Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import {
test,
expect,
jest,
describe,
beforeEach,
afterAll,
} from "@jest/globals";
import { SupadataLoader } from "../web/supadata.js";

const mockTranscript = jest.fn();
const mockYoutubeVideo = jest.fn();

const mockSupadataConstructor = jest.fn().mockImplementation(() => ({
transcript: mockTranscript,
youtube: {
video: mockYoutubeVideo,
},
}));

jest.mock("@supadata/js", () => {
return {
Supadata: mockSupadataConstructor,
};
});

const REAL_ENV = process.env;

beforeEach(() => {
process.env = { ...REAL_ENV };
jest.clearAllMocks();
mockTranscript.mockReset();
mockYoutubeVideo.mockReset();
mockSupadataConstructor.mockClear();
});

afterAll(() => {
process.env = REAL_ENV;
});

describe("SupadataLoader", () => {
test("initializes with API key", async () => {
mockTranscript.mockResolvedValue({ content: "test", lang: "en" });

const loader = new SupadataLoader({
urls: ["https://youtube.com/watch?v=123"],
apiKey: "test-key",
});

await loader.load();

expect(mockSupadataConstructor).toHaveBeenCalledWith({ apiKey: "test-key" });
});

test("fetches transcript successfully", async () => {
mockTranscript.mockResolvedValue({
content: "Hello world",
lang: "en",
});

const loader = new SupadataLoader({
urls: ["https://youtube.com/watch?v=123"],
apiKey: "test-key",
operation: "transcript",
});

const docs = await loader.load();

expect(mockTranscript).toHaveBeenCalledWith(
expect.objectContaining({
url: "https://youtube.com/watch?v=123",
text: true,
}),
);
expect(docs).toHaveLength(1);
expect(docs[0].pageContent).toBe("Hello world");
});

test("fetches metadata successfully", async () => {
mockYoutubeVideo.mockResolvedValue({ title: "Awesome Video" });

const loader = new SupadataLoader({
urls: ["https://youtube.com/watch?v=123"],
apiKey: "test-key",
operation: "metadata",
});

const docs = await loader.load();

expect(mockYoutubeVideo).toHaveBeenCalled();
expect(docs).toHaveLength(1);
expect(docs[0].pageContent).toContain("Awesome Video");
expect(docs[0].metadata.supadataOperation).toBe("metadata");
});
});
200 changes: 200 additions & 0 deletions libs/langchain-community/src/document_loaders/web/supadata.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
import { Document } from "@langchain/core/documents";
import { getEnvironmentVariable } from "@langchain/core/utils/env";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";

export type SupadataOperation = "metadata" | "transcript";

export interface SupadataLoaderParams {
/** URLs to load (YouTube, web pages, etc.). */
urls: string[];
/**
* Supadata API key. If omitted, falls back to SUPADATA_API_KEY env var.
*/
apiKey?: string;
/**
* Operation to perform. "metadata" returns structured info,
* "transcript" returns textual content. Default: "transcript".
*/
operation?: SupadataOperation;
/** Preferred transcript language, e.g. "en". */
lang?: string;
/**
* If true, return plain-text transcript instead of timestamped chunks.
* Default: true.
*/
text?: boolean;
/** Transcript mode, e.g. "native", "auto", or "generate". */
mode?: "native" | "auto" | "generate";
/** Extra parameters forwarded directly to the Supadata SDK. */
params?: Record<string, unknown>;
}

/**
* Document loader that wraps the Supadata JavaScript SDK.
*
* Supports two operations:
* - "transcript": fetch a transcript for the given URL
* - "metadata": fetch metadata for the given URL
*
* The Supadata API key is read either from the `apiKey` parameter or from
* the `SUPADATA_API_KEY` environment variable.
*/
export class SupadataLoader extends BaseDocumentLoader {
private readonly urls: string[];

private readonly apiKey?: string;

private readonly operation: SupadataOperation;

private readonly lang?: string;

private readonly text: boolean;

private readonly mode?: "native" | "auto" | "generate";

private readonly params: Record<string, unknown>;

constructor(params: SupadataLoaderParams) {
super();

if (!params.urls || params.urls.length === 0) {
throw new Error(
"SupadataLoader: at least one URL is required in `urls`.",
);
}

this.urls = params.urls;
this.apiKey = params.apiKey;
this.operation = params.operation ?? "transcript";
this.lang = params.lang;
this.text = params.text ?? true;
this.mode = params.mode;
this.params = params.params ?? {};
}

async load(): Promise<Document[]> {
const client = await this.getClient();
const docs: Document[] = [];

for (const url of this.urls) {
try {
if (this.operation === "metadata") {
docs.push(await this.loadMetadata(client, url));
} else if (this.operation === "transcript") {
docs.push(await this.loadTranscript(client, url));
} else {
throw new Error(
`SupadataLoader: unsupported operation "${this.operation}". Use "metadata" or "transcript".`,
);
}
} catch (e: any) {
// Surface the failure but keep other URLs processing.
// eslint-disable-next-line no-console
console.warn(`SupadataLoader: failed to load ${url}: ${e?.message ?? e}`);
}
}

return docs;
}

private resolveApiKey(): string {
if (this.apiKey) {
return this.apiKey;
}

const envKey = getEnvironmentVariable("SUPADATA_API_KEY");
if (!envKey) {
throw new Error(
"SupadataLoader: Supadata API key not found. Pass `apiKey` to the loader or set the SUPADATA_API_KEY environment variable.",
);
}
return envKey;
}

private async getClient(): Promise<any> {
const apiKey = this.resolveApiKey();

try {
const { Supadata } = await import("@supadata/js");
return new Supadata({ apiKey });
} catch {
throw new Error(
"SupadataLoader: failed to load `@supadata/js`. Please install it with `npm install @supadata/js` (or `pnpm add @supadata/js`).",
);
}
}

private async loadMetadata(client: any, url: string): Promise<Document> {
let isYoutube = false;

try {
const hostname = new URL(url).hostname.toLowerCase();

isYoutube =
hostname === "youtube.com" ||
hostname === "www.youtube.com" ||
hostname.endsWith(".youtube.com") ||
hostname === "youtu.be";
} catch {
// If URL parsing fails, treat as non-YouTube
isYoutube = false;
}

let result;
if (isYoutube && client.youtube?.video) {
result = await client.youtube.video({ url, ...this.params });
} else if (client.web?.scrape) {
result = await client.web.scrape({ url, ...this.params });
} else {
throw new Error(
"SupadataLoader: could not determine a Supadata SDK method to call for metadata. " +
"Ensure the SDK version exposes either `youtube.video` or `web.scrape`.",
);
}

return new Document({
pageContent: JSON.stringify(result, null, 2),
metadata: {
source: url,
supadataOperation: "metadata",
},
});
}

private async loadTranscript(client: any, url: string): Promise<Document> {
const payload: Record<string, unknown> = {
url,
text: this.text,
...this.params,
};

if (this.lang) {
payload.lang = this.lang;
}
if (this.mode) {
payload.mode = this.mode;
}

const result = await client.transcript(payload);

if (result.jobId) {
return new Document({
pageContent: `Transcript processing. Job ID: ${result.jobId}`,
metadata: {
source: url,
supadataOperation: "transcript_job",
jobId: result.jobId,
},
});
}

return new Document({
pageContent: result.content,
metadata: {
source: url,
supadataOperation: "transcript",
lang: result.lang,
},
});
}
}