|
| 1 | +import FirecrawlApp from "@mendable/firecrawl-js"; |
| 2 | +import { Document, type DocumentInterface } from "@langchain/core/documents"; |
| 3 | +import { getEnvironmentVariable } from "@langchain/core/utils/env"; |
| 4 | +import { BaseDocumentLoader } from "../base.js"; |
| 5 | + |
| 6 | +/** |
| 7 | + * Interface representing the parameters for the Firecrawl loader. It |
| 8 | + * includes properties such as the URL to scrape or crawl and the API key. |
| 9 | + */ |
| 10 | +interface FirecrawlLoaderParameters { |
| 11 | + /** |
| 12 | + * URL to scrape or crawl |
| 13 | + */ |
| 14 | + url: string; |
| 15 | + |
| 16 | + /** |
| 17 | + * API key for Firecrawl. If not provided, the default value is the value of the FIRECRAWL_API_KEY environment variable. |
| 18 | + */ |
| 19 | + apiKey?: string; |
| 20 | + |
| 21 | + /** |
| 22 | + * Mode of operation. Can be either "crawl" or "scrape". If not provided, the default value is "crawl". |
| 23 | + */ |
| 24 | + mode?: "crawl" | "scrape"; |
| 25 | + params?: Record<string, unknown>; |
| 26 | +} |
| 27 | +interface FirecrawlDocument { |
| 28 | + markdown: string; |
| 29 | + metadata: Record<string, unknown>; |
| 30 | +} |
| 31 | + |
| 32 | +/** |
| 33 | + * Class representing a document loader for loading data from |
| 34 | + * Firecrawl (firecrawl.dev). It extends the BaseDocumentLoader class. |
| 35 | + * @example |
| 36 | + * ```typescript |
| 37 | + * const loader = new FireCrawlLoader({ |
| 38 | + * url: "{url}", |
| 39 | + * apiKey: "{apiKey}", |
| 40 | + * mode: "crawl" |
| 41 | + * }); |
| 42 | + * const docs = await loader.load(); |
| 43 | + * ``` |
| 44 | + */ |
| 45 | +export class FireCrawlLoader extends BaseDocumentLoader { |
| 46 | + private apiKey: string; |
| 47 | + |
| 48 | + private url: string; |
| 49 | + |
| 50 | + private mode: "crawl" | "scrape"; |
| 51 | + |
| 52 | + private params?: Record<string, unknown>; |
| 53 | + |
| 54 | + constructor(loaderParams: FirecrawlLoaderParameters) { |
| 55 | + super(); |
| 56 | + const { |
| 57 | + apiKey = getEnvironmentVariable("FIRECRAWL_API_KEY"), |
| 58 | + url, |
| 59 | + mode = "crawl", |
| 60 | + params, |
| 61 | + } = loaderParams; |
| 62 | + if (!apiKey) { |
| 63 | + throw new Error( |
| 64 | + "Firecrawl API key not set. You can set it as FIRECRAWL_API_KEY in your .env file, or pass it to Firecrawl." |
| 65 | + ); |
| 66 | + } |
| 67 | + |
| 68 | + this.apiKey = apiKey; |
| 69 | + this.url = url; |
| 70 | + this.mode = mode; |
| 71 | + this.params = params; |
| 72 | + } |
| 73 | + |
| 74 | + /** |
| 75 | + * Loads the data from the Firecrawl. |
| 76 | + * @returns An array of Documents representing the retrieved data. |
| 77 | + * @throws An error if the data could not be loaded. |
| 78 | + */ |
| 79 | + public async load(): Promise<DocumentInterface[]> { |
| 80 | + const app = new FirecrawlApp({ apiKey: this.apiKey }); |
| 81 | + let firecrawlDocs: FirecrawlDocument[]; |
| 82 | + |
| 83 | + if (this.mode === "scrape") { |
| 84 | + const response = await app.scrapeUrl(this.url, this.params); |
| 85 | + if (!response.success) { |
| 86 | + throw new Error( |
| 87 | + `Firecrawl: Failed to scrape URL. Error: ${response.error}` |
| 88 | + ); |
| 89 | + } |
| 90 | + firecrawlDocs = [response.data as FirecrawlDocument]; |
| 91 | + } else if (this.mode === "crawl") { |
| 92 | + const response = await app.crawlUrl(this.url, this.params, true); |
| 93 | + firecrawlDocs = response as FirecrawlDocument[]; |
| 94 | + } else { |
| 95 | + throw new Error( |
| 96 | + `Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.` |
| 97 | + ); |
| 98 | + } |
| 99 | + |
| 100 | + return firecrawlDocs.map( |
| 101 | + (doc) => |
| 102 | + new Document({ |
| 103 | + pageContent: doc.markdown || "", |
| 104 | + metadata: doc.metadata || {}, |
| 105 | + }) |
| 106 | + ); |
| 107 | + } |
| 108 | +} |
0 commit comments