Skip to content

Commit e0da231

Browse files
nickscamarabracesprouljacoblee93
authored
langchain[minor]: Firecrawl Document Loader (#5180)
* Nick: init * Update firecrawl.ts * Nick: * Nick: * Update package.json * Nick: fixes docs * Update yarn.lock * Update examples/src/document_loaders/firecrawl.ts Co-authored-by: Brace Sproul <[email protected]> * Update langchain/src/document_loaders/web/firecrawl.ts Co-authored-by: Brace Sproul <[email protected]> * Nick: fixes * Update yarn.lock * Fix yarn.lock * lint & format * Update firecrawl.ts * Add entrypoint --------- Co-authored-by: Brace Sproul <[email protected]> Co-authored-by: Jacob Lee <[email protected]>
1 parent e77fcec commit e0da231

File tree

9 files changed

+233
-1
lines changed

9 files changed

+233
-1
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
---
2+
hide_table_of_contents: true
3+
---
4+
5+
# Firecrawl
6+
7+
This guide shows how to use [Firecrawl](https://firecrawl.dev) with LangChain to load web data into an LLM-ready format using Firecrawl.
8+
9+
## Overview
10+
11+
[FireCrawl](https://firecrawl.dev) crawls and convert any website into LLM-ready data. It crawls all accessible subpages and give you clean markdown and metadata for each. No sitemap required.
12+
13+
FireCrawl handles complex tasks such as reverse proxies, caching, rate limits, and content blocked by JavaScript. Built by the [mendable.ai](https://mendable.ai) team.
14+
15+
This guide shows how to scrap and crawl entire websites and load them using the `FireCrawlLoader` in LangChain.
16+
17+
## Setup
18+
19+
Sign up and get your free [FireCrawl API key](https://firecrawl.dev) to start. FireCrawl offers 100 free credits to get you started, and it's [open-source](https://github.com/mendableai/firecrawl) in case you want to self-host.
20+
21+
## Usage
22+
23+
Here's an example of how to use the `FireCrawlLoader` to load web search results:
24+
25+
Firecrawl offers 2 modes: `scrape` and `crawl`. In `scrape` mode, Firecrawl will only scrape the page you provide. In `crawl` mode, Firecrawl will crawl the entire website.
26+
27+
import CodeBlock from "@theme/CodeBlock";
28+
import Example from "@examples/document_loaders/firecrawl.ts";
29+
30+
```bash npm2yarn
31+
npm install @mendableai/firecrawl-js
32+
```
33+
34+
<CodeBlock language="typescript">{Example}</CodeBlock>
35+
36+
### Additional Parameters
37+
38+
For `params` you can pass any of the params according to the [Firecrawl documentation](https://docs.firecrawl.dev).
+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import { FireCrawlLoader } from "langchain/document_loaders/web/firecrawl";
2+
3+
const loader = new FireCrawlLoader({
4+
url: "https://firecrawl.dev", // The URL to scrape
5+
apiKey: process.env.FIRECRAWL_API_KEY, // Optional, defaults to `FIRECRAWL_API_KEY` in your env.
6+
mode: "scrape", // The mode to run the crawler in. Can be "scrape" for single urls or "crawl" for all accessible subpages
7+
params: {
8+
// optional parameters based on Firecrawl API docs
9+
// For API documentation, visit https://docs.firecrawl.dev
10+
},
11+
});
12+
13+
const docs = await loader.load();

langchain/.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -566,6 +566,10 @@ document_loaders/web/figma.cjs
566566
document_loaders/web/figma.js
567567
document_loaders/web/figma.d.ts
568568
document_loaders/web/figma.d.cts
569+
document_loaders/web/firecrawl.cjs
570+
document_loaders/web/firecrawl.js
571+
document_loaders/web/firecrawl.d.ts
572+
document_loaders/web/firecrawl.d.cts
569573
document_loaders/web/github.cjs
570574
document_loaders/web/github.js
571575
document_loaders/web/github.d.ts

langchain/langchain.config.js

+2
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ export const config = {
191191
"document_loaders/web/hn": "document_loaders/web/hn",
192192
"document_loaders/web/imsdb": "document_loaders/web/imsdb",
193193
"document_loaders/web/figma": "document_loaders/web/figma",
194+
"document_loaders/web/firecrawl": "document_loaders/web/firecrawl",
194195
"document_loaders/web/github": "document_loaders/web/github",
195196
"document_loaders/web/notiondb": "document_loaders/web/notiondb",
196197
"document_loaders/web/notionapi": "document_loaders/web/notionapi",
@@ -637,6 +638,7 @@ export const config = {
637638
"document_loaders/web/hn",
638639
"document_loaders/web/imsdb",
639640
"document_loaders/web/figma",
641+
"document_loaders/web/firecrawl",
640642
"document_loaders/web/github",
641643
"document_loaders/web/pdf",
642644
"document_loaders/web/notiondb",

langchain/package.json

+18
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,10 @@
578578
"document_loaders/web/figma.js",
579579
"document_loaders/web/figma.d.ts",
580580
"document_loaders/web/figma.d.cts",
581+
"document_loaders/web/firecrawl.cjs",
582+
"document_loaders/web/firecrawl.js",
583+
"document_loaders/web/firecrawl.d.ts",
584+
"document_loaders/web/firecrawl.d.cts",
581585
"document_loaders/web/github.cjs",
582586
"document_loaders/web/github.js",
583587
"document_loaders/web/github.d.ts",
@@ -1230,6 +1234,7 @@
12301234
"@google-cloud/storage": "^7.7.0",
12311235
"@jest/globals": "^29.5.0",
12321236
"@langchain/scripts": "~0.0",
1237+
"@mendable/firecrawl-js": "^0.0.13",
12331238
"@notionhq/client": "^2.2.10",
12341239
"@pinecone-database/pinecone": "^1.1.0",
12351240
"@supabase/supabase-js": "^2.10.0",
@@ -1314,6 +1319,7 @@
13141319
"@gomomento/sdk-web": "^1.51.1",
13151320
"@google-ai/generativelanguage": "^0.2.1",
13161321
"@google-cloud/storage": "^6.10.1 || ^7.7.0",
1322+
"@mendable/firecrawl-js": "^0.0.13",
13171323
"@notionhq/client": "^2.2.10",
13181324
"@pinecone-database/pinecone": "*",
13191325
"@supabase/supabase-js": "^2.10.0",
@@ -1386,6 +1392,9 @@
13861392
"@google-cloud/storage": {
13871393
"optional": true
13881394
},
1395+
"@mendable/firecrawl-js": {
1396+
"optional": true
1397+
},
13891398
"@notionhq/client": {
13901399
"optional": true
13911400
},
@@ -2826,6 +2835,15 @@
28262835
"import": "./document_loaders/web/figma.js",
28272836
"require": "./document_loaders/web/figma.cjs"
28282837
},
2838+
"./document_loaders/web/firecrawl": {
2839+
"types": {
2840+
"import": "./document_loaders/web/firecrawl.d.ts",
2841+
"require": "./document_loaders/web/firecrawl.d.cts",
2842+
"default": "./document_loaders/web/firecrawl.d.ts"
2843+
},
2844+
"import": "./document_loaders/web/firecrawl.js",
2845+
"require": "./document_loaders/web/firecrawl.cjs"
2846+
},
28292847
"./document_loaders/web/github": {
28302848
"types": {
28312849
"import": "./document_loaders/web/github.d.ts",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/* eslint-disable no-process-env */
2+
/* eslint-disable @typescript-eslint/no-non-null-assertion */
3+
import { test } from "@jest/globals";
4+
import { Document } from "@langchain/core/documents";
5+
import { FireCrawlLoader } from "../web/firecrawl.js";
6+
7+
test("Test FireCrawlLoader load method with scrape mode", async () => {
8+
const loader = new FireCrawlLoader({
9+
url: "https://firecrawl.dev",
10+
apiKey: process.env.FIRECRAWL_API_KEY,
11+
mode: "scrape",
12+
});
13+
14+
const documents = await loader.load();
15+
expect(documents).toHaveLength(1);
16+
const document = documents[0];
17+
expect(document).toBeInstanceOf(Document);
18+
expect(document.pageContent).toBeTruthy();
19+
expect(document.metadata).toBeTruthy();
20+
});
21+
22+
test("Test FireCrawlLoader load method with crawl mode", async () => {
23+
const loader = new FireCrawlLoader({
24+
url: "https://firecrawl.dev",
25+
apiKey: process.env.FIRECRAWL_API_KEY,
26+
mode: "crawl",
27+
});
28+
29+
const documents = await loader.load();
30+
const document = documents[0];
31+
expect(document).toBeInstanceOf(Document);
32+
expect(document.pageContent).toBeTruthy();
33+
expect(document.metadata).toBeTruthy();
34+
}, 15000);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import FirecrawlApp from "@mendable/firecrawl-js";
2+
import { Document, type DocumentInterface } from "@langchain/core/documents";
3+
import { getEnvironmentVariable } from "@langchain/core/utils/env";
4+
import { BaseDocumentLoader } from "../base.js";
5+
6+
/**
7+
* Interface representing the parameters for the Firecrawl loader. It
8+
* includes properties such as the URL to scrape or crawl and the API key.
9+
*/
10+
interface FirecrawlLoaderParameters {
11+
/**
12+
* URL to scrape or crawl
13+
*/
14+
url: string;
15+
16+
/**
17+
* API key for Firecrawl. If not provided, the default value is the value of the FIRECRAWL_API_KEY environment variable.
18+
*/
19+
apiKey?: string;
20+
21+
/**
22+
* Mode of operation. Can be either "crawl" or "scrape". If not provided, the default value is "crawl".
23+
*/
24+
mode?: "crawl" | "scrape";
25+
params?: Record<string, unknown>;
26+
}
27+
interface FirecrawlDocument {
28+
markdown: string;
29+
metadata: Record<string, unknown>;
30+
}
31+
32+
/**
33+
* Class representing a document loader for loading data from
34+
* Firecrawl (firecrawl.dev). It extends the BaseDocumentLoader class.
35+
* @example
36+
* ```typescript
37+
* const loader = new FireCrawlLoader({
38+
* url: "{url}",
39+
* apiKey: "{apiKey}",
40+
* mode: "crawl"
41+
* });
42+
* const docs = await loader.load();
43+
* ```
44+
*/
45+
export class FireCrawlLoader extends BaseDocumentLoader {
46+
private apiKey: string;
47+
48+
private url: string;
49+
50+
private mode: "crawl" | "scrape";
51+
52+
private params?: Record<string, unknown>;
53+
54+
constructor(loaderParams: FirecrawlLoaderParameters) {
55+
super();
56+
const {
57+
apiKey = getEnvironmentVariable("FIRECRAWL_API_KEY"),
58+
url,
59+
mode = "crawl",
60+
params,
61+
} = loaderParams;
62+
if (!apiKey) {
63+
throw new Error(
64+
"Firecrawl API key not set. You can set it as FIRECRAWL_API_KEY in your .env file, or pass it to Firecrawl."
65+
);
66+
}
67+
68+
this.apiKey = apiKey;
69+
this.url = url;
70+
this.mode = mode;
71+
this.params = params;
72+
}
73+
74+
/**
75+
* Loads the data from the Firecrawl.
76+
* @returns An array of Documents representing the retrieved data.
77+
* @throws An error if the data could not be loaded.
78+
*/
79+
public async load(): Promise<DocumentInterface[]> {
80+
const app = new FirecrawlApp({ apiKey: this.apiKey });
81+
let firecrawlDocs: FirecrawlDocument[];
82+
83+
if (this.mode === "scrape") {
84+
const response = await app.scrapeUrl(this.url, this.params);
85+
if (!response.success) {
86+
throw new Error(
87+
`Firecrawl: Failed to scrape URL. Error: ${response.error}`
88+
);
89+
}
90+
firecrawlDocs = [response.data as FirecrawlDocument];
91+
} else if (this.mode === "crawl") {
92+
const response = await app.crawlUrl(this.url, this.params, true);
93+
firecrawlDocs = response as FirecrawlDocument[];
94+
} else {
95+
throw new Error(
96+
`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`
97+
);
98+
}
99+
100+
return firecrawlDocs.map(
101+
(doc) =>
102+
new Document({
103+
pageContent: doc.markdown || "",
104+
metadata: doc.metadata || {},
105+
})
106+
);
107+
}
108+
}

langchain/src/load/import_constants.ts

+1
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ export const optionalImportEntrypoints: string[] = [
9191
"langchain/document_loaders/web/hn",
9292
"langchain/document_loaders/web/imsdb",
9393
"langchain/document_loaders/web/figma",
94+
"langchain/document_loaders/web/firecrawl",
9495
"langchain/document_loaders/web/github",
9596
"langchain/document_loaders/web/notiondb",
9697
"langchain/document_loaders/web/notionapi",

yarn.lock

+15-1
Original file line numberDiff line numberDiff line change
@@ -10092,6 +10092,16 @@ __metadata:
1009210092
languageName: node
1009310093
linkType: hard
1009410094

10095+
"@mendable/firecrawl-js@npm:^0.0.13":
10096+
version: 0.0.13
10097+
resolution: "@mendable/firecrawl-js@npm:0.0.13"
10098+
dependencies:
10099+
axios: ^1.6.8
10100+
dotenv: ^16.4.5
10101+
checksum: 9dbc0b6e5d300bb9ef9f45cebd5c0026ac468863984cdc73a57ed6fdf888eaead5f9e2325c6848d03897c72cab195fffb4ce7d832e39696a11216bc53b417b6d
10102+
languageName: node
10103+
linkType: hard
10104+
1009510105
"@mistralai/mistralai@npm:^0.1.3":
1009610106
version: 0.1.3
1009710107
resolution: "@mistralai/mistralai@npm:0.1.3"
@@ -16954,7 +16964,7 @@ __metadata:
1695416964
languageName: node
1695516965
linkType: hard
1695616966

16957-
"axios@npm:^1.6.2":
16967+
"axios@npm:^1.6.2, axios@npm:^1.6.8":
1695816968
version: 1.6.8
1695916969
resolution: "axios@npm:1.6.8"
1696016970
dependencies:
@@ -26670,6 +26680,7 @@ __metadata:
2667026680
"@langchain/openai": ~0.0.28
2667126681
"@langchain/scripts": ~0.0
2667226682
"@langchain/textsplitters": ~0.0.0
26683+
"@mendable/firecrawl-js": ^0.0.13
2667326684
"@notionhq/client": ^2.2.10
2667426685
"@pinecone-database/pinecone": ^1.1.0
2667526686
"@supabase/supabase-js": ^2.10.0
@@ -26766,6 +26777,7 @@ __metadata:
2676626777
"@gomomento/sdk-web": ^1.51.1
2676726778
"@google-ai/generativelanguage": ^0.2.1
2676826779
"@google-cloud/storage": ^6.10.1 || ^7.7.0
26780+
"@mendable/firecrawl-js": ^0.0.13
2676926781
"@notionhq/client": ^2.2.10
2677026782
"@pinecone-database/pinecone": "*"
2677126783
"@supabase/supabase-js": ^2.10.0
@@ -26827,6 +26839,8 @@ __metadata:
2682726839
optional: true
2682826840
"@google-cloud/storage":
2682926841
optional: true
26842+
"@mendable/firecrawl-js":
26843+
optional: true
2683026844
"@notionhq/client":
2683126845
optional: true
2683226846
"@pinecone-database/pinecone":

0 commit comments

Comments
 (0)