Skip to content

Commit 7bc5f0f

Browse files
authored
Merge pull request #610 from hwchase17/nc/web-loader-async-caller
Add timeout and retry to web loaders
2 parents 69f6268 + b871089 commit 7bc5f0f

File tree

2 files changed

+30
-5
lines changed

2 files changed

+30
-5
lines changed

langchain/src/document_loaders/cheerio_web_base.ts

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,49 @@ import type { CheerioAPI, load as LoadT } from "cheerio";
22
import { Document } from "../document.js";
33
import { BaseDocumentLoader } from "./base.js";
44
import type { DocumentLoader } from "./base.js";
5+
import { AsyncCaller, AsyncCallerParams } from "../util/async_caller.js";
6+
7+
export interface WebBaseLoaderParams extends AsyncCallerParams {
8+
/**
9+
* The timeout in milliseconds for the fetch request. Defaults to 10s.
10+
*/
11+
timeout?: number;
12+
}
513

614
export class CheerioWebBaseLoader
715
extends BaseDocumentLoader
816
implements DocumentLoader
917
{
10-
constructor(public webPath: string) {
18+
timeout: number;
19+
20+
caller: AsyncCaller;
21+
22+
constructor(public webPath: string, fields?: WebBaseLoaderParams) {
1123
super();
24+
const { timeout, ...rest } = fields ?? {};
25+
this.timeout = timeout ?? 10000;
26+
this.caller = new AsyncCaller(rest);
1227
}
1328

14-
static async _scrape(url: string): Promise<CheerioAPI> {
29+
static async _scrape(
30+
url: string,
31+
caller: AsyncCaller,
32+
timeout: number | undefined
33+
): Promise<CheerioAPI> {
1534
const { load } = await CheerioWebBaseLoader.imports();
16-
const response = await fetch(url);
35+
const response = await caller.call(fetch, url, {
36+
signal: timeout ? AbortSignal.timeout(timeout) : undefined,
37+
});
1738
const html = await response.text();
1839
return load(html);
1940
}
2041

2142
async scrape(): Promise<CheerioAPI> {
22-
return CheerioWebBaseLoader._scrape(this.webPath);
43+
return CheerioWebBaseLoader._scrape(
44+
this.webPath,
45+
this.caller,
46+
this.timeout
47+
);
2348
}
2449

2550
async load(): Promise<Document[]> {

langchain/src/document_loaders/gitbook.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ export class GitbookLoader extends CheerioWebBaseLoader {
5454
for (const path of relative_paths) {
5555
const url = this.webPath + path;
5656
console.log(`Fetching text from ${url}`);
57-
const html = await GitbookLoader._scrape(url);
57+
const html = await GitbookLoader._scrape(url, this.caller, this.timeout);
5858
documents.push(...this.loadPath(html, url));
5959
}
6060
console.log(`Fetched ${documents.length} documents.`);

0 commit comments

Comments
 (0)