@@ -2,24 +2,49 @@ import type { CheerioAPI, load as LoadT } from "cheerio";
2
2
import { Document } from "../document.js" ;
3
3
import { BaseDocumentLoader } from "./base.js" ;
4
4
import type { DocumentLoader } from "./base.js" ;
5
+ import { AsyncCaller , AsyncCallerParams } from "../util/async_caller.js" ;
6
+
7
+ export interface WebBaseLoaderParams extends AsyncCallerParams {
8
+ /**
9
+ * The timeout in milliseconds for the fetch request. Defaults to 10s.
10
+ */
11
+ timeout ?: number ;
12
+ }
5
13
6
14
export class CheerioWebBaseLoader
7
15
extends BaseDocumentLoader
8
16
implements DocumentLoader
9
17
{
10
- constructor ( public webPath : string ) {
18
+ timeout : number ;
19
+
20
+ caller : AsyncCaller ;
21
+
22
+ constructor ( public webPath : string , fields ?: WebBaseLoaderParams ) {
11
23
super ( ) ;
24
+ const { timeout, ...rest } = fields ?? { } ;
25
+ this . timeout = timeout ?? 10000 ;
26
+ this . caller = new AsyncCaller ( rest ) ;
12
27
}
13
28
14
- static async _scrape ( url : string ) : Promise < CheerioAPI > {
29
+ static async _scrape (
30
+ url : string ,
31
+ caller : AsyncCaller ,
32
+ timeout : number | undefined
33
+ ) : Promise < CheerioAPI > {
15
34
const { load } = await CheerioWebBaseLoader . imports ( ) ;
16
- const response = await fetch ( url ) ;
35
+ const response = await caller . call ( fetch , url , {
36
+ signal : timeout ? AbortSignal . timeout ( timeout ) : undefined ,
37
+ } ) ;
17
38
const html = await response . text ( ) ;
18
39
return load ( html ) ;
19
40
}
20
41
21
42
async scrape ( ) : Promise < CheerioAPI > {
22
- return CheerioWebBaseLoader . _scrape ( this . webPath ) ;
43
+ return CheerioWebBaseLoader . _scrape (
44
+ this . webPath ,
45
+ this . caller ,
46
+ this . timeout
47
+ ) ;
23
48
}
24
49
25
50
async load ( ) : Promise < Document [ ] > {
0 commit comments