@@ -22,7 +22,7 @@ import {
22
22
import { Telemetry } from "../../util/posthog.js" ;
23
23
import TransformersJsEmbeddingsProvider from "../embeddings/TransformersJsEmbeddingsProvider.js" ;
24
24
import { Article , chunkArticle , pageToArticle } from "./article.js" ;
25
- import { crawlPage } from "./crawl .js" ;
25
+ import { crawlSite } from "./crawlSite .js" ;
26
26
import { runLanceMigrations , runSqliteMigrations } from "./migrations.js" ;
27
27
import {
28
28
downloadFromS3 ,
@@ -202,11 +202,36 @@ export default class DocsService {
202
202
let processedPages = 0 ;
203
203
let maxKnownPages = 1 ;
204
204
205
+ // crawlSite(
206
+ // new URL(startUrl),
207
+ // siteIndexingConfig.maxDepth,
208
+ // async function* (page) {
209
+ // processedPages++;
210
+
211
+ // const article = pageToArticle(page);
212
+
213
+ // if (article) {
214
+ // articles.push(article);
215
+ // }
216
+
217
+ // // Use a heuristic approach for progress calculation
218
+ // const progress = Math.min(processedPages / maxKnownPages, 1);
219
+
220
+ // yield {
221
+ // progress, // Yield the heuristic progress
222
+ // desc: `Finding subpages (${page.path})`,
223
+ // status: "indexing",
224
+ // };
225
+
226
+ // // Increase maxKnownPages to delay progress reaching 100% too soon
227
+ // if (processedPages === maxKnownPages) {
228
+ // maxKnownPages *= 2;
229
+ // }
230
+ // },
231
+ // );
232
+
205
233
// Crawl pages and retrieve info as articles
206
- for await ( const page of crawlPage (
207
- new URL ( startUrl ) ,
208
- siteIndexingConfig . maxDepth ,
209
- ) ) {
234
+ for await ( const page of crawlSite ( startUrl ) ) {
210
235
processedPages ++ ;
211
236
212
237
const article = pageToArticle ( page ) ;
@@ -521,8 +546,9 @@ export default class DocsService {
521
546
private async getLanceTableNameFromEmbeddingsProvider (
522
547
isPreIndexedDoc : boolean ,
523
548
) {
524
- const embeddingsProvider =
525
- await this . getEmbeddingsProvider ( isPreIndexedDoc ) ;
549
+ const embeddingsProvider = await this . getEmbeddingsProvider (
550
+ isPreIndexedDoc ,
551
+ ) ;
526
552
const embeddingsProviderId = this . removeInvalidLanceTableNameChars (
527
553
embeddingsProvider . id ,
528
554
) ;
0 commit comments