Skip to content

Commit a51c520

Browse files
feat: use crawlee for docs service
1 parent 4bb1846 commit a51c520

17 files changed

+2652
-640
lines changed

core/core.ts

+12
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ import type { IMessenger, Message } from "./util/messenger";
2727
import { editConfigJson } from "./util/paths";
2828
import { Telemetry } from "./util/posthog";
2929
import { streamDiffLines } from "./util/verticalEdit";
30+
import {
31+
installChromium,
32+
isChromiumInstalled,
33+
} from "./indexing/docs/installChromium";
3034

3135
export class Core {
3236
// implements IMessenger<ToCoreProtocol, FromCoreProtocol>
@@ -167,6 +171,14 @@ export class Core {
167171
(..._) => Promise.resolve([]),
168172
);
169173

174+
try {
175+
if (!isChromiumInstalled()) {
176+
installChromium();
177+
}
178+
} catch (err) {
179+
console.debug(`Failed to install Chromium: ${err}`);
180+
}
181+
170182
const on = this.messenger.on.bind(this.messenger);
171183

172184
this.messenger.onError((err) => {

core/indexing/docs/DocsService.ts

+33-7
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ import {
2222
import { Telemetry } from "../../util/posthog.js";
2323
import TransformersJsEmbeddingsProvider from "../embeddings/TransformersJsEmbeddingsProvider.js";
2424
import { Article, chunkArticle, pageToArticle } from "./article.js";
25-
import { crawlPage } from "./crawl.js";
25+
import { crawlSite } from "./crawlSite.js";
2626
import { runLanceMigrations, runSqliteMigrations } from "./migrations.js";
2727
import {
2828
downloadFromS3,
@@ -202,11 +202,36 @@ export default class DocsService {
202202
let processedPages = 0;
203203
let maxKnownPages = 1;
204204

205+
// crawlSite(
206+
// new URL(startUrl),
207+
// siteIndexingConfig.maxDepth,
208+
// async function* (page) {
209+
// processedPages++;
210+
211+
// const article = pageToArticle(page);
212+
213+
// if (article) {
214+
// articles.push(article);
215+
// }
216+
217+
// // Use a heuristic approach for progress calculation
218+
// const progress = Math.min(processedPages / maxKnownPages, 1);
219+
220+
// yield {
221+
// progress, // Yield the heuristic progress
222+
// desc: `Finding subpages (${page.path})`,
223+
// status: "indexing",
224+
// };
225+
226+
// // Increase maxKnownPages to delay progress reaching 100% too soon
227+
// if (processedPages === maxKnownPages) {
228+
// maxKnownPages *= 2;
229+
// }
230+
// },
231+
// );
232+
205233
// Crawl pages and retrieve info as articles
206-
for await (const page of crawlPage(
207-
new URL(startUrl),
208-
siteIndexingConfig.maxDepth,
209-
)) {
234+
for await (const page of crawlSite(startUrl)) {
210235
processedPages++;
211236

212237
const article = pageToArticle(page);
@@ -521,8 +546,9 @@ export default class DocsService {
521546
private async getLanceTableNameFromEmbeddingsProvider(
522547
isPreIndexedDoc: boolean,
523548
) {
524-
const embeddingsProvider =
525-
await this.getEmbeddingsProvider(isPreIndexedDoc);
549+
const embeddingsProvider = await this.getEmbeddingsProvider(
550+
isPreIndexedDoc,
551+
);
526552
const embeddingsProviderId = this.removeInvalidLanceTableNameChars(
527553
embeddingsProvider.id,
528554
);

core/indexing/docs/article.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { Readability } from "@mozilla/readability";
22
import { JSDOM } from "jsdom";
33
import { Chunk } from "../../index.js";
44
import { cleanFragment, cleanHeader } from "../chunk/markdown.js";
5-
import { PageData } from "./crawl.js";
5+
import { PageData } from "./crawlSite.js";
66

77
export type ArticleComponent = {
88
title: string;
@@ -148,7 +148,7 @@ export function stringToArticle(
148148

149149
export function pageToArticle(page: PageData): Article | undefined {
150150
try {
151-
return stringToArticle(page.url, page.html, page.path);
151+
return stringToArticle(page.url, page.content, page.path);
152152
} catch (err) {
153153
console.error("Error converting URL to article components", err);
154154
return undefined;

core/indexing/docs/crawl.test.ts

-103
This file was deleted.

0 commit comments

Comments
 (0)