Skip to content

Commit b9b58e2

Browse files
use async generator
1 parent e8f1e3a commit b9b58e2

File tree

6 files changed

+267
-365
lines changed

6 files changed

+267
-365
lines changed

core/core.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ import type { IMessenger, Message } from "./util/messenger";
2727
import { editConfigJson } from "./util/paths";
2828
import { Telemetry } from "./util/posthog";
2929
import { streamDiffLines } from "./util/verticalEdit";
30-
import { verifyOrInstallChromium } from "./indexing/docs/crawlSite";
30+
import { verifyOrInstallChromium } from "./indexing/docs/crawl";
3131

3232
export class Core {
3333
// implements IMessenger<ToCoreProtocol, FromCoreProtocol>

core/indexing/docs/DocsService.ts

+20-18
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,36 @@
11
import { open, type Database } from "sqlite";
22
import sqlite3 from "sqlite3";
33
import lancedb, { Connection } from "vectordb";
4-
import { ConfigHandler } from "../../config/ConfigHandler.js";
5-
import DocsContextProvider from "../../context/providers/DocsContextProvider.js";
4+
import { ConfigHandler } from "../../config/ConfigHandler";
5+
import DocsContextProvider from "../../context/providers/DocsContextProvider";
66
import {
77
Chunk,
88
ContinueConfig,
99
EmbeddingsProvider,
1010
IDE,
1111
IndexingProgressUpdate,
1212
SiteIndexingConfig,
13-
} from "../../index.js";
14-
import { FromCoreProtocol, ToCoreProtocol } from "../../protocol/index.js";
15-
import { GlobalContext } from "../../util/GlobalContext.js";
16-
import { IMessenger } from "../../util/messenger.js";
13+
} from "../..";
14+
import { FromCoreProtocol, ToCoreProtocol } from "../../protocol";
15+
import { GlobalContext } from "../../util/GlobalContext";
16+
import { IMessenger } from "../../util/messenger";
1717
import {
1818
editConfigJson,
1919
getDocsSqlitePath,
2020
getLanceDbPath,
21-
} from "../../util/paths.js";
22-
import { Telemetry } from "../../util/posthog.js";
23-
import TransformersJsEmbeddingsProvider from "../embeddings/TransformersJsEmbeddingsProvider.js";
24-
import { Article, chunkArticle, pageToArticle } from "./article.js";
25-
import { crawlSite } from "./crawlSite.js";
26-
import { runLanceMigrations, runSqliteMigrations } from "./migrations.js";
21+
} from "../../util/paths";
22+
import { Telemetry } from "../../util/posthog";
23+
import TransformersJsEmbeddingsProvider from "../embeddings/TransformersJsEmbeddingsProvider";
24+
import { Article, chunkArticle, pageToArticle } from "./article";
25+
import { crawl } from "./crawl";
26+
import { runLanceMigrations, runSqliteMigrations } from "./migrations";
2727
import {
2828
downloadFromS3,
2929
getS3Filename,
3030
S3Buckets,
3131
SiteIndexingResults,
32-
} from "./preIndexed.js";
33-
import preIndexedDocs from "./preIndexedDocs.js";
32+
} from "./preIndexed";
33+
import preIndexedDocs from "./preIndexedDocs";
3434

3535
// Purposefully lowercase because lancedb converts
3636
export interface LanceDbDocsRow {
@@ -112,8 +112,8 @@ export default class DocsService {
112112
}
113113

114114
/*
115-
* Currently, we generate and host embeddings for pre-indexed docs using transformers.js.
116-
* However, we don't ship transformers.js with the JetBrains extension.
115+
* Currently, we generate and host embeddings for pre-indexed docs using transformers.
116+
* However, we don't ship transformers with the JetBrains extension.
117117
* So, we only include pre-indexed docs in the submenu for non-JetBrains IDEs.
118118
*/
119119
async canUsePreindexedDocs() {
@@ -203,7 +203,7 @@ export default class DocsService {
203203
let maxKnownPages = 1;
204204

205205
// Crawl pages and retrieve info as articles
206-
for await (const page of crawlSite(startUrl)) {
206+
for await (const page of crawl(new URL(startUrl))) {
207207
processedPages++;
208208

209209
const article = pageToArticle(page);
@@ -276,6 +276,8 @@ export default class DocsService {
276276
status: "failed",
277277
};
278278

279+
this.docsIndexingQueue.delete(startUrl);
280+
279281
return;
280282
}
281283

@@ -721,7 +723,7 @@ export default class DocsService {
721723

722724
if (isJetBrainsAndPreIndexedDocsProvider) {
723725
this.ide.errorPopup(
724-
"The 'transformers.js' embeddings provider currently cannot be used to index " +
726+
"The 'transformers' embeddings provider currently cannot be used to index " +
725727
"documentation in JetBrains. To enable documentation indexing, you can use " +
726728
"any of the other providers described in the docs: " +
727729
"https://docs.continue.dev/walkthroughs/codebase-embeddings#embeddings-providers",

core/indexing/docs/article.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { Readability } from "@mozilla/readability";
22
import { JSDOM } from "jsdom";
33
import { Chunk } from "../../";
44
import { cleanFragment, cleanHeader } from "../chunk/markdown";
5-
import { PageData } from "./crawlSite";
5+
import { PageData } from "./crawl";
66

77
export type ArticleComponent = {
88
title: string;

core/indexing/docs/crawlSite.test.ts renamed to core/indexing/docs/crawl.test.ts

+5-5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { crawlSite, PageData } from "./crawlSite";
1+
import { crawl, PageData } from "./crawl";
22
import preIndexedDocs from "./preIndexedDocs";
33

44
// Temporary workaround until we have better caching of Chromium
@@ -7,7 +7,7 @@ const TIMEOUT = 1_000_000;
77

88
// Skipped until we have a better way to cache Chromium installs
99
// between tests and in CI
10-
describe.skip("crawlSite", () => {
10+
describe.skip("crawl", () => {
1111
describe("GitHub repositories", () => {
1212
const repoUrl =
1313
"https://github.com/Patrick-Erichsen/test-github-repo-for-crawling";
@@ -16,7 +16,7 @@ describe.skip("crawlSite", () => {
1616

1717
beforeAll(async () => {
1818
crawlResults = [];
19-
for await (const page of crawlSite(repoUrl)) {
19+
for await (const page of crawl(new URL(repoUrl))) {
2020
crawlResults.push(page);
2121
}
2222
}, TIMEOUT);
@@ -66,7 +66,7 @@ describe.skip("crawlSite", () => {
6666
for (const site of TEST_SITES) {
6767
const crawlResults: PageData[] = [];
6868

69-
for await (const page of crawlSite(site, NUM_PAGES_TO_CRAWL)) {
69+
for await (const page of crawl(new URL(site), NUM_PAGES_TO_CRAWL)) {
7070
crawlResults.push(page);
7171
}
7272

@@ -94,7 +94,7 @@ describe.skip("crawlSite", () => {
9494
let pageFound = false;
9595

9696
try {
97-
for await (const page of crawlSite(url, 1)) {
97+
for await (const page of crawl(new URL(url), 1)) {
9898
if (page.url === url) {
9999
pageFound = true;
100100
break;

core/indexing/docs/crawl.ts

+240
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
import { Octokit } from "@octokit/rest";
2+
import { URL } from "node:url";
3+
import { EventEmitter } from "events";
4+
import { getChromiumPath, getContinueUtilsPath } from "../../util/paths";
5+
import { executablePath, Page } from "puppeteer";
6+
// @ts-ignore
7+
import PCR from "puppeteer-chromium-resolver";
8+
import * as fs from "fs";
9+
10+
export type PageData = {
11+
url: string;
12+
path: string;
13+
content: string;
14+
};
15+
16+
const MAX_TIME_TO_CRAWL = 1000 * 5;
17+
const LINK_GROUP_SIZE = 2; // Controls parallelization of crawler
18+
const GITHUB_HOST = "github.com";
19+
const MAX_REQUESTS_PER_CRAWL = 1000;
20+
const markdownRegex = new RegExp(/\.(md|mdx)$/);
21+
const octokit = new Octokit({
22+
auth: undefined,
23+
});
24+
25+
const PCR_CONFIG = {
26+
downloadPath: getContinueUtilsPath(),
27+
};
28+
29+
export function verifyOrInstallChromium() {
30+
if (!fs.existsSync(getChromiumPath())) {
31+
PCR(PCR_CONFIG);
32+
}
33+
}
34+
35+
async function getGithubRepoDefaultBranch(
36+
owner: string,
37+
repo: string,
38+
): Promise<string> {
39+
const repoInfo = await octokit.repos.get({
40+
owner,
41+
repo,
42+
});
43+
44+
return repoInfo.data.default_branch;
45+
}
46+
47+
async function getGitHubRepoPaths(owner: string, repo: string, branch: string) {
48+
const tree = await octokit.request(
49+
"GET /repos/{owner}/{repo}/git/trees/{tree_sha}",
50+
{
51+
owner,
52+
repo,
53+
tree_sha: branch,
54+
headers: {
55+
"X-GitHub-Api-Version": "2022-11-28",
56+
},
57+
recursive: "true",
58+
},
59+
);
60+
61+
const paths = tree.data.tree
62+
.filter(
63+
(file: any) =>
64+
file.type === "blob" && markdownRegex.test(file.path ?? ""),
65+
)
66+
.map((file: any) => file.path);
67+
68+
return paths;
69+
}
70+
71+
async function getGithubRepoFileContent(
72+
path: string,
73+
owner: string,
74+
repo: string,
75+
) {
76+
try {
77+
const response = await octokit.repos.getContent({
78+
owner,
79+
repo,
80+
path,
81+
headers: {
82+
Accept: "application/vnd.github.raw+json",
83+
},
84+
});
85+
86+
return response.data as unknown as string;
87+
} catch (error) {
88+
console.debug("Error fetching file contents:", error);
89+
return null;
90+
}
91+
}
92+
93+
async function* crawlGithubRepo(url: URL) {
94+
const urlStr = url.toString();
95+
const [_, owner, repo] = url.pathname.split("/");
96+
const branch = await getGithubRepoDefaultBranch(owner, repo);
97+
const paths = await getGitHubRepoPaths(owner, repo, branch);
98+
99+
for await (const path of paths) {
100+
const content = await getGithubRepoFileContent(path, owner, repo);
101+
102+
yield {
103+
path,
104+
url: urlStr,
105+
content: content ?? "",
106+
};
107+
}
108+
}
109+
110+
async function getLinksFromPage(page: Page) {
111+
// The URL lib is not available by default in the page scope,
112+
// so we need to expose it to the page through this fn.
113+
await page.exposeFunction(
114+
"getCleanedUrlFromAnchorTag",
115+
(a: HTMLAnchorElement) => {
116+
let url = new URL(a.href);
117+
url.hash = "";
118+
return url.href;
119+
},
120+
);
121+
122+
const links: string[] = await page.$$eval("a", (links) =>
123+
links.map((a) => (window as any).getCleanedUrlFromAnchorTag),
124+
);
125+
126+
return links;
127+
}
128+
129+
async function getLinkGroups(page: Page) {
130+
const links = await getLinksFromPage(page);
131+
132+
const groups = links.reduce((acc, link, i) => {
133+
const groupIndex = Math.floor(i / LINK_GROUP_SIZE);
134+
135+
if (!acc[groupIndex]) {
136+
acc.push([]);
137+
}
138+
139+
acc[groupIndex].push(link);
140+
141+
return acc;
142+
}, [] as string[][]);
143+
144+
return groups;
145+
}
146+
147+
function shouldSkipPage(url: URL, rootUrl: URL, visitedLinks: Set<string>) {
148+
const hasVisitedLink = visitedLinks.has(url.toString());
149+
const isInvalidHostOrPath =
150+
!url.pathname.startsWith(rootUrl.pathname) || rootUrl.host !== url.host;
151+
152+
return hasVisitedLink || isInvalidHostOrPath;
153+
}
154+
155+
async function* crawlSitePages(
156+
page: Page,
157+
url: URL,
158+
rootUrl: URL,
159+
maxRequestsPerCrawl: number,
160+
visitedLinks: Set<string> = new Set(),
161+
currentRequests: number = 0,
162+
): AsyncGenerator<any> {
163+
if (currentRequests >= maxRequestsPerCrawl) {
164+
console.warn("Max requests per crawl reached. Stopping crawler.");
165+
return;
166+
}
167+
168+
if (shouldSkipPage(url, rootUrl, visitedLinks)) {
169+
console.warn("Skipping ", url.toString());
170+
return;
171+
}
172+
173+
await page.goto(url.toString());
174+
175+
const htmlContent = await page.content();
176+
const linkGroups = await getLinkGroups(page);
177+
const requestCount = currentRequests + 1;
178+
179+
visitedLinks.add(url.toString());
180+
181+
yield {
182+
path: url.pathname,
183+
url: url.toString(),
184+
content: htmlContent,
185+
};
186+
187+
for (const linkGroup of linkGroups) {
188+
for (const link of linkGroup) {
189+
yield* crawlSitePages(
190+
page,
191+
new URL(link),
192+
rootUrl,
193+
maxRequestsPerCrawl,
194+
visitedLinks,
195+
requestCount,
196+
);
197+
}
198+
}
199+
}
200+
201+
async function* crawlSite(
202+
startUrl: URL,
203+
rootUrl: URL,
204+
maxRequestsPerCrawl: number,
205+
): AsyncGenerator<PageData> {
206+
const stats = await PCR(PCR_CONFIG);
207+
208+
const browser = await stats.puppeteer.launch({
209+
args: [
210+
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
211+
],
212+
executablePath: stats.executablePath,
213+
// From the docs: https://pptr.dev/guides/headless-modes
214+
// If the performance is more important for your use case, switch to chrome-headless-shell as following:
215+
// { headless: "shell" }
216+
headless: "shell",
217+
});
218+
219+
const page = await browser.newPage();
220+
221+
try {
222+
yield* crawlSitePages(page, startUrl, rootUrl, maxRequestsPerCrawl);
223+
} catch (e) {
224+
console.debug("Error getting links: ", e);
225+
} finally {
226+
await browser.close();
227+
}
228+
}
229+
230+
export async function* crawl(
231+
url: URL,
232+
maxRequestsPerCrawl: number = MAX_REQUESTS_PER_CRAWL,
233+
): AsyncGenerator<PageData> {
234+
if (url.host === GITHUB_HOST) {
235+
yield* crawlGithubRepo(url);
236+
} else {
237+
// TODO: Why both
238+
yield* crawlSite(url, url, maxRequestsPerCrawl);
239+
}
240+
}

0 commit comments

Comments
 (0)