use async generator

Patrick-Erichsen · Patrick-Erichsen · commit b9b58e23fcb0 · 2024-08-22T17:14:06.000-07:00
diff --git a/core/core.ts b/core/core.ts
@@ -27,7 +27,7 @@ import type { IMessenger, Message } from "./util/messenger";
 import { editConfigJson } from "./util/paths";
 import { Telemetry } from "./util/posthog";
 import { streamDiffLines } from "./util/verticalEdit";
-import { verifyOrInstallChromium } from "./indexing/docs/crawlSite";
+import { verifyOrInstallChromium } from "./indexing/docs/crawl";
 
 export class Core {
   // implements IMessenger<ToCoreProtocol, FromCoreProtocol>
diff --git a/core/indexing/docs/DocsService.ts b/core/indexing/docs/DocsService.ts
@@ -1,36 +1,36 @@
 import { open, type Database } from "sqlite";
 import sqlite3 from "sqlite3";
 import lancedb, { Connection } from "vectordb";
-import { ConfigHandler } from "../../config/ConfigHandler.js";
-import DocsContextProvider from "../../context/providers/DocsContextProvider.js";
+import { ConfigHandler } from "../../config/ConfigHandler";
+import DocsContextProvider from "../../context/providers/DocsContextProvider";
 import {
   Chunk,
   ContinueConfig,
   EmbeddingsProvider,
   IDE,
   IndexingProgressUpdate,
   SiteIndexingConfig,
-} from "../../index.js";
-import { FromCoreProtocol, ToCoreProtocol } from "../../protocol/index.js";
-import { GlobalContext } from "../../util/GlobalContext.js";
-import { IMessenger } from "../../util/messenger.js";
+} from "../..";
+import { FromCoreProtocol, ToCoreProtocol } from "../../protocol";
+import { GlobalContext } from "../../util/GlobalContext";
+import { IMessenger } from "../../util/messenger";
 import {
   editConfigJson,
   getDocsSqlitePath,
   getLanceDbPath,
-} from "../../util/paths.js";
-import { Telemetry } from "../../util/posthog.js";
-import TransformersJsEmbeddingsProvider from "../embeddings/TransformersJsEmbeddingsProvider.js";
-import { Article, chunkArticle, pageToArticle } from "./article.js";
-import { crawlSite } from "./crawlSite.js";
-import { runLanceMigrations, runSqliteMigrations } from "./migrations.js";
+} from "../../util/paths";
+import { Telemetry } from "../../util/posthog";
+import TransformersJsEmbeddingsProvider from "../embeddings/TransformersJsEmbeddingsProvider";
+import { Article, chunkArticle, pageToArticle } from "./article";
+import { crawl } from "./crawl";
+import { runLanceMigrations, runSqliteMigrations } from "./migrations";
 import {
   downloadFromS3,
   getS3Filename,
   S3Buckets,
   SiteIndexingResults,
-} from "./preIndexed.js";
-import preIndexedDocs from "./preIndexedDocs.js";
+} from "./preIndexed";
+import preIndexedDocs from "./preIndexedDocs";
 
 // Purposefully lowercase because lancedb converts
 export interface LanceDbDocsRow {
@@ -112,8 +112,8 @@ export default class DocsService {
   }
 
   /*
-   * Currently, we generate and host embeddings for pre-indexed docs using transformers.js.
-   * However, we don't ship transformers.js with the JetBrains extension.
+   * Currently, we generate and host embeddings for pre-indexed docs using transformers.
+   * However, we don't ship transformers with the JetBrains extension.
    * So, we only include pre-indexed docs in the submenu for non-JetBrains IDEs.
    */
   async canUsePreindexedDocs() {
@@ -203,7 +203,7 @@ export default class DocsService {
     let maxKnownPages = 1;
 
     // Crawl pages and retrieve info as articles
-    for await (const page of crawlSite(startUrl)) {
+    for await (const page of crawl(new URL(startUrl))) {
       processedPages++;
 
       const article = pageToArticle(page);
@@ -276,6 +276,8 @@ export default class DocsService {
         status: "failed",
       };
 
+      this.docsIndexingQueue.delete(startUrl);
+
       return;
     }
 
@@ -721,7 +723,7 @@ export default class DocsService {
 
     if (isJetBrainsAndPreIndexedDocsProvider) {
       this.ide.errorPopup(
-        "The 'transformers.js' embeddings provider currently cannot be used to index " +
+        "The 'transformers' embeddings provider currently cannot be used to index " +
           "documentation in JetBrains. To enable documentation indexing, you can use " +
           "any of the other providers described in the docs: " +
           "https://docs.continue.dev/walkthroughs/codebase-embeddings#embeddings-providers",
diff --git a/core/indexing/docs/article.ts b/core/indexing/docs/article.ts
@@ -2,7 +2,7 @@ import { Readability } from "@mozilla/readability";
 import { JSDOM } from "jsdom";
 import { Chunk } from "../../";
 import { cleanFragment, cleanHeader } from "../chunk/markdown";
-import { PageData } from "./crawlSite";
+import { PageData } from "./crawl";
 
 export type ArticleComponent = {
   title: string;
diff --git a/core/indexing/docs/crawl.test.ts b/core/indexing/docs/crawl.test.ts
@@ -1,4 +1,4 @@
-import { crawlSite, PageData } from "./crawlSite";
+import { crawl, PageData } from "./crawl";
 import preIndexedDocs from "./preIndexedDocs";
 
 // Temporary workaround until we have better caching of Chromium
@@ -7,7 +7,7 @@ const TIMEOUT = 1_000_000;
 
 // Skipped until we have a better way to cache Chromium installs
 // between tests and in CI
-describe.skip("crawlSite", () => {
+describe.skip("crawl", () => {
   describe("GitHub repositories", () => {
     const repoUrl =
       "https://github.com/Patrick-Erichsen/test-github-repo-for-crawling";
@@ -16,7 +16,7 @@ describe.skip("crawlSite", () => {
 
     beforeAll(async () => {
       crawlResults = [];
-      for await (const page of crawlSite(repoUrl)) {
+      for await (const page of crawl(new URL(repoUrl))) {
         crawlResults.push(page);
       }
     }, TIMEOUT);
@@ -66,7 +66,7 @@ describe.skip("crawlSite", () => {
         for (const site of TEST_SITES) {
           const crawlResults: PageData[] = [];
 
-          for await (const page of crawlSite(site, NUM_PAGES_TO_CRAWL)) {
+          for await (const page of crawl(new URL(site), NUM_PAGES_TO_CRAWL)) {
             crawlResults.push(page);
           }
 
@@ -94,7 +94,7 @@ describe.skip("crawlSite", () => {
           let pageFound = false;
 
           try {
-            for await (const page of crawlSite(url, 1)) {
+            for await (const page of crawl(new URL(url), 1)) {
               if (page.url === url) {
                 pageFound = true;
                 break;
diff --git a/core/indexing/docs/crawl.ts b/core/indexing/docs/crawl.ts
@@ -0,0 +1,240 @@
+import { Octokit } from "@octokit/rest";
+import { URL } from "node:url";
+import { EventEmitter } from "events";
+import { getChromiumPath, getContinueUtilsPath } from "../../util/paths";
+import { executablePath, Page } from "puppeteer";
+// @ts-ignore
+import PCR from "puppeteer-chromium-resolver";
+import * as fs from "fs";
+
+export type PageData = {
+  url: string;
+  path: string;
+  content: string;
+};
+
+const MAX_TIME_TO_CRAWL = 1000 * 5;
+const LINK_GROUP_SIZE = 2; // Controls parallelization of crawler
+const GITHUB_HOST = "github.com";
+const MAX_REQUESTS_PER_CRAWL = 1000;
+const markdownRegex = new RegExp(/\.(md|mdx)$/);
+const octokit = new Octokit({
+  auth: undefined,
+});
+
+const PCR_CONFIG = {
+  downloadPath: getContinueUtilsPath(),
+};
+
+export function verifyOrInstallChromium() {
+  if (!fs.existsSync(getChromiumPath())) {
+    PCR(PCR_CONFIG);
+  }
+}
+
+async function getGithubRepoDefaultBranch(
+  owner: string,
+  repo: string,
+): Promise<string> {
+  const repoInfo = await octokit.repos.get({
+    owner,
+    repo,
+  });
+
+  return repoInfo.data.default_branch;
+}
+
+async function getGitHubRepoPaths(owner: string, repo: string, branch: string) {
+  const tree = await octokit.request(
+    "GET /repos/{owner}/{repo}/git/trees/{tree_sha}",
+    {
+      owner,
+      repo,
+      tree_sha: branch,
+      headers: {
+        "X-GitHub-Api-Version": "2022-11-28",
+      },
+      recursive: "true",
+    },
+  );
+
+  const paths = tree.data.tree
+    .filter(
+      (file: any) =>
+        file.type === "blob" && markdownRegex.test(file.path ?? ""),
+    )
+    .map((file: any) => file.path);
+
+  return paths;
+}
+
+async function getGithubRepoFileContent(
+  path: string,
+  owner: string,
+  repo: string,
+) {
+  try {
+    const response = await octokit.repos.getContent({
+      owner,
+      repo,
+      path,
+      headers: {
+        Accept: "application/vnd.github.raw+json",
+      },
+    });
+
+    return response.data as unknown as string;
+  } catch (error) {
+    console.debug("Error fetching file contents:", error);
+    return null;
+  }
+}
+
+async function* crawlGithubRepo(url: URL) {
+  const urlStr = url.toString();
+  const [_, owner, repo] = url.pathname.split("/");
+  const branch = await getGithubRepoDefaultBranch(owner, repo);
+  const paths = await getGitHubRepoPaths(owner, repo, branch);
+
+  for await (const path of paths) {
+    const content = await getGithubRepoFileContent(path, owner, repo);
+
+    yield {
+      path,
+      url: urlStr,
+      content: content ?? "",
+    };
+  }
+}
+
+async function getLinksFromPage(page: Page) {
+  // The URL lib is not available by default in the page scope,
+  // so we need to expose it to the page through this fn.
+  await page.exposeFunction(
+    "getCleanedUrlFromAnchorTag",
+    (a: HTMLAnchorElement) => {
+      let url = new URL(a.href);
+      url.hash = "";
+      return url.href;
+    },
+  );
+
+  const links: string[] = await page.$$eval("a", (links) =>
+    links.map((a) => (window as any).getCleanedUrlFromAnchorTag),
+  );
+
+  return links;
+}
+
+async function getLinkGroups(page: Page) {
+  const links = await getLinksFromPage(page);
+
+  const groups = links.reduce((acc, link, i) => {
+    const groupIndex = Math.floor(i / LINK_GROUP_SIZE);
+
+    if (!acc[groupIndex]) {
+      acc.push([]);
+    }
+
+    acc[groupIndex].push(link);
+
+    return acc;
+  }, [] as string[][]);
+
+  return groups;
+}
+
+function shouldSkipPage(url: URL, rootUrl: URL, visitedLinks: Set<string>) {
+  const hasVisitedLink = visitedLinks.has(url.toString());
+  const isInvalidHostOrPath =
+    !url.pathname.startsWith(rootUrl.pathname) || rootUrl.host !== url.host;
+
+  return hasVisitedLink || isInvalidHostOrPath;
+}
+
+async function* crawlSitePages(
+  page: Page,
+  url: URL,
+  rootUrl: URL,
+  maxRequestsPerCrawl: number,
+  visitedLinks: Set<string> = new Set(),
+  currentRequests: number = 0,
+): AsyncGenerator<any> {
+  if (currentRequests >= maxRequestsPerCrawl) {
+    console.warn("Max requests per crawl reached. Stopping crawler.");
+    return;
+  }
+
+  if (shouldSkipPage(url, rootUrl, visitedLinks)) {
+    console.warn("Skipping ", url.toString());
+    return;
+  }
+
+  await page.goto(url.toString());
+
+  const htmlContent = await page.content();
+  const linkGroups = await getLinkGroups(page);
+  const requestCount = currentRequests + 1;
+
+  visitedLinks.add(url.toString());
+
+  yield {
+    path: url.pathname,
+    url: url.toString(),
+    content: htmlContent,
+  };
+
+  for (const linkGroup of linkGroups) {
+    for (const link of linkGroup) {
+      yield* crawlSitePages(
+        page,
+        new URL(link),
+        rootUrl,
+        maxRequestsPerCrawl,
+        visitedLinks,
+        requestCount,
+      );
+    }
+  }
+}
+
+async function* crawlSite(
+  startUrl: URL,
+  rootUrl: URL,
+  maxRequestsPerCrawl: number,
+): AsyncGenerator<PageData> {
+  const stats = await PCR(PCR_CONFIG);
+
+  const browser = await stats.puppeteer.launch({
+    args: [
+      "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
+    ],
+    executablePath: stats.executablePath,
+    // From the docs: https://pptr.dev/guides/headless-modes
+    // If the performance is more important for your use case, switch to chrome-headless-shell as following:
+    // { headless: "shell" }
+    headless: "shell",
+  });
+
+  const page = await browser.newPage();
+
+  try {
+    yield* crawlSitePages(page, startUrl, rootUrl, maxRequestsPerCrawl);
+  } catch (e) {
+    console.debug("Error getting links: ", e);
+  } finally {
+    await browser.close();
+  }
+}
+
+export async function* crawl(
+  url: URL,
+  maxRequestsPerCrawl: number = MAX_REQUESTS_PER_CRAWL,
+): AsyncGenerator<PageData> {
+  if (url.host === GITHUB_HOST) {
+    yield* crawlGithubRepo(url);
+  } else {
+    // TODO: Why both
+    yield* crawlSite(url, url, maxRequestsPerCrawl);
+  }
+}
diff --git a/core/indexing/docs/crawlSite.ts b/core/indexing/docs/crawlSite.ts