Skip to content

Commit d9ca1c7

Browse files
spewRob Leidle
andauthored
Make indexing functional in large workspaces by chunking segments of the workspace instead of workign on the entire workspace at once (#1876)
Co-authored-by: Rob Leidle <[email protected]>
1 parent 4136087 commit d9ca1c7

File tree

2 files changed

+173
-111
lines changed

2 files changed

+173
-111
lines changed

core/indexing/CodebaseIndexer.ts

Lines changed: 140 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import { LanceDbIndex } from "./LanceDbIndex.js";
77
import { ChunkCodebaseIndex } from "./chunk/ChunkCodebaseIndex.js";
88
import { getComputeDeleteAddRemove } from "./refreshIndex.js";
99
import { CodebaseIndex, IndexResultType } from "./types.js";
10-
import { walkDir } from "./walkDir.js";
10+
import { walkDirAsync } from "./walkDir.js";
1111

1212
export class PauseToken {
1313
constructor(private _paused: boolean) {}
@@ -62,25 +62,7 @@ export class CodebaseIndexer {
6262
}
6363
const branch = await this.ide.getBranch(workspaceDir);
6464
const repoName = await this.ide.getRepoName(workspaceDir);
65-
const stats = await this.ide.getLastModified([file]);
66-
const indexesToBuild = await this.getIndexesToBuild();
67-
for (const codebaseIndex of indexesToBuild) {
68-
const tag: IndexTag = {
69-
directory: workspaceDir,
70-
branch,
71-
artifactId: codebaseIndex.artifactId,
72-
};
73-
const [results, lastUpdated, markComplete] = await getComputeDeleteAddRemove(
74-
tag,
75-
{ ...stats },
76-
(filepath) => this.ide.readFile(filepath),
77-
repoName,
78-
);
79-
for await (const _ of codebaseIndex.update(tag, results, markComplete, repoName)) {
80-
lastUpdated.forEach((lastUpdated, path) => {
81-
markComplete([lastUpdated], IndexResultType.UpdateLastUpdated);
82-
});
83-
}
65+
for await (const updateDesc of this.indexFiles(workspaceDir, branch, repoName, [file])) {
8466
}
8567
}
8668

@@ -131,33 +113,40 @@ export class CodebaseIndexer {
131113
desc: "Starting indexing...",
132114
status: "loading",
133115
};
116+
const beginTime = Date.now();
134117

135118
for (const directory of workspaceDirs) {
136-
const files = await walkDir(directory, this.ide);
137-
const stats = await this.ide.getLastModified(files);
119+
const dirBasename = await this.basename(directory);
120+
yield {
121+
progress,
122+
desc: `Discovering files in ${dirBasename}...`,
123+
status: "indexing"
124+
};
125+
// compute the number of files in this directory to display an accurate progress bar
126+
let totalFileCount = 0;
127+
for await (const p of walkDirAsync(directory, this.ide)) {
128+
totalFileCount += 1;
129+
if (abortSignal.aborted) {
130+
yield {
131+
progress: 1,
132+
desc: "Indexing cancelled",
133+
status: "disabled",
134+
};
135+
return;
136+
}
137+
if (this.pauseToken.paused) {
138+
yield *this.yieldUpdateAndPause();
139+
}
140+
}
141+
138142
const branch = await this.ide.getBranch(directory);
139143
const repoName = await this.ide.getRepoName(directory);
140-
let completedRelativeExpectedTime = 0;
141-
142-
for (const codebaseIndex of indexesToBuild) {
143-
// TODO: IndexTag type should use repoName rather than directory
144-
const tag: IndexTag = {
145-
directory,
146-
branch,
147-
artifactId: codebaseIndex.artifactId,
148-
};
149-
const [results, lastUpdated, markComplete] = await getComputeDeleteAddRemove(
150-
tag,
151-
{ ...stats },
152-
(filepath) => this.ide.readFile(filepath),
153-
repoName,
154-
);
144+
const batchSize = this.getBatchSize(totalFileCount);
145+
let completedFileCount = 0;
155146

147+
for await (const files of this.walkDirInBatches(directory, batchSize)) {
156148
try {
157-
for await (let {
158-
progress: indexProgress,
159-
desc,
160-
} of codebaseIndex.update(tag, results, markComplete, repoName)) {
149+
for await (const updateDesc of this.indexFiles(directory, branch, repoName, files)) {
161150
// Handle pausing in this loop because it's the only one really taking time
162151
if (abortSignal.aborted) {
163152
yield {
@@ -167,77 +156,125 @@ export class CodebaseIndexer {
167156
};
168157
return;
169158
}
170-
171159
if (this.pauseToken.paused) {
172-
yield {
173-
progress,
174-
desc: "Paused",
175-
status: "paused",
176-
};
177-
while (this.pauseToken.paused) {
178-
await new Promise((resolve) => setTimeout(resolve, 100));
179-
}
160+
yield *this.yieldUpdateAndPause();
180161
}
181-
182-
progress =
183-
(completedDirs +
184-
(completedRelativeExpectedTime +
185-
Math.min(1.0, indexProgress) *
186-
codebaseIndex.relativeExpectedTime) /
187-
totalRelativeExpectedTime) /
188-
workspaceDirs.length;
189162
yield {
190-
progress,
191-
desc,
163+
progress: progress,
164+
desc: updateDesc,
192165
status: "indexing",
193166
};
194167
}
168+
} catch (err) {
169+
yield this.handleErrorAndGetProgressUpdate(err);
170+
return;
171+
}
172+
completedFileCount += files.length;
173+
progress = completedFileCount / totalFileCount / workspaceDirs.length + completedDirs / workspaceDirs.length;
174+
this.logProgress(beginTime, completedFileCount, progress);
175+
}
176+
completedDirs += 1;
177+
}
178+
yield {
179+
progress: 100,
180+
desc: "Indexing Complete",
181+
status: "done",
182+
};
183+
}
195184

196-
lastUpdated.forEach((lastUpdated, path) => {
197-
markComplete([lastUpdated], IndexResultType.UpdateLastUpdated);
198-
});
185+
private handleErrorAndGetProgressUpdate(err: unknown): IndexingProgressUpdate {
186+
console.log("error when indexing: ", err);
187+
if (err instanceof Error) {
188+
return this.errorToProgressUpdate(err);
189+
}
190+
return {
191+
progress: 0,
192+
desc: `Indexing failed: ${err}`,
193+
status: "failed",
194+
};
195+
}
199196

200-
completedRelativeExpectedTime += codebaseIndex.relativeExpectedTime;
201-
yield {
202-
progress:
203-
(completedDirs +
204-
completedRelativeExpectedTime / totalRelativeExpectedTime) /
205-
workspaceDirs.length,
206-
desc: "Completed indexing " + codebaseIndex.artifactId,
207-
status: "indexing",
208-
};
209-
} catch (e: any) {
210-
let errMsg = `${e}`;
197+
private errorToProgressUpdate(err: Error): IndexingProgressUpdate {
198+
const errorRegex =
199+
/Invalid argument error: Values length (\d+) is less than the length \((\d+)\) multiplied by the value size \(\d+\)/;
200+
const match = err.message.match(errorRegex);
201+
let errMsg: string;
202+
if (match) {
203+
const [_, valuesLength, expectedLength] = match;
204+
errMsg = `Generated embedding had length ${valuesLength} but was expected to be ${expectedLength}. This may be solved by deleting ~/.continue/index and refreshing the window to re-index.`;
205+
} else {
206+
errMsg = `${err}`;
207+
}
208+
return {
209+
progress: 0,
210+
desc: errMsg,
211+
status: "failed",
212+
};
213+
}
211214

212-
const errorRegex =
213-
/Invalid argument error: Values length (\d+) is less than the length \((\d+)\) multiplied by the value size \(\d+\)/;
214-
const match = e.message.match(errorRegex);
215+
private logProgress(beginTime: number, completedFileCount: number, progress: number) {
216+
const timeTaken = Date.now() - beginTime;
217+
const seconds = Math.round(timeTaken / 1000);
218+
const progressPercentage = (progress * 100).toFixed(1);
219+
const filesPerSec = (completedFileCount / seconds).toFixed(2);
220+
console.log(`Indexing: ${progressPercentage}% complete, elapsed time: ${seconds}s, ${filesPerSec} file/sec`);
221+
}
215222

216-
if (match) {
217-
const [_, valuesLength, expectedLength] = match;
218-
errMsg = `Generated embedding had length ${valuesLength} but was expected to be ${expectedLength}. This may be solved by deleting ~/.continue/index and refreshing the window to re-index.`;
219-
}
223+
private async* yieldUpdateAndPause(): AsyncGenerator<IndexingProgressUpdate> {
224+
yield {
225+
progress: 0,
226+
desc: "Indexing Paused",
227+
status: "paused",
228+
};
229+
while (this.pauseToken.paused) {
230+
await new Promise((resolve) => setTimeout(resolve, 100));
231+
}
232+
}
220233

221-
yield {
222-
progress: 0,
223-
desc: errMsg,
224-
status: "failed",
225-
};
234+
private getBatchSize(workspaceSize: number): number {
235+
// at least 10 and as much as 100 (in a repository with 10000 files)
236+
return Math.min(100, Math.max(10, Math.floor(workspaceSize / 100)));
237+
}
226238

227-
console.warn(
228-
`Error updating the ${codebaseIndex.artifactId} index: ${e}`,
229-
);
230-
return;
231-
}
239+
/*
240+
* enables the indexing operation to be completed in small batches, this is important in large
241+
* repositories where indexing can quickly use up all the memory available
242+
*/
243+
private async* walkDirInBatches(directory: string, batchSize: number): AsyncGenerator<string[]> {
244+
let results = [];
245+
for await (const p of walkDirAsync(directory, this.ide)) {
246+
results.push(p);
247+
if (results.length === batchSize) {
248+
yield results;
249+
results = [];
232250
}
251+
}
252+
if (results.length > 0) {
253+
yield results;
254+
}
255+
}
233256

234-
completedDirs++;
235-
progress = completedDirs / workspaceDirs.length;
236-
yield {
237-
progress,
238-
desc: "Indexing Complete",
239-
status: "done",
257+
private async* indexFiles(workspaceDir: string, branch: string, repoName: string | undefined, filePaths: string[]): AsyncGenerator<string> {
258+
const stats = await this.ide.getLastModified(filePaths);
259+
const indexesToBuild = await this.getIndexesToBuild();
260+
for (const codebaseIndex of indexesToBuild) {
261+
const tag: IndexTag = {
262+
directory: workspaceDir,
263+
branch,
264+
artifactId: codebaseIndex.artifactId,
240265
};
266+
const [results, lastUpdated, markComplete] = await getComputeDeleteAddRemove(
267+
tag,
268+
{ ...stats },
269+
(filepath) => this.ide.readFile(filepath),
270+
repoName,
271+
);
272+
for await (const { desc } of codebaseIndex.update(tag, results, markComplete, repoName)) {
273+
lastUpdated.forEach((lastUpdated, path) => {
274+
markComplete([lastUpdated], IndexResultType.UpdateLastUpdated);
275+
});
276+
yield desc;
277+
}
241278
}
242279
}
243280

@@ -250,4 +287,10 @@ export class CodebaseIndexer {
250287
}
251288
return undefined;
252289
}
290+
291+
private async basename(filepath: string): Promise<string> {
292+
const pathSep = await this.ide.pathSep();
293+
const path = filepath.split(pathSep);
294+
return path[path.length - 1];
295+
}
253296
}

core/indexing/walkDir.ts

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class DFSWalker {
4646

4747
// walk is a depth-first search implementation
4848
public async *walk(): AsyncGenerator<string> {
49+
const fixupFunc = await this.newPathFixupFunc(this.options.returnRelativePaths ? "" : this.path, this.ide);
4950
const root: WalkContext = {
5051
walkableEntry: {
5152
relPath: "",
@@ -73,10 +74,10 @@ class DFSWalker {
7374
});
7475
if (this.options.onlyDirs) {
7576
// when onlyDirs is enabled the walker will only return directory names
76-
yield w.relPath;
77+
yield fixupFunc(w.relPath);
7778
}
7879
} else {
79-
yield w.relPath;
80+
yield fixupFunc(w.relPath);
8081
}
8182
}
8283
}
@@ -155,6 +156,24 @@ class DFSWalker {
155156
private entryIsSymlink(entry: Entry) {
156157
return entry[1] === (64 as FileType.SymbolicLink);
157158
}
159+
160+
// returns a function which will optionally prefix a root path and fixup the paths for the appropriate OS filesystem (i.e. windows)
161+
// the reason to construct this function once is to avoid the need to call ide.pathSep() multiple times
162+
private async newPathFixupFunc(rootPath: string, ide: IDE): Promise<(relPath: string) => string> {
163+
const pathSep = await ide.pathSep();
164+
const prefix = rootPath === "" ? "" : rootPath + pathSep;
165+
if (pathSep === "/") {
166+
if (rootPath === "") {
167+
// return a no-op function in this case to avoid unnecessary string concatentation
168+
return (relPath: string) => relPath;
169+
}
170+
return (relPath: string) => prefix + relPath;
171+
}
172+
// this serves to 'fix-up' the path on Windows
173+
return (relPath: string) => {
174+
return prefix + relPath.split("/").join(pathSep);
175+
};
176+
}
158177
}
159178

160179
const defaultOptions: WalkerOptions = {
@@ -167,18 +186,18 @@ export async function walkDir(
167186
ide: IDE,
168187
_options?: WalkerOptions,
169188
): Promise<string[]> {
170-
let entries: string[] = [];
171-
const options = { ...defaultOptions, ..._options };
172-
const dfsWalker = new DFSWalker(path, ide, options);
173-
let relativePaths: string[] = [];
174-
for await (const e of dfsWalker.walk()) {
175-
relativePaths.push(e);
189+
let paths: string[] = [];
190+
for await (const p of walkDirAsync(path, ide, _options)) {
191+
paths.push(p);
176192
}
177-
const pathSep = await ide.pathSep();
178-
const prefix = options.returnRelativePaths ? "" : path + pathSep;
193+
return paths;
194+
}
179195

180-
if (pathSep === "/") {
181-
return relativePaths.map((p) => prefix + p);
182-
}
183-
return relativePaths.map((p) => prefix + p.split("/").join(pathSep));
196+
export async function* walkDirAsync(
197+
path: string,
198+
ide: IDE,
199+
_options?: WalkerOptions,
200+
): AsyncGenerator<string> {
201+
const options = { ...defaultOptions, ..._options };
202+
yield* new DFSWalker(path, ide, options).walk();
184203
}

0 commit comments

Comments
 (0)