Skip to content

Commit 9b33a2f

Browse files
Merge pull request #1978 from continuedev/pe/batch-embed-requests
feat: batch embedding requests
2 parents df63182 + b6fedb5 commit 9b33a2f

File tree

3 files changed

+100
-63
lines changed

3 files changed

+100
-63
lines changed

core/indexing/CodebaseIndexer.ts

+14-16
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ export class PauseToken {
2828
}
2929

3030
export class CodebaseIndexer {
31+
/**
32+
* We batch for two reasons:
33+
* - To limit memory usage for indexes that perform computations locally, e.g. FTS
34+
* - To make as few requests as possible to the embeddings providers
35+
*/
36+
filesPerBatch = 1000;
37+
3138
// Note that we exclude certain Sqlite errors that we do not want to clear the indexes on,
3239
// e.g. a `SQLITE_BUSY` error.
3340
errorsRegexesToClearIndexesOn = [
@@ -303,33 +310,27 @@ export class CodebaseIndexer {
303310
}
304311
}
305312

306-
private getBatchSize(workspaceSize: number): number {
307-
return 100;
308-
}
309-
310313
/*
311-
* enables the indexing operation to be completed in small batches, this is important in large
314+
* Enables the indexing operation to be completed in batches, this is important in large
312315
* repositories where indexing can quickly use up all the memory available
313316
*/
314317
private *batchRefreshIndexResults(
315318
results: RefreshIndexResults,
316-
workspaceSize: number,
317319
): Generator<RefreshIndexResults> {
318320
let curPos = 0;
319-
const batchSize = this.getBatchSize(workspaceSize);
320321
while (
321322
curPos < results.compute.length ||
322323
curPos < results.del.length ||
323324
curPos < results.addTag.length ||
324325
curPos < results.removeTag.length
325326
) {
326327
yield {
327-
compute: results.compute.slice(curPos, curPos + batchSize),
328-
del: results.del.slice(curPos, curPos + batchSize),
329-
addTag: results.addTag.slice(curPos, curPos + batchSize),
330-
removeTag: results.removeTag.slice(curPos, curPos + batchSize),
328+
compute: results.compute.slice(curPos, curPos + this.filesPerBatch),
329+
del: results.del.slice(curPos, curPos + this.filesPerBatch),
330+
addTag: results.addTag.slice(curPos, curPos + this.filesPerBatch),
331+
removeTag: results.removeTag.slice(curPos, curPos + this.filesPerBatch),
331332
};
332-
curPos += batchSize;
333+
curPos += this.filesPerBatch;
333334
}
334335
}
335336

@@ -367,10 +368,7 @@ export class CodebaseIndexer {
367368
results.addTag.length +
368369
results.removeTag.length;
369370
let completedOps = 0;
370-
for (const subResult of this.batchRefreshIndexResults(
371-
results,
372-
workspaceFiles.length,
373-
)) {
371+
for (const subResult of this.batchRefreshIndexResults(results)) {
374372
for await (const { desc } of codebaseIndex.update(
375373
tag,
376374
subResult,

core/indexing/LanceDbIndex.ts

+84-45
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ interface LanceDbRow {
3131
[key: string]: any;
3232
}
3333

34+
type ItemWithChunks = { item: PathAndCacheKey; chunks: Chunk[] };
35+
36+
type ChunkMap = Map<string, ItemWithChunks>;
37+
3438
export class LanceDbIndex implements CodebaseIndex {
3539
relativeExpectedTime: number = 13;
3640
get artifactId(): string {
@@ -85,77 +89,112 @@ export class LanceDbIndex implements CodebaseIndex {
8589
);
8690
}
8791

88-
private async packToRows(item: PathAndCacheKey): Promise<LanceDbRow[]> {
89-
const content = await this.readFile(item.path);
90-
if (!shouldChunk(this.pathSep, item.path, content)) {
91-
return [];
92+
private async computeRows(items: PathAndCacheKey[]): Promise<LanceDbRow[]> {
93+
const chunkMap = await this.collectChunks(items);
94+
const allChunks = Array.from(chunkMap.values()).flatMap(
95+
({ chunks }) => chunks,
96+
);
97+
const embeddings = await this.getEmbeddings(allChunks);
98+
99+
// Remove undefined embeddings and their corresponding chunks
100+
for (let i = embeddings.length - 1; i >= 0; i--) {
101+
if (embeddings[i] === undefined) {
102+
const chunk = allChunks[i];
103+
const chunks = chunkMap.get(chunk.filepath)?.chunks;
104+
105+
if (chunks) {
106+
const index = chunks.findIndex((c) => c === chunk);
107+
if (index !== -1) {
108+
chunks.splice(index, 1);
109+
}
110+
}
111+
112+
embeddings.splice(i, 1);
113+
}
92114
}
115+
116+
return this.createLanceDbRows(chunkMap, embeddings);
117+
}
118+
119+
private async collectChunks(items: PathAndCacheKey[]): Promise<ChunkMap> {
120+
const chunkMap: ChunkMap = new Map();
121+
122+
for (const item of items) {
123+
try {
124+
const content = await this.readFile(item.path);
125+
126+
if (!shouldChunk(this.pathSep, item.path, content)) {
127+
continue;
128+
}
129+
130+
const chunks = await this.getChunks(item, content);
131+
chunkMap.set(item.path, { item, chunks });
132+
} catch (err) {
133+
console.log(`LanceDBIndex, skipping ${item.path}: ${err}`);
134+
}
135+
}
136+
137+
return chunkMap;
138+
}
139+
140+
private async getChunks(
141+
item: PathAndCacheKey,
142+
content: string,
143+
): Promise<Chunk[]> {
93144
const chunks: Chunk[] = [];
145+
94146
const chunkParams = {
95147
filepath: item.path,
96148
contents: content,
97149
maxChunkSize: this.embeddingsProvider.maxChunkSize,
98150
digest: item.cacheKey,
99151
};
152+
100153
for await (const chunk of chunkDocument(chunkParams)) {
101154
if (chunk.content.length === 0) {
102-
// File did not chunk properly, let's skip it.
103155
throw new Error("did not chunk properly");
104156
}
157+
105158
chunks.push(chunk);
106159
}
107-
const embeddings = await this.chunkListToEmbedding(chunks);
108-
if (chunks.length !== embeddings.length) {
109-
throw new Error(
110-
`Unexpected lengths: chunks and embeddings do not match for ${item.path}`,
111-
);
112-
}
113-
const results = [];
114-
for (let i = 0; i < chunks.length; i++) {
115-
results.push({
116-
path: item.path,
117-
cachekey: item.cacheKey,
118-
uuid: uuidv4(),
119-
vector: embeddings[i],
120-
startLine: chunks[i].startLine,
121-
endLine: chunks[i].endLine,
122-
contents: chunks[i].content,
123-
});
124-
}
125-
return results;
160+
161+
return chunks;
126162
}
127163

128-
private async chunkListToEmbedding(chunks: Chunk[]): Promise<number[][]> {
129-
let embeddings: number[][];
164+
private async getEmbeddings(chunks: Chunk[]): Promise<number[][]> {
130165
try {
131-
embeddings = await this.embeddingsProvider.embed(
132-
chunks.map((c) => c.content),
133-
);
166+
return await this.embeddingsProvider.embed(chunks.map((c) => c.content));
134167
} catch (err) {
135168
throw new Error(
136-
`Failed to generate embedding for ${chunks[0]?.filepath} with provider: ${this.embeddingsProvider.id}: ${err}`,
169+
`Failed to generate embeddings for ${chunks.length} chunks with provider: ${this.embeddingsProvider.id}: ${err}`,
137170
{ cause: err },
138171
);
139172
}
140-
if (embeddings.some((emb) => emb === undefined)) {
141-
throw new Error(
142-
`Empty embedding returned for ${chunks[0]?.filepath} with provider: ${this.embeddingsProvider.id}`,
143-
);
144-
}
145-
return embeddings;
146173
}
147174

148-
private async computeRows(items: PathAndCacheKey[]): Promise<LanceDbRow[]> {
149-
const rowChunkPromises = items.map(this.packToRows.bind(this));
150-
const rowChunkLists = [];
151-
for (let i = 0; i < items.length; i++) {
152-
try {
153-
rowChunkLists.push(await rowChunkPromises[i]);
154-
} catch (err) {
155-
console.log(`LanceDBIndex, skipping ${items[i].path}: ${err}`);
175+
private createLanceDbRows(
176+
chunkMap: ChunkMap,
177+
embeddings: number[][],
178+
): LanceDbRow[] {
179+
const results: LanceDbRow[] = [];
180+
let embeddingIndex = 0;
181+
182+
for (const [path, { item, chunks }] of chunkMap) {
183+
for (const chunk of chunks) {
184+
results.push({
185+
path,
186+
cachekey: item.cacheKey,
187+
uuid: uuidv4(),
188+
vector: embeddings[embeddingIndex],
189+
startLine: chunk.startLine,
190+
endLine: chunk.endLine,
191+
contents: chunk.content,
192+
});
193+
embeddingIndex++;
156194
}
157195
}
158-
return rowChunkLists.flat();
196+
197+
return results;
159198
}
160199

161200
async *update(

extensions/vscode/package-lock.json

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)