@@ -31,6 +31,10 @@ interface LanceDbRow {
31
31
[ key : string ] : any ;
32
32
}
33
33
34
+ type ItemWithChunks = { item : PathAndCacheKey ; chunks : Chunk [ ] } ;
35
+
36
+ type ChunkMap = Map < string , ItemWithChunks > ;
37
+
34
38
export class LanceDbIndex implements CodebaseIndex {
35
39
relativeExpectedTime : number = 13 ;
36
40
get artifactId ( ) : string {
@@ -85,77 +89,112 @@ export class LanceDbIndex implements CodebaseIndex {
85
89
) ;
86
90
}
87
91
88
- private async packToRows ( item : PathAndCacheKey ) : Promise < LanceDbRow [ ] > {
89
- const content = await this . readFile ( item . path ) ;
90
- if ( ! shouldChunk ( this . pathSep , item . path , content ) ) {
91
- return [ ] ;
92
+ private async computeRows ( items : PathAndCacheKey [ ] ) : Promise < LanceDbRow [ ] > {
93
+ const chunkMap = await this . collectChunks ( items ) ;
94
+ const allChunks = Array . from ( chunkMap . values ( ) ) . flatMap (
95
+ ( { chunks } ) => chunks ,
96
+ ) ;
97
+ const embeddings = await this . getEmbeddings ( allChunks ) ;
98
+
99
+ // Remove undefined embeddings and their corresponding chunks
100
+ for ( let i = embeddings . length - 1 ; i >= 0 ; i -- ) {
101
+ if ( embeddings [ i ] === undefined ) {
102
+ const chunk = allChunks [ i ] ;
103
+ const chunks = chunkMap . get ( chunk . filepath ) ?. chunks ;
104
+
105
+ if ( chunks ) {
106
+ const index = chunks . findIndex ( ( c ) => c === chunk ) ;
107
+ if ( index !== - 1 ) {
108
+ chunks . splice ( index , 1 ) ;
109
+ }
110
+ }
111
+
112
+ embeddings . splice ( i , 1 ) ;
113
+ }
92
114
}
115
+
116
+ return this . createLanceDbRows ( chunkMap , embeddings ) ;
117
+ }
118
+
119
+ private async collectChunks ( items : PathAndCacheKey [ ] ) : Promise < ChunkMap > {
120
+ const chunkMap : ChunkMap = new Map ( ) ;
121
+
122
+ for ( const item of items ) {
123
+ try {
124
+ const content = await this . readFile ( item . path ) ;
125
+
126
+ if ( ! shouldChunk ( this . pathSep , item . path , content ) ) {
127
+ continue ;
128
+ }
129
+
130
+ const chunks = await this . getChunks ( item , content ) ;
131
+ chunkMap . set ( item . path , { item, chunks } ) ;
132
+ } catch ( err ) {
133
+ console . log ( `LanceDBIndex, skipping ${ item . path } : ${ err } ` ) ;
134
+ }
135
+ }
136
+
137
+ return chunkMap ;
138
+ }
139
+
140
+ private async getChunks (
141
+ item : PathAndCacheKey ,
142
+ content : string ,
143
+ ) : Promise < Chunk [ ] > {
93
144
const chunks : Chunk [ ] = [ ] ;
145
+
94
146
const chunkParams = {
95
147
filepath : item . path ,
96
148
contents : content ,
97
149
maxChunkSize : this . embeddingsProvider . maxChunkSize ,
98
150
digest : item . cacheKey ,
99
151
} ;
152
+
100
153
for await ( const chunk of chunkDocument ( chunkParams ) ) {
101
154
if ( chunk . content . length === 0 ) {
102
- // File did not chunk properly, let's skip it.
103
155
throw new Error ( "did not chunk properly" ) ;
104
156
}
157
+
105
158
chunks . push ( chunk ) ;
106
159
}
107
- const embeddings = await this . chunkListToEmbedding ( chunks ) ;
108
- if ( chunks . length !== embeddings . length ) {
109
- throw new Error (
110
- `Unexpected lengths: chunks and embeddings do not match for ${ item . path } ` ,
111
- ) ;
112
- }
113
- const results = [ ] ;
114
- for ( let i = 0 ; i < chunks . length ; i ++ ) {
115
- results . push ( {
116
- path : item . path ,
117
- cachekey : item . cacheKey ,
118
- uuid : uuidv4 ( ) ,
119
- vector : embeddings [ i ] ,
120
- startLine : chunks [ i ] . startLine ,
121
- endLine : chunks [ i ] . endLine ,
122
- contents : chunks [ i ] . content ,
123
- } ) ;
124
- }
125
- return results ;
160
+
161
+ return chunks ;
126
162
}
127
163
128
- private async chunkListToEmbedding ( chunks : Chunk [ ] ) : Promise < number [ ] [ ] > {
129
- let embeddings : number [ ] [ ] ;
164
+ private async getEmbeddings ( chunks : Chunk [ ] ) : Promise < number [ ] [ ] > {
130
165
try {
131
- embeddings = await this . embeddingsProvider . embed (
132
- chunks . map ( ( c ) => c . content ) ,
133
- ) ;
166
+ return await this . embeddingsProvider . embed ( chunks . map ( ( c ) => c . content ) ) ;
134
167
} catch ( err ) {
135
168
throw new Error (
136
- `Failed to generate embedding for ${ chunks [ 0 ] ?. filepath } with provider: ${ this . embeddingsProvider . id } : ${ err } ` ,
169
+ `Failed to generate embeddings for ${ chunks . length } chunks with provider: ${ this . embeddingsProvider . id } : ${ err } ` ,
137
170
{ cause : err } ,
138
171
) ;
139
172
}
140
- if ( embeddings . some ( ( emb ) => emb === undefined ) ) {
141
- throw new Error (
142
- `Empty embedding returned for ${ chunks [ 0 ] ?. filepath } with provider: ${ this . embeddingsProvider . id } ` ,
143
- ) ;
144
- }
145
- return embeddings ;
146
173
}
147
174
148
- private async computeRows ( items : PathAndCacheKey [ ] ) : Promise < LanceDbRow [ ] > {
149
- const rowChunkPromises = items . map ( this . packToRows . bind ( this ) ) ;
150
- const rowChunkLists = [ ] ;
151
- for ( let i = 0 ; i < items . length ; i ++ ) {
152
- try {
153
- rowChunkLists . push ( await rowChunkPromises [ i ] ) ;
154
- } catch ( err ) {
155
- console . log ( `LanceDBIndex, skipping ${ items [ i ] . path } : ${ err } ` ) ;
175
+ private createLanceDbRows (
176
+ chunkMap : ChunkMap ,
177
+ embeddings : number [ ] [ ] ,
178
+ ) : LanceDbRow [ ] {
179
+ const results : LanceDbRow [ ] = [ ] ;
180
+ let embeddingIndex = 0 ;
181
+
182
+ for ( const [ path , { item, chunks } ] of chunkMap ) {
183
+ for ( const chunk of chunks ) {
184
+ results . push ( {
185
+ path,
186
+ cachekey : item . cacheKey ,
187
+ uuid : uuidv4 ( ) ,
188
+ vector : embeddings [ embeddingIndex ] ,
189
+ startLine : chunk . startLine ,
190
+ endLine : chunk . endLine ,
191
+ contents : chunk . content ,
192
+ } ) ;
193
+ embeddingIndex ++ ;
156
194
}
157
195
}
158
- return rowChunkLists . flat ( ) ;
196
+
197
+ return results ;
159
198
}
160
199
161
200
async * update (
0 commit comments