@@ -7,7 +7,7 @@ import { LanceDbIndex } from "./LanceDbIndex.js";
7
7
import { ChunkCodebaseIndex } from "./chunk/ChunkCodebaseIndex.js" ;
8
8
import { getComputeDeleteAddRemove } from "./refreshIndex.js" ;
9
9
import { CodebaseIndex , IndexResultType } from "./types.js" ;
10
- import { walkDir } from "./walkDir.js" ;
10
+ import { walkDirAsync } from "./walkDir.js" ;
11
11
12
12
export class PauseToken {
13
13
constructor ( private _paused : boolean ) { }
@@ -62,25 +62,7 @@ export class CodebaseIndexer {
62
62
}
63
63
const branch = await this . ide . getBranch ( workspaceDir ) ;
64
64
const repoName = await this . ide . getRepoName ( workspaceDir ) ;
65
- const stats = await this . ide . getLastModified ( [ file ] ) ;
66
- const indexesToBuild = await this . getIndexesToBuild ( ) ;
67
- for ( const codebaseIndex of indexesToBuild ) {
68
- const tag : IndexTag = {
69
- directory : workspaceDir ,
70
- branch,
71
- artifactId : codebaseIndex . artifactId ,
72
- } ;
73
- const [ results , lastUpdated , markComplete ] = await getComputeDeleteAddRemove (
74
- tag ,
75
- { ...stats } ,
76
- ( filepath ) => this . ide . readFile ( filepath ) ,
77
- repoName ,
78
- ) ;
79
- for await ( const _ of codebaseIndex . update ( tag , results , markComplete , repoName ) ) {
80
- lastUpdated . forEach ( ( lastUpdated , path ) => {
81
- markComplete ( [ lastUpdated ] , IndexResultType . UpdateLastUpdated ) ;
82
- } ) ;
83
- }
65
+ for await ( const updateDesc of this . indexFiles ( workspaceDir , branch , repoName , [ file ] ) ) {
84
66
}
85
67
}
86
68
@@ -131,33 +113,40 @@ export class CodebaseIndexer {
131
113
desc : "Starting indexing..." ,
132
114
status : "loading" ,
133
115
} ;
116
+ const beginTime = Date . now ( ) ;
134
117
135
118
for ( const directory of workspaceDirs ) {
136
- const files = await walkDir ( directory , this . ide ) ;
137
- const stats = await this . ide . getLastModified ( files ) ;
119
+ const dirBasename = await this . basename ( directory ) ;
120
+ yield {
121
+ progress,
122
+ desc : `Discovering files in ${ dirBasename } ...` ,
123
+ status : "indexing"
124
+ } ;
125
+ // compute the number of files in this directory to display an accurate progress bar
126
+ let totalFileCount = 0 ;
127
+ for await ( const p of walkDirAsync ( directory , this . ide ) ) {
128
+ totalFileCount += 1 ;
129
+ if ( abortSignal . aborted ) {
130
+ yield {
131
+ progress : 1 ,
132
+ desc : "Indexing cancelled" ,
133
+ status : "disabled" ,
134
+ } ;
135
+ return ;
136
+ }
137
+ if ( this . pauseToken . paused ) {
138
+ yield * this . yieldUpdateAndPause ( ) ;
139
+ }
140
+ }
141
+
138
142
const branch = await this . ide . getBranch ( directory ) ;
139
143
const repoName = await this . ide . getRepoName ( directory ) ;
140
- let completedRelativeExpectedTime = 0 ;
141
-
142
- for ( const codebaseIndex of indexesToBuild ) {
143
- // TODO: IndexTag type should use repoName rather than directory
144
- const tag : IndexTag = {
145
- directory,
146
- branch,
147
- artifactId : codebaseIndex . artifactId ,
148
- } ;
149
- const [ results , lastUpdated , markComplete ] = await getComputeDeleteAddRemove (
150
- tag ,
151
- { ...stats } ,
152
- ( filepath ) => this . ide . readFile ( filepath ) ,
153
- repoName ,
154
- ) ;
144
+ const batchSize = this . getBatchSize ( totalFileCount ) ;
145
+ let completedFileCount = 0 ;
155
146
147
+ for await ( const files of this . walkDirInBatches ( directory , batchSize ) ) {
156
148
try {
157
- for await ( let {
158
- progress : indexProgress ,
159
- desc,
160
- } of codebaseIndex . update ( tag , results , markComplete , repoName ) ) {
149
+ for await ( const updateDesc of this . indexFiles ( directory , branch , repoName , files ) ) {
161
150
// Handle pausing in this loop because it's the only one really taking time
162
151
if ( abortSignal . aborted ) {
163
152
yield {
@@ -167,77 +156,125 @@ export class CodebaseIndexer {
167
156
} ;
168
157
return ;
169
158
}
170
-
171
159
if ( this . pauseToken . paused ) {
172
- yield {
173
- progress,
174
- desc : "Paused" ,
175
- status : "paused" ,
176
- } ;
177
- while ( this . pauseToken . paused ) {
178
- await new Promise ( ( resolve ) => setTimeout ( resolve , 100 ) ) ;
179
- }
160
+ yield * this . yieldUpdateAndPause ( ) ;
180
161
}
181
-
182
- progress =
183
- ( completedDirs +
184
- ( completedRelativeExpectedTime +
185
- Math . min ( 1.0 , indexProgress ) *
186
- codebaseIndex . relativeExpectedTime ) /
187
- totalRelativeExpectedTime ) /
188
- workspaceDirs . length ;
189
162
yield {
190
- progress,
191
- desc,
163
+ progress : progress ,
164
+ desc : updateDesc ,
192
165
status : "indexing" ,
193
166
} ;
194
167
}
168
+ } catch ( err ) {
169
+ yield this . handleErrorAndGetProgressUpdate ( err ) ;
170
+ return ;
171
+ }
172
+ completedFileCount += files . length ;
173
+ progress = completedFileCount / totalFileCount / workspaceDirs . length + completedDirs / workspaceDirs . length ;
174
+ this . logProgress ( beginTime , completedFileCount , progress ) ;
175
+ }
176
+ completedDirs += 1 ;
177
+ }
178
+ yield {
179
+ progress : 100 ,
180
+ desc : "Indexing Complete" ,
181
+ status : "done" ,
182
+ } ;
183
+ }
195
184
196
- lastUpdated . forEach ( ( lastUpdated , path ) => {
197
- markComplete ( [ lastUpdated ] , IndexResultType . UpdateLastUpdated ) ;
198
- } ) ;
185
+ private handleErrorAndGetProgressUpdate ( err : unknown ) : IndexingProgressUpdate {
186
+ console . log ( "error when indexing: " , err ) ;
187
+ if ( err instanceof Error ) {
188
+ return this . errorToProgressUpdate ( err ) ;
189
+ }
190
+ return {
191
+ progress : 0 ,
192
+ desc : `Indexing failed: ${ err } ` ,
193
+ status : "failed" ,
194
+ } ;
195
+ }
199
196
200
- completedRelativeExpectedTime += codebaseIndex . relativeExpectedTime ;
201
- yield {
202
- progress :
203
- ( completedDirs +
204
- completedRelativeExpectedTime / totalRelativeExpectedTime ) /
205
- workspaceDirs . length ,
206
- desc : "Completed indexing " + codebaseIndex . artifactId ,
207
- status : "indexing" ,
208
- } ;
209
- } catch ( e : any ) {
210
- let errMsg = `${ e } ` ;
197
+ private errorToProgressUpdate ( err : Error ) : IndexingProgressUpdate {
198
+ const errorRegex =
199
+ / I n v a l i d a r g u m e n t e r r o r : V a l u e s l e n g t h ( \d + ) i s l e s s t h a n t h e l e n g t h \( ( \d + ) \) m u l t i p l i e d b y t h e v a l u e s i z e \( \d + \) / ;
200
+ const match = err . message . match ( errorRegex ) ;
201
+ let errMsg : string ;
202
+ if ( match ) {
203
+ const [ _ , valuesLength , expectedLength ] = match ;
204
+ errMsg = `Generated embedding had length ${ valuesLength } but was expected to be ${ expectedLength } . This may be solved by deleting ~/.continue/index and refreshing the window to re-index.` ;
205
+ } else {
206
+ errMsg = `${ err } ` ;
207
+ }
208
+ return {
209
+ progress : 0 ,
210
+ desc : errMsg ,
211
+ status : "failed" ,
212
+ } ;
213
+ }
211
214
212
- const errorRegex =
213
- / I n v a l i d a r g u m e n t e r r o r : V a l u e s l e n g t h ( \d + ) i s l e s s t h a n t h e l e n g t h \( ( \d + ) \) m u l t i p l i e d b y t h e v a l u e s i z e \( \d + \) / ;
214
- const match = e . message . match ( errorRegex ) ;
215
+ private logProgress ( beginTime : number , completedFileCount : number , progress : number ) {
216
+ const timeTaken = Date . now ( ) - beginTime ;
217
+ const seconds = Math . round ( timeTaken / 1000 ) ;
218
+ const progressPercentage = ( progress * 100 ) . toFixed ( 1 ) ;
219
+ const filesPerSec = ( completedFileCount / seconds ) . toFixed ( 2 ) ;
220
+ console . log ( `Indexing: ${ progressPercentage } % complete, elapsed time: ${ seconds } s, ${ filesPerSec } file/sec` ) ;
221
+ }
215
222
216
- if ( match ) {
217
- const [ _ , valuesLength , expectedLength ] = match ;
218
- errMsg = `Generated embedding had length ${ valuesLength } but was expected to be ${ expectedLength } . This may be solved by deleting ~/.continue/index and refreshing the window to re-index.` ;
219
- }
223
+ private async * yieldUpdateAndPause ( ) : AsyncGenerator < IndexingProgressUpdate > {
224
+ yield {
225
+ progress : 0 ,
226
+ desc : "Indexing Paused" ,
227
+ status : "paused" ,
228
+ } ;
229
+ while ( this . pauseToken . paused ) {
230
+ await new Promise ( ( resolve ) => setTimeout ( resolve , 100 ) ) ;
231
+ }
232
+ }
220
233
221
- yield {
222
- progress : 0 ,
223
- desc : errMsg ,
224
- status : "failed" ,
225
- } ;
234
+ private getBatchSize ( workspaceSize : number ) : number {
235
+ // at least 10 and as much as 100 (in a repository with 10000 files)
236
+ return Math . min ( 100 , Math . max ( 10 , Math . floor ( workspaceSize / 100 ) ) ) ;
237
+ }
226
238
227
- console . warn (
228
- `Error updating the ${ codebaseIndex . artifactId } index: ${ e } ` ,
229
- ) ;
230
- return ;
231
- }
239
+ /*
240
+ * enables the indexing operation to be completed in small batches, this is important in large
241
+ * repositories where indexing can quickly use up all the memory available
242
+ */
243
+ private async * walkDirInBatches ( directory : string , batchSize : number ) : AsyncGenerator < string [ ] > {
244
+ let results = [ ] ;
245
+ for await ( const p of walkDirAsync ( directory , this . ide ) ) {
246
+ results . push ( p ) ;
247
+ if ( results . length === batchSize ) {
248
+ yield results ;
249
+ results = [ ] ;
232
250
}
251
+ }
252
+ if ( results . length > 0 ) {
253
+ yield results ;
254
+ }
255
+ }
233
256
234
- completedDirs ++ ;
235
- progress = completedDirs / workspaceDirs . length ;
236
- yield {
237
- progress,
238
- desc : "Indexing Complete" ,
239
- status : "done" ,
257
+ private async * indexFiles ( workspaceDir : string , branch : string , repoName : string | undefined , filePaths : string [ ] ) : AsyncGenerator < string > {
258
+ const stats = await this . ide . getLastModified ( filePaths ) ;
259
+ const indexesToBuild = await this . getIndexesToBuild ( ) ;
260
+ for ( const codebaseIndex of indexesToBuild ) {
261
+ const tag : IndexTag = {
262
+ directory : workspaceDir ,
263
+ branch,
264
+ artifactId : codebaseIndex . artifactId ,
240
265
} ;
266
+ const [ results , lastUpdated , markComplete ] = await getComputeDeleteAddRemove (
267
+ tag ,
268
+ { ...stats } ,
269
+ ( filepath ) => this . ide . readFile ( filepath ) ,
270
+ repoName ,
271
+ ) ;
272
+ for await ( const { desc } of codebaseIndex . update ( tag , results , markComplete , repoName ) ) {
273
+ lastUpdated . forEach ( ( lastUpdated , path ) => {
274
+ markComplete ( [ lastUpdated ] , IndexResultType . UpdateLastUpdated ) ;
275
+ } ) ;
276
+ yield desc ;
277
+ }
241
278
}
242
279
}
243
280
@@ -250,4 +287,10 @@ export class CodebaseIndexer {
250
287
}
251
288
return undefined ;
252
289
}
290
+
291
+ private async basename ( filepath : string ) : Promise < string > {
292
+ const pathSep = await this . ide . pathSep ( ) ;
293
+ const path = filepath . split ( pathSep ) ;
294
+ return path [ path . length - 1 ] ;
295
+ }
253
296
}
0 commit comments