1
1
import { omit , throttle } from "es-toolkit" ;
2
2
import defaultDocLoaderPlugin , {
3
+ DocLoaderInput ,
3
4
type DocLoader ,
4
5
type DocLoaderPlugin ,
5
6
} from "./DocLoader" ;
6
7
import { DocManager } from "./DocManager" ;
7
8
import defaultDocSplitterPlugin , {
8
9
type DocSplitterPlugin ,
9
10
} from "./DocSplitter" ;
10
- import type { PluginWithConfig } from "./Plugin" ;
11
+ import type { Content , PluginWithConfig } from "./Plugin" ;
11
12
import { createMeilisearchClient , getExtFromPath } from "./Utils" ;
12
- import { type Config as MeiliSearchConfig , type SearchParams } from "meilisearch" ;
13
+ import { MeiliSearch , type SearchParams } from "meilisearch" ;
13
14
import { basename } from "path" ;
14
15
import { FSLayer , Scanner , Watcher } from "./FSLayer" ;
15
16
import { AnyZodObject } from "zod" ;
17
+ import { DBLayer } from "./DBLayer" ;
16
18
17
19
export interface DifyKnowledgeRequest {
18
20
knowledge_id : string ;
@@ -34,15 +36,16 @@ export interface DifyKnowledgeResponseRecord {
34
36
* DocBase 初始化配置
35
37
*/
36
38
export interface DocBaseOptions {
39
+ db : DBLayer
37
40
/**
38
41
* MeiliSearch 配置
39
42
*/
40
- meiliSearchConfig : MeiliSearchConfig ;
43
+ // meiliSearchConfig: MeiliSearchConfig;
41
44
/**
42
45
* 初始化知识库目录
43
46
* @default []
44
47
*/
45
- initPaths ?: string [ ] ;
48
+ // initPaths?: string[];
46
49
/**
47
50
* 初始化插件列表
48
51
* @default
@@ -51,25 +54,27 @@ export interface DocBaseOptions {
51
54
* { plugin: defaultDocSplitterPlugin, params: { len: 1000 } }
52
55
* ]
53
56
*/
54
- initPlugins ?: PluginWithConfig < any > [ ] ;
57
+ // initPlugins?: PluginWithConfig<any>[];
55
58
/**
56
59
* 是否在初始化时扫描初始化知识库目录
57
60
* @default false
58
61
*/
59
- initscan ?: boolean ;
62
+ // initscan?: boolean;
60
63
/**
61
64
* 索引前缀
62
65
*/
63
- indexPrefix ?: string ;
66
+ // indexPrefix?: string;
64
67
/**
65
68
* 文件变动时间节流时段(毫秒),在该时段内每个文件最多执行一次嵌入更新操作
66
69
*/
67
- fileOpThrottleMs ?: number ;
70
+ // fileOpThrottleMs?: number;
68
71
}
69
72
70
73
export class DocBase {
74
+ #meiliSearch: MeiliSearch ;
75
+
71
76
/** 文档管理器 */
72
- #docManager ! : DocManager ;
77
+ #docManagers: Map < string , DocManager > = new Map ( ) ;
73
78
74
79
/** 文档加载指向器,映射文件扩展名到文档加载器名称 */
75
80
#docExtToLoaderName: Map < string , string > = new Map ( ) ;
@@ -87,41 +92,44 @@ export class DocBase {
87
92
#docWatcher! : Watcher ;
88
93
89
94
/** 任务缓存器 */
90
- #watcherTaskCache = new Map < string , "remove" | "upsert" > ( ) ;
91
- // 节流器默认 500 毫秒
92
- fileOpThrottleMs : number = 500 ;
95
+ #watcherTaskCache = new Map < string , {
96
+ docManagerId : string
97
+ type : "remove" | "upsert"
98
+ } > ( ) ;
93
99
94
100
// 执行任务缓存器中的任务, 每 watcherTaskThrottleMs 毫秒最多执行一次
95
101
#doWatcherTask = throttle (
96
102
async ( ) => {
97
103
console . debug ( "Starting to execute watcher tasks..." ) ;
98
104
const results = await Promise . allSettled (
99
- this . #watcherTaskCache. entries ( ) . map ( async ( [ path , type ] ) => {
105
+ this . #watcherTaskCache. entries ( ) . map ( async ( [ path , { docManagerId, type } ] ) => {
106
+ const docManager = this . #docManagers. get ( docManagerId ) ;
100
107
if ( type === "upsert" ) {
101
108
console . debug ( `Upserting document: ${ path } ` ) ;
102
- await this . # docManager. upsertDoc ( path ) ;
109
+ await docManager . upsertDoc ( path ) ;
103
110
console . debug ( `Document upserted: ${ path } ` ) ;
104
111
} else if ( type === "remove" ) {
105
112
console . debug ( `Deleting document: ${ path } ` ) ;
106
- await this . # docManager. deleteDocByPath ( path ) ;
113
+ await docManager . deleteDocByPath ( path ) ;
107
114
console . debug ( `Document deleted: ${ path } ` ) ;
108
115
}
109
116
} )
110
117
) ;
111
118
console . debug ( "Watcher tasks execution completed." ) ;
112
119
return results ;
113
120
} ,
114
- this . fileOpThrottleMs ,
121
+ // 最小执行间隔为半秒
122
+ 500 ,
115
123
{ edges : [ "trailing" ] }
116
124
) ;
117
125
118
126
/** 获取挂载的知识库目录 */
119
- get dirs ( ) {
120
- console . info ( "Fetching watched directories..." ) ;
121
- const watchedPaths = this . #docWatcher. getWatchedPaths ( ) ;
122
- console . info ( "Watched directories fetched successfully." ) ;
123
- return watchedPaths ;
124
- }
127
+ // get dirs() {
128
+ // console.info("Fetching watched directories...");
129
+ // const watchedPaths = this.#docWatcher.getWatchedPaths();
130
+ // console.info("Watched directories fetched successfully.");
131
+ // return watchedPaths;
132
+ // }
125
133
126
134
/** 获取支持的文档类型 */
127
135
get exts ( ) {
@@ -176,7 +184,8 @@ export class DocBase {
176
184
* 扫描指定目录中的文档
177
185
* @param dirs - 要扫描的目录数组
178
186
*/
179
- #scan = async ( dirs : string [ ] ) => {
187
+ #scan = async ( id : string , dirs : string [ ] ) => {
188
+ const docManager = this . #docManagers. get ( id )
180
189
await this . #docScanner( {
181
190
dirs,
182
191
exts : Array . from ( this . #docExtToLoaderName. keys ( ) ) ,
@@ -185,7 +194,7 @@ export class DocBase {
185
194
await Promise . all (
186
195
paths . map ( async ( path ) => {
187
196
console . info ( `Upserting document during scan: ${ path } ` ) ;
188
- await this . # docManager. upsertDoc ( path ) ;
197
+ await docManager . upsertDoc ( path ) ;
189
198
console . info ( `Document upserted during scan: ${ path } ` ) ;
190
199
} )
191
200
) ;
@@ -197,34 +206,42 @@ export class DocBase {
197
206
198
207
/** 启动 docbase */
199
208
start = async ( {
200
- meiliSearchConfig,
209
+ // meiliSearchConfig,
201
210
// TODO 为区分多知识库的参数
202
- indexPrefix,
211
+ // indexPrefix,
203
212
// TODO 改成后期增删查改
204
- initPaths = [ ] ,
205
- initPlugins = [
213
+ // initPaths = [],
214
+
215
+ db,
216
+ // fileOpThrottleMs,
217
+ } : DocBaseOptions ) => {
218
+ console . info ( "Starting DocBase..." ) ;
219
+ // TODO 初始化配置?
220
+ const { meiliSearchConfig } = await db . config . get ( ) ;
221
+ // TODO
222
+ // @ts -ignore
223
+ this . #meiliSearch = await createMeilisearchClient ( meiliSearchConfig )
224
+
225
+ // 初始化插件
226
+ console . info ( "Loading all plugins..." ) ;
227
+ const plugins = await db . plugin . get ( )
228
+ const initPlugins : PluginWithConfig [ ] = [
206
229
{
207
230
plugin : defaultDocLoaderPlugin ,
208
- params : { } ,
231
+ config : { } ,
209
232
} ,
210
233
{
211
234
plugin : defaultDocSplitterPlugin ,
212
- params : {
235
+ config : {
213
236
len : 1000 ,
214
237
} ,
215
238
} ,
216
- ] ,
217
- initscan = false ,
218
- fileOpThrottleMs,
219
- } : DocBaseOptions ) => {
220
- console . info ( "Starting DocBase..." ) ;
221
- this . fileOpThrottleMs = fileOpThrottleMs ;
222
- console . info ( "Loading all plugins..." ) ;
239
+ ...plugins
240
+ ]
223
241
// 加载所有插件
224
242
await Promise . all (
225
243
initPlugins . map ( ( initPlugin ) => this . loadPlugin ( initPlugin ) )
226
244
) ;
227
- console . info ( "All plugins loaded successfully." ) ;
228
245
229
246
const docSplitterExist = typeof this . #docSplitter. func === "function" ;
230
247
const docLoadersExist = this . #docLoaders. size > 0 ;
@@ -235,16 +252,7 @@ export class DocBase {
235
252
console . error ( "Loaded components: \n" + msg ) ;
236
253
throw new Error ( "Loaded components: \n" + msg ) ;
237
254
}
238
-
239
- // 初始化文档管理器
240
- console . info ( "Initializing document manager..." ) ;
241
- this . #docManager = new DocManager ( {
242
- indexPrefix,
243
- meiliSearch : await createMeilisearchClient ( meiliSearchConfig ) ,
244
- docLoader : ( path ) => this . #hyperDocLoader( path ) ,
245
- docSplitter : ( text ) => this . #docSplitter. func ( text ) ,
246
- } ) ;
247
- await this . #docManager. init ( ) ;
255
+ console . info ( "All plugins loaded successfully." ) ;
248
256
249
257
// 初始化监视器扫描器
250
258
console . info ( "Initializing watcher and scanner..." ) ;
@@ -266,24 +274,43 @@ export class DocBase {
266
274
this . #docScanner = scanner ;
267
275
console . info ( "Watcher and scanner initialized successfully." ) ;
268
276
277
+ // 初始化文档管理器
278
+ console . info ( "Initializing DocManager..." ) ;
279
+ const docLoader = ( input : DocLoaderInput ) => this . #hyperDocLoader( input )
280
+ const docSplitter = ( content : AsyncIterable < Content > ) => this . #docSplitter. func ( content )
281
+ const base = await db . base . get ( )
282
+
283
+ base . map ( async ( { path, id } ) => {
284
+ const docm = new DocManager ( {
285
+ indexPrefix : id ,
286
+ meiliSearch : this . #meiliSearch,
287
+ docLoader,
288
+ docSplitter,
289
+ } ) ;
290
+ await docm . init ( )
291
+ // 扫描目录
292
+ await this . #scan( [ path ] ) ;
293
+ // 监视目录
294
+ this . #docWatcher. watch ( path )
295
+ return docm
296
+ } )
297
+ console . info ( "DocManager initialized successfully." ) ;
298
+
269
299
// 扫描加载默认目录下文档
270
- if ( initscan ) {
271
- console . info ( "Scanning initial directories..." ) ;
272
- await this . #scan( initPaths ) ;
273
- console . info ( "Initial directories scanned successfully." ) ;
274
- }
300
+ // console.info("Scanning initial directories...");
301
+ // console.info("Initial directories scanned successfully.");
275
302
276
303
// 开启监视,同步变动文档
277
- console . info ( "Starting to watch directories..." ) ;
278
- initPaths . map ( ( initPath ) => this . #docWatcher. watch ( initPath ) ) ;
279
- console . info ( "Directories are being watched." ) ;
304
+ // console.info("Starting to watch directories...");
305
+ // initPaths.map((initPath) => this.#docWatcher.watch(initPath));
306
+ // console.info("Directories are being watched.");
280
307
console . info ( "DocBase started successfully." ) ;
281
308
} ;
282
309
283
310
/**
284
311
* 立即扫描所有目录
285
312
*/
286
- scanAllNow = async ( ) => {
313
+ scanAllNow = async ( id : string ) => {
287
314
console . info ( "Starting to scan all directories immediately..." ) ;
288
315
await this . #scan( this . dirs ) ;
289
316
console . info ( "All directories scanned immediately." ) ;
@@ -361,9 +388,9 @@ export class DocBase {
361
388
pluginWithConfig : PluginWithConfig < T >
362
389
) => {
363
390
console . info ( `Loading ${ pluginWithConfig . plugin . pluginType } plugin ${ pluginWithConfig . plugin . name } ` ) ;
364
- const { plugin, params } = pluginWithConfig ;
391
+ const { plugin, config } = pluginWithConfig ;
365
392
366
- plugin . init && await plugin . init ( params )
393
+ plugin . init && await plugin . init ( config )
367
394
368
395
switch ( plugin . pluginType ) {
369
396
case "DocLoader" :
@@ -432,8 +459,10 @@ export class DocBase {
432
459
* @param opt - meilisearch 搜索选项
433
460
* @returns 返回搜索结果
434
461
*/
435
- search = async ( query : string , opt ?: SearchParams ) => {
436
- console . info ( `Searching for documents with query: ${ query } ` ) ;
462
+ search = async ( params ?: SearchParams & {
463
+ knowledgeId : string ;
464
+ } ) => {
465
+ console . info ( `Searching for documents with query: ${ params . q } ` ) ;
437
466
const results = await this . #docManager. search ( query , opt ) ;
438
467
console . info ( `Search completed. Found ${ results . length } documents.` ) ;
439
468
return results ;
@@ -442,22 +471,18 @@ export class DocBase {
442
471
/**
443
472
* 作为 Dify 外部知识库搜索
444
473
* @param params - Dify 知识库搜索请求参数
445
- * @param params.query - 搜索查询字符串
446
- * @param params.retrieval_setting - 检索设置
447
- * @param params.retrieval_setting.top_k - 返回结果的最大数量
448
- * @param params.retrieval_setting.score_threshold - 相关性得分阈值
449
474
* @returns 返回符合 Dify 格式的搜索结果数组
450
475
*/
451
476
difySearch = async (
452
477
params : DifyKnowledgeRequest
453
478
) : Promise < DifyKnowledgeResponseRecord [ ] > => {
454
479
console . info ( "Performing Dify search..." ) ;
455
- // 等待多知识库支持
456
- // params.knowledge_id;
457
480
const q = params . query ;
458
481
const { top_k, score_threshold } = params . retrieval_setting ;
459
482
460
- const results = await this . search ( q , {
483
+ const results = await this . search ( {
484
+ q,
485
+ knowledgeId : params . knowledge_id ,
461
486
limit : top_k ,
462
487
rankingScoreThreshold : score_threshold ,
463
488
showRankingScore : true ,
0 commit comments