Skip to content

Commit d135677

Browse files
add vdb document id index (#16244)
Co-authored-by: crazywoola <[email protected]>
1 parent cade0f6 commit d135677

File tree

5 files changed

+23
-23
lines changed

5 files changed

+23
-23
lines changed

api/core/rag/datasource/vdb/elasticsearch/elasticsearch_vector.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,8 @@ def create_collection(
196196
Field.METADATA_KEY.value: {
197197
"type": "object",
198198
"properties": {
199-
"doc_id": {"type": "keyword"} # Map doc_id to keyword type
199+
"doc_id": {"type": "keyword"}, # Map doc_id to keyword type
200+
"document_id": {"type": "keyword"}, # Map doc_id to keyword type
200201
},
201202
},
202203
}

api/core/rag/datasource/vdb/field.py

+1
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ class Field(Enum):
1111
TEXT_KEY = "text"
1212
PRIMARY_KEY = "id"
1313
DOC_ID = "metadata.doc_id"
14+
DOCUMENT_ID = "metadata.document_id"

api/core/rag/datasource/vdb/qdrant/qdrant_vector.py

+4
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,10 @@ def create_collection(self, collection_name: str, vector_size: int):
134134
self._client.create_payload_index(
135135
collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD
136136
)
137+
# create document_id payload index
138+
self._client.create_payload_index(
139+
collection_name, Field.DOCUMENT_ID.value, field_schema=PayloadSchemaType.KEYWORD
140+
)
137141
# create full text index
138142
text_index_params = TextIndexParams(
139143
type=TextIndexType.TEXT,

api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py

+14-22
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,10 @@ def create_collection(self, collection_name: str, vector_size: int):
144144
self._client.create_payload_index(
145145
collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD
146146
)
147+
# create document_id payload index
148+
self._client.create_payload_index(
149+
collection_name, Field.DOCUMENT_ID.value, field_schema=PayloadSchemaType.KEYWORD
150+
)
147151
# create full text index
148152
text_index_params = TextIndexParams(
149153
type=TextIndexType.TEXT,
@@ -318,23 +322,17 @@ def text_exists(self, id: str) -> bool:
318322
def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
319323
from qdrant_client.http import models
320324

321-
filter = models.Filter(
322-
must=[
323-
models.FieldCondition(
324-
key="group_id",
325-
match=models.MatchValue(value=self._group_id),
326-
),
327-
],
328-
)
325+
filter = None
329326
document_ids_filter = kwargs.get("document_ids_filter")
330327
if document_ids_filter:
331-
if filter.must:
332-
filter.must.append(
328+
filter = models.Filter(
329+
must=[
333330
models.FieldCondition(
334331
key="metadata.document_id",
335332
match=models.MatchAny(any=document_ids_filter),
336333
)
337-
)
334+
],
335+
)
338336
results = self._client.search(
339337
collection_name=self._collection_name,
340338
query_vector=query_vector,
@@ -369,23 +367,17 @@ def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
369367
"""
370368
from qdrant_client.http import models
371369

372-
scroll_filter = models.Filter(
373-
must=[
374-
models.FieldCondition(
375-
key="page_content",
376-
match=models.MatchText(text=query),
377-
)
378-
]
379-
)
370+
scroll_filter = None
380371
document_ids_filter = kwargs.get("document_ids_filter")
381372
if document_ids_filter:
382-
if scroll_filter.must:
383-
scroll_filter.must.append(
373+
scroll_filter = models.Filter(
374+
must=[
384375
models.FieldCondition(
385376
key="metadata.document_id",
386377
match=models.MatchAny(any=document_ids_filter),
387378
)
388-
)
379+
]
380+
)
389381
response = self._client.scroll(
390382
collection_name=self._collection_name,
391383
scroll_filter=scroll_filter,

api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py

+2
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,12 @@ def _create_collection(self, dimension: int):
105105
text TEXT NOT NULL,
106106
meta JSON NOT NULL,
107107
doc_id VARCHAR(64) AS (JSON_UNQUOTE(JSON_EXTRACT(meta, '$.doc_id'))) STORED,
108+
document_id VARCHAR(64) AS (JSON_UNQUOTE(JSON_EXTRACT(meta, '$.document_id'))) STORED,
108109
vector VECTOR<FLOAT>({dimension}) NOT NULL,
109110
create_time DATETIME DEFAULT CURRENT_TIMESTAMP,
110111
update_time DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
111112
KEY (doc_id),
113+
KEY (document_id),
112114
VECTOR INDEX idx_vector (({tidb_dist_func}(vector))) USING HNSW
113115
);
114116
""")

0 commit comments

Comments
 (0)