Skip to content

Commit 12cdc11

Browse files
lambda-scienceanakin87Anush008
authored
feat(Qdrant): start to work on sparse vector integration (#578)
* feat(Qdrant): start to working on sparse vector integration * Progress towards Sparse vector support with Fastembed * __init__.py * merge batch results for hybrid request * feat(Qdrant): missing comma * feat(Qdrant): making some test progress * feat(Qdrant): all current test are fixed * feat(Qdrant): linting * feat(Qdrant): working sparse retriver hooray * feat(Qdrant): fix hybrid retriver * feat(Qdrant): modify PR for haystack 2.1.0 with proper sparse vectors * feat(Qdrant): fix lint * test w Haystack main * fix deps * Update integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py Co-authored-by: Anush <[email protected]> * feat(Qdrant): remove hybrid & old code, constant for vector field names * Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py Co-authored-by: Stefano Fiorucci <[email protected]> * Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py Co-authored-by: Stefano Fiorucci <[email protected]> * feat(Qdrant): reverting pop change, changing Dict to SparseEmbedding type * feat(Qdrant): fix lint * feat(Qdrant): remove old todo * simplify documents_to_batch * feat(Qdrant): SparseEmbedding instead of Dict * feat(Qdrant): introducing `use_sparse_embeddings` parameters for document store to make sparse embeddings non breaking change. Need more testing * feat(Qdrant): `use_sparse_embeddings` true by default + bugfix * feat(Qdrant): `use_sparse_embeddings` true by default + bugfix * feat(Qdrant): `use_sparse_embeddings` true by default + bugfix * feat(Qdrant): bugfix * Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py Co-authored-by: Anush <[email protected]> * Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py Co-authored-by: Anush <[email protected]> * Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py Co-authored-by: Anush <[email protected]> * Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py Co-authored-by: Anush <[email protected]> * Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py Co-authored-by: Anush <[email protected]> * Revert "Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/document_store.py" This reverts commit f7cf65e. * feat(Qdrant): fixing test * Update integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py Co-authored-by: Anush <[email protected]> * feat(Qdrant): fixing creation * feat(Qdrant): fixing creation * little fixes * make changes nonbreaking * refactoring --------- Co-authored-by: anakin87 <[email protected]> Co-authored-by: Anush <[email protected]>
1 parent 363c7b5 commit 12cdc11

File tree

9 files changed

+704
-313
lines changed

9 files changed

+704
-313
lines changed

integrations/qdrant/pyproject.toml

+3-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ classifiers = [
2424
"Programming Language :: Python :: Implementation :: CPython",
2525
"Programming Language :: Python :: Implementation :: PyPy",
2626
]
27-
dependencies = ["haystack-ai", "qdrant-client"]
27+
dependencies = ["haystack-ai>=2.0.1", "qdrant-client"]
2828

2929
[project.urls]
3030
Source = "https://github.com/deepset-ai/haystack-core-integrations"
@@ -103,6 +103,8 @@ ignore = [
103103
"B027",
104104
# Allow boolean positional values in function calls, like `dict.get(... True)`
105105
"FBT003",
106+
# Allow boolean arguments in function definition
107+
"FBT001", "FBT002",
106108
# Ignore checks for possible passwords
107109
"S105",
108110
"S106",

integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5-
from .retriever import QdrantEmbeddingRetriever
5+
from .retriever import QdrantEmbeddingRetriever, QdrantSparseRetriever
66

7-
__all__ = ("QdrantEmbeddingRetriever",)
7+
__all__ = ("QdrantEmbeddingRetriever", "QdrantSparseRetriever")

integrations/qdrant/src/haystack_integrations/components/retrievers/qdrant/retriever.py

+122-3
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
from typing import Any, Dict, List, Optional
22

33
from haystack import Document, component, default_from_dict, default_to_dict
4+
from haystack.dataclasses.sparse_embedding import SparseEmbedding
45
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
56

67

78
@component
89
class QdrantEmbeddingRetriever:
910
"""
10-
A component for retrieving documents from an QdrantDocumentStore.
11+
A component for retrieving documents from an QdrantDocumentStore using dense vectors.
1112
1213
Usage example:
1314
```python
@@ -32,8 +33,8 @@ def __init__(
3233
document_store: QdrantDocumentStore,
3334
filters: Optional[Dict[str, Any]] = None,
3435
top_k: int = 10,
35-
scale_score: bool = True, # noqa: FBT001, FBT002
36-
return_embedding: bool = False, # noqa: FBT001, FBT002
36+
scale_score: bool = True,
37+
return_embedding: bool = False,
3738
):
3839
"""
3940
Create a QdrantEmbeddingRetriever component.
@@ -120,3 +121,121 @@ def run(
120121
)
121122

122123
return {"documents": docs}
124+
125+
126+
@component
127+
class QdrantSparseRetriever:
128+
"""
129+
A component for retrieving documents from an QdrantDocumentStore using sparse vectors.
130+
131+
Usage example:
132+
```python
133+
from haystack_integrations.components.retrievers.qdrant import QdrantSparseRetriever
134+
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
135+
from haystack.dataclasses.sparse_embedding import SparseEmbedding
136+
137+
document_store = QdrantDocumentStore(
138+
":memory:",
139+
recreate_index=True,
140+
return_embedding=True,
141+
wait_result_from_api=True,
142+
)
143+
retriever = QdrantSparseRetriever(document_store=document_store)
144+
sparse_embedding = SparseEmbedding(indices=[0, 1, 2, 3], values=[0.1, 0.8, 0.05, 0.33])
145+
retriever.run(query_sparse_embedding=sparse_embedding)
146+
```
147+
"""
148+
149+
def __init__(
150+
self,
151+
document_store: QdrantDocumentStore,
152+
filters: Optional[Dict[str, Any]] = None,
153+
top_k: int = 10,
154+
scale_score: bool = True,
155+
return_embedding: bool = False,
156+
):
157+
"""
158+
Create a QdrantSparseRetriever component.
159+
160+
:param document_store: An instance of QdrantDocumentStore.
161+
:param filters: A dictionary with filters to narrow down the search space. Default is None.
162+
:param top_k: The maximum number of documents to retrieve. Default is 10.
163+
:param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True.
164+
:param return_embedding: Whether to return the sparse embedding of the retrieved Documents. Default is False.
165+
166+
:raises ValueError: If 'document_store' is not an instance of QdrantDocumentStore.
167+
"""
168+
169+
if not isinstance(document_store, QdrantDocumentStore):
170+
msg = "document_store must be an instance of QdrantDocumentStore"
171+
raise ValueError(msg)
172+
173+
self._document_store = document_store
174+
self._filters = filters
175+
self._top_k = top_k
176+
self._scale_score = scale_score
177+
self._return_embedding = return_embedding
178+
179+
def to_dict(self) -> Dict[str, Any]:
180+
"""
181+
Serializes the component to a dictionary.
182+
183+
:returns:
184+
Dictionary with serialized data.
185+
"""
186+
d = default_to_dict(
187+
self,
188+
document_store=self._document_store,
189+
filters=self._filters,
190+
top_k=self._top_k,
191+
scale_score=self._scale_score,
192+
return_embedding=self._return_embedding,
193+
)
194+
d["init_parameters"]["document_store"] = self._document_store.to_dict()
195+
196+
return d
197+
198+
@classmethod
199+
def from_dict(cls, data: Dict[str, Any]) -> "QdrantEmbeddingRetriever":
200+
"""
201+
Deserializes the component from a dictionary.
202+
203+
:param data:
204+
Dictionary to deserialize from.
205+
:returns:
206+
Deserialized component.
207+
"""
208+
document_store = QdrantDocumentStore.from_dict(data["init_parameters"]["document_store"])
209+
data["init_parameters"]["document_store"] = document_store
210+
return default_from_dict(cls, data)
211+
212+
@component.output_types(documents=List[Document])
213+
def run(
214+
self,
215+
query_sparse_embedding: SparseEmbedding,
216+
filters: Optional[Dict[str, Any]] = None,
217+
top_k: Optional[int] = None,
218+
scale_score: Optional[bool] = None,
219+
return_embedding: Optional[bool] = None,
220+
):
221+
"""
222+
Run the Sparse Embedding Retriever on the given input data.
223+
224+
:param query_sparse_embedding: Sparse Embedding of the query.
225+
:param filters: A dictionary with filters to narrow down the search space.
226+
:param top_k: The maximum number of documents to return.
227+
:param scale_score: Whether to scale the scores of the retrieved documents or not.
228+
:param return_embedding: Whether to return the embedding of the retrieved Documents.
229+
:returns:
230+
The retrieved documents.
231+
232+
"""
233+
docs = self._document_store.query_by_sparse(
234+
query_sparse_embedding=query_sparse_embedding,
235+
filters=filters or self._filters,
236+
top_k=top_k or self._top_k,
237+
scale_score=scale_score or self._scale_score,
238+
return_embedding=return_embedding or self._return_embedding,
239+
)
240+
241+
return {"documents": docs}

integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py

+60-50
Original file line numberDiff line numberDiff line change
@@ -7,64 +7,74 @@
77

88
logger = logging.getLogger(__name__)
99

10+
DENSE_VECTORS_NAME = "text-dense"
11+
SPARSE_VECTORS_NAME = "text-sparse"
1012

11-
class HaystackToQdrant:
12-
"""A converter from Haystack to Qdrant types."""
1313

14-
UUID_NAMESPACE = uuid.UUID("3896d314-1e95-4a3a-b45a-945f9f0b541d")
14+
UUID_NAMESPACE = uuid.UUID("3896d314-1e95-4a3a-b45a-945f9f0b541d")
1515

16-
def documents_to_batch(
17-
self,
18-
documents: List[Document],
19-
*,
20-
embedding_field: str,
21-
) -> List[rest.PointStruct]:
22-
points = []
23-
for document in documents:
24-
payload = document.to_dict(flatten=False)
16+
17+
def convert_haystack_documents_to_qdrant_points(
18+
documents: List[Document],
19+
*,
20+
embedding_field: str,
21+
use_sparse_embeddings: bool,
22+
) -> List[rest.PointStruct]:
23+
points = []
24+
for document in documents:
25+
payload = document.to_dict(flatten=False)
26+
if use_sparse_embeddings:
27+
vector = {}
28+
29+
dense_vector = payload.pop(embedding_field, None)
30+
if dense_vector is not None:
31+
vector[DENSE_VECTORS_NAME] = dense_vector
32+
33+
sparse_vector = payload.pop("sparse_embedding", None)
34+
if sparse_vector is not None:
35+
sparse_vector_instance = rest.SparseVector(**sparse_vector)
36+
vector[SPARSE_VECTORS_NAME] = sparse_vector_instance
37+
38+
else:
2539
vector = payload.pop(embedding_field) or {}
26-
_id = self.convert_id(payload.get("id"))
27-
28-
# TODO: remove as soon as we introduce the support for sparse embeddings in Qdrant
29-
if "sparse_embedding" in payload:
30-
sparse_embedding = payload.pop("sparse_embedding", None)
31-
if sparse_embedding:
32-
logger.warning(
33-
"Document %s has the `sparse_embedding` field set,"
34-
"but storing sparse embeddings in Qdrant is not currently supported."
35-
"The `sparse_embedding` field will be ignored.",
36-
payload["id"],
37-
)
38-
39-
point = rest.PointStruct(
40-
payload=payload,
41-
vector=vector,
42-
id=_id,
43-
)
44-
points.append(point)
45-
return points
46-
47-
def convert_id(self, _id: str) -> str:
48-
"""
49-
Converts any string into a UUID-like format in a deterministic way.
50-
51-
Qdrant does not accept any string as an id, so an internal id has to be
52-
generated for each point. This is a deterministic way of doing so.
53-
"""
54-
return uuid.uuid5(self.UUID_NAMESPACE, _id).hex
40+
_id = convert_id(payload.get("id"))
41+
42+
point = rest.PointStruct(
43+
payload=payload,
44+
vector=vector,
45+
id=_id,
46+
)
47+
points.append(point)
48+
return points
49+
50+
51+
def convert_id(_id: str) -> str:
52+
"""
53+
Converts any string into a UUID-like format in a deterministic way.
54+
55+
Qdrant does not accept any string as an id, so an internal id has to be
56+
generated for each point. This is a deterministic way of doing so.
57+
"""
58+
return uuid.uuid5(UUID_NAMESPACE, _id).hex
5559

5660

5761
QdrantPoint = Union[rest.ScoredPoint, rest.Record]
5862

5963

60-
class QdrantToHaystack:
61-
def __init__(self, content_field: str, name_field: str, embedding_field: str):
62-
self.content_field = content_field
63-
self.name_field = name_field
64-
self.embedding_field = embedding_field
64+
def convert_qdrant_point_to_haystack_document(point: QdrantPoint, use_sparse_embeddings: bool) -> Document:
65+
payload = {**point.payload}
66+
payload["score"] = point.score if hasattr(point, "score") else None
6567

66-
def point_to_document(self, point: QdrantPoint) -> Document:
67-
payload = {**point.payload}
68+
if not use_sparse_embeddings:
6869
payload["embedding"] = point.vector if hasattr(point, "vector") else None
69-
payload["score"] = point.score if hasattr(point, "score") else None
70-
return Document.from_dict(payload)
70+
elif hasattr(point, "vector") and point.vector is not None:
71+
payload["embedding"] = point.vector.get(DENSE_VECTORS_NAME)
72+
73+
if SPARSE_VECTORS_NAME in point.vector:
74+
parse_vector_dict = {
75+
"indices": point.vector[SPARSE_VECTORS_NAME].indices,
76+
"values": point.vector[SPARSE_VECTORS_NAME].values,
77+
}
78+
payload["sparse_embedding"] = parse_vector_dict
79+
80+
return Document.from_dict(payload)

0 commit comments

Comments
 (0)