Skip to content

Commit b4ff369

Browse files
authored
Make Document Stores initially skip SparseEmbedding (#606)
1 parent 2195623 commit b4ff369

File tree

8 files changed

+94
-7
lines changed

8 files changed

+94
-7
lines changed

integrations/astra/src/haystack_integrations/document_stores/astra/document_store.py

+9
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,15 @@ def _convert_input_document(document: Union[dict, Document]):
195195
document_dict["dataframe"] = document_dict.pop("dataframe").to_json()
196196
if embedding := document_dict.pop("embedding", []):
197197
document_dict["$vector"] = embedding
198+
if "sparse_embedding" in document_dict:
199+
sparse_embedding = document_dict.pop("sparse_embedding", None)
200+
if sparse_embedding:
201+
logger.warning(
202+
"Document %s has the `sparse_embedding` field set,"
203+
"but storing sparse embeddings in Astra is not currently supported."
204+
"The `sparse_embedding` field will be ignored.",
205+
document_dict["_id"],
206+
)
198207

199208
return document_dict
200209

integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py

+8
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,14 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
189189
if doc.embedding is not None:
190190
data["embeddings"] = [doc.embedding]
191191

192+
if hasattr(doc, "sparse_embedding") and doc.sparse_embedding is not None:
193+
logger.warning(
194+
"Document %s has the `sparse_embedding` field set,"
195+
"but storing sparse embeddings in Chroma is not currently supported."
196+
"The `sparse_embedding` field will be ignored.",
197+
doc.id,
198+
)
199+
192200
self._collection.add(**data)
193201

194202
return len(documents)

integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -216,16 +216,30 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
216216
policy = DuplicatePolicy.FAIL
217217

218218
action = "index" if policy == DuplicatePolicy.OVERWRITE else "create"
219-
documents_written, errors = helpers.bulk(
220-
client=self._client,
221-
actions=(
219+
220+
elasticsearch_actions = []
221+
for doc in documents:
222+
doc_dict = doc.to_dict()
223+
if "sparse_embedding" in doc_dict:
224+
sparse_embedding = doc_dict.pop("sparse_embedding", None)
225+
if sparse_embedding:
226+
logger.warning(
227+
"Document %s has the `sparse_embedding` field set,"
228+
"but storing sparse embeddings in Elasticsearch is not currently supported."
229+
"The `sparse_embedding` field will be ignored.",
230+
doc.id,
231+
)
232+
elasticsearch_actions.append(
222233
{
223234
"_op_type": action,
224235
"_id": doc.id,
225-
"_source": doc.to_dict(),
236+
"_source": doc_dict,
226237
}
227-
for doc in documents
228-
),
238+
)
239+
240+
documents_written, errors = helpers.bulk(
241+
client=self._client,
242+
actions=elasticsearch_actions,
229243
refresh="wait_for",
230244
index=self._index,
231245
raise_on_error=False,

integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,19 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
170170
if policy == DuplicatePolicy.NONE:
171171
policy = DuplicatePolicy.FAIL
172172

173-
mongo_documents = [doc.to_dict(flatten=False) for doc in documents]
173+
mongo_documents = []
174+
for doc in documents:
175+
doc_dict = doc.to_dict(flatten=False)
176+
if "sparse_embedding" in doc_dict:
177+
sparse_embedding = doc_dict.pop("sparse_embedding", None)
178+
if sparse_embedding:
179+
logger.warning(
180+
"Document %s has the `sparse_embedding` field set,"
181+
"but storing sparse embeddings in MongoDB Atlas is not currently supported."
182+
"The `sparse_embedding` field will be ignored.",
183+
doc.id,
184+
)
185+
mongo_documents.append(doc_dict)
174186
operations: List[Union[UpdateOne, InsertOne, ReplaceOne]]
175187
written_docs = len(documents)
176188

integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py

+10
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,16 @@ def _from_haystack_to_pg_documents(documents: List[Document]) -> List[Dict[str,
415415
db_document["dataframe"] = Jsonb(db_document["dataframe"]) if db_document["dataframe"] else None
416416
db_document["meta"] = Jsonb(db_document["meta"])
417417

418+
if "sparse_embedding" in db_document:
419+
sparse_embedding = db_document.pop("sparse_embedding", None)
420+
if sparse_embedding:
421+
logger.warning(
422+
"Document %s has the `sparse_embedding` field set,"
423+
"but storing sparse embeddings in Pgvector is not currently supported."
424+
"The `sparse_embedding` field will be ignored.",
425+
db_document["id"],
426+
)
427+
418428
db_documents.append(db_document)
419429

420430
return db_documents

integrations/pinecone/src/haystack_integrations/document_stores/pinecone/document_store.py

+7
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,13 @@ def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> Li
292292
"objects in Pinecone is not supported. "
293293
"The content of the `blob` field will be ignored."
294294
)
295+
if hasattr(document, "sparse_embedding") and document.sparse_embedding is not None:
296+
logger.warning(
297+
"Document %s has the `sparse_embedding` field set,"
298+
"but storing sparse embeddings in Pinecone is not currently supported."
299+
"The `sparse_embedding` field will be ignored.",
300+
document.id,
301+
)
295302

296303
documents_for_pinecone.append(doc_for_pinecone)
297304
return documents_for_pinecone

integrations/qdrant/src/haystack_integrations/document_stores/qdrant/converters.py

+14
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
import logging
12
import uuid
23
from typing import List, Union
34

45
from haystack.dataclasses import Document
56
from qdrant_client.http import models as rest
67

8+
logger = logging.getLogger(__name__)
9+
710

811
class HaystackToQdrant:
912
"""A converter from Haystack to Qdrant types."""
@@ -22,6 +25,17 @@ def documents_to_batch(
2225
vector = payload.pop(embedding_field) or {}
2326
_id = self.convert_id(payload.get("id"))
2427

28+
# TODO: remove as soon as we introduce the support for sparse embeddings in Qdrant
29+
if "sparse_embedding" in payload:
30+
sparse_embedding = payload.pop("sparse_embedding", None)
31+
if sparse_embedding:
32+
logger.warning(
33+
"Document %s has the `sparse_embedding` field set,"
34+
"but storing sparse embeddings in Qdrant is not currently supported."
35+
"The `sparse_embedding` field will be ignored.",
36+
payload["id"],
37+
)
38+
2539
point = rest.PointStruct(
2640
payload=payload,
2741
vector=vector,

integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py

+13
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import base64
55
import datetime
66
import json
7+
import logging
78
from dataclasses import asdict
89
from typing import Any, Dict, List, Optional, Tuple, Union
910

@@ -21,6 +22,8 @@
2122
from ._filters import convert_filters
2223
from .auth import AuthCredentials
2324

25+
logger = logging.getLogger(__name__)
26+
2427
Number = Union[int, float]
2528
TimeoutType = Union[Tuple[Number, Number], Number]
2629

@@ -224,6 +227,16 @@ def _to_data_object(self, document: Document) -> Dict[str, Any]:
224227
# The embedding vector is stored separately from the rest of the data
225228
del data["embedding"]
226229

230+
if "sparse_embedding" in data:
231+
sparse_embedding = data.pop("sparse_embedding", None)
232+
if sparse_embedding:
233+
logger.warning(
234+
"Document %s has the `sparse_embedding` field set,"
235+
"but storing sparse embeddings in Weaviate is not currently supported."
236+
"The `sparse_embedding` field will be ignored.",
237+
data["_original_id"],
238+
)
239+
227240
return data
228241

229242
def _to_document(self, data: DataObject[Dict[str, Any], None]) -> Document:

0 commit comments

Comments
 (0)