Skip to content

fix: Weaviate - skip _split_overlap meta field #1173

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,14 @@ def _to_data_object(self, document: Document) -> Dict[str, Any]:
# The embedding vector is stored separately from the rest of the data
del data["embedding"]

# _split_overlap meta field is unsupported because of a bug
# https://github.com/deepset-ai/haystack-core-integrations/issues/1172
if "_split_overlap" in data:
data.pop("_split_overlap")
logger.warning(
"Document %s has the unsupported `_split_overlap` meta field. It will be ignored.", data["_original_id"]
)

if "sparse_embedding" in data:
sparse_embedding = data.pop("sparse_embedding", None)
if sparse_embedding:
Expand Down
24 changes: 24 additions & 0 deletions integrations/weaviate/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,30 @@ def test_comparison_less_than_equal_with_iso_date(self, document_store, filterab
def test_comparison_not_equal_with_dataframe(self, document_store, filterable_docs):
return super().test_comparison_not_equal_with_dataframe(document_store, filterable_docs)

def test_meta_split_overlap_is_skipped(self, document_store):
doc = Document(
content="The moonlight shimmered ",
meta={
"source_id": "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0",
"page_number": 1,
"split_id": 0,
"split_idx_start": 0,
"_split_overlap": [
{"doc_id": "68ed48ba830048c5d7815874ed2de794722e6d10866b6c55349a914fd9a0df65", "range": (0, 20)}
],
},
)
document_store.write_documents([doc])

written_doc = document_store.filter_documents()[0]

assert written_doc.content == "The moonlight shimmered "
assert written_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0"
assert written_doc.meta["page_number"] == 1.0
assert written_doc.meta["split_id"] == 0.0
assert written_doc.meta["split_idx_start"] == 0.0
assert "_split_overlap" not in written_doc.meta

def test_bm25_retrieval(self, document_store):
document_store.write_documents(
[
Expand Down