Skip to content

Commit b7de9f1

Browse files
Bug: Fix issue with Pinecone custom namespaces not being created automatically (#38336)
1 parent 9e2b057 commit b7de9f1

File tree

6 files changed

+73
-6
lines changed

6 files changed

+73
-6
lines changed

airbyte-integrations/connectors/destination-pinecone/destination_pinecone/indexer.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,9 @@ def index(self, document_chunks, namespace, streamName):
137137
for batch in serial_batches:
138138
async_results = []
139139
for ids_vectors_chunk in create_chunks(batch, batch_size=PINECONE_BATCH_SIZE):
140-
async_result = self.pinecone_index.upsert(vectors=ids_vectors_chunk, async_req=True, show_progress=False)
140+
async_result = self.pinecone_index.upsert(
141+
vectors=ids_vectors_chunk, async_req=True, show_progress=False, namespace=namespace
142+
)
141143
async_results.append(async_result)
142144
# Wait for and retrieve responses (this raises in case of error)
143145
[async_result.result() for async_result in async_results]

airbyte-integrations/connectors/destination-pinecone/integration_tests/pinecone_integration_test.py

+61-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,18 @@
99

1010
from airbyte_cdk.destinations.vector_db_based.embedder import OPEN_AI_VECTOR_SIZE
1111
from airbyte_cdk.destinations.vector_db_based.test_utils import BaseIntegrationTest
12-
from airbyte_cdk.models import DestinationSyncMode, Status
12+
from airbyte_cdk.models import (
13+
AirbyteMessage,
14+
AirbyteRecordMessage,
15+
AirbyteStateMessage,
16+
AirbyteStream,
17+
ConfiguredAirbyteCatalog,
18+
ConfiguredAirbyteStream,
19+
DestinationSyncMode,
20+
Status,
21+
SyncMode,
22+
Type,
23+
)
1324
from destination_pinecone.destination import DestinationPinecone
1425
from langchain.embeddings import OpenAIEmbeddings
1526
from langchain.vectorstores import Pinecone
@@ -47,7 +58,14 @@ def tearDown(self):
4758
if "Namespace not found" not in str(e):
4859
raise(e)
4960
else :
50-
print("Noting to delete. No data in the index/namespace.")
61+
print("Nothing to delete in default namespace. No data in the index/namespace.")
62+
try:
63+
self.pinecone_index.delete(delete_all=True, namespace="ns1")
64+
except PineconeException as e:
65+
if "Namespace not found" not in str(e):
66+
raise(e)
67+
else :
68+
print("Nothing to delete in ns1 namespace. No data in the index/namespace.")
5169

5270
def test_integration_test_flag_is_set(self):
5371
assert "PYTEST_CURRENT_TEST" in os.environ
@@ -107,3 +125,44 @@ def test_write(self):
107125
vector_store = Pinecone(self.pinecone_index_rest, embeddings.embed_query, "text")
108126
result = vector_store.similarity_search("feline animals", 1)
109127
assert result[0].metadata["_ab_record_id"] == "mystream_2"
128+
129+
def test_write_with_namespace(self):
130+
catalog = self._get_configured_catalog_with_namespace(DestinationSyncMode.overwrite)
131+
first_state_message = self._state({"state": "1"})
132+
first_record_chunk = [self._record_with_namespace("mystream", f"Dogs are number {i}", i) for i in range(5)]
133+
134+
# initial sync
135+
destination = DestinationPinecone()
136+
list(destination.write(self.config, catalog, [*first_record_chunk, first_state_message]))
137+
138+
self._wait()
139+
assert self.pinecone_index.describe_index_stats().total_vector_count == 5
140+
141+
142+
def _get_configured_catalog_with_namespace(self, destination_mode: DestinationSyncMode) -> ConfiguredAirbyteCatalog:
143+
stream_schema = {"type": "object", "properties": {"str_col": {"type": "str"}, "int_col": {"type": "integer"}, "random_col": {"type": "integer"}}}
144+
145+
overwrite_stream = ConfiguredAirbyteStream(
146+
stream=AirbyteStream(
147+
name="mystream",
148+
namespace="ns1",
149+
json_schema=stream_schema,
150+
supported_sync_modes=[SyncMode.incremental, SyncMode.full_refresh]
151+
),
152+
primary_key=[["int_col"]],
153+
sync_mode=SyncMode.incremental,
154+
destination_sync_mode=destination_mode,
155+
)
156+
157+
return ConfiguredAirbyteCatalog(streams=[overwrite_stream])
158+
159+
def _record_with_namespace(self, stream: str, str_value: str, int_value: int) -> AirbyteMessage:
160+
return AirbyteMessage(
161+
type=Type.RECORD, record=AirbyteRecordMessage(stream=stream,
162+
namespace="ns1",
163+
data={"str_col": str_value, "int_col": int_value},
164+
emitted_at=0)
165+
)
166+
167+
168+

airbyte-integrations/connectors/destination-pinecone/metadata.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ data:
1313
connectorSubtype: vectorstore
1414
connectorType: destination
1515
definitionId: 3d2b6f84-7f0d-4e3f-a5e5-7c7d4b50eabd
16-
dockerImageTag: 0.1.1
16+
dockerImageTag: 0.1.2
1717
dockerRepository: airbyte/destination-pinecone
1818
documentationUrl: https://docs.airbyte.com/integrations/destinations/pinecone
1919
githubIssueLabel: destination-pinecone

airbyte-integrations/connectors/destination-pinecone/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
44

55
[tool.poetry]
66
name = "airbyte-destination-pinecone"
7-
version = "0.1.1"
7+
version = "0.1.2"
88
description = "Airbyte destination implementation for Pinecone."
99
authors = ["Airbyte <[email protected]>"]
1010
license = "MIT"

airbyte-integrations/connectors/destination-pinecone/unit_tests/pinecone_indexer_test.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ def test_pinecone_index_upsert_and_delete(mock_describe_index):
105105
(ANY, [4, 5, 6], {"_ab_stream": "abc", "text": "test2"}),
106106
),
107107
async_req=True,
108-
show_progress=False
108+
show_progress=False,
109+
namespace="ns1",
109110
)
110111

111112

@@ -139,6 +140,7 @@ def test_pinecone_index_upsert_and_delete_starter(mock_describe_index, mock_dete
139140
),
140141
async_req=True,
141142
show_progress=False,
143+
namespace="ns1",
142144
)
143145

144146
def test_pinecone_index_upsert_and_delete_pod(mock_describe_index, mock_determine_spec_type):
@@ -168,6 +170,7 @@ def test_pinecone_index_upsert_and_delete_pod(mock_describe_index, mock_determin
168170
),
169171
async_req=True,
170172
show_progress=False,
173+
namespace="ns1",
171174
)
172175

173176
def test_pinecone_index_upsert_and_delete_serverless(mock_describe_index, mock_determine_spec_type):
@@ -197,6 +200,7 @@ def test_pinecone_index_upsert_and_delete_serverless(mock_describe_index, mock_d
197200
),
198201
async_req=True,
199202
show_progress=False,
203+
namespace="ns1",
200204
)
201205

202206

@@ -356,4 +360,5 @@ def test_metadata_normalization():
356360
vectors=((ANY, [1, 2, 3], {"_ab_stream": "abc", "text": "test", "small": "a", "id": 1}),),
357361
async_req=True,
358362
show_progress=False,
363+
namespace=None,
359364
)

docs/integrations/destinations/pinecone.md

+1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ OpenAI and Fake embeddings produce vectors with 1536 dimensions, and the Cohere
7676

7777
| Version | Date | Pull Request | Subject |
7878
| :------ | :--------- | :-------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------- |
79+
| 0.1.2 | 2023-05-17 | [#38336](https://github.com/airbytehq/airbyte/pull/338336) | Fix for regression:Custom namespaces not created automatically
7980
| 0.1.1 | 2023-05-14 | [#38151](https://github.com/airbytehq/airbyte/pull/38151) | Add airbyte source tag for attribution
8081
| 0.1.0 | 2023-05-06 | [#37756](https://github.com/airbytehq/airbyte/pull/37756) | Add support for Pinecone Serverless |
8182
| 0.0.24 | 2023-04-15 | [#37333](https://github.com/airbytehq/airbyte/pull/37333) | Update CDK & pytest version to fix security vulnerabilities. |

0 commit comments

Comments
 (0)