5
5
import uuid
6
6
from typing import Optional
7
7
8
- import pinecone
9
8
import urllib3
10
9
from airbyte_cdk .destinations .vector_db_based .document_processor import METADATA_RECORD_ID_FIELD , METADATA_STREAM_FIELD
11
10
from airbyte_cdk .destinations .vector_db_based .indexer import Indexer
12
11
from airbyte_cdk .destinations .vector_db_based .utils import create_chunks , create_stream_identifier , format_exception
12
+ from airbyte_cdk .models import AirbyteConnectionStatus , Status
13
13
from airbyte_cdk .models .airbyte_protocol import ConfiguredAirbyteCatalog , DestinationSyncMode
14
14
from destination_pinecone .config import PineconeIndexingModel
15
+ from pinecone import PineconeException
16
+ from pinecone .grpc import PineconeGRPC
15
17
16
18
# large enough to speed up processing, small enough to not hit pinecone request limits
17
19
PINECONE_BATCH_SIZE = 40
@@ -29,32 +31,54 @@ class PineconeIndexer(Indexer):
29
31
30
32
def __init__ (self , config : PineconeIndexingModel , embedding_dimensions : int ):
31
33
super ().__init__ (config )
32
- pinecone .init (api_key = config .pinecone_key , environment = config .pinecone_environment , threaded = True )
34
+ try :
35
+ self .pc = PineconeGRPC (api_key = config .pinecone_key , threaded = True )
36
+ except PineconeException as e :
37
+ return AirbyteConnectionStatus (status = Status .FAILED , message = str (e ))
33
38
34
- self .pinecone_index = pinecone . GRPCIndex (config .index )
39
+ self .pinecone_index = self . pc . Index (config .index )
35
40
self .embedding_dimensions = embedding_dimensions
36
41
42
+ def determine_spec_type (self , index_name ):
43
+ description = self .pc .describe_index (index_name )
44
+ spec_keys = description .get ("spec" , {})
45
+ if "pod" in spec_keys :
46
+ return "pod"
47
+ elif "serverless" in spec_keys :
48
+ return "serverless"
49
+ else :
50
+ raise ValueError ("Unknown index specification type." )
51
+
37
52
def pre_sync (self , catalog : ConfiguredAirbyteCatalog ):
38
- index_description = pinecone . describe_index (self .config .index )
39
- self . _pod_type = index_description . pod_type
53
+ self . _pod_type = self . determine_spec_type (self .config .index )
54
+
40
55
for stream in catalog .streams :
56
+ stream_identifier = create_stream_identifier (stream .stream )
41
57
if stream .destination_sync_mode == DestinationSyncMode .overwrite :
42
58
self .delete_vectors (
43
- filter = {METADATA_STREAM_FIELD : create_stream_identifier ( stream . stream ) }, namespace = stream .stream .namespace
59
+ filter = {METADATA_STREAM_FIELD : stream_identifier }, namespace = stream .stream .namespace , prefix = stream_identifier
44
60
)
45
61
46
62
def post_sync (self ):
47
63
return []
48
64
49
- def delete_vectors (self , filter , namespace = None ):
65
+ def delete_vectors (self , filter , namespace = None , prefix = None ):
50
66
if self ._pod_type == "starter" :
51
67
# Starter pod types have a maximum of 100000 rows
52
68
top_k = 10000
53
69
self .delete_by_metadata (filter , top_k , namespace )
70
+ elif self ._pod_type == "serverless" :
71
+ if prefix == None :
72
+ raise ValueError ("Prefix is required for a serverless index." )
73
+ self .delete_by_prefix (prefix = prefix , namespace = namespace )
54
74
else :
75
+ # Pod spec
55
76
self .pinecone_index .delete (filter = filter , namespace = namespace )
56
77
57
78
def delete_by_metadata (self , filter , top_k , namespace = None ):
79
+ """
80
+ Applicable to Starter implementation only. Deletes all vectors that match the given metadata filter.
81
+ """
58
82
zero_vector = [0.0 ] * self .embedding_dimensions
59
83
query_result = self .pinecone_index .query (vector = zero_vector , filter = filter , top_k = top_k , namespace = namespace )
60
84
while len (query_result .matches ) > 0 :
@@ -66,6 +90,13 @@ def delete_by_metadata(self, filter, top_k, namespace=None):
66
90
self .pinecone_index .delete (ids = list (batch ), namespace = namespace )
67
91
query_result = self .pinecone_index .query (vector = zero_vector , filter = filter , top_k = top_k , namespace = namespace )
68
92
93
+ def delete_by_prefix (self , prefix , namespace = None ):
94
+ """
95
+ Applicable to Serverless implementation only. Deletes all vectors with the given prefix.
96
+ """
97
+ for ids in self .pinecone_index .list (prefix = prefix , namespace = namespace ):
98
+ self .pinecone_index .delete (ids = ids , namespace = namespace )
99
+
69
100
def _truncate_metadata (self , metadata : dict ) -> dict :
70
101
"""
71
102
Normalize metadata to ensure it is within the size limit and doesn't contain complex objects.
@@ -85,34 +116,45 @@ def _truncate_metadata(self, metadata: dict) -> dict:
85
116
86
117
return result
87
118
88
- def index (self , document_chunks , namespace , stream ):
119
+ def index (self , document_chunks , namespace , streamName ):
89
120
pinecone_docs = []
90
121
for i in range (len (document_chunks )):
91
122
chunk = document_chunks [i ]
92
123
metadata = self ._truncate_metadata (chunk .metadata )
93
124
if chunk .page_content is not None :
94
125
metadata ["text" ] = chunk .page_content
95
- pinecone_docs .append ((str (uuid .uuid4 ()), chunk .embedding , metadata ))
126
+ prefix = streamName
127
+ pinecone_docs .append ((prefix + "#" + str (uuid .uuid4 ()), chunk .embedding , metadata ))
96
128
serial_batches = create_chunks (pinecone_docs , batch_size = PINECONE_BATCH_SIZE * PARALLELISM_LIMIT )
97
129
for batch in serial_batches :
98
- async_results = [
99
- self . pinecone_index . upsert ( vectors = ids_vectors_chunk , async_req = True , show_progress = False , namespace = namespace )
100
- for ids_vectors_chunk in create_chunks ( batch , batch_size = PINECONE_BATCH_SIZE )
101
- ]
130
+ async_results = []
131
+ for ids_vectors_chunk in create_chunks ( batch , batch_size = PINECONE_BATCH_SIZE ):
132
+ async_result = self . pinecone_index . upsert ( vectors = ids_vectors_chunk , async_req = True , show_progress = False )
133
+ async_results . append ( async_result )
102
134
# Wait for and retrieve responses (this raises in case of error)
103
135
[async_result .result () for async_result in async_results ]
104
136
105
137
def delete (self , delete_ids , namespace , stream ):
138
+ filter = {METADATA_RECORD_ID_FIELD : {"$in" : delete_ids }}
106
139
if len (delete_ids ) > 0 :
107
- self .delete_vectors (filter = {METADATA_RECORD_ID_FIELD : {"$in" : delete_ids }}, namespace = namespace )
140
+ if self ._pod_type == "starter" :
141
+ # Starter pod types have a maximum of 100000 rows
142
+ top_k = 10000
143
+ self .delete_by_metadata (filter = filter , top_k = top_k , namespace = namespace )
144
+ elif self ._pod_type == "serverless" :
145
+ self .pinecone_index .delete (ids = delete_ids , namespace = namespace )
146
+ else :
147
+ # Pod spec
148
+ self .pinecone_index .delete (filter = filter , namespace = namespace )
108
149
109
150
def check (self ) -> Optional [str ]:
110
151
try :
111
- indexes = pinecone .list_indexes ()
112
- if self .config .index not in indexes :
152
+ list = self .pc .list_indexes ()
153
+ index_names = [index ["name" ] for index in list .indexes ]
154
+ if self .config .index not in index_names :
113
155
return f"Index { self .config .index } does not exist in environment { self .config .pinecone_environment } ."
114
156
115
- description = pinecone .describe_index (self .config .index )
157
+ description = self . pc .describe_index (self .config .index )
116
158
actual_dimension = int (description .dimension )
117
159
if actual_dimension != self .embedding_dimensions :
118
160
return f"Your embedding configuration will produce vectors with dimension { self .embedding_dimensions :d} , but your index is configured with dimension { actual_dimension :d} . Make sure embedding and indexing configurations match."
@@ -121,7 +163,7 @@ def check(self) -> Optional[str]:
121
163
if f"Failed to resolve 'controller.{ self .config .pinecone_environment } .pinecone.io'" in str (e .reason ):
122
164
return f"Failed to resolve environment, please check whether { self .config .pinecone_environment } is correct."
123
165
124
- if isinstance (e , pinecone . exceptions . UnauthorizedException ):
166
+ if isinstance (e , PineconeException ):
125
167
if e .body :
126
168
return e .body
127
169
0 commit comments