3
3
#
4
4
5
5
import logging
6
- from datetime import datetime
6
+ from datetime import datetime , timedelta , timezone
7
7
from typing import Any , Iterator , List , Mapping , MutableMapping , Optional , Tuple , Union
8
8
9
+ import isodate
9
10
import pendulum
10
11
import requests
11
12
from airbyte_cdk import AirbyteLogger
29
30
30
31
from .api import PARENT_SALESFORCE_OBJECTS , UNSUPPORTED_BULK_API_SALESFORCE_OBJECTS , UNSUPPORTED_FILTERING_STREAMS , Salesforce
31
32
from .streams import (
33
+ LOOKBACK_SECONDS ,
32
34
BulkIncrementalSalesforceStream ,
33
35
BulkSalesforceStream ,
34
36
BulkSalesforceSubStream ,
@@ -172,9 +174,8 @@ def prepare_stream(cls, stream_name: str, json_schema, sobject_options, sf_objec
172
174
173
175
return stream_class , stream_kwargs
174
176
175
- @classmethod
176
177
def generate_streams (
177
- cls ,
178
+ self ,
178
179
config : Mapping [str , Any ],
179
180
stream_objects : Mapping [str , Any ],
180
181
sf_object : Salesforce ,
@@ -184,69 +185,86 @@ def generate_streams(
184
185
schemas = sf_object .generate_schemas (stream_objects )
185
186
default_args = [sf_object , authenticator , config ]
186
187
streams = []
188
+ state_manager = ConnectorStateManager (stream_instance_map = {s .name : s for s in streams }, state = self .state )
187
189
for stream_name , sobject_options in stream_objects .items ():
188
190
json_schema = schemas .get (stream_name , {})
189
191
190
- stream_class , kwargs = cls .prepare_stream (stream_name , json_schema , sobject_options , * default_args )
192
+ stream_class , kwargs = self .prepare_stream (stream_name , json_schema , sobject_options , * default_args )
191
193
192
194
parent_name = PARENT_SALESFORCE_OBJECTS .get (stream_name , {}).get ("parent_name" )
193
195
if parent_name :
194
196
# get minimal schema required for getting proper class name full_refresh/incremental, rest/bulk
195
197
parent_schema = PARENT_SALESFORCE_OBJECTS .get (stream_name , {}).get ("schema_minimal" )
196
- parent_class , parent_kwargs = cls .prepare_stream (parent_name , parent_schema , sobject_options , * default_args )
198
+ parent_class , parent_kwargs = self .prepare_stream (parent_name , parent_schema , sobject_options , * default_args )
197
199
kwargs ["parent" ] = parent_class (** parent_kwargs )
198
200
199
201
stream = stream_class (** kwargs )
200
202
201
- api_type = cls ._get_api_type (stream_name , json_schema , config .get ("force_use_bulk_api" , False ))
203
+ api_type = self ._get_api_type (stream_name , json_schema , config .get ("force_use_bulk_api" , False ))
202
204
if api_type == "rest" and not stream .primary_key and stream .too_many_properties :
203
205
logger .warning (
204
206
f"Can not instantiate stream { stream_name } . It is not supported by the BULK API and can not be "
205
207
"implemented via REST because the number of its properties exceeds the limit and it lacks a primary key."
206
208
)
207
209
continue
208
- streams .append (stream )
210
+
211
+ streams .append (self ._wrap_for_concurrency (config , stream , state_manager ))
212
+ streams .append (self ._wrap_for_concurrency (config , Describe (sf_api = sf_object , catalog = self .catalog ), state_manager ))
209
213
return streams
210
214
215
+ def _wrap_for_concurrency (self , config , stream , state_manager ):
216
+ stream_slicer_cursor = None
217
+ if stream .cursor_field :
218
+ stream_slicer_cursor = self ._create_stream_slicer_cursor (config , state_manager , stream )
219
+ if hasattr (stream , "set_cursor" ):
220
+ stream .set_cursor (stream_slicer_cursor )
221
+ if hasattr (stream , "parent" ) and hasattr (stream .parent , "set_cursor" ):
222
+ stream_slicer_cursor = self ._create_stream_slicer_cursor (config , state_manager , stream )
223
+ stream .parent .set_cursor (stream_slicer_cursor )
224
+
225
+ if not stream_slicer_cursor or self ._get_sync_mode_from_catalog (stream ) == SyncMode .full_refresh :
226
+ cursor = FinalStateCursor (
227
+ stream_name = stream .name , stream_namespace = stream .namespace , message_repository = self .message_repository
228
+ )
229
+ state = None
230
+ else :
231
+ cursor = stream_slicer_cursor
232
+ state = cursor .state
233
+ return StreamFacade .create_from_stream (stream , self , logger , state , cursor )
234
+
211
235
def streams (self , config : Mapping [str , Any ]) -> List [Stream ]:
212
236
if not config .get ("start_date" ):
213
237
config ["start_date" ] = (datetime .now () - relativedelta (years = self .START_DATE_OFFSET_IN_YEARS )).strftime (self .DATETIME_FORMAT )
214
238
sf = self ._get_sf_object (config )
215
239
stream_objects = sf .get_validated_streams (config = config , catalog = self .catalog )
216
240
streams = self .generate_streams (config , stream_objects , sf )
217
- streams .append (Describe (sf_api = sf , catalog = self .catalog ))
218
- state_manager = ConnectorStateManager (stream_instance_map = {s .name : s for s in streams }, state = self .state )
219
-
220
- configured_streams = []
221
-
222
- for stream in streams :
223
- sync_mode = self ._get_sync_mode_from_catalog (stream )
224
- if sync_mode == SyncMode .full_refresh :
225
- cursor = FinalStateCursor (
226
- stream_name = stream .name , stream_namespace = stream .namespace , message_repository = self .message_repository
227
- )
228
- state = None
229
- else :
230
- cursor_field_key = stream .cursor_field or ""
231
- if not isinstance (cursor_field_key , str ):
232
- raise AssertionError (f"A string cursor field key is required, but got { cursor_field_key } ." )
233
- cursor_field = CursorField (cursor_field_key )
234
- legacy_state = state_manager .get_stream_state (stream .name , stream .namespace )
235
- cursor = ConcurrentCursor (
236
- stream .name ,
237
- stream .namespace ,
238
- legacy_state ,
239
- self .message_repository ,
240
- state_manager ,
241
- stream .state_converter ,
242
- cursor_field ,
243
- self ._get_slice_boundary_fields (stream , state_manager ),
244
- config ["start_date" ],
245
- )
246
- state = cursor .state
241
+ return streams
247
242
248
- configured_streams .append (StreamFacade .create_from_stream (stream , self , logger , state , cursor ))
249
- return configured_streams
243
+ def _create_stream_slicer_cursor (
244
+ self , config : Mapping [str , Any ], state_manager : ConnectorStateManager , stream : Stream
245
+ ) -> ConcurrentCursor :
246
+ """
247
+ We have moved the generation of stream slices to the concurrent CDK cursor
248
+ """
249
+ cursor_field_key = stream .cursor_field or ""
250
+ if not isinstance (cursor_field_key , str ):
251
+ raise AssertionError (f"Nested cursor field are not supported hence type str is expected but got { cursor_field_key } ." )
252
+ cursor_field = CursorField (cursor_field_key )
253
+ stream_state = state_manager .get_stream_state (stream .name , stream .namespace )
254
+ return ConcurrentCursor (
255
+ stream .name ,
256
+ stream .namespace ,
257
+ stream_state ,
258
+ self .message_repository ,
259
+ state_manager ,
260
+ stream .state_converter ,
261
+ cursor_field ,
262
+ self ._get_slice_boundary_fields (stream , state_manager ),
263
+ datetime .fromtimestamp (pendulum .parse (config ["start_date" ]).timestamp (), timezone .utc ),
264
+ stream .state_converter .get_end_provider (),
265
+ timedelta (seconds = LOOKBACK_SECONDS ),
266
+ isodate .parse_duration (config ["stream_slice_step" ]) if "stream_slice_step" in config else timedelta (days = 30 ),
267
+ )
250
268
251
269
def _get_slice_boundary_fields (self , stream : Stream , state_manager : ConnectorStateManager ) -> Optional [Tuple [str , str ]]:
252
270
return ("start_date" , "end_date" )
0 commit comments