8
8
from collections import Counter
9
9
from typing import Any , Iterator , List , Mapping , MutableMapping , Optional , Tuple , Type , Union
10
10
11
- from airbyte_cdk .models import AirbyteMessage , AirbyteStateMessage , ConfiguredAirbyteCatalog , ConnectorSpecification
12
- from airbyte_cdk .sources import AbstractSource
11
+ from airbyte_cdk .logger import AirbyteLogFormatter , init_logger
12
+ from airbyte_cdk .models import (
13
+ AirbyteMessage ,
14
+ AirbyteStateMessage ,
15
+ ConfiguredAirbyteCatalog ,
16
+ ConnectorSpecification ,
17
+ FailureType ,
18
+ Level ,
19
+ SyncMode ,
20
+ )
21
+ from airbyte_cdk .sources .concurrent_source .concurrent_source import ConcurrentSource
22
+ from airbyte_cdk .sources .concurrent_source .concurrent_source_adapter import ConcurrentSourceAdapter
13
23
from airbyte_cdk .sources .file_based .availability_strategy import AbstractFileBasedAvailabilityStrategy , DefaultFileBasedAvailabilityStrategy
14
24
from airbyte_cdk .sources .file_based .config .abstract_file_based_spec import AbstractFileBasedSpec
15
25
from airbyte_cdk .sources .file_based .config .file_based_stream_config import FileBasedStreamConfig , ValidationPolicy
20
30
from airbyte_cdk .sources .file_based .file_types .file_type_parser import FileTypeParser
21
31
from airbyte_cdk .sources .file_based .schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES , AbstractSchemaValidationPolicy
22
32
from airbyte_cdk .sources .file_based .stream import AbstractFileBasedStream , DefaultFileBasedStream
33
+ from airbyte_cdk .sources .file_based .stream .concurrent .adapters import FileBasedStreamFacade
34
+ from airbyte_cdk .sources .file_based .stream .concurrent .cursor import FileBasedNoopCursor
23
35
from airbyte_cdk .sources .file_based .stream .cursor import AbstractFileBasedCursor
24
36
from airbyte_cdk .sources .file_based .stream .cursor .default_file_based_cursor import DefaultFileBasedCursor
37
+ from airbyte_cdk .sources .message .repository import InMemoryMessageRepository , MessageRepository
38
+ from airbyte_cdk .sources .source import TState
25
39
from airbyte_cdk .sources .streams import Stream
26
40
from airbyte_cdk .utils .analytics_message import create_analytics_message
41
+ from airbyte_cdk .utils .traced_exception import AirbyteTracedException
27
42
from pydantic .error_wrappers import ValidationError
28
43
44
+ DEFAULT_CONCURRENCY = 100
45
+ MAX_CONCURRENCY = 100
46
+ INITIAL_N_PARTITIONS = MAX_CONCURRENCY // 2
47
+
48
+
49
+ class FileBasedSource (ConcurrentSourceAdapter , ABC ):
50
+ # We make each source override the concurrency level to give control over when they are upgraded.
51
+ _concurrency_level = None
29
52
30
- class FileBasedSource (AbstractSource , ABC ):
31
53
def __init__ (
32
54
self ,
33
55
stream_reader : AbstractFileBasedStreamReader ,
34
56
spec_class : Type [AbstractFileBasedSpec ],
35
- catalog_path : Optional [str ] = None ,
57
+ catalog : Optional [ConfiguredAirbyteCatalog ],
58
+ config : Optional [Mapping [str , Any ]],
59
+ state : Optional [TState ],
36
60
availability_strategy : Optional [AbstractFileBasedAvailabilityStrategy ] = None ,
37
61
discovery_policy : AbstractDiscoveryPolicy = DefaultDiscoveryPolicy (),
38
62
parsers : Mapping [Type [Any ], FileTypeParser ] = default_parsers ,
@@ -41,15 +65,29 @@ def __init__(
41
65
):
42
66
self .stream_reader = stream_reader
43
67
self .spec_class = spec_class
68
+ self .config = config
69
+ self .catalog = catalog
70
+ self .state = state
44
71
self .availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy (stream_reader )
45
72
self .discovery_policy = discovery_policy
46
73
self .parsers = parsers
47
74
self .validation_policies = validation_policies
48
- catalog = self .read_catalog (catalog_path ) if catalog_path else None
49
75
self .stream_schemas = {s .stream .name : s .stream .json_schema for s in catalog .streams } if catalog else {}
50
76
self .cursor_cls = cursor_cls
51
- self .logger = logging . getLogger (f"airbyte.{ self .name } " )
77
+ self .logger = init_logger (f"airbyte.{ self .name } " )
52
78
self .errors_collector : FileBasedErrorsCollector = FileBasedErrorsCollector ()
79
+ self ._message_repository : Optional [MessageRepository ] = None
80
+ concurrent_source = ConcurrentSource .create (
81
+ MAX_CONCURRENCY , INITIAL_N_PARTITIONS , self .logger , self ._slice_logger , self .message_repository
82
+ )
83
+ self ._state = None
84
+ super ().__init__ (concurrent_source )
85
+
86
+ @property
87
+ def message_repository (self ) -> MessageRepository :
88
+ if self ._message_repository is None :
89
+ self ._message_repository = InMemoryMessageRepository (Level (AirbyteLogFormatter .level_mapping [self .logger .level ]))
90
+ return self ._message_repository
53
91
54
92
def check_connection (self , logger : logging .Logger , config : Mapping [str , Any ]) -> Tuple [bool , Optional [Any ]]:
55
93
"""
@@ -61,7 +99,15 @@ def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) ->
61
99
62
100
Otherwise, the "error" object should describe what went wrong.
63
101
"""
64
- streams = self .streams (config )
102
+ try :
103
+ streams = self .streams (config )
104
+ except Exception as config_exception :
105
+ raise AirbyteTracedException (
106
+ internal_message = "Please check the logged errors for more information." ,
107
+ message = FileBasedSourceError .CONFIG_VALIDATION_ERROR .value ,
108
+ exception = AirbyteTracedException (exception = config_exception ),
109
+ failure_type = FailureType .config_error ,
110
+ )
65
111
if len (streams ) == 0 :
66
112
return (
67
113
False ,
@@ -80,7 +126,7 @@ def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) ->
80
126
reason ,
81
127
) = stream .availability_strategy .check_availability_and_parsability (stream , logger , self )
82
128
except Exception :
83
- errors .append (f"Unable to connect to stream { stream } - { '' .join (traceback .format_exc ())} " )
129
+ errors .append (f"Unable to connect to stream { stream . name } - { '' .join (traceback .format_exc ())} " )
84
130
else :
85
131
if not stream_is_available and reason :
86
132
errors .append (reason )
@@ -91,10 +137,26 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]:
91
137
"""
92
138
Return a list of this source's streams.
93
139
"""
140
+ file_based_streams = self ._get_file_based_streams (config )
141
+
142
+ configured_streams : List [Stream ] = []
143
+
144
+ for stream in file_based_streams :
145
+ sync_mode = self ._get_sync_mode_from_catalog (stream )
146
+ if sync_mode == SyncMode .full_refresh and hasattr (self , "_concurrency_level" ) and self ._concurrency_level is not None :
147
+ configured_streams .append (
148
+ FileBasedStreamFacade .create_from_stream (stream , self , self .logger , None , FileBasedNoopCursor (stream .config ))
149
+ )
150
+ else :
151
+ configured_streams .append (stream )
152
+
153
+ return configured_streams
154
+
155
+ def _get_file_based_streams (self , config : Mapping [str , Any ]) -> List [AbstractFileBasedStream ]:
94
156
try :
95
157
parsed_config = self ._get_parsed_config (config )
96
158
self .stream_reader .config = parsed_config
97
- streams : List [Stream ] = []
159
+ streams : List [AbstractFileBasedStream ] = []
98
160
for stream_config in parsed_config .streams :
99
161
self ._validate_input_schema (stream_config )
100
162
streams .append (
@@ -115,6 +177,14 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]:
115
177
except ValidationError as exc :
116
178
raise ConfigValidationError (FileBasedSourceError .CONFIG_VALIDATION_ERROR ) from exc
117
179
180
+ def _get_sync_mode_from_catalog (self , stream : Stream ) -> Optional [SyncMode ]:
181
+ if self .catalog :
182
+ for catalog_stream in self .catalog .streams :
183
+ if stream .name == catalog_stream .stream .name :
184
+ return catalog_stream .sync_mode
185
+ raise RuntimeError (f"No sync mode was found for { stream .name } ." )
186
+ return None
187
+
118
188
def read (
119
189
self ,
120
190
logger : logging .Logger ,
0 commit comments