9
9
import random
10
10
import string
11
11
import tempfile
12
+ import time
12
13
from datetime import datetime
13
14
from pathlib import Path
14
- from typing import Any , Dict
15
+ from typing import Any , Callable , Dict , Generator , Iterable
15
16
from unittest .mock import MagicMock
16
17
17
18
import duckdb
30
31
)
31
32
from destination_duckdb import DestinationDuckdb
32
33
from destination_duckdb .destination import CONFIG_MOTHERDUCK_API_KEY
34
+ from faker import Faker
33
35
34
36
CONFIG_PATH = "integration_tests/config.json"
35
37
SECRETS_CONFIG_PATH = (
@@ -96,6 +98,12 @@ def test_table_name() -> str:
96
98
return f"airbyte_integration_{ rand_string } "
97
99
98
100
101
+ @pytest .fixture
102
+ def test_large_table_name () -> str :
103
+ letters = string .ascii_lowercase
104
+ rand_string = "" .join (random .choice (letters ) for _ in range (10 ))
105
+ return f"airbyte_integration_{ rand_string } "
106
+
99
107
@pytest .fixture
100
108
def table_schema () -> str :
101
109
schema = {"type" : "object" , "properties" : {"column1" : {"type" : ["null" , "string" ]}}}
@@ -104,7 +112,7 @@ def table_schema() -> str:
104
112
105
113
@pytest .fixture
106
114
def configured_catalogue (
107
- test_table_name : str , table_schema : str
115
+ test_table_name : str , test_large_table_name : str , table_schema : str ,
108
116
) -> ConfiguredAirbyteCatalog :
109
117
append_stream = ConfiguredAirbyteStream (
110
118
stream = AirbyteStream (
@@ -115,7 +123,16 @@ def configured_catalogue(
115
123
sync_mode = SyncMode .incremental ,
116
124
destination_sync_mode = DestinationSyncMode .append ,
117
125
)
118
- return ConfiguredAirbyteCatalog (streams = [append_stream ])
126
+ append_stream_large = ConfiguredAirbyteStream (
127
+ stream = AirbyteStream (
128
+ name = test_large_table_name ,
129
+ json_schema = table_schema ,
130
+ supported_sync_modes = [SyncMode .full_refresh , SyncMode .incremental ],
131
+ ),
132
+ sync_mode = SyncMode .incremental ,
133
+ destination_sync_mode = DestinationSyncMode .append ,
134
+ )
135
+ return ConfiguredAirbyteCatalog (streams = [append_stream , append_stream_large ])
119
136
120
137
121
138
@pytest .fixture
@@ -206,3 +223,101 @@ def test_write(
206
223
assert len (result ) == 2
207
224
assert result [0 ][2 ] == json .dumps (airbyte_message1 .record .data )
208
225
assert result [1 ][2 ] == json .dumps (airbyte_message2 .record .data )
226
+
227
+ def _airbyte_messages (n : int , batch_size : int , table_name : str ) -> Generator [AirbyteMessage , None , None ]:
228
+ fake = Faker ()
229
+ Faker .seed (0 )
230
+
231
+ for i in range (n ):
232
+ if i != 0 and i % batch_size == 0 :
233
+ yield AirbyteMessage (
234
+ type = Type .STATE , state = AirbyteStateMessage (data = {"state" : str (i // batch_size )})
235
+ )
236
+ else :
237
+ message = AirbyteMessage (
238
+ type = Type .RECORD ,
239
+ record = AirbyteRecordMessage (
240
+ stream = table_name ,
241
+ data = {"key1" : fake .first_name () , "key2" : fake .ssn ()},
242
+ emitted_at = int (datetime .now ().timestamp ()) * 1000 ,
243
+ ),
244
+ )
245
+ yield message
246
+
247
+
248
+ def _airbyte_messages_with_inconsistent_json_fields (n : int , batch_size : int , table_name : str ) -> Generator [AirbyteMessage , None , None ]:
249
+ fake = Faker ()
250
+ Faker .seed (0 )
251
+ random .seed (0 )
252
+
253
+ for i in range (n ):
254
+ if i != 0 and i % batch_size == 0 :
255
+ yield AirbyteMessage (
256
+ type = Type .STATE , state = AirbyteStateMessage (data = {"state" : str (i // batch_size )})
257
+ )
258
+ else :
259
+ message = AirbyteMessage (
260
+ type = Type .RECORD ,
261
+ record = AirbyteRecordMessage (
262
+ stream = table_name ,
263
+ # Throw in empty nested objects and see how pyarrow deals with them.
264
+ data = {"key1" : fake .first_name () ,
265
+ "key2" : fake .ssn () if random .random ()< 0.5 else random .randrange (1000 ,9999999999999 ),
266
+ "nested1" : {} if random .random ()< 0.1 else {
267
+ "key3" : fake .first_name () ,
268
+ "key4" : fake .ssn () if random .random ()< 0.5 else random .randrange (1000 ,9999999999999 ),
269
+ "dictionary1" :{} if random .random ()< 0.1 else {
270
+ "key3" : fake .first_name () ,
271
+ "key4" : "True" if random .random () < 0.5 else True
272
+ }
273
+ }
274
+ }
275
+ if random .random () < 0.9 else {},
276
+
277
+ emitted_at = int (datetime .now ().timestamp ()) * 1000 ,
278
+ ),
279
+ )
280
+ yield message
281
+
282
+
283
+ TOTAL_RECORDS = 5_000
284
+ BATCH_WRITE_SIZE = 1000
285
+
286
+ @pytest .mark .slow
287
+ @pytest .mark .parametrize ("airbyte_message_generator,explanation" ,
288
+ [(_airbyte_messages , "Test writing a large number of simple json objects." ),
289
+ (_airbyte_messages_with_inconsistent_json_fields , "Test writing a large number of json messages with inconsistent schema." )] )
290
+ def test_large_number_of_writes (
291
+ config : Dict [str , str ],
292
+ request ,
293
+ configured_catalogue : ConfiguredAirbyteCatalog ,
294
+ test_large_table_name : str ,
295
+ test_schema_name : str ,
296
+ airbyte_message_generator : Callable [[int , int , str ], Iterable [AirbyteMessage ]],
297
+ explanation : str ,
298
+ ):
299
+ destination = DestinationDuckdb ()
300
+ generator = destination .write (
301
+ config ,
302
+ configured_catalogue ,
303
+ airbyte_message_generator (TOTAL_RECORDS , BATCH_WRITE_SIZE , test_large_table_name ),
304
+ )
305
+
306
+ result = list (generator )
307
+ assert len (result ) == TOTAL_RECORDS // (BATCH_WRITE_SIZE + 1 )
308
+ motherduck_api_key = str (config .get (CONFIG_MOTHERDUCK_API_KEY , "" ))
309
+ duckdb_config = {}
310
+ if motherduck_api_key :
311
+ duckdb_config ["motherduck_token" ] = motherduck_api_key
312
+ duckdb_config ["custom_user_agent" ] = "airbyte_intg_test"
313
+
314
+ con = duckdb .connect (
315
+ database = config .get ("destination_path" ), read_only = False , config = duckdb_config
316
+ )
317
+ with con :
318
+ cursor = con .execute (
319
+ "SELECT count(1) "
320
+ f"FROM { test_schema_name } ._airbyte_raw_{ test_large_table_name } "
321
+ )
322
+ result = cursor .fetchall ()
323
+ assert result [0 ][0 ] == TOTAL_RECORDS - TOTAL_RECORDS // (BATCH_WRITE_SIZE + 1 )
0 commit comments