Skip to content

Commit 690479d

Browse files
girardamaxi297brianjlai
authored
✨Source S3 (v4): Set decimal_as_float to True for parquet files (#29342)
* [ISSUE #28893] infer csv schema * [ISSUE #28893] align with pyarrow * Automated Commit - Formatting Changes * [ISSUE #28893] legacy inference and infer only when needed * [ISSUE #28893] fix scenario tests * [ISSUE #28893] using discovered schema as part of read * [ISSUE #28893] self-review + cleanup * [ISSUE #28893] fix test * [ISSUE #28893] code review part #1 * [ISSUE #28893] code review part #2 * Fix test * formatcdk * [ISSUE #28893] code review * FIX test log level * Re-adding failing tests * [ISSUE #28893] improve inferrence to consider multiple types per value * set decimal_as_float to True * update * Automated Commit - Formatting Changes * add file adapters for avro, csv, jsonl, and parquet * fix try catch * update * format * pr feedback with a few additional default options set --------- Co-authored-by: maxi297 <[email protected]> Co-authored-by: maxi297 <[email protected]> Co-authored-by: brianjlai <[email protected]>
1 parent cde2c1b commit 690479d

File tree

2 files changed

+40
-43
lines changed

2 files changed

+40
-43
lines changed

airbyte-integrations/connectors/source-s3/source_s3/v4/legacy_config_transformer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,11 @@ def _transform_file_format(cls, format_options: Union[CsvFormat, ParquetFormat,
9696
if "autogenerate_column_names" in advanced_options:
9797
csv_options["autogenerate_column_names"] = advanced_options["autogenerate_column_names"]
9898
return csv_options
99+
99100
elif isinstance(format_options, JsonlFormat):
100101
return {"filetype": "jsonl"}
101102
elif isinstance(format_options, ParquetFormat):
102-
return {"filetype": "parquet"}
103+
return {"filetype": "parquet", "decimal_as_float": True}
103104
else:
104105
# This should never happen because it would fail schema validation
105106
raise ValueError(f"Format filetype {format_options} is not a supported file type")

airbyte-integrations/connectors/source-s3/unit_tests/v4/test_legacy_config_transformer.py

Lines changed: 38 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,13 @@
2020
"aws_secret_access_key": "some_secret",
2121
"endpoint": "https://external-s3.com",
2222
"path_prefix": "a_folder/",
23-
"start_date": "2022-01-01T01:02:03Z"
24-
23+
"start_date": "2022-01-01T01:02:03Z",
2524
},
2625
"format": {
2726
"filetype": "avro",
2827
},
2928
"path_pattern": "**/*.avro",
30-
"schema": '{"col1": "string", "col2": "integer"}'
29+
"schema": '{"col1": "string", "col2": "integer"}',
3130
},
3231
{
3332
"bucket": "test_bucket",
@@ -42,13 +41,11 @@
4241
"globs": ["a_folder/**/*.avro"],
4342
"validation_policy": "Emit Record",
4443
"input_schema": '{"col1": "string", "col2": "integer"}',
45-
"format": {
46-
"filetype": "avro"
47-
}
44+
"format": {"filetype": "avro"},
4845
}
49-
]
50-
}
51-
, id="test_convert_legacy_config"
46+
],
47+
},
48+
id="test_convert_legacy_config",
5249
),
5350
pytest.param(
5451
{
@@ -70,15 +67,13 @@
7067
"file_type": "avro",
7168
"globs": ["**/*.avro"],
7269
"validation_policy": "Emit Record",
73-
"format": {
74-
"filetype": "avro"
75-
}
70+
"format": {"filetype": "avro"},
7671
}
77-
]
78-
}
79-
, id="test_convert_no_optional_fields"
72+
],
73+
},
74+
id="test_convert_no_optional_fields",
8075
),
81-
]
76+
],
8277
)
8378
def test_convert_legacy_config(legacy_config, expected_config):
8479
parsed_legacy_config = SourceS3Spec(**legacy_config)
@@ -101,8 +96,8 @@ def test_convert_legacy_config(legacy_config, expected_config):
10196
"encoding": "ansi",
10297
"double_quote": False,
10398
"newlines_in_values": True,
104-
"additional_reader_options": "{\"strings_can_be_null\": true}",
105-
"advanced_options": "{\"skip_rows\": 3, \"skip_rows_after_names\": 5, \"autogenerate_column_names\": true}",
99+
"additional_reader_options": '{"strings_can_be_null": true}',
100+
"advanced_options": '{"skip_rows": 3, "skip_rows_after_names": 5, "autogenerate_column_names": true}',
106101
"blocksize": 20000,
107102
},
108103
{
@@ -122,7 +117,8 @@ def test_convert_legacy_config(legacy_config, expected_config):
122117
"autogenerate_column_names": True,
123118
},
124119
None,
125-
id="test_csv_all_legacy_options_set"),
120+
id="test_csv_all_legacy_options_set",
121+
),
126122
pytest.param(
127123
"csv",
128124
{
@@ -145,14 +141,15 @@ def test_convert_legacy_config(legacy_config, expected_config):
145141
"strings_can_be_null": False,
146142
},
147143
None,
148-
id="test_csv_only_required_options"),
144+
id="test_csv_only_required_options",
145+
),
149146
pytest.param(
150147
"csv",
151148
{},
152149
{
153150
"filetype": "csv",
154151
"delimiter": ",",
155-
"quote_char": "\"",
152+
"quote_char": '"',
156153
"encoding": "utf8",
157154
"double_quote": True,
158155
"null_values": ["", "null", "NULL", "N/A", "NA", "NaN", "None"],
@@ -162,23 +159,26 @@ def test_convert_legacy_config(legacy_config, expected_config):
162159
"strings_can_be_null": False,
163160
},
164161
None,
165-
id="test_csv_empty_format"),
162+
id="test_csv_empty_format",
163+
),
166164
pytest.param(
167165
"csv",
168166
{
169-
"additional_reader_options": "{\"not_valid\": \"at all}",
167+
"additional_reader_options": '{"not_valid": "at all}',
170168
},
171169
None,
172170
ValueError,
173-
id="test_malformed_additional_reader_options"),
171+
id="test_malformed_additional_reader_options",
172+
),
174173
pytest.param(
175174
"csv",
176175
{
177-
"advanced_options": "{\"not_valid\": \"at all}",
176+
"advanced_options": '{"not_valid": "at all}',
178177
},
179178
None,
180179
ValueError,
181-
id="test_malformed_advanced_options"),
180+
id="test_malformed_advanced_options",
181+
),
182182
pytest.param(
183183
"jsonl",
184184
{
@@ -187,11 +187,10 @@ def test_convert_legacy_config(legacy_config, expected_config):
187187
"unexpected_field_behavior": "ignore",
188188
"block_size": 0,
189189
},
190-
{
191-
"filetype": "jsonl"
192-
},
190+
{"filetype": "jsonl"},
193191
None,
194-
id="test_jsonl_format"),
192+
id="test_jsonl_format",
193+
),
195194
pytest.param(
196195
"parquet",
197196
{
@@ -200,22 +199,20 @@ def test_convert_legacy_config(legacy_config, expected_config):
200199
"batch_size": 65536,
201200
"buffer_size": 100,
202201
},
203-
{
204-
"filetype": "parquet"
205-
},
202+
{"filetype": "parquet", "decimal_as_float": True},
206203
None,
207-
id="test_parquet_format"),
204+
id="test_parquet_format",
205+
),
208206
pytest.param(
209207
"avro",
210208
{
211209
"filetype": "avro",
212210
},
213-
{
214-
"filetype": "avro"
215-
},
211+
{"filetype": "avro"},
216212
None,
217-
id="test_avro_format"),
218-
]
213+
id="test_avro_format",
214+
),
215+
],
219216
)
220217
def test_convert_file_format(file_type, legacy_format_config, expected_format_config, expected_error):
221218
legacy_config = {
@@ -225,7 +222,6 @@ def test_convert_file_format(file_type, legacy_format_config, expected_format_co
225222
"bucket": "test_bucket",
226223
"aws_access_key_id": "some_access_key",
227224
"aws_secret_access_key": "some_secret",
228-
229225
},
230226
"format": legacy_format_config,
231227
"path_pattern": f"**/*.{file_type}",
@@ -241,9 +237,9 @@ def test_convert_file_format(file_type, legacy_format_config, expected_format_co
241237
"file_type": file_type,
242238
"globs": [f"**/*.{file_type}"],
243239
"validation_policy": "Emit Record",
244-
"format": expected_format_config
240+
"format": expected_format_config,
245241
}
246-
]
242+
],
247243
}
248244

249245
parsed_legacy_config = SourceS3Spec(**legacy_config)

0 commit comments

Comments
 (0)