Skip to content

Commit 18c9ebc

Browse files
authored
[airbyte-cdk] Increase the maximum parseable field size for CSV files (#36320)
1 parent 7f70ac4 commit 18c9ebc

File tree

2 files changed

+24
-1
lines changed

2 files changed

+24
-1
lines changed

airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,11 @@ def _skip_rows(fp: IOBase, rows_to_skip: int) -> None:
131131
class CsvParser(FileTypeParser):
132132
_MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000
133133

134-
def __init__(self, csv_reader: Optional[_CsvReader] = None):
134+
def __init__(self, csv_reader: Optional[_CsvReader] = None, csv_field_max_bytes: int = 2**31):
135+
# Increase the maximum length of data that can be parsed in a single CSV field. The default is 128k, which is typically sufficient
136+
# but given the use of Airbyte in loading a large variety of data it is best to allow for a larger maximum field size to avoid
137+
# skipping data on load. https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072
138+
csv.field_size_limit(csv_field_max_bytes)
135139
self._csv_reader = csv_reader if csv_reader else _CsvReader()
136140

137141
def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:

airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py

+19
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,25 @@ def test_given_too_few_values_for_columns_when_read_data_then_raise_exception_an
501501
next(data_generator)
502502
assert new_dialect not in csv.list_dialects()
503503

504+
def test_parse_field_size_larger_than_default_python_maximum(self) -> None:
505+
# The field size for the csv module will be set as a side-effect of initializing the CsvParser class.
506+
assert csv.field_size_limit() == 2**31
507+
long_string = 130 * 1024 * "a"
508+
assert len(long_string.encode("utf-8")) > (128 * 1024)
509+
self._stream_reader.open_file.return_value = (
510+
CsvFileBuilder()
511+
.with_data(
512+
[
513+
"header1,header2",
514+
f'1,"{long_string}"',
515+
]
516+
)
517+
.build()
518+
)
519+
520+
data_generator = self._read_data()
521+
assert list(data_generator) == [{"header1": "1", "header2": long_string}]
522+
504523
def _read_data(self) -> Generator[Dict[str, str], None, None]:
505524
data_generator = self._csv_reader.read_data(
506525
self._config,

0 commit comments

Comments
 (0)