Skip to content

Commit fc8cd5a

Browse files
strosekgirarda
andauthored
fix(python-cdk): add user friendly message for encoding errors (#44438)
Co-authored-by: Alexandre Girard <[email protected]>
1 parent dc6a1cc commit fc8cd5a

File tree

9 files changed

+67
-16
lines changed

9 files changed

+67
-16
lines changed

airbyte-cdk/python/airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import traceback
77
from typing import TYPE_CHECKING, Optional, Tuple
88

9+
from airbyte_cdk import AirbyteTracedException
910
from airbyte_cdk.sources import Source
1011
from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
1112
from airbyte_cdk.sources.file_based.exceptions import CheckAvailabilityError, CustomFileBasedException, FileBasedSourceError
@@ -66,6 +67,8 @@ def check_availability_and_parsability(
6667
# If the parser is set to not check parsability, we still want to check that we can open the file.
6768
handle = stream.stream_reader.open_file(file, parser.file_read_mode, None, logger)
6869
handle.close()
70+
except AirbyteTracedException as ate:
71+
raise ate
6972
except CheckAvailabilityError:
7073
return False, "".join(traceback.format_exc())
7174

@@ -98,6 +101,8 @@ def _check_parse_record(self, stream: "AbstractFileBasedStream", file: RemoteFil
98101
# consider the connection check successful even though it means
99102
# we skip the schema validation check.
100103
return
104+
except AirbyteTracedException as ate:
105+
raise ate
101106
except Exception as exc:
102107
raise CheckAvailabilityError(FileBasedSourceError.ERROR_READING_FILE, stream=stream.name, file=file.uri) from exc
103108

airbyte-cdk/python/airbyte_cdk/sources/file_based/exceptions.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class FileBasedSourceError(Enum):
1414
GLOB_PARSE_ERROR = (
1515
"Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
1616
)
17+
ENCODING_ERROR = "File encoding error. The configured encoding must match file encoding."
1718
ERROR_CASTING_VALUE = "Could not cast the value to the expected type."
1819
ERROR_CASTING_VALUE_UNRECOGNIZED_TYPE = "Could not cast the value to the expected type because the type is not recognized. Valid types are null, array, boolean, integer, number, object, and string."
1920
ERROR_DECODING_VALUE = "Expected a JSON-decodeable value but could not decode record."

airbyte-cdk/python/airbyte_cdk/sources/file_based/file_based_source.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) ->
122122
)
123123

124124
errors = []
125+
tracebacks = []
125126
for stream in streams:
126127
if not isinstance(stream, AbstractFileBasedStream):
127128
raise ValueError(f"Stream {stream} is not a file-based stream.")
@@ -130,12 +131,34 @@ def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) ->
130131
stream_is_available,
131132
reason,
132133
) = stream.availability_strategy.check_availability_and_parsability(stream, logger, self)
134+
except AirbyteTracedException as ate:
135+
errors.append(f"Unable to connect to stream {stream.name} - {ate.message}")
136+
tracebacks.append(traceback.format_exc())
133137
except Exception:
134-
errors.append(f"Unable to connect to stream {stream.name} - {''.join(traceback.format_exc())}")
138+
errors.append(f"Unable to connect to stream {stream.name}")
139+
tracebacks.append(traceback.format_exc())
135140
else:
136141
if not stream_is_available and reason:
137142
errors.append(reason)
138143

144+
if len(errors) == 1 and len(tracebacks) == 1:
145+
raise AirbyteTracedException(
146+
internal_message=tracebacks[0],
147+
message=f"{errors[0]}",
148+
failure_type=FailureType.config_error,
149+
)
150+
if len(errors) == 1 and len(tracebacks) == 0:
151+
raise AirbyteTracedException(
152+
message=f"{errors[0]}",
153+
failure_type=FailureType.config_error,
154+
)
155+
elif len(errors) > 1:
156+
raise AirbyteTracedException(
157+
internal_message="\n".join(tracebacks),
158+
message=f"{len(errors)} streams with errors: {', '.join(error for error in errors)}",
159+
failure_type=FailureType.config_error,
160+
)
161+
139162
return not bool(errors), (errors or None)
140163

141164
def streams(self, config: Mapping[str, Any]) -> List[Stream]:

airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/csv_parser.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,12 @@ def read_data(
5252
quoting=csv.QUOTE_MINIMAL,
5353
)
5454
with stream_reader.open_file(file, file_read_mode, config_format.encoding, logger) as fp:
55-
headers = self._get_headers(fp, config_format, dialect_name)
55+
try:
56+
headers = self._get_headers(fp, config_format, dialect_name)
57+
except UnicodeError:
58+
raise AirbyteTracedException(
59+
message=f"{FileBasedSourceError.ENCODING_ERROR.value} Expected encoding: {config_format.encoding}",
60+
)
5661

5762
rows_to_skip = (
5863
config_format.skip_rows_before_header
@@ -274,7 +279,7 @@ def _pre_propcess_property_types(property_types: Dict[str, Any]) -> Mapping[str,
274279

275280
@staticmethod
276281
def _cast_types(
277-
row: Dict[str, str], deduped_property_types: Dict[str, str], config_format: CsvFormat, logger: logging.Logger
282+
row: Dict[str, str], deduped_property_types: Mapping[str, str], config_format: CsvFormat, logger: logging.Logger
278283
) -> Dict[str, Any]:
279284
"""
280285
Casts the values in the input 'row' dictionary according to the types defined in the JSON schema.

airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def state(self, value: MutableMapping[str, Any]) -> None:
5454
"""State setter, accept state serialized by state getter."""
5555
self._cursor.set_initial_state(value)
5656

57-
@property
57+
@property # type: ignore # mypy complains wrong type, but AbstractFileBasedCursor is parent of file-based cursors
5858
def cursor(self) -> Optional[AbstractFileBasedCursor]:
5959
return self._cursor
6060

@@ -172,13 +172,14 @@ def get_json_schema(self) -> JsonSchema:
172172
try:
173173
schema = self._get_raw_json_schema()
174174
except InvalidSchemaError as config_exception:
175-
self.logger.exception(FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value, exc_info=config_exception)
176175
raise AirbyteTracedException(
177176
internal_message="Please check the logged errors for more information.",
178177
message=FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value,
179178
exception=AirbyteTracedException(exception=config_exception),
180179
failure_type=FailureType.config_error,
181180
)
181+
except AirbyteTracedException as ate:
182+
raise ate
182183
except Exception as exc:
183184
raise SchemaInferenceError(FileBasedSourceError.SCHEMA_INFERENCE_ERROR, stream=self.name) from exc
184185
else:
@@ -279,6 +280,8 @@ async def _infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
279280
for task in done:
280281
try:
281282
base_schema = merge_schemas(base_schema, task.result())
283+
except AirbyteTracedException as ate:
284+
raise ate
282285
except Exception as exc:
283286
self.logger.error(f"An error occurred inferring the schema. \n {traceback.format_exc()}", exc_info=exc)
284287

@@ -287,6 +290,8 @@ async def _infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
287290
async def _infer_file_schema(self, file: RemoteFile) -> SchemaType:
288291
try:
289292
return await self.get_parser().infer_schema(self.config, file, self.stream_reader, self.logger)
293+
except AirbyteTracedException as ate:
294+
raise ate
290295
except Exception as exc:
291296
raise SchemaInferenceError(
292297
FileBasedSourceError.SCHEMA_INFERENCE_ERROR,

airbyte-cdk/python/unit_tests/sources/file_based/file_types/test_csv_parser.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,17 @@ def test_parse_field_size_larger_than_default_python_maximum(self) -> None:
520520
data_generator = self._read_data()
521521
assert list(data_generator) == [{"header1": "1", "header2": long_string}]
522522

523+
def test_read_data_with_encoding_error(self) -> None:
524+
self._stream_reader.open_file.return_value = CsvFileBuilder().with_data(["something"]).build()
525+
self._csv_reader._get_headers = Mock(side_effect=UnicodeDecodeError("encoding", b"", 0, 1, "reason"))
526+
527+
with pytest.raises(AirbyteTracedException) as ate:
528+
data_generator = self._read_data()
529+
assert len(list(data_generator)) == 0
530+
531+
assert "encoding" in ate.value.message
532+
assert self._csv_reader._get_headers.called
533+
523534
def _read_data(self) -> Generator[Dict[str, str], None, None]:
524535
data_generator = self._csv_reader.read_data(
525536
self._config,

airbyte-cdk/python/unit_tests/sources/file_based/scenarios/check_scenarios.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#
22
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
33
#
4-
4+
from airbyte_cdk import AirbyteTracedException
55
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError
66
from unit_tests.sources.file_based.helpers import (
77
FailingSchemaValidationPolicy,
@@ -130,7 +130,7 @@
130130
_base_failure_scenario.copy()
131131
.set_name("error_empty_stream_scenario")
132132
.set_source_builder(_base_failure_scenario.copy().source_builder.copy().set_files({}))
133-
.set_expected_check_error(None, FileBasedSourceError.EMPTY_STREAM.value)
133+
.set_expected_check_error(AirbyteTracedException, FileBasedSourceError.EMPTY_STREAM.value)
134134
).build()
135135

136136

@@ -142,7 +142,7 @@
142142
TestErrorListMatchingFilesInMemoryFilesStreamReader(files=_base_failure_scenario.source_builder._files, file_type="csv")
143143
)
144144
)
145-
.set_expected_check_error(None, FileBasedSourceError.ERROR_LISTING_FILES.value)
145+
.set_expected_check_error(AirbyteTracedException, FileBasedSourceError.ERROR_LISTING_FILES.value)
146146
).build()
147147

148148

@@ -154,7 +154,7 @@
154154
TestErrorOpenFileInMemoryFilesStreamReader(files=_base_failure_scenario.source_builder._files, file_type="csv")
155155
)
156156
)
157-
.set_expected_check_error(None, FileBasedSourceError.ERROR_READING_FILE.value)
157+
.set_expected_check_error(AirbyteTracedException, FileBasedSourceError.ERROR_READING_FILE.value)
158158
).build()
159159

160160

@@ -216,5 +216,5 @@
216216
],
217217
}
218218
)
219-
.set_expected_check_error(None, FileBasedSourceError.ERROR_READING_FILE.value)
219+
.set_expected_check_error(AirbyteTracedException, FileBasedSourceError.ERROR_READING_FILE.value)
220220
).build()

airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3240,6 +3240,7 @@
32403240
}
32413241
)
32423242
.set_expected_records(None)
3243+
.set_expected_check_error(AirbyteTracedException, None)
32433244
).build()
32443245

32453246
csv_no_records_scenario: TestScenario[InMemoryFilesSource] = (
@@ -3343,4 +3344,5 @@
33433344
}
33443345
)
33453346
.set_expected_records(None)
3347+
.set_expected_check_error(AirbyteTracedException, None)
33463348
).build()

airbyte-cdk/python/unit_tests/sources/file_based/test_scenarios.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -182,12 +182,11 @@ def verify_check(capsys: CaptureFixture[str], tmp_path: PosixPath, scenario: Tes
182182
expected_exc, expected_msg = scenario.expected_check_error
183183

184184
if expected_exc:
185-
with pytest.raises(expected_exc):
186-
output = check(capsys, tmp_path, scenario)
187-
if expected_msg:
188-
# expected_msg is a string. what's the expected value field?
189-
assert expected_msg in output["message"] # type: ignore
190-
assert output["status"] == scenario.expected_check_status
185+
with pytest.raises(expected_exc) as exc:
186+
check(capsys, tmp_path, scenario)
187+
188+
if expected_msg:
189+
assert expected_msg in exc.value.message
191190

192191
else:
193192
output = check(capsys, tmp_path, scenario)

0 commit comments

Comments
 (0)