Skip to content

Commit 9132674

Browse files
lazebnyidavydov-doctavia-squidington-iii
authored
🎉Source Amazon S3: increase unit test coverage at least 90% (#11967)
* Increased unittest coverage * #11676 test coverage 85% * #11676 unit tests 90% * #11676 two more unit tests * #11676 bump version * auto-bump connector version Co-authored-by: Denys Davydov <[email protected]> Co-authored-by: Denys Davydov <[email protected]> Co-authored-by: Octavia Squidington III <[email protected]>
1 parent 34cd942 commit 9132674

File tree

11 files changed

+497
-52
lines changed

11 files changed

+497
-52
lines changed

airbyte-config/init/src/main/resources/seed/source_definitions.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -779,7 +779,7 @@
779779
- name: S3
780780
sourceDefinitionId: 69589781-7828-43c5-9f63-8925b1c1ccc2
781781
dockerRepository: airbyte/source-s3
782-
dockerImageTag: 0.1.13
782+
dockerImageTag: 0.1.14
783783
documentationUrl: https://docs.airbyte.io/integrations/sources/s3
784784
icon: s3.svg
785785
sourceType: file

airbyte-config/init/src/main/resources/seed/source_specs.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -7158,7 +7158,7 @@
71587158
path_in_connector_config:
71597159
- "credentials"
71607160
- "client_secret"
7161-
- dockerImage: "airbyte/source-s3:0.1.13"
7161+
- dockerImage: "airbyte/source-s3:0.1.14"
71627162
spec:
71637163
documentationUrl: "https://docs.airbyte.io/integrations/sources/s3"
71647164
changelogUrl: "https://docs.airbyte.io/integrations/sources/s3"

airbyte-integrations/connectors/source-s3/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ COPY source_s3 ./source_s3
1717
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
1818
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
1919

20-
LABEL io.airbyte.version=0.1.13
20+
LABEL io.airbyte.version=0.1.14
2121
LABEL io.airbyte.name=airbyte/source-s3

airbyte-integrations/connectors/source-s3/source_s3/s3file.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def _setup_boto_session(self) -> None:
2727
Currently grabbing last_modified across multiple files asynchronously and may implement more multi-threading in future.
2828
See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/resources.html (anchor link broken, scroll to bottom)
2929
"""
30-
if self.use_aws_account:
30+
if self.use_aws_account(self._provider):
3131
self._boto_session = boto3session.Session(
3232
aws_access_key_id=self._provider.get("aws_access_key_id"),
3333
aws_secret_access_key=self._provider.get("aws_secret_access_key"),

airbyte-integrations/connectors/source-s3/source_s3/source_files_abstract/stream.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def fileformatparser_class(self) -> type:
109109
:return: reference to the relevant fileformatparser class e.g. CsvParser
110110
"""
111111
filetype = self._format.get("filetype")
112-
file_reader = self.fileformatparser_map.get(self._format.get("filetype"))
112+
file_reader = self.fileformatparser_map.get(filetype)
113113
if not file_reader:
114114
raise RuntimeError(
115115
f"Detected mismatched file format '{filetype}'. Available values: '{list(self.fileformatparser_map.keys())}''."
@@ -221,23 +221,24 @@ def _get_master_schema(self, min_datetime: datetime = None) -> Dict[str, Any]:
221221
# this compares datatype of every column that the two schemas have in common
222222
for col in column_superset:
223223
if (col in master_schema.keys()) and (col in this_schema.keys()) and (master_schema[col] != this_schema[col]):
224-
# if this column exists in a provided schema or schema state, we'll WARN here rather than throw an error
225-
# this is to allow more leniency as we may be able to coerce this datatype mismatch on read according to provided schema state
226-
# if not, then the read will error anyway
224+
# If this column exists in a provided schema or schema state, we'll WARN here rather than throw an error
225+
# this is to allow more leniency as we may be able to coerce this datatype mismatch on read according to
226+
# provided schema state. If not, then the read will error anyway
227227
if col in self._schema.keys():
228228
LOGGER.warn(
229229
f"Detected mismatched datatype on column '{col}', in file '{storagefile.url}'. "
230230
+ f"Should be '{master_schema[col]}', but found '{this_schema[col]}'. "
231231
+ f"Airbyte will attempt to coerce this to {master_schema[col]} on read."
232232
)
233-
# else we're inferring the schema (or at least this column) from scratch and therefore throw an error on mismatching datatypes
233+
# else we're inferring the schema (or at least this column) from scratch and therefore
234+
# throw an error on mismatching datatypes
234235
else:
235236
raise RuntimeError(
236237
f"Detected mismatched datatype on column '{col}', in file '{storagefile.url}'. "
237238
+ f"Should be '{master_schema[col]}', but found '{this_schema[col]}'."
238239
)
239240

240-
# missing columns in this_schema doesn't affect our master_schema so we don't check for it here
241+
# missing columns in this_schema doesn't affect our master_schema, so we don't check for it here
241242

242243
# add to master_schema any columns from this_schema that aren't already present
243244
for col, datatype in this_schema.items():

airbyte-integrations/connectors/source-s3/unit_tests/conftest.py

+54-1
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,21 @@
22
# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
33
#
44

5+
import json
56
import os
67
import shutil
78
import tempfile
8-
from typing import Any
9+
from pathlib import Path
10+
from typing import Any, List, Mapping
11+
12+
import requests # noqa
13+
from airbyte_cdk import AirbyteLogger
14+
from netifaces import AF_INET, ifaddresses, interfaces
15+
from pytest import fixture
16+
from requests.exceptions import ConnectionError # noqa
17+
from source_s3 import SourceS3
18+
19+
logger = AirbyteLogger()
920

1021
TMP_FOLDER = os.path.join(tempfile.gettempdir(), "test_generated")
1122

@@ -22,3 +33,45 @@ def pytest_generate_tests(metafunc: Any) -> None:
2233
def pytest_sessionfinish(session: Any, exitstatus: Any) -> None:
2334
"""whole test run finishes."""
2435
shutil.rmtree(TMP_FOLDER, ignore_errors=True)
36+
37+
38+
@fixture(name="config")
39+
def config_fixture(tmp_path):
40+
config_file = tmp_path / "config.json"
41+
with open(config_file, "w") as fp:
42+
json.dump(
43+
{
44+
"dataset": "dummy",
45+
"provider": {"bucket": "test-test", "endpoint": "test", "use_ssl": "test", "verify_ssl_cert": "test"},
46+
"path_pattern": "",
47+
"format": {"delimiter": "\\t"},
48+
},
49+
fp,
50+
)
51+
source = SourceS3()
52+
config = source.read_config(config_file)
53+
return config
54+
55+
56+
def get_local_ip() -> str:
57+
all_interface_ips: List[str] = []
58+
for iface_name in interfaces():
59+
all_interface_ips += [i["addr"] for i in ifaddresses(iface_name).setdefault(AF_INET, [{"addr": None}]) if i["addr"]]
60+
logger.info(f"detected interface IPs: {all_interface_ips}")
61+
for ip in sorted(all_interface_ips):
62+
if not ip.startswith("127."):
63+
return ip
64+
65+
assert False, "not found an non-localhost interface"
66+
67+
68+
@fixture(scope="session")
69+
def minio_credentials() -> Mapping[str, Any]:
70+
config_template = Path(__file__).parent / "config_minio.template.json"
71+
assert config_template.is_file() is not None, f"not found {config_template}"
72+
config_file = Path(__file__).parent / "config_minio.json"
73+
config_file.write_text(config_template.read_text().replace("<local_ip>", get_local_ip()))
74+
75+
with open(str(config_file)) as f:
76+
credentials = json.load(f)
77+
return credentials

airbyte-integrations/connectors/source-s3/unit_tests/test_parquet_parser.py

+9
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import pandas as pd
1414
import pyarrow as pa
1515
import pyarrow.parquet as pq
16+
import pytest
1617
from source_s3.source_files_abstract.formats.parquet_parser import PARQUET_TYPES, ParquetParser
1718

1819
from .abstract_test_parser import AbstractTestParser
@@ -239,3 +240,11 @@ def cases(cls) -> Mapping[str, Any]:
239240
"fails": [],
240241
}
241242
return cases
243+
244+
def test_parse_field_type(self):
245+
with pytest.raises(TypeError):
246+
assert ParquetParser.parse_field_type(needed_logical_type="", need_physical_type="")
247+
248+
def test_convert_field_data(self):
249+
with pytest.raises(TypeError):
250+
ParquetParser.convert_field_data(logical_type="", field_value="")

airbyte-integrations/connectors/source-s3/unit_tests/test_s3file.py

+14
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44

55

66
from typing import Mapping
7+
from unittest.mock import MagicMock
78

89
import pytest
10+
import smart_open
911
from airbyte_cdk import AirbyteLogger
1012
from source_s3.s3file import S3File
1113

@@ -30,3 +32,15 @@ class TestS3File:
3032
)
3133
def test_use_aws_account(self, provider: Mapping[str, str], return_true: bool) -> None:
3234
assert S3File.use_aws_account(provider) is return_true
35+
36+
@pytest.mark.parametrize( # passing in full provider to emulate real usage (dummy values are unused by func)
37+
"provider",
38+
[
39+
({"storage": "S3", "bucket": "dummy", "aws_access_key_id": "id", "aws_secret_access_key": "key", "path_prefix": "dummy"}),
40+
({"storage": "S3", "bucket": "dummy", "aws_access_key_id": None, "aws_secret_access_key": None, "path_prefix": "dummy"}),
41+
],
42+
)
43+
def test_s3_file_contextmanager(self, provider):
44+
smart_open.open = MagicMock()
45+
with S3File(file_info=MagicMock(), provider=provider).open("rb") as s3_file:
46+
assert s3_file

airbyte-integrations/connectors/source-s3/unit_tests/test_source.py

+51
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,15 @@
33
#
44

55
import json
6+
from unittest.mock import MagicMock, patch
67

8+
import pytest
9+
from airbyte_cdk.logger import AirbyteLogger
10+
from airbyte_cdk.models import ConnectorSpecification
711
from source_s3 import SourceS3
12+
from source_s3.source_files_abstract.spec import SourceFilesAbstractSpec
13+
14+
logger = AirbyteLogger()
815

916

1017
def test_transform_backslash_t_to_tab(tmp_path):
@@ -14,3 +21,47 @@ def test_transform_backslash_t_to_tab(tmp_path):
1421
source = SourceS3()
1522
config = source.read_config(config_file)
1623
assert config["format"]["delimiter"] == "\t"
24+
25+
26+
def test_check_connection_empty_config():
27+
config = {}
28+
ok, error_msg = SourceS3().check_connection(logger, config=config)
29+
30+
assert not ok
31+
assert error_msg
32+
33+
34+
def test_check_connection_exception(config):
35+
ok, error_msg = SourceS3().check_connection(logger, config=config)
36+
37+
assert not ok
38+
assert error_msg
39+
40+
41+
def test_check_connection(config):
42+
instance = SourceS3()
43+
with patch.object(instance.stream_class, "filepath_iterator", MagicMock()):
44+
ok, error_msg = instance.check_connection(logger, config=config)
45+
46+
assert ok
47+
assert not error_msg
48+
49+
50+
def test_streams(config):
51+
instance = SourceS3()
52+
assert len(instance.streams(config)) == 1
53+
54+
55+
def test_spec():
56+
spec = SourceS3().spec()
57+
58+
assert isinstance(spec, ConnectorSpecification)
59+
60+
61+
def test_check_provider_added():
62+
with pytest.raises(Exception):
63+
SourceFilesAbstractSpec.check_provider_added({"properties": []})
64+
65+
66+
def test_change_format_to_oneOf():
67+
assert SourceFilesAbstractSpec.change_format_to_oneOf({"properties": {"format": {"oneOf": ""}}})

0 commit comments

Comments
 (0)