Skip to content

Commit 3d49955

Browse files
sivankumar86alafanecherePhlairoctavia-squidington-iii
authored
source-S3: Support JSON format (#14213)
* json format support added * json format support added * code formatted * format convertion changed * format naming convertion changed * test cased issue fixed * test case issued resolved * sample file and config added for integration tests * Json doc added Json doc added * update * sample file and config added for integration tests * sample file and config added for integration tests * update jsonl files * review 1 * review 1 * review 1 * pyarrow version upgrade * clean integration test folder architecture * add timestamp record to simple_test.jsonl * fixed integration test and parser review change * simplify table read * doc update * fix specs * user sample files * fix sample files * add newlines at end of files * rename json parser * rename jsonfile to jsonlfile * schema inference added * patch review fix * Update docs/integrations/sources/s3.md doc update Co-authored-by: George Claireaux <[email protected]> * changing the version * changing the title to sync with other type * fix expected csv records * fix expected records for avro and parquet * review fix * fixed master schema handling * remove sample configs * fix expected records * json doc update added more details on json parser * fixed api name * bump version * auto-bump connector version [ci skip] Co-authored-by: alafanechere <[email protected]> Co-authored-by: George Claireaux <[email protected]> Co-authored-by: George Claireaux <[email protected]> Co-authored-by: Octavia Squidington III <[email protected]>
1 parent c5a98f3 commit 3d49955

38 files changed

+556
-29
lines changed

airbyte-config/init/src/main/resources/seed/source_definitions.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -833,7 +833,7 @@
833833
- name: S3
834834
sourceDefinitionId: 69589781-7828-43c5-9f63-8925b1c1ccc2
835835
dockerRepository: airbyte/source-s3
836-
dockerImageTag: 0.1.17
836+
dockerImageTag: 0.1.18
837837
documentationUrl: https://docs.airbyte.io/integrations/sources/s3
838838
icon: s3.svg
839839
sourceType: file

airbyte-config/init/src/main/resources/seed/source_specs.yaml

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7892,7 +7892,7 @@
78927892
supportsNormalization: false
78937893
supportsDBT: false
78947894
supported_destination_sync_modes: []
7895-
- dockerImage: "airbyte/source-s3:0.1.17"
7895+
- dockerImage: "airbyte/source-s3:0.1.18"
78967896
spec:
78977897
documentationUrl: "https://docs.airbyte.io/integrations/sources/s3"
78987898
changelogUrl: "https://docs.airbyte.io/integrations/sources/s3"
@@ -8071,6 +8071,51 @@
80718071
title: "Filetype"
80728072
const: "avro"
80738073
type: "string"
8074+
- title: "Jsonl"
8075+
description: "This connector uses <a href=\"https://arrow.apache.org/docs/python/json.html\"\
8076+
\ target=\"_blank\">PyArrow</a> for JSON Lines (jsonl) file parsing."
8077+
type: "object"
8078+
properties:
8079+
filetype:
8080+
title: "Filetype"
8081+
const: "jsonl"
8082+
type: "string"
8083+
newlines_in_values:
8084+
title: "Allow newlines in values"
8085+
description: "Whether newline characters are allowed in JSON values.\
8086+
\ Turning this on may affect performance. Leave blank to default\
8087+
\ to False."
8088+
default: false
8089+
order: 0
8090+
type: "boolean"
8091+
unexpected_field_behavior:
8092+
title: "Unexpected field behavior"
8093+
description: "How JSON fields outside of explicit_schema (if given)\
8094+
\ are treated. Check <a href=\"https://arrow.apache.org/docs/python/generated/pyarrow.json.ParseOptions.html\"\
8095+
\ target=\"_blank\">PyArrow documentation</a> for details"
8096+
default: "infer"
8097+
examples:
8098+
- "ignore"
8099+
- "infer"
8100+
- "error"
8101+
order: 1
8102+
allOf:
8103+
- title: "UnexpectedFieldBehaviorEnum"
8104+
description: "An enumeration."
8105+
enum:
8106+
- "ignore"
8107+
- "infer"
8108+
- "error"
8109+
type: "string"
8110+
block_size:
8111+
title: "Block Size"
8112+
description: "The chunk size in bytes to process at a time in memory\
8113+
\ from each file. If your data is particularly wide and failing\
8114+
\ during schema detection, increasing this should solve it. Beware\
8115+
\ of raising this too high as you could hit OOM errors."
8116+
default: 10000
8117+
order: 2
8118+
type: "integer"
80748119
schema:
80758120
title: "Manually enforced data schema (Optional)"
80768121
description: "Optionally provide a schema to enforce, as a valid JSON string.\

airbyte-integrations/connectors/source-s3/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ COPY source_s3 ./source_s3
1717
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
1818
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
1919

20-
LABEL io.airbyte.version=0.1.17
20+
LABEL io.airbyte.version=0.1.18
2121
LABEL io.airbyte.name=airbyte/source-s3

airbyte-integrations/connectors/source-s3/acceptance-test-config.yml

Lines changed: 57 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@ tests:
1313
status: "succeed"
1414
# # for Avro format
1515
- config_path: "secrets/avro_config.json"
16+
status:
17+
"succeed"
18+
# for JSON format
19+
- config_path: "secrets/jsonl_config.json"
20+
status: "succeed"
21+
- config_path: "secrets/jsonl_newlines_config.json"
1622
status: "succeed"
1723
# for custom server
1824
- config_path: "integration_tests/config_minio.json"
@@ -24,65 +30,92 @@ tests:
2430
- config_path: "secrets/config.json"
2531
# for Parquet format
2632
- config_path: "secrets/parquet_config.json"
27-
# # for Avro format
33+
# for Avro format
2834
- config_path: "secrets/avro_config.json"
35+
# for JSON format
36+
- config_path: "secrets/jsonl_config.json"
37+
- config_path: "secrets/jsonl_newlines_config.json"
2938
# for custom server
3039
- config_path: "integration_tests/config_minio.json"
3140
basic_read:
3241
# for CSV format
3342
- config_path: "secrets/config.json"
3443
timeout_seconds: 1800
35-
configured_catalog_path: "integration_tests/configured_catalog.json"
44+
configured_catalog_path: "integration_tests/configured_catalogs/csv.json"
3645
expect_records:
37-
path: "integration_tests/expected_records.txt"
46+
path: "integration_tests/expected_records/csv.txt"
3847
# for Parquet format
3948
- config_path: "secrets/parquet_config.json"
4049
timeout_seconds: 1800
41-
configured_catalog_path: "integration_tests/parquet_configured_catalog.json"
50+
configured_catalog_path: "integration_tests/configured_catalogs/parquet.json"
4251
expect_records:
43-
path: "integration_tests/parquet_expected_records.txt"
52+
path: "integration_tests/expected_records/parquet.txt"
4453
# for Avro format
4554
- config_path: "secrets/avro_config.json"
4655
timeout_seconds: 1800
47-
configured_catalog_path: "integration_tests/configured_catalog.json"
56+
configured_catalog_path: "integration_tests/configured_catalogs/avro.json"
57+
expect_records:
58+
path: "integration_tests/expected_records/avro.txt"
59+
# for JSONL format
60+
- config_path: "secrets/jsonl_config.json"
61+
timeout_seconds: 1800
62+
configured_catalog_path: "integration_tests/configured_catalogs/jsonl.json"
4863
expect_records:
49-
path: "integration_tests/expected_records_avro.txt"
64+
path: "integration_tests/expected_records/jsonl.txt"
65+
- config_path: "secrets/jsonl_newlines_config.json"
66+
timeout_seconds: 1800
67+
configured_catalog_path: "integration_tests/configured_catalogs/jsonl.json"
68+
expect_records:
69+
path: "integration_tests/expected_records/jsonl_newlines.txt"
5070
# for custom server
5171
- config_path: "integration_tests/config_minio.json"
5272
timeout_seconds: 1800
53-
configured_catalog_path: "integration_tests/configured_catalog.json"
73+
configured_catalog_path: "integration_tests/configured_catalogs/csv.json"
5474
# expected records contains _ab_source_file_last_modified property which
5575
# is modified all the time s3 file changed and for custom server it is
5676
# file creating date and it always new. Uncomment this line when SAT
5777
# would have ability to ignore specific fields from expected records.
5878
# expect_records:
59-
# path: "integration_tests/expected_records_custom_server.txt.txt"
79+
# path: "integration_tests/expected_records/custom_server.txt"
6080
incremental:
6181
# for CSV format
6282
- config_path: "secrets/config.json"
6383
timeout_seconds: 1800
64-
configured_catalog_path: "integration_tests/configured_catalog.json"
84+
configured_catalog_path: "integration_tests/configured_catalogs/csv.json"
6585
cursor_paths:
6686
test: ["_ab_source_file_last_modified"]
6787
future_state_path: "integration_tests/abnormal_state.json"
6888
# for Parquet format
6989
- config_path: "secrets/parquet_config.json"
7090
timeout_seconds: 1800
71-
configured_catalog_path: "integration_tests/parquet_configured_catalog.json"
91+
configured_catalog_path: "integration_tests/configured_catalogs/parquet.json"
7292
cursor_paths:
7393
test: ["_ab_source_file_last_modified"]
7494
future_state_path: "integration_tests/abnormal_state.json"
7595
# for Avro format
7696
- config_path: "secrets/avro_config.json"
7797
timeout_seconds: 1800
78-
configured_catalog_path: "integration_tests/configured_catalog.json"
98+
configured_catalog_path: "integration_tests/configured_catalogs/avro.json"
99+
cursor_paths:
100+
test: ["_ab_source_file_last_modified"]
101+
future_state_path: "integration_tests/abnormal_state.json"
102+
# for JSON format
103+
- config_path: "secrets/jsonl_config.json"
104+
timeout_seconds: 1800
105+
configured_catalog_path: "integration_tests/configured_catalogs/jsonl.json"
106+
cursor_paths:
107+
test: ["_ab_source_file_last_modified"]
108+
future_state_path: "integration_tests/abnormal_state.json"
109+
- config_path: "secrets/jsonl_newlines_config.json"
110+
timeout_seconds: 1800
111+
configured_catalog_path: "integration_tests/configured_catalogs/jsonl.json"
79112
cursor_paths:
80113
test: ["_ab_source_file_last_modified"]
81114
future_state_path: "integration_tests/abnormal_state.json"
82115
# for custom server
83116
- config_path: "integration_tests/config_minio.json"
84117
timeout_seconds: 1800
85-
configured_catalog_path: "integration_tests/configured_catalog.json"
118+
configured_catalog_path: "integration_tests/configured_catalogs/csv.json"
86119
cursor_paths:
87120
test: ["_ab_source_file_last_modified"]
88121
future_state_path: "integration_tests/abnormal_state.json"
@@ -91,16 +124,23 @@ tests:
91124
# for CSV format
92125
- config_path: "secrets/config.json"
93126
timeout_seconds: 1800
94-
configured_catalog_path: "integration_tests/configured_catalog.json"
127+
configured_catalog_path: "integration_tests/configured_catalogs/csv.json"
95128
# for Parquet format
96129
- config_path: "secrets/parquet_config.json"
97130
timeout_seconds: 1800
98-
configured_catalog_path: "integration_tests/parquet_configured_catalog.json"
131+
configured_catalog_path: "integration_tests/configured_catalogs/parquet.json"
99132
# for Avro format
100133
- config_path: "secrets/avro_config.json"
101134
timeout_seconds: 1800
102-
configured_catalog_path: "integration_tests/configured_catalog.json"
135+
configured_catalog_path: "integration_tests/configured_catalogs/avro.json"
136+
# for JSON format
137+
- config_path: "secrets/jsonl_config.json"
138+
timeout_seconds: 1800
139+
configured_catalog_path: "integration_tests/configured_catalogs/jsonl.json"
140+
- config_path: "secrets/jsonl_newlines_config.json"
141+
timeout_seconds: 1800
142+
configured_catalog_path: "integration_tests/configured_catalogs/jsonl.json"
103143
# for custom server
104144
- config_path: "integration_tests/config_minio.json"
105145
timeout_seconds: 1800
106-
configured_catalog_path: "integration_tests/configured_catalog.json"
146+
configured_catalog_path: "integration_tests/configured_catalogs/csv.json"
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"dataset": "test",
3+
"provider": {
4+
"storage": "S3",
5+
"bucket": "test-bucket",
6+
"aws_access_key_id": "123456",
7+
"aws_secret_access_key": "123456key",
8+
"path_prefix": "",
9+
"endpoint": "http://10.0.3.185:9000"
10+
},
11+
"format": {
12+
"filetype": "csv"
13+
},
14+
"path_pattern": "*.csv",
15+
"schema": "{}"
16+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"streams": [
3+
{
4+
"stream": {
5+
"name": "test",
6+
"json_schema": {},
7+
"supported_sync_modes": ["full_refresh", "incremental"],
8+
"source_defined_cursor": true,
9+
"default_cursor_field": ["_ab_source_file_last_modified"]
10+
},
11+
"sync_mode": "incremental",
12+
"destination_sync_mode": "append"
13+
}
14+
]
15+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"streams": [
3+
{
4+
"stream": {
5+
"name": "test",
6+
"json_schema": {},
7+
"supported_sync_modes": ["full_refresh", "incremental"],
8+
"source_defined_cursor": true,
9+
"default_cursor_field": ["_ab_source_file_last_modified"]
10+
},
11+
"sync_mode": "incremental",
12+
"destination_sync_mode": "append"
13+
}
14+
]
15+
}

airbyte-integrations/connectors/source-s3/integration_tests/expected_records_avro.txt renamed to airbyte-integrations/connectors/source-s3/integration_tests/expected_records/avro.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77
{"stream": "test", "data": {"id": 6, "fullname_and_valid": {"fullname": "MRNMXFkXZo", "valid": true}, "_ab_additional_properties": {}, "_ab_source_file_last_modified": "2022-05-11T11:54:11+0000", "_ab_source_file_url": "test_sample.avro"}, "emitted_at": 10000000}
88
{"stream": "test", "data": {"id": 7, "fullname_and_valid": {"fullname": "MXvEWMgnIr", "valid": true}, "_ab_additional_properties": {}, "_ab_source_file_last_modified": "2022-05-11T11:54:11+0000", "_ab_source_file_url": "test_sample.avro"}, "emitted_at": 10000000}
99
{"stream": "test", "data": {"id": 8, "fullname_and_valid": {"fullname": "rqmFGqZqdF", "valid": true}, "_ab_additional_properties": {}, "_ab_source_file_last_modified": "2022-05-11T11:54:11+0000", "_ab_source_file_url": "test_sample.avro"}, "emitted_at": 10000000}
10-
{"stream": "test", "data": {"id": 9, "fullname_and_valid": {"fullname": "lmPpQTcPFM", "valid": true}, "_ab_additional_properties": {}, "_ab_source_file_last_modified": "2022-05-11T11:54:11+0000", "_ab_source_file_url": "test_sample.avro"}, "emitted_at": 10000000}
10+
{"stream": "test", "data": {"id": 9, "fullname_and_valid": {"fullname": "lmPpQTcPFM", "valid": true}, "_ab_additional_properties": {}, "_ab_source_file_last_modified": "2022-05-11T11:54:11+0000", "_ab_source_file_url": "test_sample.avro"}, "emitted_at": 10000000}

0 commit comments

Comments
 (0)