🎉 Source S3 - memory & performance optimisations + advanced CSV options (#6615)

Phlair · jzhuan-icims · davinchia · web-flow · commit 1d3a17a8fb2e · 2021-10-19T16:50:51.000+01:00
* memory &amp; performance optimisations

* address comments

* version bump

* added advanced_options for reading csv without header, and more custom pyarrow ReadOptions

* updated to use the latest airbyte-cdk

* updated docs

* bump source-s3 to 0.1.6

* remove unneeded lines

* Use the all dep ami for python builds.

* ec2-instance-id should be ec2-image-id

* ec2-instance-id should be ec2-image-id

Co-authored-by: Jingkun Zhuang &lt;Jingkun.Zhuang@icims.com&gt;
Co-authored-by: Davin Chia &lt;davinchia@gmail.com&gt;
diff --git a/.github/workflows/publish-command.yml b/.github/workflows/publish-command.yml
@@ -34,6 +34,7 @@ jobs:
           aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }}
           github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }}
+          ec2-image-id: ami-0d648081937c75a73
   publish-image:
     needs: start-publish-image-runner
     runs-on: ${{ needs.start-publish-image-runner.outputs.label }}
diff --git a/.github/workflows/test-command.yml b/.github/workflows/test-command.yml
@@ -33,6 +33,7 @@ jobs:
           aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }}
           github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }}
+          ec2-image-id: ami-0d648081937c75a73
   integration-test:
     timeout-minutes: 240
     needs: start-test-runner
diff --git a/airbyte-config/init/src/main/resources/config/STANDARD_SOURCE_DEFINITION/69589781-7828-43c5-9f63-8925b1c1ccc2.json b/airbyte-config/init/src/main/resources/config/STANDARD_SOURCE_DEFINITION/69589781-7828-43c5-9f63-8925b1c1ccc2.json
@@ -2,6 +2,6 @@
   "sourceDefinitionId": "69589781-7828-43c5-9f63-8925b1c1ccc2",
   "name": "S3",
   "dockerRepository": "airbyte/source-s3",
-  "dockerImageTag": "0.1.5",
+  "dockerImageTag": "0.1.6",
   "documentationUrl": "https://docs.airbyte.io/integrations/sources/s3"
 }
diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml
@@ -85,7 +85,7 @@
 - sourceDefinitionId: 69589781-7828-43c5-9f63-8925b1c1ccc2
   name: S3
   dockerRepository: airbyte/source-s3
-  dockerImageTag: 0.1.5
+  dockerImageTag: 0.1.6
   documentationUrl: https://docs.airbyte.io/integrations/sources/s3
   sourceType: file
 - sourceDefinitionId: fbb5fbe2-16ad-4cf4-af7d-ff9d9c316c87
diff --git a/airbyte-integrations/connectors/source-s3/Dockerfile b/airbyte-integrations/connectors/source-s3/Dockerfile
@@ -17,7 +17,7 @@ COPY source_s3 ./source_s3
 ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
 
-LABEL io.airbyte.version=0.1.5
+LABEL io.airbyte.version=0.1.6
 LABEL io.airbyte.name=airbyte/source-s3
 
 
diff --git a/airbyte-integrations/connectors/source-s3/integration_tests/spec.json b/airbyte-integrations/connectors/source-s3/integration_tests/spec.json
@@ -93,6 +93,15 @@
                   "{\"timestamp_parsers\": [\"%m/%d/%Y %H:%M\", \"%Y/%m/%d %H:%M\"], \"strings_can_be_null\": true, \"null_values\": [\"NA\", \"NULL\"]}"
                 ],
                 "type": "string"
+              },
+              "advanced_options": {
+                "title": "Advanced Options",
+                "description": "Optionally add a valid JSON string here to provide additional <a href=\"https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html#pyarrow.csv.ReadOptions\" target=\"_blank\">Pyarrow ReadOptions</a>. Specify 'column_names' here if your CSV doesn't have header, or if you want to use custom column names. 'block_size' and 'encoding' are already used above, specify them again here will override the values above.",
+                "default": "{}",
+                "examples": [
+                  "{\"column_names\": [\"column1\", \"column2\"]}"
+                ],
+                "type": "string"
               }
             }
           },
diff --git a/airbyte-integrations/connectors/source-s3/setup.py b/airbyte-integrations/connectors/source-s3/setup.py
@@ -6,7 +6,7 @@
 from setuptools import find_packages, setup
 
 MAIN_REQUIREMENTS = [
-    "airbyte-cdk~=0.1.7",
+    "airbyte-cdk~=0.1.28",
     "pyarrow==4.0.1",
     "smart-open[s3]==5.1.0",
     "wcmatch==8.2",
diff --git a/airbyte-integrations/connectors/source-s3/source_s3/source_files_abstract/formats/csv_parser.py b/airbyte-integrations/connectors/source-s3/source_s3/source_files_abstract/formats/csv_parser.py
@@ -28,7 +28,10 @@ def _read_options(self):
         https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html
         build ReadOptions object like: pa.csv.ReadOptions(**self._read_options())
         """
-        return {"block_size": self._format.get("block_size", 10000), "encoding": self._format.get("encoding", "utf8")}
+        return {
+            **{"block_size": self._format.get("block_size", 10000), "encoding": self._format.get("encoding", "utf8")},
+            **json.loads(self._format.get("advanced_options", "{}")),
+        }
 
     def _parse_options(self):
         """
diff --git a/airbyte-integrations/connectors/source-s3/source_s3/source_files_abstract/formats/csv_spec.py b/airbyte-integrations/connectors/source-s3/source_s3/source_files_abstract/formats/csv_spec.py
@@ -50,3 +50,8 @@ class Config:
             '{"timestamp_parsers": ["%m/%d/%Y %H:%M", "%Y/%m/%d %H:%M"], "strings_can_be_null": true, "null_values": ["NA", "NULL"]}'
         ],
     )
+    advanced_options: str = Field(
+        default="{}",
+        description="Optionally add a valid JSON string here to provide additional <a href=\"https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html#pyarrow.csv.ReadOptions\" target=\"_blank\">Pyarrow ReadOptions</a>. Specify 'column_names' here if your CSV doesn't have header, or if you want to use custom column names. 'block_size' and 'encoding' are already used above, specify them again here will override the values above.",
+        examples=["{\"column_names\": [\"column1\", \"column2\"]}"],
+    )
diff --git a/airbyte-integrations/connectors/source-s3/source_s3/source_files_abstract/stream.py b/airbyte-integrations/connectors/source-s3/source_s3/source_files_abstract/stream.py
diff --git a/airbyte-integrations/connectors/source-s3/source_s3/stream.py b/airbyte-integrations/connectors/source-s3/source_s3/stream.py
@@ -36,7 +36,7 @@ def _list_bucket(self, accept_key=lambda k: True) -> Iterator[str]:
         else:
             session = boto3session.Session()
             client_config = Config(signature_version=UNSIGNED)
-        client = make_s3_client(self._provider, config=client_config, session=session)
+        client = make_s3_client(provider, config=client_config, session=session)
 
         ctoken = None
         while True:
diff --git a/airbyte-integrations/connectors/source-s3/unit_tests/sample_files/csv/test_file_8_no_header.csv b/airbyte-integrations/connectors/source-s3/unit_tests/sample_files/csv/test_file_8_no_header.csv
@@ -0,0 +1,8 @@
+1,PVdhmjb1,False,12,-31.3,2021-07-14,2021-07-14 15:30:09.224125
+2,j4DyXTS7,True,-8,41.6,2021-07-14,2021-07-14 15:30:09.224383
+3,v0w8fTME,False,7,-27.5,2021-07-14,2021-07-14 15:30:09.224527
+4,1q6jD8Np,False,-8,-6.7,2021-07-14,2021-07-14 15:30:09.224741
+5,77h4aiMP,True,-15,-13.7,2021-07-14,2021-07-14 15:30:09.224907
+6,Le35Wyic,True,3,35.3,2021-07-14,2021-07-14 15:30:09.225033
+7,xZhh1Kyl,False,10,-9.2,2021-07-14,2021-07-14 15:30:09.225145
+8,M2t286iJ,False,4,-3.5,2021-07-14,2021-07-14 15:30:09.225320
diff --git a/airbyte-integrations/connectors/source-s3/unit_tests/test_csv_parser.py b/airbyte-integrations/connectors/source-s3/unit_tests/test_csv_parser.py
@@ -2,6 +2,7 @@
 # Copyright (c) 2021 Airbyte, Inc., all rights reserved.
 #
 
+import json
 import os
 from pathlib import Path
 from typing import Any, List, Mapping
@@ -249,4 +250,30 @@ def test_files(self) -> List[Mapping[str, Any]]:
                 "line_checks": {},
                 "fails": ["test_get_inferred_schema", "test_stream_records"],
             },
+            {
+                # no header test
+                "test_alias": "no header csv file",
+                "AbstractFileParser": CsvParser(
+                    format={
+                        "filetype": "csv",
+                        "advanced_options": json.dumps({
+                            "column_names": ["id", "name", "valid", "code", "degrees", "birthday", "last_seen"]
+                        })
+                    },
+                    master_schema={}
+                ),
+                "filepath": os.path.join(SAMPLE_DIRECTORY, "csv/test_file_8_no_header.csv"),
+                "num_records": 8,
+                "inferred_schema": {
+                    "id": "integer",
+                    "name": "string",
+                    "valid": "boolean",
+                    "code": "integer",
+                    "degrees": "number",
+                    "birthday": "string",
+                    "last_seen": "string",
+                },
+                "line_checks": {},
+                "fails": [],
+            },
         ]
diff --git a/airbyte-webapp/src/App.tsx b/airbyte-webapp/src/App.tsx
@@ -44,6 +44,9 @@ const Features: Feature[] = [
   {
     id: FeatureItem.AllowUpdateConnectors,
   },
+  {
+    id: FeatureItem.AllowOAuthConnector,
+  },
 ];
 
 const StyleProvider: React.FC = ({ children }) => (
diff --git a/docs/integrations/sources/s3.md b/docs/integrations/sources/s3.md
@@ -181,14 +181,16 @@ Since CSV files are effectively plain text, providing specific reader options is
 * `double_quote` : Whether two quotes in a quoted CSV value denote a single quote in the data.
 * `newlines_in_values` : Sometimes referred to as `multiline`. In most cases, newline characters signal the end of a row in a CSV, however text data may contain newline characters within it. Setting this to True allows correct parsing in this case.
 * `block_size` : This is the number of bytes to process in memory at a time while reading files. The default value here is usually fine but if your table is particularly wide \(lots of columns / data in fields is large\) then raising this might solve failures on detecting schema. Since this defines how much data to read into memory, raising this too high could cause Out Of Memory issues so use with caution.
+* `additional_reader_options` : This allows for editing the less commonly required CSV [ConvertOptions](https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions). The value must be a valid JSON string, e.g.:
 
-The final setting in the UI is `additional_reader_options`. This is a catch-all to allow for editing the less commonly required CSV parsing options. The value must be a valid JSON string, e.g.:
+    ```text
+    {"timestamp_parsers": ["%m/%d/%Y %H:%M", "%Y/%m/%d %H:%M"], "strings_can_be_null": true, "null_values": ["NA", "NULL"]}
+    ```
+* `advanced_options` : This allows for editing the less commonly required CSV [ReadOptions](https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html#pyarrow.csv.ReadOptions). The value must be a valid JSON string. One use case for this is when your CSV has no header, or you want to use custom column names, you can specify `column_names` using this option.
 
-```text
-{"timestamp_parsers": ["%m/%d/%Y %H:%M", "%Y/%m/%d %H:%M"], "strings_can_be_null": true, "null_values": ["NA", "NULL"]}
-```
-
-You can find details on [available options here](https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions).
+    ```test
+    {"column_names": ["column1", "column2", "column3"]}
+    ```
 
 #### Parquet
 
@@ -204,6 +206,7 @@ You can find details on [here](https://arrow.apache.org/docs/python/generated/py
 
 | Version | Date | Pull Request | Subject |
 | :--- | :--- | :--- | :--- |
+| 0.1.6 | 2021-10-15 | [6615](https://github.com/airbytehq/airbyte/pull/6615) & [7058](https://github.com/airbytehq/airbyte/pull/7058) | Memory and performance optimisation. Advanced options for CSV parsing. |
 | 0.1.5 | 2021-09-24 | [6398](https://github.com/airbytehq/airbyte/pull/6398) | Support custom non Amazon S3 services |
 | 0.1.4 | 2021-08-13 | [5305](https://github.com/airbytehq/airbyte/pull/5305) | Support of Parquet format |
 | 0.1.3 | 2021-08-04 | [5197](https://github.com/airbytehq/airbyte/pull/5197) | Fixed bug where sync could hang indefinitely on schema inference |

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,6 @@`
`2`	`2`	`"sourceDefinitionId": "69589781-7828-43c5-9f63-8925b1c1ccc2",`
`3`	`3`	`"name": "S3",`
`4`	`4`	`"dockerRepository": "airbyte/source-s3",`
`5`		`- "dockerImageTag": "0.1.5",`
	`5`	`+ "dockerImageTag": "0.1.6",`
`6`	`6`	`"documentationUrl": "https://docs.airbyte.io/integrations/sources/s3"`
`7`	`7`	`}`
Original file line number	Diff line number	Diff line change
`@@ -50,3 +50,8 @@ class Config:`
`50`	`50`	`'{"timestamp_parsers": ["%m/%d/%Y %H:%M", "%Y/%m/%d %H:%M"], "strings_can_be_null": true, "null_values": ["NA", "NULL"]}'`
`51`	`51`	`],`
`52`	`52`	`)`
	`53`	`+ advanced_options: str = Field(`
	`54`	`+ default="{}",`
	`55`	`+ description="Optionally add a valid JSON string here to provide additional <a href=\"https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html#pyarrow.csv.ReadOptions\" target=\"_blank\">Pyarrow ReadOptions</a>. Specify 'column_names' here if your CSV doesn't have header, or if you want to use custom column names. 'block_size' and 'encoding' are already used above, specify them again here will override the values above.",`
	`56`	`+ examples=["{\"column_names\": [\"column1\", \"column2\"]}"],`
	`57`	`+ )`