Skip to content

feat(source-gcs): Enable all file types for GCS #44015

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ acceptance_tests:
tests:
- spec_path: integration_tests/spec.json
backward_compatibility_tests_config:
disable_for_version: 0.2.0
disable_for_version: 0.5.0
connection:
tests:
- config_path: "secrets/config.json"
Expand All @@ -17,16 +17,24 @@ acceptance_tests:
tests:
- config_path: "secrets/config.json"
timeout_seconds: 2400
- config_path: "secrets/config_jsonl.json"
timeout_seconds: 2400
basic_read:
tests:
- config_path: "secrets/config.json"
expect_trace_message_on_failure: false
- config_path: "secrets/config_jsonl.json"
expect_trace_message_on_failure: false
incremental:
tests:
- config_path: "secrets/config.json"
configured_catalog_path: "integration_tests/configured_catalog.json"
future_state:
future_state_path: "integration_tests/abnormal_state.json"
- config_path: "secrets/config_jsonl.json"
configured_catalog_path: "integration_tests/configured_catalog_jsonl.json"
future_state:
future_state_path: "integration_tests/abnormal_state_jsonl.json"
full_refresh:
tests:
- config_path: "secrets/config.json"
Expand All @@ -38,3 +46,9 @@ acceptance_tests:
example_2:
- name: _ab_source_file_url
bypass_reason: "Uri has autogenerated token in query params"
- config_path: "secrets/config_jsonl.json"
configured_catalog_path: "integration_tests/configured_catalog_jsonl.json"
ignored_fields:
example_1:
- name: _ab_source_file_url
bypass_reason: "Uri has autogenerated token in query params"
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[
{
"type": "STREAM",
"stream": {
"stream_state": {
"_ab_source_file_last_modified": "2094-03-21T16:13:20.571000Z_https://storage.googleapis.com/airbyte-integration-test-source-gcs/test_folder/test_data_1.jsonl",
"history": {
"https://storage.googleapis.com/airbyte-integration-test-source-gcs/test_folder/test_data_1.jsonl": "2094-03-21T16:13:20.571000Z"
}
},
"stream_descriptor": {
"name": "example_1_jsonl"
}
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"streams": [
{
"stream": {
"name": "example_1_jsonl",
"json_schema": {},
"supported_sync_modes": ["full_refresh", "incremental"]
},
"sync_mode": "incremental",
"destination_sync_mode": "overwrite"
}
]
}
197 changes: 191 additions & 6 deletions airbyte-integrations/connectors/source-gcs/integration_tests/spec.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,22 @@
},
"streams": {
"title": "The list of streams to sync",
"description": "Each instance of this configuration defines a <a href=https://docs.airbyte.com/cloud/core-concepts#stream>stream</a>. Use this to define which files belong in the stream, their format, and how they should be parsed and validated. When sending data to warehouse destination such as Snowflake or BigQuery, each stream is a separate table.",
"order": 3,
"description": "Each instance of this configuration defines a <a href=\"https://docs.airbyte.com/cloud/core-concepts#stream\">stream</a>. Use this to define which files belong in the stream, their format, and how they should be parsed and validated. When sending data to warehouse destination such as Snowflake or BigQuery, each stream is a separate table.",
"order": 10,
"type": "array",
"items": {
"title": "SourceGCSStreamConfig",
"title": "FileBasedStreamConfig",
"type": "object",
"properties": {
"name": {
"title": "Name",
"description": "The name of the stream.",
"order": 0,
"type": "string"
},
"globs": {
"title": "Globs",
"description": "The pattern used to specify which files should be selected from the file system. For more information on glob pattern matching look <a href=\"https://en.wikipedia.org/wiki/Glob_(programming)\">here</a>.",
"default": ["**"],
"order": 1,
"type": "array",
"items": {
Expand All @@ -41,7 +41,7 @@
},
"legacy_prefix": {
"title": "Legacy Prefix",
"description": "The path prefix configured in previous versions of the GCS connector. This option is deprecated in favor of a single glob.",
"description": "The path prefix configured in v3 versions of the S3 connector. This option is deprecated in favor of a single glob.",
"airbyte_hidden": true,
"type": "string"
},
Expand Down Expand Up @@ -71,9 +71,27 @@
"format": {
"title": "Format",
"description": "The configuration options that are used to alter how to read incoming files that deviate from the standard formatting.",
"order": 2,
"type": "object",
"oneOf": [
{
"title": "Avro Format",
"type": "object",
"properties": {
"filetype": {
"title": "Filetype",
"default": "avro",
"const": "avro",
"type": "string"
},
"double_as_string": {
"title": "Convert Double Fields to Strings",
"description": "Whether to convert double fields to strings. This is recommended if you have decimal numbers with a high degree of precision because there can be a loss precision when handling floating point numbers.",
"default": false,
"type": "boolean"
}
},
"required": ["filetype"]
},
{
"title": "CSV Format",
"type": "object",
Expand Down Expand Up @@ -233,6 +251,173 @@
}
},
"required": ["filetype"]
},
{
"title": "Jsonl Format",
"type": "object",
"properties": {
"filetype": {
"title": "Filetype",
"default": "jsonl",
"const": "jsonl",
"type": "string"
}
},
"required": ["filetype"]
},
{
"title": "Parquet Format",
"type": "object",
"properties": {
"filetype": {
"title": "Filetype",
"default": "parquet",
"const": "parquet",
"type": "string"
},
"decimal_as_float": {
"title": "Convert Decimal Fields to Floats",
"description": "Whether to convert decimal fields to floats. There is a loss of precision when converting decimals to floats, so this is not recommended.",
"default": false,
"type": "boolean"
}
},
"required": ["filetype"]
},
{
"title": "Unstructured Document Format",
"type": "object",
"properties": {
"filetype": {
"title": "Filetype",
"default": "unstructured",
"const": "unstructured",
"type": "string"
},
"skip_unprocessable_files": {
"title": "Skip Unprocessable Files",
"description": "If true, skip files that cannot be parsed and pass the error message along as the _ab_source_file_parse_error field. If false, fail the sync.",
"default": true,
"always_show": true,
"type": "boolean"
},
"strategy": {
"title": "Parsing Strategy",
"description": "The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf",
"default": "auto",
"always_show": true,
"order": 0,
"enum": ["auto", "fast", "ocr_only", "hi_res"],
"type": "string"
},
"processing": {
"title": "Processing",
"description": "Processing configuration",
"default": {
"mode": "local"
},
"type": "object",
"discriminator": {
"propertyName": "mode",
"mapping": {
"local": "#/definitions/LocalProcessingConfigModel",
"api": "#/definitions/APIProcessingConfigModel"
}
},
"oneOf": [
{
"title": "Local",
"type": "object",
"properties": {
"mode": {
"title": "Mode",
"default": "local",
"const": "local",
"enum": ["local"],
"type": "string"
}
},
"description": "Process files locally, supporting `fast` and `ocr` modes. This is the default option.",
"required": ["mode"]
},
{
"title": "via API",
"type": "object",
"properties": {
"mode": {
"title": "Mode",
"default": "api",
"const": "api",
"enum": ["api"],
"type": "string"
},
"api_key": {
"title": "API Key",
"description": "The API key to use matching the environment",
"default": "",
"always_show": true,
"airbyte_secret": true,
"type": "string"
},
"api_url": {
"title": "API URL",
"description": "The URL of the unstructured API to use",
"default": "https://api.unstructured.io",
"always_show": true,
"examples": ["https://api.unstructured.com"],
"type": "string"
},
"parameters": {
"title": "Additional URL Parameters",
"description": "List of parameters send to the API",
"default": [],
"always_show": true,
"type": "array",
"items": {
"title": "APIParameterConfigModel",
"type": "object",
"properties": {
"name": {
"title": "Parameter name",
"description": "The name of the unstructured API parameter to use",
"examples": [
"combine_under_n_chars",
"languages"
],
"type": "string"
},
"value": {
"title": "Value",
"description": "The value of the parameter",
"examples": ["true", "hi_res"],
"type": "string"
}
},
"required": ["name", "value"]
}
}
},
"description": "Process files via an API, using the `hi_res` mode. This option is useful for increased performance and accuracy, but requires an API key and a hosted instance of unstructured.",
"required": ["mode"]
}
]
}
},
"description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file.",
"required": ["filetype"]
},
{
"title": "Excel Format",
"type": "object",
"properties": {
"filetype": {
"title": "Filetype",
"default": "excel",
"const": "excel",
"type": "string"
}
},
"required": ["filetype"]
}
]
},
Expand Down
7 changes: 6 additions & 1 deletion airbyte-integrations/connectors/source-gcs/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ data:
connectorSubtype: file
connectorType: source
definitionId: 2a8c41ae-8c23-4be0-a73f-2ab10ca1a820
dockerImageTag: 0.5.0
dockerImageTag: 0.6.0
dockerRepository: airbyte/source-gcs
documentationUrl: https://docs.airbyte.com/integrations/sources/gcs
githubIssueLabel: source-gcs
Expand Down Expand Up @@ -47,4 +47,9 @@ data:
secretStore:
type: GSM
alias: airbyte-connector-testing-secret-store
- name: SECRET_SOURCE-GCS_JSONL__CREDS
fileName: config_jsonl.json
secretStore:
type: GSM
alias: airbyte-connector-testing-secret-store
metadataSpecVersion: "1.0"
2 changes: 1 addition & 1 deletion airbyte-integrations/connectors/source-gcs/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ requires = [ "poetry-core>=1.0.0",]
build-backend = "poetry.core.masonry.api"

[tool.poetry]
version = "0.5.0"
version = "0.6.0"
name = "source-gcs"
description = "Source implementation for Gcs."
authors = [ "Airbyte <[email protected]>",]
Expand Down
Loading
Loading