Skip to content

Commit 8d6f159

Browse files
authored
Source azure blob storage: certification (#37504)
Signed-off-by: Artem Inzhyyants <[email protected]>
1 parent 005e12c commit 8d6f159

File tree

8 files changed

+97
-40
lines changed

8 files changed

+97
-40
lines changed

airbyte-integrations/connectors/source-azure-blob-storage/integration_tests/spec.json

+9-9
Original file line numberDiff line numberDiff line change
@@ -351,18 +351,11 @@
351351
"required": ["name", "format"]
352352
}
353353
},
354-
"azure_blob_storage_account_name": {
355-
"title": "Azure Blob Storage account name",
356-
"description": "The account's name of the Azure Blob Storage.",
357-
"examples": ["airbyte5storage"],
358-
"order": 2,
359-
"type": "string"
360-
},
361354
"credentials": {
362355
"title": "Authentication",
363356
"description": "Credentials for connecting to the Azure Blob Storage",
364357
"type": "object",
365-
"order": 3,
358+
"order": 2,
366359
"oneOf": [
367360
{
368361
"title": "Authenticate via Oauth2",
@@ -434,6 +427,13 @@
434427
}
435428
]
436429
},
430+
"azure_blob_storage_account_name": {
431+
"title": "Azure Blob Storage account name",
432+
"description": "The account's name of the Azure Blob Storage.",
433+
"examples": ["airbyte5storage"],
434+
"order": 3,
435+
"type": "string"
436+
},
437437
"azure_blob_storage_container_name": {
438438
"title": "Azure blob storage container (Bucket) Name",
439439
"description": "The name of the Azure blob storage container.",
@@ -451,8 +451,8 @@
451451
},
452452
"required": [
453453
"streams",
454-
"azure_blob_storage_account_name",
455454
"credentials",
455+
"azure_blob_storage_account_name",
456456
"azure_blob_storage_container_name"
457457
]
458458
},

airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,19 @@
11
data:
22
ab_internal:
3-
ql: 100
4-
sl: 100
3+
ql: 400
4+
sl: 200
55
connectorBuildOptions:
66
baseImage: docker.io/airbyte/python-connector-base:1.2.0@sha256:c22a9d97464b69d6ef01898edf3f8612dc11614f05a84984451dde195f337db9
77
connectorSubtype: file
88
connectorType: source
99
definitionId: fdaaba68-4875-4ed9-8fcd-4ae1e0a25093
10-
dockerImageTag: 0.4.1
10+
dockerImageTag: 0.4.2
1111
dockerRepository: airbyte/source-azure-blob-storage
1212
documentationUrl: https://docs.airbyte.com/integrations/sources/azure-blob-storage
1313
githubIssueLabel: source-azure-blob-storage
1414
icon: azureblobstorage.svg
1515
license: MIT
16+
maxSecondsBetweenMessages: 1
1617
name: Azure Blob Storage
1718
remoteRegistries:
1819
pypi:
@@ -23,8 +24,8 @@ data:
2324
enabled: true
2425
oss:
2526
enabled: true
26-
releaseStage: alpha
27-
supportLevel: community
27+
releaseStage: generally_available
28+
supportLevel: certified
2829
tags:
2930
- language:python
3031
- cdk:python-file-based

airbyte-integrations/connectors/source-azure-blob-storage/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ requires = [ "poetry-core>=1.0.0",]
33
build-backend = "poetry.core.masonry.api"
44

55
[tool.poetry]
6-
version = "0.4.1"
6+
version = "0.4.2"
77
name = "source-azure-blob-storage"
88
description = "Source implementation for Azure Blob Storage."
99
authors = [ "Airbyte <[email protected]>",]

airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/source.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,8 @@
22
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
33
#
44

5-
from typing import Any, Mapping
5+
from typing import Any
66

7-
from airbyte_cdk.config_observation import emit_configuration_as_airbyte_control_message
87
from airbyte_cdk.sources.declarative.models import OAuthConfigSpecification
98
from airbyte_cdk.sources.file_based.file_based_source import FileBasedSource
109
from airbyte_protocol.models import AdvancedAuth, ConnectorSpecification

airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/spec.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -59,17 +59,17 @@ class SourceAzureBlobStorageSpec(AbstractFileBasedSpec):
5959
def documentation_url(cls) -> AnyUrl:
6060
return AnyUrl("https://docs.airbyte.com/integrations/sources/azure-blob-storage", scheme="https")
6161

62-
azure_blob_storage_account_name: str = Field(
63-
title="Azure Blob Storage account name",
64-
description="The account's name of the Azure Blob Storage.",
65-
examples=["airbyte5storage"],
66-
order=2,
67-
)
6862
credentials: Union[Oauth2, StorageAccountKey] = Field(
6963
title="Authentication",
7064
description="Credentials for connecting to the Azure Blob Storage",
7165
discriminator="auth_type",
7266
type="object",
67+
order=2,
68+
)
69+
azure_blob_storage_account_name: str = Field(
70+
title="Azure Blob Storage account name",
71+
description="The account's name of the Azure Blob Storage.",
72+
examples=["airbyte5storage"],
7373
order=3,
7474
)
7575
azure_blob_storage_container_name: str = Field(

airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/stream_reader.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
99
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
1010
from airbyte_cdk.sources.streams.http.requests_native_auth import Oauth2Authenticator
11+
from airbyte_cdk.utils import AirbyteTracedException
12+
from airbyte_protocol.models import FailureType
1113
from azure.core.credentials import AccessToken
14+
from azure.core.exceptions import ResourceNotFoundError
1215
from azure.storage.blob import BlobServiceClient, ContainerClient
1316
from smart_open import open
1417

@@ -80,10 +83,13 @@ def get_matching_files(
8083
) -> Iterable[RemoteFile]:
8184
prefixes = [prefix] if prefix else self.get_prefixes_from_globs(globs)
8285
prefixes = prefixes or [None]
83-
for prefix in prefixes:
84-
for blob in self.azure_container_client.list_blobs(name_starts_with=prefix):
85-
remote_file = RemoteFile(uri=blob.name, last_modified=blob.last_modified.astimezone(pytz.utc).replace(tzinfo=None))
86-
yield from self.filter_files_by_globs_and_start_date([remote_file], globs)
86+
try:
87+
for prefix in prefixes:
88+
for blob in self.azure_container_client.list_blobs(name_starts_with=prefix):
89+
remote_file = RemoteFile(uri=blob.name, last_modified=blob.last_modified.astimezone(pytz.utc).replace(tzinfo=None))
90+
yield from self.filter_files_by_globs_and_start_date([remote_file], globs)
91+
except ResourceNotFoundError as e:
92+
raise AirbyteTracedException(failure_type=FailureType.config_error, internal_message=e.message, message=e.reason or e.message)
8793

8894
def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
8995
try:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2+
3+
4+
import dpath.util
5+
from source_azure_blob_storage import SourceAzureBlobStorageSpec
6+
7+
8+
def test_spec():
9+
config = SourceAzureBlobStorageSpec(
10+
azure_blob_storage_endpoint="https://teststorage.blob.core.windows.net",
11+
azure_blob_storage_account_name="account1",
12+
azure_blob_storage_container_name="airbyte-source-azure-blob-storage-test",
13+
credentials={"auth_type": "storage_account_key", "azure_blob_storage_account_key": "key1"},
14+
streams=[],
15+
start_date="2024-01-01T00:00:00.000000Z",
16+
)
17+
18+
assert config.documentation_url() == "https://docs.airbyte.com/integrations/sources/azure-blob-storage"
19+
assert len(dpath.util.get(config.schema(), "properties/streams/items/properties/format/oneOf/4/properties/processing/oneOf")) == 1

docs/integrations/sources/azure-blob-storage.md

+45-13
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,33 @@ This page contains the setup guide and reference information for the Azure Blob
66
Cloud storage may incur egress costs. Egress refers to data that is transferred out of the cloud storage system, such as when you download files or access them from a different location. For more information, see the [Azure Blob Storage pricing guide](https://azure.microsoft.com/en-us/pricing/details/storage/blobs/).
77
:::
88

9+
## Prerequisites
10+
11+
- Tenant ID of the Microsoft Azure Application user
12+
- Azure Blob Storage account name
13+
- Azure blob storage container (Bucket) Name
14+
15+
<details>
16+
<summary>
17+
Minimum permissions (role [Storage Blob Data Reader](https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles/storage#storage-blob-data-reader) ):
18+
</summary>
19+
```json
20+
[
21+
{
22+
"actions": [
23+
"Microsoft.Storage/storageAccounts/blobServices/containers/read",
24+
"Microsoft.Storage/storageAccounts/blobServices/generateUserDelegationKey/action"
25+
],
26+
"notActions": [],
27+
"dataActions": [
28+
"Microsoft.Storage/storageAccounts/blobServices/containers/blobs/read"
29+
],
30+
"notDataActions": []
31+
}
32+
]
33+
```
34+
</details>
35+
936
## Setup guide
1037

1138
### Step 1: Set up Azure Blob Storage
@@ -20,7 +47,7 @@ to use role [Storage Blob Data Reader](https://learn.microsoft.com/en-gb/azure/s
2047

2148
<details>
2249
<summary>
23-
Follow this steps to setup IAM role:
50+
Follow these steps to set up an IAM role:
2451
</summary>
2552

2653
1. Go to Azure portal, select the Storage (or Container) you'd like to sync from and get to Access Control(IAM) -> Role Assignment ![Access Control (IAM)](../../.gitbook/assets/source/azure-blob-storage/access_control_iam.png)
@@ -38,19 +65,19 @@ Follow this steps to setup IAM role:
3865
2. In the left navigation bar, click **Sources**. In the top-right corner, click **+ New source**.
3966
3. Find and select **Azure Blob Storage** from the list of available sources.
4067
4. Enter the name of your Azure **Account**.
41-
5. Click **Authenticate your Azure Blob Storage account**.
68+
5. Enter your Tenant ID and Click **Authenticate your Azure Blob Storage account**.
4269
6. Log in and authorize the Azure Blob Storage account.
4370
7. Enter the name of the **Container** containing your files to replicate.
4471
8. Add a stream
4572
1. Write the **File Type**
4673
2. In the **Format** box, use the dropdown menu to select the format of the files you'd like to replicate. The supported formats are **CSV**, **Parquet**, **Avro** and **JSONL**. Toggling the **Optional fields** button within the **Format** box will allow you to enter additional configurations based on the selected format. For a detailed breakdown of these settings, refer to the [File Format section](#file-format-settings) below.
4774
3. Give a **Name** to the stream
48-
4. (Optional) - If you want to enforce a specific schema, you can enter a **Input schema**. By default, this value is set to `{}` and will automatically infer the schema from the file\(s\) you are replicating. For details on providing a custom schema, refer to the [User Schema section](#user-schema).
75+
4. (Optional)If you want to enforce a specific schema, you can enter a **Input schema**. By default, this value is set to `{}` and will automatically infer the schema from the file\(s\) you are replicating. For details on providing a custom schema, refer to the [User Schema section](#user-schema).
4976
5. Optionally, enter the **Globs** which dictates which files to be synced. This is a regular expression that allows Airbyte to pattern match the specific files to replicate. If you are replicating all the files within your bucket, use `**` as the pattern. For more precise pattern matching options, refer to the [Path Patterns section](#path-patterns) below.
5077
9. (Optional) Enter the endpoint to use for the data replication.
5178
10. (Optional) Enter the desired start date from which to begin replicating data.
5279

53-
## Supported sync modes
80+
## Supported Streams
5481

5582
The Azure Blob Storage source connector supports the following [sync modes](https://docs.airbyte.com/cloud/core-concepts#connection-sync-modes):
5683

@@ -63,7 +90,7 @@ The Azure Blob Storage source connector supports the following [sync modes](http
6390
| Replicate Multiple Streams \(distinct tables\) | Yes |
6491
| Namespaces | No |
6592

66-
## File Compressions
93+
### File Compressions
6794

6895
| Compression | Supported? |
6996
|:------------|:-----------|
@@ -76,7 +103,7 @@ The Azure Blob Storage source connector supports the following [sync modes](http
76103

77104
Please let us know any specific compressions you'd like to see support for next!
78105

79-
## Path Patterns
106+
### Path Patterns
80107

81108
\(tl;dr -&gt; path pattern syntax using [wcmatch.glob](https://facelessuser.github.io/wcmatch/glob/). GLOBSTAR and SPLIT flags are enabled.\)
82109

@@ -126,7 +153,7 @@ We want to pick up part1.csv, part2.csv and part3.csv \(excluding another_part1.
126153

127154
As you can probably tell, there are many ways to achieve the same goal with path patterns. We recommend using a pattern that ensures clarity and is robust against future additions to the directory structure.
128155

129-
## User Schema
156+
### User Schema
130157

131158
Providing a schema allows for more control over the output of this stream. Without a provided schema, columns and datatypes will be inferred from the first created file in the bucket matching your path pattern and suffix. This will probably be fine in most cases but there may be situations you want to enforce a schema instead, e.g.:
132159

@@ -150,9 +177,9 @@ For example:
150177
- `{"id": "integer", "location": "string", "longitude": "number", "latitude": "number"}`
151178
- `{"username": "string", "friends": "array", "information": "object"}`
152179

153-
## File Format Settings
180+
### File Format Settings
154181

155-
### CSV
182+
#### CSV
156183

157184
Since CSV files are effectively plain text, providing specific reader options is often required for correct parsing of the files. These settings are applied when a CSV is created or exported so please ensure that this process happens consistently over time.
158185

@@ -180,24 +207,24 @@ Leaving this field blank (default option) will disallow escaping.
180207
- **True Values**: A set of case-sensitive strings that should be interpreted as true values.
181208

182209

183-
### Parquet
210+
#### Parquet
184211

185212
Apache Parquet is a column-oriented data storage format of the Apache Hadoop ecosystem. It provides efficient data compression and encoding schemes with enhanced performance to handle complex data in bulk. At the moment, partitioned parquet datasets are unsupported. The following settings are available:
186213

187214
- **Convert Decimal Fields to Floats**: Whether to convert decimal fields to floats. There is a loss of precision when converting decimals to floats, so this is not recommended.
188215

189-
### Avro
216+
#### Avro
190217

191218
The Avro parser uses the [Fastavro library](https://fastavro.readthedocs.io/en/latest/). The following settings are available:
192219
- **Convert Double Fields to Strings**: Whether to convert double fields to strings. This is recommended if you have decimal numbers with a high degree of precision because there can be a loss precision when handling floating point numbers.
193220

194-
### JSONL
221+
#### JSONL
195222

196223
There are currently no options for JSONL parsing.
197224

198225
<FieldAnchor field="streams.0.format[unstructured],streams.1.format[unstructured],streams.2.format[unstructured]">
199226

200-
### Document File Type Format (Experimental)
227+
#### Document File Type Format (Experimental)
201228

202229
:::warning
203230
The Document File Type Format is currently an experimental feature and not subject to SLAs. Use at your own risk.
@@ -213,10 +240,15 @@ This connector utilizes the open source [Unstructured](https://unstructured-io.g
213240

214241
</FieldAnchor>
215242

243+
## Performance considerations
244+
245+
The Azure Blob Storage connector should not encounter any [Microsoft API limitations](https://learn.microsoft.com/en-us/azure/storage/blobs/scalability-targets#scale-targets-for-blob-storage) under normal usage.
246+
216247
## Changelog
217248

218249
| Version | Date | Pull Request | Subject |
219250
|:--------|:-----------|:---------------------------------------------------------|:---------------------------------------------------------------------------------------------|
251+
| 0.4.2 | 2024-04-23 | [37504](https://github.com/airbytehq/airbyte/pull/37504) | Update specification |
220252
| 0.4.1 | 2024-04-22 | [37467](https://github.com/airbytehq/airbyte/pull/37467) | Fix start date filter |
221253
| 0.4.0 | 2024-04-05 | [36825](https://github.com/airbytehq/airbyte/pull/36825) | Add oauth 2.0 support |
222254
| 0.3.6 | 2024-04-03 | [36542](https://github.com/airbytehq/airbyte/pull/36542) | Use Latest CDK; add integration tests |

0 commit comments

Comments
 (0)