Skip to content

Commit 8771ba9

Browse files
🎉 BigQuery destination mlp (#11238)
* fix bug with anyOf and allOf json blocks * updated spec and documentation * rollback last changes * updated spec * updated spec * updated bigquery permissions in doc * updated spec * updated spec * updated spec * updated tests * fixed remarks * updated tests * updated tests * added new bigquery version * updated spec * updated version * updated definitions
1 parent 656ccd5 commit 8771ba9

File tree

5 files changed

+153
-85
lines changed

5 files changed

+153
-85
lines changed

airbyte-config/init/src/main/resources/seed/destination_definitions.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
- name: BigQuery
2020
destinationDefinitionId: 22f6c74f-5699-40ff-833c-4a879ea40133
2121
dockerRepository: airbyte/destination-bigquery
22-
dockerImageTag: 0.6.12
22+
dockerImageTag: 1.0.0
2323
documentationUrl: https://docs.airbyte.io/integrations/destinations/bigquery
2424
icon: bigquery.svg
2525
resourceRequirements:

airbyte-config/init/src/main/resources/seed/destination_specs.yaml

Lines changed: 41 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@
188188
supportsDBT: false
189189
supported_destination_sync_modes:
190190
- "append"
191-
- dockerImage: "airbyte/destination-bigquery:0.6.12"
191+
- dockerImage: "airbyte/destination-bigquery:1.0.0"
192192
spec:
193193
documentationUrl: "https://docs.airbyte.io/integrations/destinations/bigquery"
194194
connectionSpecification:
@@ -201,14 +201,13 @@
201201
additionalProperties: true
202202
properties:
203203
big_query_client_buffer_size_mb:
204-
title: "Google BigQuery client chunk size"
205-
description: "Google BigQuery client's chunk(buffer) size (MIN=1, MAX =\
204+
title: "Google BigQuery Client Chunk Size (Optional)"
205+
description: "Google BigQuery client's chunk (buffer) size (MIN=1, MAX =\
206206
\ 15) for each table. The size that will be written by a single RPC. Written\
207207
\ data will be buffered and only flushed upon reaching this size or closing\
208-
\ the channel. The default 15MiB value is used if not set explicitly.\
209-
\ It's recommended to decrease value for big data sets migration for less\
210-
\ HEAP memory consumption and avoiding crashes. For more details refer\
211-
\ to https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.client.Client.html"
208+
\ the channel. The default 15MB value is used if not set explicitly. Read\
209+
\ more <a href=\"https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.client.Client.html\"\
210+
>here</a>."
212211
type: "integer"
213212
minimum: 1
214213
maximum: 15
@@ -218,18 +217,22 @@
218217
project_id:
219218
type: "string"
220219
description: "The GCP project ID for the project containing the target BigQuery\
221-
\ dataset."
220+
\ dataset. Read more <a href=\"https://cloud.google.com/iam/docs/creating-managing-service-accounts#creating\"\
221+
>here</a>."
222222
title: "Project ID"
223223
dataset_id:
224224
type: "string"
225-
description: "Default BigQuery Dataset ID tables are replicated to if the\
226-
\ source does not specify a namespace."
225+
description: "The default BigQuery Dataset ID that tables are replicated\
226+
\ to if the source does not specify a namespace. Read more <a href=\"\
227+
https://cloud.google.com/bigquery/docs/datasets#create-dataset\">here</a>."
227228
title: "Default Dataset ID"
228229
dataset_location:
229230
type: "string"
230231
description: "The location of the dataset. Warning: Changes made after creation\
231-
\ will not be applied."
232-
title: "Dataset Location"
232+
\ will not be applied. The default \"US\" value is used if not set explicitly.\
233+
\ Read more <a href=\"https://cloud.google.com/bigquery/docs/locations\"\
234+
>here</a>."
235+
title: "Dataset Location (Optional)"
233236
default: "US"
234237
enum:
235238
- "US"
@@ -266,20 +269,22 @@
266269
credentials_json:
267270
type: "string"
268271
description: "The contents of the JSON service account key. Check out the\
269-
\ <a href=\"https://docs.airbyte.io/integrations/destinations/bigquery\"\
272+
\ <a href=\"https://docs.airbyte.com/integrations/destinations/bigquery#service-account-key\"\
270273
>docs</a> if you need help generating this key. Default credentials will\
271274
\ be used if this field is left empty."
272-
title: "Credentials JSON"
275+
title: "Credentials JSON (Optional)"
273276
airbyte_secret: true
274277
transformation_priority:
275278
type: "string"
276279
description: "Interactive run type means that the query is executed as soon\
277280
\ as possible, and these queries count towards concurrent rate limit and\
278-
\ daily limit. Batch queries are queued and started as soon as idle resources\
281+
\ daily limit. Read more about interactive run type <a href=\"https://cloud.google.com/bigquery/docs/running-queries#queries\"\
282+
>here</a>. Batch queries are queued and started as soon as idle resources\
279283
\ are available in the BigQuery shared resource pool, which usually occurs\
280284
\ within a few minutes. Batch queries don’t count towards your concurrent\
281-
\ rate limit."
282-
title: "Transformation Query Run Type"
285+
\ rate limit. Read more about batch queries <a href=\"https://cloud.google.com/bigquery/docs/running-queries#batch\"\
286+
>here</a>. The default \"interactive\" value is used if not set explicitly."
287+
title: "Transformation Query Run Type (Optional)"
283288
default: "interactive"
284289
enum:
285290
- "interactive"
@@ -288,11 +293,17 @@
288293
type: "object"
289294
title: "Loading Method"
290295
description: "Loading method used to send select the way data will be uploaded\
291-
\ to BigQuery."
296+
\ to BigQuery. <br><b>Standard Inserts</b> - Direct uploading using SQL\
297+
\ INSERT statements. This method is extremely inefficient and provided\
298+
\ only for quick testing. In almost all cases, you should use staging.\
299+
\ <br><b>GCS Staging</b> - Writes large batches of records to a file,\
300+
\ uploads the file to GCS, then uses <b>COPY INTO table</b> to upload\
301+
\ the file. Recommended for most workloads for better speed and scalability.\
302+
\ Read more about GCS Staging <a href=\"https://docs.airbyte.com/integrations/destinations/bigquery#gcs-staging\"\
303+
>here</a>."
292304
oneOf:
293305
- title: "Standard Inserts"
294306
additionalProperties: false
295-
description: "Direct uploading using streams."
296307
required:
297308
- "method"
298309
properties:
@@ -301,9 +312,6 @@
301312
const: "Standard"
302313
- title: "GCS Staging"
303314
additionalProperties: false
304-
description: "Writes large batches of records to a file, uploads the file\
305-
\ to GCS, then uses <pre>COPY INTO table</pre> to upload the file. Recommended\
306-
\ for large production workloads for better speed and scalability."
307315
required:
308316
- "method"
309317
- "gcs_bucket_name"
@@ -316,16 +324,18 @@
316324
gcs_bucket_name:
317325
title: "GCS Bucket Name"
318326
type: "string"
319-
description: "The name of the GCS bucket."
327+
description: "The name of the GCS bucket. Read more <a href=\"https://cloud.google.com/storage/docs/naming-buckets\"\
328+
>here</a>."
320329
examples:
321330
- "airbyte_sync"
322331
gcs_bucket_path:
332+
title: "GCS Bucket Path"
323333
description: "Directory under the GCS bucket where data will be written."
324334
type: "string"
325335
examples:
326336
- "data_sync/test"
327337
part_size_mb:
328-
title: "Block Size (MB) for GCS multipart upload"
338+
title: "Block Size (MB) for GCS Multipart Upload (Optional)"
329339
description: "This is the size of a \"Part\" being buffered in memory.\
330340
\ It limits the memory usage when writing. Larger values will allow\
331341
\ to upload a bigger files and improve the speed, but consumes more\
@@ -340,14 +350,19 @@
340350
type: "string"
341351
description: "This upload method is supposed to temporary store records\
342352
\ in GCS bucket. What do you want to do with data in GCS bucket\
343-
\ when migration has finished?"
344-
title: "GCS tmp files afterward processing"
353+
\ when migration has finished? The default \"Delete all tmp files\
354+
\ from GCS\" value is used if not set explicitly."
355+
title: "GCS Tmp Files Afterward Processing (Optional)"
345356
default: "Delete all tmp files from GCS"
346357
enum:
347358
- "Delete all tmp files from GCS"
348359
- "Keep all tmp files in GCS"
349360
credential:
350361
title: "Credential"
362+
description: "An HMAC key is a type of credential and can be associated\
363+
\ with a service account or a user account in Cloud Storage. Read\
364+
\ more <a href=\"https://cloud.google.com/storage/docs/authentication/hmackeys\"\
365+
>here</a>."
351366
type: "object"
352367
oneOf:
353368
- title: "HMAC key"

airbyte-integrations/connectors/destination-bigquery/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ ENV ENABLE_SENTRY true
1717

1818
COPY --from=build /airbyte /airbyte
1919

20-
LABEL io.airbyte.version=0.6.12
20+
LABEL io.airbyte.version=1.0.0
2121
LABEL io.airbyte.name=airbyte/destination-bigquery

airbyte-integrations/connectors/destination-bigquery/src/main/resources/spec.json

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
"additionalProperties": true,
1313
"properties": {
1414
"big_query_client_buffer_size_mb": {
15-
"title": "Google BigQuery client chunk size",
16-
"description": "Google BigQuery client's chunk(buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MiB value is used if not set explicitly. It's recommended to decrease value for big data sets migration for less HEAP memory consumption and avoiding crashes. For more details refer to https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.client.Client.html",
15+
"title": "Google BigQuery Client Chunk Size (Optional)",
16+
"description": "Google BigQuery client's chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more <a href=\"https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.client.Client.html\">here</a>.",
1717
"type": "integer",
1818
"minimum": 1,
1919
"maximum": 15,
@@ -22,18 +22,18 @@
2222
},
2323
"project_id": {
2424
"type": "string",
25-
"description": "The GCP project ID for the project containing the target BigQuery dataset.",
25+
"description": "The GCP project ID for the project containing the target BigQuery dataset. Read more <a href=\"https://cloud.google.com/iam/docs/creating-managing-service-accounts#creating\">here</a>.",
2626
"title": "Project ID"
2727
},
2828
"dataset_id": {
2929
"type": "string",
30-
"description": "Default BigQuery Dataset ID tables are replicated to if the source does not specify a namespace.",
30+
"description": "The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more <a href=\"https://cloud.google.com/bigquery/docs/datasets#create-dataset\">here</a>.",
3131
"title": "Default Dataset ID"
3232
},
3333
"dataset_location": {
3434
"type": "string",
35-
"description": "The location of the dataset. Warning: Changes made after creation will not be applied.",
36-
"title": "Dataset Location",
35+
"description": "The location of the dataset. Warning: Changes made after creation will not be applied. The default \"US\" value is used if not set explicitly. Read more <a href=\"https://cloud.google.com/bigquery/docs/locations\">here</a>.",
36+
"title": "Dataset Location (Optional)",
3737
"default": "US",
3838
"enum": [
3939
"US",
@@ -71,26 +71,25 @@
7171
},
7272
"credentials_json": {
7373
"type": "string",
74-
"description": "The contents of the JSON service account key. Check out the <a href=\"https://docs.airbyte.io/integrations/destinations/bigquery\">docs</a> if you need help generating this key. Default credentials will be used if this field is left empty.",
75-
"title": "Credentials JSON",
74+
"description": "The contents of the JSON service account key. Check out the <a href=\"https://docs.airbyte.com/integrations/destinations/bigquery#service-account-key\">docs</a> if you need help generating this key. Default credentials will be used if this field is left empty.",
75+
"title": "Credentials JSON (Optional)",
7676
"airbyte_secret": true
7777
},
7878
"transformation_priority": {
7979
"type": "string",
80-
"description": "Interactive run type means that the query is executed as soon as possible, and these queries count towards concurrent rate limit and daily limit. Batch queries are queued and started as soon as idle resources are available in the BigQuery shared resource pool, which usually occurs within a few minutes. Batch queries don’t count towards your concurrent rate limit.",
81-
"title": "Transformation Query Run Type",
80+
"description": "Interactive run type means that the query is executed as soon as possible, and these queries count towards concurrent rate limit and daily limit. Read more about interactive run type <a href=\"https://cloud.google.com/bigquery/docs/running-queries#queries\">here</a>. Batch queries are queued and started as soon as idle resources are available in the BigQuery shared resource pool, which usually occurs within a few minutes. Batch queries don’t count towards your concurrent rate limit. Read more about batch queries <a href=\"https://cloud.google.com/bigquery/docs/running-queries#batch\">here</a>. The default \"interactive\" value is used if not set explicitly.",
81+
"title": "Transformation Query Run Type (Optional)",
8282
"default": "interactive",
8383
"enum": ["interactive", "batch"]
8484
},
8585
"loading_method": {
8686
"type": "object",
8787
"title": "Loading Method",
88-
"description": "Loading method used to send select the way data will be uploaded to BigQuery.",
88+
"description": "Loading method used to send select the way data will be uploaded to BigQuery. <br><b>Standard Inserts</b> - Direct uploading using SQL INSERT statements. This method is extremely inefficient and provided only for quick testing. In almost all cases, you should use staging. <br><b>GCS Staging</b> - Writes large batches of records to a file, uploads the file to GCS, then uses <b>COPY INTO table</b> to upload the file. Recommended for most workloads for better speed and scalability. Read more about GCS Staging <a href=\"https://docs.airbyte.com/integrations/destinations/bigquery#gcs-staging\">here</a>.",
8989
"oneOf": [
9090
{
9191
"title": "Standard Inserts",
9292
"additionalProperties": false,
93-
"description": "Direct uploading using streams.",
9493
"required": ["method"],
9594
"properties": {
9695
"method": {
@@ -102,7 +101,6 @@
102101
{
103102
"title": "GCS Staging",
104103
"additionalProperties": false,
105-
"description": "Writes large batches of records to a file, uploads the file to GCS, then uses <pre>COPY INTO table</pre> to upload the file. Recommended for large production workloads for better speed and scalability.",
106104
"required": [
107105
"method",
108106
"gcs_bucket_name",
@@ -117,16 +115,17 @@
117115
"gcs_bucket_name": {
118116
"title": "GCS Bucket Name",
119117
"type": "string",
120-
"description": "The name of the GCS bucket.",
118+
"description": "The name of the GCS bucket. Read more <a href=\"https://cloud.google.com/storage/docs/naming-buckets\">here</a>.",
121119
"examples": ["airbyte_sync"]
122120
},
123121
"gcs_bucket_path": {
122+
"title": "GCS Bucket Path",
124123
"description": "Directory under the GCS bucket where data will be written.",
125124
"type": "string",
126125
"examples": ["data_sync/test"]
127126
},
128127
"part_size_mb": {
129-
"title": "Block Size (MB) for GCS multipart upload",
128+
"title": "Block Size (MB) for GCS Multipart Upload (Optional)",
130129
"description": "This is the size of a \"Part\" being buffered in memory. It limits the memory usage when writing. Larger values will allow to upload a bigger files and improve the speed, but consumes more memory. Allowed values: min=5MB, max=525MB Default: 5MB.",
131130
"type": "integer",
132131
"default": 5,
@@ -136,8 +135,8 @@
136135
},
137136
"keep_files_in_gcs-bucket": {
138137
"type": "string",
139-
"description": "This upload method is supposed to temporary store records in GCS bucket. What do you want to do with data in GCS bucket when migration has finished?",
140-
"title": "GCS tmp files afterward processing",
138+
"description": "This upload method is supposed to temporary store records in GCS bucket. What do you want to do with data in GCS bucket when migration has finished? The default \"Delete all tmp files from GCS\" value is used if not set explicitly.",
139+
"title": "GCS Tmp Files Afterward Processing (Optional)",
141140
"default": "Delete all tmp files from GCS",
142141
"enum": [
143142
"Delete all tmp files from GCS",
@@ -146,6 +145,7 @@
146145
},
147146
"credential": {
148147
"title": "Credential",
148+
"description": "An HMAC key is a type of credential and can be associated with a service account or a user account in Cloud Storage. Read more <a href=\"https://cloud.google.com/storage/docs/authentication/hmackeys\">here</a>.",
149149
"type": "object",
150150
"oneOf": [
151151
{

0 commit comments

Comments
 (0)