Skip to content

Commit a3f03ab

Browse files
edgaogisripa
authored andcommitted
Destination bigquery: upgrade cdk (#35315)
Signed-off-by: Gireesh Sreepathi <[email protected]> Co-authored-by: Gireesh Sreepathi <[email protected]>
1 parent 323adf2 commit a3f03ab

File tree

11 files changed

+350
-281
lines changed

11 files changed

+350
-281
lines changed

airbyte-integrations/connectors/destination-bigquery/build.gradle

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
plugins {
22
id 'airbyte-java-connector'
3+
id 'org.jetbrains.kotlin.jvm' version '1.9.22'
34
}
45

56
airbyteJavaConnector {
6-
cdkVersionRequired = '0.20.9'
7+
cdkVersionRequired = '0.23.11'
78
features = [
89
'db-destinations',
910
'datastore-bigquery',

airbyte-integrations/connectors/destination-bigquery/metadata.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ data:
55
connectorSubtype: database
66
connectorType: destination
77
definitionId: 22f6c74f-5699-40ff-833c-4a879ea40133
8-
dockerImageTag: 2.4.11
8+
dockerImageTag: 2.4.12
99
dockerRepository: airbyte/destination-bigquery
1010
documentationUrl: https://docs.airbyte.com/integrations/destinations/bigquery
1111
githubIssueLabel: destination-bigquery

airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryDestination.java

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import java.util.ArrayList;
6262
import java.util.List;
6363
import java.util.Map;
64+
import java.util.Optional;
6465
import java.util.UUID;
6566
import java.util.concurrent.ConcurrentHashMap;
6667
import java.util.concurrent.ConcurrentMap;
@@ -233,9 +234,11 @@ public SerializedAirbyteMessageConsumer getSerializedMessageConsumer(final JsonN
233234
final boolean disableTypeDedupe = BigQueryUtils.getDisableTypeDedupFlag(config);
234235
final String datasetLocation = BigQueryUtils.getDatasetLocation(config);
235236
final BigQuerySqlGenerator sqlGenerator = new BigQuerySqlGenerator(config.get(BigQueryConsts.CONFIG_PROJECT_ID).asText(), datasetLocation);
236-
final ParsedCatalog parsedCatalog = parseCatalog(config, catalog, datasetLocation);
237+
final Optional<String> rawNamespaceOverride = TypingAndDedupingFlag.getRawNamespaceOverride(RAW_DATA_DATASET);
238+
final ParsedCatalog parsedCatalog = parseCatalog(config, catalog, datasetLocation, rawNamespaceOverride);
237239
final BigQuery bigquery = getBigQuery(config);
238-
final TyperDeduper typerDeduper = buildTyperDeduper(sqlGenerator, parsedCatalog, bigquery, datasetLocation, disableTypeDedupe);
240+
final TyperDeduper typerDeduper =
241+
buildTyperDeduper(sqlGenerator, parsedCatalog, bigquery, datasetLocation, disableTypeDedupe);
239242

240243
AirbyteExceptionHandler.addAllStringsInConfigForDeinterpolation(config);
241244
final JsonNode serviceAccountKey = config.get(BigQueryConsts.CONFIG_CREDS);
@@ -360,7 +363,6 @@ private SerializedAirbyteMessageConsumer getStandardRecordConsumer(final BigQuer
360363
final Consumer<AirbyteMessage> outputRecordCollector,
361364
final TyperDeduper typerDeduper)
362365
throws Exception {
363-
typerDeduper.prepareTables();
364366
final Supplier<ConcurrentMap<AirbyteStreamNameNamespacePair, AbstractBigQueryUploader<?>>> writeConfigs = getUploaderMap(
365367
bigquery,
366368
config,
@@ -372,6 +374,8 @@ private SerializedAirbyteMessageConsumer getStandardRecordConsumer(final BigQuer
372374
return new BigQueryRecordStandardConsumer(
373375
outputRecordCollector,
374376
() -> {
377+
typerDeduper.prepareSchemasAndRunMigrations();
378+
375379
// Set up our raw tables
376380
writeConfigs.get().forEach((streamId, uploader) -> {
377381
final StreamConfig stream = parsedCatalog.getStream(streamId);
@@ -390,6 +394,8 @@ private SerializedAirbyteMessageConsumer getStandardRecordConsumer(final BigQuer
390394
uploader.createRawTable();
391395
}
392396
});
397+
398+
typerDeduper.prepareFinalTables();
393399
},
394400
(hasFailed, streamSyncSummaries) -> {
395401
try {
@@ -424,11 +430,13 @@ private void setDefaultStreamNamespace(final ConfiguredAirbyteCatalog catalog, f
424430
}
425431
}
426432

427-
private ParsedCatalog parseCatalog(final JsonNode config, final ConfiguredAirbyteCatalog catalog, final String datasetLocation) {
433+
private ParsedCatalog parseCatalog(final JsonNode config,
434+
final ConfiguredAirbyteCatalog catalog,
435+
final String datasetLocation,
436+
final Optional<String> rawNamespaceOverride) {
428437
final BigQuerySqlGenerator sqlGenerator = new BigQuerySqlGenerator(config.get(BigQueryConsts.CONFIG_PROJECT_ID).asText(), datasetLocation);
429-
final CatalogParser catalogParser = TypingAndDedupingFlag.getRawNamespaceOverride(RAW_DATA_DATASET).isPresent()
430-
? new CatalogParser(sqlGenerator, TypingAndDedupingFlag.getRawNamespaceOverride(RAW_DATA_DATASET).get())
431-
: new CatalogParser(sqlGenerator);
438+
final CatalogParser catalogParser = rawNamespaceOverride.map(s -> new CatalogParser(sqlGenerator, s))
439+
.orElseGet(() -> new CatalogParser(sqlGenerator));
432440

433441
return catalogParser.parseCatalog(catalog);
434442
}
@@ -440,11 +448,13 @@ private TyperDeduper buildTyperDeduper(final BigQuerySqlGenerator sqlGenerator,
440448
final boolean disableTypeDedupe) {
441449
final BigQueryV1V2Migrator migrator = new BigQueryV1V2Migrator(bigquery, namingResolver);
442450
final BigQueryV2TableMigrator v2RawTableMigrator = new BigQueryV2TableMigrator(bigquery);
443-
final BigQueryDestinationHandler destinationHandler = new BigQueryDestinationHandler(bigquery, datasetLocation);
451+
final BigQueryDestinationHandler destinationHandler = new BigQueryDestinationHandler(
452+
bigquery,
453+
datasetLocation);
444454

445455
if (disableTypeDedupe) {
446456
return new NoOpTyperDeduperWithV1V2Migrations<>(
447-
sqlGenerator, destinationHandler, parsedCatalog, migrator, v2RawTableMigrator, 8);
457+
sqlGenerator, destinationHandler, parsedCatalog, migrator, v2RawTableMigrator, List.of());
448458
}
449459

450460
return new DefaultTyperDeduper<>(
@@ -453,8 +463,7 @@ private TyperDeduper buildTyperDeduper(final BigQuerySqlGenerator sqlGenerator,
453463
parsedCatalog,
454464
migrator,
455465
v2RawTableMigrator,
456-
8);
457-
466+
List.of());
458467
}
459468

460469
@Override

airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/BigQueryStagingConsumerFactory.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,8 @@ private OnStartFunction onStartFunction(final BigQueryStagingOperations bigQuery
135135
final TyperDeduper typerDeduper) {
136136
return () -> {
137137
LOGGER.info("Preparing airbyte_raw tables in destination started for {} streams", writeConfigs.size());
138-
typerDeduper.prepareTables();
138+
typerDeduper.prepareSchemasAndRunMigrations();
139+
139140
for (final BigQueryWriteConfig writeConfig : writeConfigs.values()) {
140141
LOGGER.info("Preparing staging are in destination for schema: {}, stream: {}, target table: {}, stage: {}",
141142
writeConfig.tableSchema(), writeConfig.streamName(), writeConfig.targetTableId(), writeConfig.streamName());
@@ -156,6 +157,8 @@ private OnStartFunction onStartFunction(final BigQueryStagingOperations bigQuery
156157
bigQueryGcsOperations.truncateTableIfExists(rawDatasetId, writeConfig.targetTableId(), writeConfig.tableSchema());
157158
}
158159
}
160+
161+
typerDeduper.prepareFinalTables();
159162
LOGGER.info("Preparing tables in destination completed.");
160163
};
161164
}

airbyte-integrations/connectors/destination-bigquery/src/main/java/io/airbyte/integrations/destination/bigquery/typing_deduping/BigQueryDestinationHandler.java

Lines changed: 165 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,17 @@
44

55
package io.airbyte.integrations.destination.bigquery.typing_deduping;
66

7+
import static io.airbyte.integrations.base.destination.typing_deduping.CollectionUtils.containsAllIgnoreCase;
8+
import static io.airbyte.integrations.base.destination.typing_deduping.CollectionUtils.containsIgnoreCase;
9+
import static io.airbyte.integrations.base.destination.typing_deduping.CollectionUtils.matchingKey;
10+
import static io.airbyte.integrations.destination.bigquery.typing_deduping.BigQuerySqlGenerator.QUOTE;
11+
import static io.airbyte.integrations.destination.bigquery.typing_deduping.BigQuerySqlGenerator.clusteringColumns;
12+
import static io.airbyte.integrations.destination.bigquery.typing_deduping.BigQuerySqlGenerator.toDialectType;
13+
import static java.util.stream.Collectors.toMap;
14+
715
import com.google.cloud.bigquery.BigQuery;
816
import com.google.cloud.bigquery.BigQueryException;
17+
import com.google.cloud.bigquery.Field;
918
import com.google.cloud.bigquery.FieldValue;
1019
import com.google.cloud.bigquery.Job;
1120
import com.google.cloud.bigquery.JobConfiguration;
@@ -14,28 +23,46 @@
1423
import com.google.cloud.bigquery.JobStatistics;
1524
import com.google.cloud.bigquery.JobStatus;
1625
import com.google.cloud.bigquery.QueryJobConfiguration;
26+
import com.google.cloud.bigquery.StandardSQLTypeName;
27+
import com.google.cloud.bigquery.StandardTableDefinition;
1728
import com.google.cloud.bigquery.Table;
1829
import com.google.cloud.bigquery.TableDefinition;
1930
import com.google.cloud.bigquery.TableId;
31+
import com.google.cloud.bigquery.TimePartitioning;
32+
import com.google.common.annotations.VisibleForTesting;
2033
import com.google.common.collect.Streams;
2134
import io.airbyte.cdk.integrations.base.AirbyteExceptionHandler;
35+
import io.airbyte.cdk.integrations.base.JavaBaseConstants;
36+
import io.airbyte.integrations.base.destination.typing_deduping.AlterTableReport;
37+
import io.airbyte.integrations.base.destination.typing_deduping.ColumnId;
2238
import io.airbyte.integrations.base.destination.typing_deduping.DestinationHandler;
39+
import io.airbyte.integrations.base.destination.typing_deduping.DestinationInitialStatus;
40+
import io.airbyte.integrations.base.destination.typing_deduping.InitialRawTableStatus;
2341
import io.airbyte.integrations.base.destination.typing_deduping.Sql;
42+
import io.airbyte.integrations.base.destination.typing_deduping.StreamConfig;
2443
import io.airbyte.integrations.base.destination.typing_deduping.StreamId;
44+
import io.airbyte.integrations.base.destination.typing_deduping.TableNotMigratedException;
45+
import io.airbyte.integrations.base.destination.typing_deduping.migrators.MinimumDestinationState;
46+
import io.airbyte.integrations.base.destination.typing_deduping.migrators.MinimumDestinationState.Impl;
2547
import java.math.BigInteger;
48+
import java.util.ArrayList;
49+
import java.util.Collection;
50+
import java.util.Collections;
2651
import java.util.Comparator;
27-
import java.util.LinkedHashMap;
52+
import java.util.HashSet;
2853
import java.util.List;
2954
import java.util.Map;
3055
import java.util.Optional;
56+
import java.util.Set;
3157
import java.util.UUID;
58+
import java.util.stream.Collectors;
3259
import java.util.stream.Stream;
3360
import org.apache.commons.text.StringSubstitutor;
3461
import org.slf4j.Logger;
3562
import org.slf4j.LoggerFactory;
3663

3764
// TODO this stuff almost definitely exists somewhere else in our codebase.
38-
public class BigQueryDestinationHandler implements DestinationHandler<TableDefinition> {
65+
public class BigQueryDestinationHandler implements DestinationHandler<MinimumDestinationState.Impl> {
3966

4067
private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryDestinationHandler.class);
4168

@@ -47,32 +74,24 @@ public BigQueryDestinationHandler(final BigQuery bq, final String datasetLocatio
4774
this.datasetLocation = datasetLocation;
4875
}
4976

50-
@Override
5177
public Optional<TableDefinition> findExistingTable(final StreamId id) {
5278
final Table table = bq.getTable(id.finalNamespace(), id.finalName());
5379
return Optional.ofNullable(table).map(Table::getDefinition);
5480
}
5581

56-
@Override
57-
public LinkedHashMap<String, TableDefinition> findExistingFinalTables(List<StreamId> streamIds) throws Exception {
58-
return null;
59-
}
60-
61-
@Override
6282
public boolean isFinalTableEmpty(final StreamId id) {
6383
return BigInteger.ZERO.equals(bq.getTable(TableId.of(id.finalNamespace(), id.finalName())).getNumRows());
6484
}
6585

66-
@Override
67-
public InitialRawTableState getInitialRawTableState(final StreamId id) throws Exception {
86+
public InitialRawTableStatus getInitialRawTableState(final StreamId id) throws Exception {
6887
final Table rawTable = bq.getTable(TableId.of(id.rawNamespace(), id.rawName()));
6988
if (rawTable == null) {
7089
// Table doesn't exist. There are no unprocessed records, and no timestamp.
71-
return new InitialRawTableState(false, Optional.empty());
90+
return new InitialRawTableStatus(false, false, Optional.empty());
7291
}
7392

7493
final FieldValue unloadedRecordTimestamp = bq.query(QueryJobConfiguration.newBuilder(new StringSubstitutor(Map.of(
75-
"raw_table", id.rawTableId(BigQuerySqlGenerator.QUOTE))).replace(
94+
"raw_table", id.rawTableId(QUOTE))).replace(
7695
// bigquery timestamps have microsecond precision
7796
"""
7897
SELECT TIMESTAMP_SUB(MIN(_airbyte_extracted_at), INTERVAL 1 MICROSECOND)
@@ -84,11 +103,11 @@ SELECT TIMESTAMP_SUB(MIN(_airbyte_extracted_at), INTERVAL 1 MICROSECOND)
84103
// If it's not null, then we can return immediately - we've found some unprocessed records and their
85104
// timestamp.
86105
if (!unloadedRecordTimestamp.isNull()) {
87-
return new InitialRawTableState(true, Optional.of(unloadedRecordTimestamp.getTimestampInstant()));
106+
return new InitialRawTableStatus(true, true, Optional.of(unloadedRecordTimestamp.getTimestampInstant()));
88107
}
89108

90109
final FieldValue loadedRecordTimestamp = bq.query(QueryJobConfiguration.newBuilder(new StringSubstitutor(Map.of(
91-
"raw_table", id.rawTableId(BigQuerySqlGenerator.QUOTE))).replace(
110+
"raw_table", id.rawTableId(QUOTE))).replace(
92111
"""
93112
SELECT MAX(_airbyte_extracted_at)
94113
FROM ${raw_table}
@@ -98,10 +117,10 @@ SELECT MAX(_airbyte_extracted_at)
98117
// So we just need to get the timestamp of the most recent record.
99118
if (loadedRecordTimestamp.isNull()) {
100119
// Null timestamp because the table is empty. T+D can process the entire raw table during this sync.
101-
return new InitialRawTableState(false, Optional.empty());
120+
return new InitialRawTableStatus(true, false, Optional.empty());
102121
} else {
103122
// The raw table already has some records. T+D can skip all records with timestamp <= this value.
104-
return new InitialRawTableState(false, Optional.of(loadedRecordTimestamp.getTimestampInstant()));
123+
return new InitialRawTableStatus(true, false, Optional.of(loadedRecordTimestamp.getTimestampInstant()));
105124
}
106125
}
107126

@@ -172,4 +191,133 @@ public void execute(final Sql sql) throws InterruptedException {
172191
}
173192
}
174193

194+
@Override
195+
public List<DestinationInitialStatus<Impl>> gatherInitialState(List<StreamConfig> streamConfigs) throws Exception {
196+
final List<DestinationInitialStatus<MinimumDestinationState.Impl>> initialStates = new ArrayList<>();
197+
for (final StreamConfig streamConfig : streamConfigs) {
198+
final StreamId id = streamConfig.id();
199+
final Optional<TableDefinition> finalTable = findExistingTable(id);
200+
final InitialRawTableStatus rawTableState = getInitialRawTableState(id);
201+
initialStates.add(new DestinationInitialStatus<>(
202+
streamConfig,
203+
finalTable.isPresent(),
204+
rawTableState,
205+
finalTable.isPresent() && !existingSchemaMatchesStreamConfig(streamConfig, finalTable.get()),
206+
finalTable.isEmpty() || isFinalTableEmpty(id),
207+
// Return a default state blob since we don't actually track state.
208+
new MinimumDestinationState.Impl(false)));
209+
}
210+
return initialStates;
211+
}
212+
213+
@Override
214+
public void commitDestinationStates(Map<StreamId, MinimumDestinationState.Impl> destinationStates) throws Exception {
215+
// Intentionally do nothing. Bigquery doesn't actually support destination states.
216+
}
217+
218+
private boolean existingSchemaMatchesStreamConfig(final StreamConfig stream,
219+
final TableDefinition existingTable)
220+
throws TableNotMigratedException {
221+
final var alterTableReport = buildAlterTableReport(stream, existingTable);
222+
boolean tableClusteringMatches = false;
223+
boolean tablePartitioningMatches = false;
224+
if (existingTable instanceof final StandardTableDefinition standardExistingTable) {
225+
tableClusteringMatches = clusteringMatches(stream, standardExistingTable);
226+
tablePartitioningMatches = partitioningMatches(standardExistingTable);
227+
}
228+
LOGGER.info("Alter Table Report {} {} {}; Clustering {}; Partitioning {}",
229+
alterTableReport.columnsToAdd(),
230+
alterTableReport.columnsToRemove(),
231+
alterTableReport.columnsToChangeType(),
232+
tableClusteringMatches,
233+
tablePartitioningMatches);
234+
235+
return alterTableReport.isNoOp() && tableClusteringMatches && tablePartitioningMatches;
236+
}
237+
238+
public AlterTableReport buildAlterTableReport(final StreamConfig stream, final TableDefinition existingTable) {
239+
final Set<String> pks = getPks(stream);
240+
241+
final Map<String, StandardSQLTypeName> streamSchema = stream.columns().entrySet().stream()
242+
.collect(toMap(
243+
entry -> entry.getKey().name(),
244+
entry -> toDialectType(entry.getValue())));
245+
246+
final Map<String, StandardSQLTypeName> existingSchema = existingTable.getSchema().getFields().stream()
247+
.collect(toMap(
248+
field -> field.getName(),
249+
field -> field.getType().getStandardType()));
250+
251+
// Columns in the StreamConfig that don't exist in the TableDefinition
252+
final Set<String> columnsToAdd = streamSchema.keySet().stream()
253+
.filter(name -> !containsIgnoreCase(existingSchema.keySet(), name))
254+
.collect(Collectors.toSet());
255+
256+
// Columns in the current schema that are no longer in the StreamConfig
257+
final Set<String> columnsToRemove = existingSchema.keySet().stream()
258+
.filter(name -> !containsIgnoreCase(streamSchema.keySet(), name) && !containsIgnoreCase(
259+
JavaBaseConstants.V2_FINAL_TABLE_METADATA_COLUMNS, name))
260+
.collect(Collectors.toSet());
261+
262+
// Columns that are typed differently than the StreamConfig
263+
final Set<String> columnsToChangeType = Stream.concat(
264+
streamSchema.keySet().stream()
265+
// If it's not in the existing schema, it should already be in the columnsToAdd Set
266+
.filter(name -> {
267+
// Big Query Columns are case-insensitive, first find the correctly cased key if it exists
268+
return matchingKey(existingSchema.keySet(), name)
269+
// if it does exist, only include it in this set if the type (the value in each respective map)
270+
// is different between the stream and existing schemas
271+
.map(key -> !existingSchema.get(key).equals(streamSchema.get(name)))
272+
// if there is no matching key, then don't include it because it is probably already in columnsToAdd
273+
.orElse(false);
274+
}),
275+
276+
// OR columns that used to have a non-null constraint and shouldn't
277+
// (https://github.com/airbytehq/airbyte/pull/31082)
278+
existingTable.getSchema().getFields().stream()
279+
.filter(field -> pks.contains(field.getName()))
280+
.filter(field -> field.getMode() == Field.Mode.REQUIRED)
281+
.map(Field::getName))
282+
.collect(Collectors.toSet());
283+
284+
final boolean isDestinationV2Format = schemaContainAllFinalTableV2AirbyteColumns(existingSchema.keySet());
285+
286+
return new AlterTableReport(columnsToAdd, columnsToRemove, columnsToChangeType, isDestinationV2Format);
287+
}
288+
289+
@VisibleForTesting
290+
public static boolean clusteringMatches(final StreamConfig stream, final StandardTableDefinition existingTable) {
291+
return existingTable.getClustering() != null
292+
&& containsAllIgnoreCase(
293+
new HashSet<>(existingTable.getClustering().getFields()),
294+
clusteringColumns(stream));
295+
}
296+
297+
@VisibleForTesting
298+
public static boolean partitioningMatches(final StandardTableDefinition existingTable) {
299+
return existingTable.getTimePartitioning() != null
300+
&& existingTable.getTimePartitioning()
301+
.getField()
302+
.equalsIgnoreCase("_airbyte_extracted_at")
303+
&& TimePartitioning.Type.DAY.equals(existingTable.getTimePartitioning().getType());
304+
}
305+
306+
/**
307+
* Checks the schema to determine whether the table contains all expected final table airbyte
308+
* columns
309+
*
310+
* @param columnNames the column names of the schema to check
311+
* @return whether all the {@link JavaBaseConstants#V2_FINAL_TABLE_METADATA_COLUMNS} are present
312+
*/
313+
@VisibleForTesting
314+
public static boolean schemaContainAllFinalTableV2AirbyteColumns(final Collection<String> columnNames) {
315+
return JavaBaseConstants.V2_FINAL_TABLE_METADATA_COLUMNS.stream()
316+
.allMatch(column -> containsIgnoreCase(columnNames, column));
317+
}
318+
319+
private static Set<String> getPks(final StreamConfig stream) {
320+
return stream.primaryKey() != null ? stream.primaryKey().stream().map(ColumnId::name).collect(Collectors.toSet()) : Collections.emptySet();
321+
}
322+
175323
}

0 commit comments

Comments
 (0)