Skip to content

Commit 36e150c

Browse files
committed
perf: optimizes according to (airbytehq#18836) review comments:
1. create temp table in startTracked() 2. extract iceberg spark sql operations to IcebergOperations, from IcebergConsumer and IcebergCatalogConfig 3. in check() method, create temp table, then write something and read it , and drop table finally.
1 parent f53bbb3 commit 36e150c

File tree

5 files changed

+282
-226
lines changed

5 files changed

+282
-226
lines changed

airbyte-integrations/connectors/destination-iceberg/src/main/java/io/airbyte/integrations/destination/iceberg/IcebergConsumer.java

Lines changed: 37 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,16 @@
44

55
package io.airbyte.integrations.destination.iceberg;
66

7-
import static io.airbyte.integrations.base.JavaBaseConstants.COLUMN_NAME_AB_ID;
8-
import static io.airbyte.integrations.base.JavaBaseConstants.COLUMN_NAME_DATA;
9-
import static io.airbyte.integrations.base.JavaBaseConstants.COLUMN_NAME_EMITTED_AT;
107
import static org.apache.logging.log4j.util.Strings.isNotBlank;
118

129
import io.airbyte.commons.json.Jsons;
13-
import io.airbyte.integrations.base.AirbyteStreamNameNamespacePair;
1410
import io.airbyte.integrations.base.CommitOnStateAirbyteMessageConsumer;
1511
import io.airbyte.integrations.destination.iceberg.config.WriteConfig;
1612
import io.airbyte.integrations.destination.iceberg.config.catalog.IcebergCatalogConfig;
1713
import io.airbyte.protocol.models.AirbyteMessage;
1814
import io.airbyte.protocol.models.AirbyteMessage.Type;
1915
import io.airbyte.protocol.models.AirbyteRecordMessage;
16+
import io.airbyte.protocol.models.AirbyteStreamNameNamespacePair;
2017
import io.airbyte.protocol.models.ConfiguredAirbyteCatalog;
2118
import io.airbyte.protocol.models.ConfiguredAirbyteStream;
2219
import io.airbyte.protocol.models.DestinationSyncMode;
@@ -31,41 +28,30 @@
3128
import java.util.function.Consumer;
3229
import lombok.extern.slf4j.Slf4j;
3330
import org.apache.iceberg.catalog.Catalog;
34-
import org.apache.iceberg.catalog.TableIdentifier;
35-
import org.apache.iceberg.spark.actions.SparkActions;
3631
import org.apache.spark.sql.Row;
3732
import org.apache.spark.sql.SaveMode;
38-
import org.apache.spark.sql.SparkSession;
3933
import org.apache.spark.sql.catalyst.expressions.GenericRow;
40-
import org.apache.spark.sql.types.StringType$;
41-
import org.apache.spark.sql.types.StructType;
42-
import org.apache.spark.sql.types.TimestampType$;
4334

4435
/**
4536
* @author Leibniz on 2022/10/26.
4637
*/
4738
@Slf4j
4839
public class IcebergConsumer extends CommitOnStateAirbyteMessageConsumer {
4940

50-
private final SparkSession spark;
41+
private final IcebergOperations operations;
5142
private final ConfiguredAirbyteCatalog catalog;
5243
private final IcebergCatalogConfig catalogConfig;
5344

5445
private Map<AirbyteStreamNameNamespacePair, WriteConfig> writeConfigs;
5546

56-
private final StructType normalizationSchema;
57-
58-
public IcebergConsumer(SparkSession spark,
59-
Consumer<AirbyteMessage> outputRecordCollector,
60-
ConfiguredAirbyteCatalog catalog,
61-
IcebergCatalogConfig catalogConfig) {
47+
public IcebergConsumer(IcebergOperations operations,
48+
Consumer<AirbyteMessage> outputRecordCollector,
49+
ConfiguredAirbyteCatalog catalog,
50+
IcebergCatalogConfig catalogConfig) {
6251
super(outputRecordCollector);
63-
this.spark = spark;
52+
this.operations = operations;
6453
this.catalog = catalog;
6554
this.catalogConfig = catalogConfig;
66-
this.normalizationSchema = new StructType().add(COLUMN_NAME_AB_ID, StringType$.MODULE$)
67-
.add(COLUMN_NAME_EMITTED_AT, TimestampType$.MODULE$)
68-
.add(COLUMN_NAME_DATA, StringType$.MODULE$);
6955
}
7056

7157
/**
@@ -77,31 +63,37 @@ protected void startTracked() throws Exception {
7763
Map<AirbyteStreamNameNamespacePair, WriteConfig> configs = new HashMap<>();
7864
Set<String> namespaceSet = new HashSet<>();
7965
for (final ConfiguredAirbyteStream stream : catalog.getStreams()) {
66+
final DestinationSyncMode syncMode = stream.getDestinationSyncMode();
67+
if (syncMode == null) {
68+
throw new IllegalStateException("Undefined destination sync mode");
69+
}
70+
final boolean isAppendMode = syncMode != DestinationSyncMode.OVERWRITE;
71+
8072
final String streamName = stream.getStream().getName().toLowerCase();
8173
String namespace = (isNotBlank(stream.getStream().getNamespace()) ? stream.getStream().getNamespace()
82-
: catalogConfig.defaultOutputDatabase()).toLowerCase();
74+
: catalogConfig.defaultOutputDatabase()).toLowerCase();
8375
if (!namespaceSet.contains(namespace)) {
8476
namespaceSet.add(namespace);
8577
try {
86-
spark.sql("CREATE DATABASE IF NOT EXISTS " + namespace);
78+
operations.createDatabase(namespace);
8779
} catch (Exception e) {
8880
log.warn("Create non-existed database failed: {}", e.getMessage(), e);
8981
}
9082
}
91-
final DestinationSyncMode syncMode = stream.getDestinationSyncMode();
92-
if (syncMode == null) {
93-
throw new IllegalStateException("Undefined destination sync mode");
94-
}
95-
final boolean isAppendMode = syncMode != DestinationSyncMode.OVERWRITE;
96-
AirbyteStreamNameNamespacePair nameNamespacePair = AirbyteStreamNameNamespacePair.fromAirbyteSteam(stream.getStream());
83+
AirbyteStreamNameNamespacePair nameNamespacePair = AirbyteStreamNameNamespacePair.fromAirbyteStream(stream.getStream());
9784
Integer flushBatchSize = catalogConfig.getFormatConfig().getFlushBatchSize();
9885
WriteConfig writeConfig = new WriteConfig(namespace, streamName, isAppendMode, flushBatchSize);
9986
configs.put(nameNamespacePair, writeConfig);
87+
88+
//drop temp table
10089
try {
101-
spark.sql("DROP TABLE IF EXISTS " + writeConfig.getFullTempTableName());
90+
operations.dropTable(writeConfig.getNamespace(), writeConfig.getTempTableName(), true);
10291
} catch (Exception e) {
10392
log.warn("Drop existed temp table failed: {}", e.getMessage(), e);
10493
}
94+
95+
//create temp table, don't catch exceptions, let it throws and fail fast
96+
operations.createAirbyteRawTable(writeConfig.getFullTempTableName());
10597
}
10698
this.writeConfigs = configs;
10799
}
@@ -143,11 +135,8 @@ private void appendToTempTable(WriteConfig writeConfig) {
143135
// saveAsTable even if rows is empty, to ensure table is created.
144136
// otherwise the table would be missing, and throws exception in close()
145137
log.info("=> Flushing {} rows into {}", rows.size(), tableName);
146-
spark.createDataFrame(rows, normalizationSchema).write()
147-
// append data to temp table
148-
.mode(SaveMode.Append)
149-
// TODO compression config
150-
.option("write-format", catalogConfig.getFormatConfig().getFormat().getFormatName()).saveAsTable(tableName);
138+
String formatName = catalogConfig.getFormatConfig().getFormat().getFormatName();
139+
operations.appendRowsToTable(tableName, rows, formatName);
151140
}
152141

153142
/**
@@ -168,15 +157,13 @@ protected void close(boolean hasFailed) throws Exception {
168157
String tempTableName = writeConfig.getFullTempTableName();
169158
String finalTableName = writeConfig.getFullTableName();
170159
log.info("=> Migration({}) data from {} to {}",
171-
writeConfig.isAppendMode() ? "append" : "overwrite",
172-
tempTableName,
173-
finalTableName);
174-
spark.sql("SELECT * FROM %s".formatted(tempTableName))
175-
.write()
176-
.mode(writeConfig.isAppendMode() ? SaveMode.Append : SaveMode.Overwrite)
177-
.saveAsTable(finalTableName);
160+
writeConfig.isAppendMode() ? "append" : "overwrite",
161+
tempTableName,
162+
finalTableName);
163+
SaveMode saveMode = writeConfig.isAppendMode() ? SaveMode.Append : SaveMode.Overwrite;
164+
operations.copyFullTable(tempTableName, finalTableName, saveMode);
178165
if (catalogConfig.getFormatConfig().isAutoCompact()) {
179-
tryCompactTable(icebergCatalog, writeConfig);
166+
tryCompactTable(writeConfig);
180167
}
181168
}
182169
log.info("==> Copy temp tables finished...");
@@ -186,38 +173,28 @@ protected void close(boolean hasFailed) throws Exception {
186173
} finally {
187174
log.info("Removing temp tables...");
188175
for (Entry<AirbyteStreamNameNamespacePair, WriteConfig> entry : writeConfigs.entrySet()) {
189-
tryDropTempTable(icebergCatalog, entry.getValue());
176+
tryDropTempTable(entry.getValue());
190177
}
191178
log.info("Closing Spark Session...");
192-
this.spark.close();
179+
operations.close();
193180
log.info("Finishing destination process...completed");
194181
}
195182
}
196183

197-
private void tryDropTempTable(Catalog icebergCatalog, WriteConfig writeConfig) {
184+
private void tryDropTempTable(WriteConfig writeConfig) {
198185
try {
199-
log.info("Trying to drop temp table: {}", writeConfig.getFullTempTableName());
200-
TableIdentifier tempTableIdentifier = TableIdentifier.of(writeConfig.getNamespace(),
201-
writeConfig.getTempTableName());
202-
boolean dropSuccess = icebergCatalog.dropTable(tempTableIdentifier, true);
203-
log.info("Drop temp table: {}", writeConfig.getFullTempTableName());
186+
operations.dropTable(writeConfig.getNamespace(), writeConfig.getTempTableName(), true);
204187
} catch (Exception e) {
205188
String errMsg = e.getMessage();
206189
log.error("Drop temp table caught exception:{}", errMsg, e);
207190
}
208191
}
209192

210-
private void tryCompactTable(Catalog icebergCatalog, WriteConfig writeConfig) {
193+
private void tryCompactTable(WriteConfig writeConfig) {
211194
log.info("=> Auto-Compact is enabled, try compact Iceberg data files");
212-
int compactTargetFileSizeBytes =
213-
catalogConfig.getFormatConfig().getCompactTargetFileSizeInMb() * 1024 * 1024;
195+
int compactTargetFileSizeBytes = catalogConfig.getFormatConfig().getCompactTargetFileSizeInMb() * 1024 * 1024;
214196
try {
215-
TableIdentifier tableIdentifier = TableIdentifier.of(writeConfig.getNamespace(),
216-
writeConfig.getTableName());
217-
SparkActions.get()
218-
.rewriteDataFiles(icebergCatalog.loadTable(tableIdentifier))
219-
.option("target-file-size-bytes", String.valueOf(compactTargetFileSizeBytes))
220-
.execute();
197+
operations.compactTable(writeConfig.getNamespace(), writeConfig.getTableName(), compactTargetFileSizeBytes);
221198
} catch (Exception e) {
222199
log.warn("Compact Iceberg data files failed: {}", e.getMessage(), e);
223200
}

airbyte-integrations/connectors/destination-iceberg/src/main/java/io/airbyte/integrations/destination/iceberg/IcebergDestination.java

Lines changed: 71 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,38 +4,47 @@
44

55
package io.airbyte.integrations.destination.iceberg;
66

7+
import static io.airbyte.integrations.destination.iceberg.config.format.DataFileFormat.PARQUET;
8+
79
import com.fasterxml.jackson.databind.JsonNode;
810
import com.google.common.annotations.VisibleForTesting;
11+
import com.google.common.base.Preconditions;
12+
import io.airbyte.commons.json.Jsons;
913
import io.airbyte.integrations.BaseConnector;
1014
import io.airbyte.integrations.base.AirbyteMessageConsumer;
1115
import io.airbyte.integrations.base.Destination;
1216
import io.airbyte.integrations.base.IntegrationRunner;
17+
import io.airbyte.integrations.destination.iceberg.IcebergOperations.IcebergOperationsFactory;
1318
import io.airbyte.integrations.destination.iceberg.config.catalog.IcebergCatalogConfig;
1419
import io.airbyte.integrations.destination.iceberg.config.catalog.IcebergCatalogConfigFactory;
1520
import io.airbyte.protocol.models.AirbyteConnectionStatus;
1621
import io.airbyte.protocol.models.AirbyteConnectionStatus.Status;
1722
import io.airbyte.protocol.models.AirbyteMessage;
1823
import io.airbyte.protocol.models.ConfiguredAirbyteCatalog;
24+
import java.sql.Timestamp;
25+
import java.util.List;
1926
import java.util.Map;
2027
import java.util.Objects;
28+
import java.util.UUID;
2129
import java.util.function.Consumer;
2230
import lombok.extern.slf4j.Slf4j;
23-
import org.apache.spark.sql.SparkSession;
24-
import org.apache.spark.sql.SparkSession.Builder;
31+
import org.apache.spark.sql.Row;
32+
import org.apache.spark.sql.catalyst.expressions.GenericRow;
2533

2634
@Slf4j
2735
public class IcebergDestination extends BaseConnector implements Destination {
2836

29-
private final IcebergCatalogConfigFactory icebergCatalogConfigFactory;
37+
private final IcebergCatalogConfigFactory configFactory = new IcebergCatalogConfigFactory();
38+
private final IcebergOperationsFactory operationsFactory;
3039

3140
public IcebergDestination() {
32-
this.icebergCatalogConfigFactory = new IcebergCatalogConfigFactory();
41+
this.operationsFactory = new IcebergOperationsFactory();
3342
}
3443

3544
@VisibleForTesting
36-
public IcebergDestination(IcebergCatalogConfigFactory icebergCatalogConfigFactory) {
37-
this.icebergCatalogConfigFactory = Objects.requireNonNullElseGet(icebergCatalogConfigFactory,
38-
IcebergCatalogConfigFactory::new);
45+
public IcebergDestination(IcebergOperationsFactory operationsFactory) {
46+
this.operationsFactory = Objects.requireNonNullElseGet(operationsFactory,
47+
IcebergOperationsFactory::new);
3948
}
4049

4150
public static void main(String[] args) throws Exception {
@@ -44,24 +53,63 @@ public static void main(String[] args) throws Exception {
4453

4554
@Override
4655
public AirbyteConnectionStatus check(JsonNode config) {
56+
// parse configuration
57+
IcebergCatalogConfig icebergCatalogConfig;
4758
try {
48-
IcebergCatalogConfig icebergCatalogConfig = icebergCatalogConfigFactory.fromJsonNodeConfig(config);
49-
icebergCatalogConfig.check();
59+
icebergCatalogConfig = configFactory.fromJsonNodeConfig(config);
60+
} catch (Exception e) {
61+
return handleCheckException(e, "parse the Iceberg configuration");
62+
}
63+
64+
// check the configuration
65+
String database = icebergCatalogConfig.defaultOutputDatabase();
66+
String tempTableName = "temp_" + System.currentTimeMillis();
67+
String tempTableFullName = "%s.`%s`.`%s`".formatted(IcebergConstants.CATALOG_NAME, database, tempTableName);
68+
try (IcebergOperations operations = operationsFactory.getInstance(icebergCatalogConfig,
69+
"Iceberg-Config-Check")) {
70+
operations.createAirbyteRawTable(tempTableFullName);
71+
Row tempRow = generateCheckRow(Map.of("testKey", "testValue"));
72+
operations.appendRowsToTable(tempTableFullName, List.of(tempRow), PARQUET.getFormatName());
73+
operations.showTableContent(tempTableFullName, 1);
74+
var selectRows = (Row[]) operations.collectTableContent(tempTableFullName, 1);
75+
operations.dropTable(database, tempTableName, true);
76+
77+
Preconditions.checkState(selectRows.length == 1,
78+
"Size of temp table for checking is not 1: " + selectRows.length);
79+
assertRowEquals(tempRow, selectRows[0]);
5080

51-
// getting here means Iceberg catalog check success
81+
//getting here means Iceberg catalog check success
5282
return new AirbyteConnectionStatus().withStatus(Status.SUCCEEDED);
53-
} catch (final Exception e) {
54-
log.error("Exception attempting to access the Iceberg catalog: ", e);
55-
Throwable rootCause = getRootCause(e);
56-
String errMessage =
57-
"Could not connect to the Iceberg catalog with the provided configuration. \n" + e.getMessage()
58-
+ ", root cause: " + rootCause.getClass().getSimpleName() + "(" + rootCause.getMessage() + ")";
59-
return new AirbyteConnectionStatus()
60-
.withStatus(AirbyteConnectionStatus.Status.FAILED)
61-
.withMessage(errMessage);
83+
} catch (Exception e) {
84+
return handleCheckException(e, "access the Iceberg catalog");
85+
}
86+
}
87+
88+
private void assertRowEquals(Row expect, Row actual) {
89+
Preconditions.checkState(expect.size() == actual.size(),
90+
"Inserted row and selected Row has different size:" + expect.size() + " vs " + actual.size());
91+
for (int i = 0; i < expect.size(); i++) {
92+
Preconditions.checkState(expect.get(i).equals(actual.get(i)),
93+
"Inserted row and selected Row has different value in " + i + "st column: " + actual.get(i) + " vs "
94+
+ actual.get(i));
6295
}
6396
}
6497

98+
private Row generateCheckRow(Map<String, Object> data) {
99+
String tempDataId = UUID.randomUUID().toString();
100+
long tempDataTs = System.currentTimeMillis();
101+
return new GenericRow(new Object[]{tempDataId, new Timestamp(tempDataTs), Jsons.serialize(data)});
102+
}
103+
104+
private AirbyteConnectionStatus handleCheckException(Exception exception, String action) {
105+
log.error("Exception attempting to {}: {}", action, exception.getMessage(), exception);
106+
Throwable rootCause = getRootCause(exception);
107+
String errMessage =
108+
"Could not " + action + " with the provided configuration. \n" + exception.getMessage() + ", root cause: "
109+
+ rootCause.getClass().getSimpleName() + "(" + rootCause.getMessage() + ")";
110+
return new AirbyteConnectionStatus().withStatus(Status.FAILED).withMessage(errMessage);
111+
}
112+
65113
private Throwable getRootCause(Throwable exp) {
66114
Throwable curCause = exp.getCause();
67115
if (curCause == null) {
@@ -75,16 +123,10 @@ private Throwable getRootCause(Throwable exp) {
75123
public AirbyteMessageConsumer getConsumer(JsonNode config,
76124
ConfiguredAirbyteCatalog catalog,
77125
Consumer<AirbyteMessage> outputRecordCollector) {
78-
final IcebergCatalogConfig icebergCatalogConfig = this.icebergCatalogConfigFactory.fromJsonNodeConfig(config);
79-
Map<String, String> sparkConfMap = icebergCatalogConfig.sparkConfigMap();
80-
81-
Builder sparkBuilder = SparkSession.builder()
82-
.master("local")
83-
.appName("Airbyte->Iceberg-" + System.currentTimeMillis());
84-
sparkConfMap.forEach(sparkBuilder::config);
85-
SparkSession spark = sparkBuilder.getOrCreate();
86-
87-
return new IcebergConsumer(spark, outputRecordCollector, catalog, icebergCatalogConfig);
126+
final IcebergCatalogConfig icebergCatalogConfig = this.configFactory.fromJsonNodeConfig(config);
127+
String sparkAppName = "Airbyte->Iceberg-" + System.currentTimeMillis();
128+
IcebergOperations operations = this.operationsFactory.getInstance(icebergCatalogConfig, sparkAppName);
129+
return new IcebergConsumer(operations, outputRecordCollector, catalog, icebergCatalogConfig);
88130
}
89131

90132
}

0 commit comments

Comments
 (0)