airbytehq · tuliren · Feb 15, 2022 · Feb 12, 2022 · Feb 12, 2022 · Feb 12, 2022
diff --git a/.../io/airbyte/integrations/destination/buffered_stream_consumer/BufferedStreamConsumer.java b/.../io/airbyte/integrations/destination/buffered_stream_consumer/BufferedStreamConsumer.java
@@ -23,10 +23,8 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.Objects;
 import java.util.Set;
 import java.util.function.Consumer;
-import java.util.stream.Collectors;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -81,13 +79,13 @@ public class BufferedStreamConsumer extends FailureTrackingAirbyteMessageConsume
   private final RecordWriter recordWriter;
   private final CheckedConsumer<Boolean, Exception> onClose;
   private final Set<AirbyteStreamNameNamespacePair> streamNames;
-  private final List<AirbyteMessage> buffer;
   private final ConfiguredAirbyteCatalog catalog;
   private final CheckedFunction<JsonNode, Boolean, Exception> isValidRecord;
-  private final Map<AirbyteStreamNameNamespacePair, Long> pairToIgnoredRecordCount;
+  private final Map<AirbyteStreamNameNamespacePair, Long> streamToIgnoredRecordCount;
   private final Consumer<AirbyteMessage> outputRecordCollector;
   private final long maxQueueSizeInBytes;
   private long bufferSizeInBytes;
+  private Map<AirbyteStreamNameNamespacePair, List<AirbyteRecordMessage>> streamBuffer;
 
   private boolean hasStarted;
   private boolean hasClosed;
@@ -112,9 +110,9 @@ public BufferedStreamConsumer(final Consumer<AirbyteMessage> outputRecordCollect
     this.catalog = catalog;
     this.streamNames = AirbyteStreamNameNamespacePair.fromConfiguredCatalog(catalog);
     this.isValidRecord = isValidRecord;
-    this.buffer = new ArrayList<>(10_000);
     this.bufferSizeInBytes = 0;
-    this.pairToIgnoredRecordCount = new HashMap<>();
+    this.streamToIgnoredRecordCount = new HashMap<>();
+    this.streamBuffer = new HashMap<>();
   }
 
   @Override
@@ -123,7 +121,7 @@ protected void startTracked() throws Exception {
     Preconditions.checkState(!hasStarted, "Consumer has already been started.");
     hasStarted = true;
 
-    pairToIgnoredRecordCount.clear();
+    streamToIgnoredRecordCount.clear();
     LOGGER.info("{} started.", BufferedStreamConsumer.class);
 
     onStart.call();
@@ -141,7 +139,7 @@ protected void acceptTracked(final AirbyteMessage message) throws Exception {
       }
 
       if (!isValidRecord.apply(message.getRecord().getData())) {
-        pairToIgnoredRecordCount.put(stream, pairToIgnoredRecordCount.getOrDefault(stream, 0L) + 1L);
+        streamToIgnoredRecordCount.put(stream, streamToIgnoredRecordCount.getOrDefault(stream, 0L) + 1L);
         return;
       }
 
@@ -151,15 +149,12 @@ protected void acceptTracked(final AirbyteMessage message) throws Exception {
       final long messageSizeInBytes = ByteUtils.getSizeInBytesForUTF8CharSet(Jsons.serialize(recordMessage.getData()));
       if (bufferSizeInBytes + messageSizeInBytes > maxQueueSizeInBytes) {
         LOGGER.info("Flushing buffer...");
-        AirbyteSentry.executeWithTracing("FlushBuffer",
-            this::flushQueueToDestination,
-            Map.of("stream", stream.getName(),
-                "namespace", Objects.requireNonNullElse(stream.getNamespace(), "null"),
-                "bufferSizeInBytes", bufferSizeInBytes));
+        flushQueueToDestination(bufferSizeInBytes);
         bufferSizeInBytes = 0;
       }
 
-      buffer.add(message);
+      final List<AirbyteRecordMessage> bufferedRecords = streamBuffer.computeIfAbsent(stream, k -> new ArrayList<>());
+      bufferedRecords.add(message.getRecord());
       bufferSizeInBytes += messageSizeInBytes;
 
     } else if (message.getType() == Type.STATE) {
@@ -170,16 +165,13 @@ protected void acceptTracked(final AirbyteMessage message) throws Exception {
 
   }
 
-  private void flushQueueToDestination() throws Exception {
-    final Map<AirbyteStreamNameNamespacePair, List<AirbyteRecordMessage>> recordsByStream = buffer.stream()
-        .map(AirbyteMessage::getRecord)
-        .collect(Collectors.groupingBy(AirbyteStreamNameNamespacePair::fromRecordMessage));
-
-    buffer.clear();
-
-    for (final Map.Entry<AirbyteStreamNameNamespacePair, List<AirbyteRecordMessage>> entry : recordsByStream.entrySet()) {
-      recordWriter.accept(entry.getKey(), entry.getValue());
-    }
+  private void flushQueueToDestination(long bufferSizeInBytes) throws Exception {
+    AirbyteSentry.executeWithTracing("FlushBuffer", () -> {
+      for (final Map.Entry<AirbyteStreamNameNamespacePair, List<AirbyteRecordMessage>> entry : streamBuffer.entrySet()) {
+        recordWriter.accept(entry.getKey(), entry.getValue());
+      }
+    }, Map.of("bufferSizeInBytes", bufferSizeInBytes));
+    streamBuffer = new HashMap<>();
 
     if (pendingState != null) {
       lastFlushedState = pendingState;
@@ -199,13 +191,13 @@ protected void close(final boolean hasFailed) throws Exception {
     Preconditions.checkState(!hasClosed, "Has already closed.");
     hasClosed = true;
 
-    pairToIgnoredRecordCount
-        .forEach((pair, count) -> LOGGER.warn("A total of {} record(s) of data from stream {} were invalid and were ignored.", count, pair));
+    streamToIgnoredRecordCount.forEach((pair, count) ->
+        LOGGER.warn("A total of {} record(s) of data from stream {} were invalid and were ignored.", count, pair));
     if (hasFailed) {
       LOGGER.error("executing on failed close procedure.");
     } else {
       LOGGER.info("executing on success close procedure.");
-      flushQueueToDestination();
+      flushQueueToDestination(bufferSizeInBytes);
     }
 
     try {

@@ -8,13 +8,9 @@
 import com.fasterxml.jackson.databind.node.ObjectNode;
 import java.util.function.Function;
 import java.util.function.Predicate;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 public class DataAdapter {
 
-  private static final Logger LOGGER = LoggerFactory.getLogger(DataAdapter.class);
-
   private final Predicate<JsonNode> filterValueNode;
   private final Function<JsonNode, JsonNode> valueNodeAdapter;
 

@@ -19,15 +19,13 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
+import java.util.Optional;
 import java.util.UUID;
 import org.apache.commons.csv.CSVFormat;
 import org.apache.commons.csv.CSVPrinter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 public abstract class JdbcSqlOperations implements SqlOperations {
 
-  private static final Logger LOGGER = LoggerFactory.getLogger(JdbcSqlOperations.class);
   protected static final String SHOW_SCHEMAS = "show schemas;";
   protected static final String NAME = "name";
 
@@ -63,21 +61,14 @@ public String createTableQuery(final JdbcDatabase database, final String schemaN
   }
 
   protected void writeBatchToFile(final File tmpFile, final List<AirbyteRecordMessage> records) throws Exception {
-    PrintWriter writer = null;
-    try {
-      writer = new PrintWriter(tmpFile, StandardCharsets.UTF_8);
-      final var csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT);
-
+    try (final PrintWriter writer = new PrintWriter(tmpFile, StandardCharsets.UTF_8);
+        final CSVPrinter csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT)) {
       for (final AirbyteRecordMessage record : records) {
         final var uuid = UUID.randomUUID().toString();
         final var jsonData = Jsons.serialize(formatData(record.getData()));
         final var emittedAt = Timestamp.from(Instant.ofEpochMilli(record.getEmittedAt()));
         csvPrinter.printRecord(uuid, jsonData, emittedAt);
       }
-    } finally {
-      if (writer != null) {
-        writer.close();
-      }
     }
   }
 
@@ -137,7 +128,8 @@ public final void insertRecords(final JdbcDatabase database,
       throws Exception {
     AirbyteSentry.executeWithTracing("InsertRecords",
         () -> {
-          records.forEach(airbyteRecordMessage -> getDataAdapter().adapt(airbyteRecordMessage.getData()));
+          final Optional<DataAdapter> dataAdapter = getDataAdapter();
+          dataAdapter.ifPresent(adapter -> records.forEach(airbyteRecordMessage -> adapter.adapt(airbyteRecordMessage.getData())));
           insertRecordsInternal(database, records, schemaName, tableName);
         },
         Map.of("schema", Objects.requireNonNullElse(schemaName, "null"), "table", tableName, "recordCount", records.size()));
@@ -149,8 +141,8 @@ protected abstract void insertRecordsInternal(JdbcDatabase database,
                                                 String tableName)
       throws Exception;
 
-  protected DataAdapter getDataAdapter() {
-    return new DataAdapter(j -> false, c -> c);
+  protected Optional<DataAdapter> getDataAdapter() {
+    return Optional.empty();
   }
 
 }
@@ -15,15 +15,12 @@
 import java.nio.file.Files;
 import java.sql.SQLException;
 import java.util.List;
+import java.util.Optional;
 import org.postgresql.copy.CopyManager;
 import org.postgresql.core.BaseConnection;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 public class PostgresSqlOperations extends JdbcSqlOperations {
 
-  private static final Logger LOGGER = LoggerFactory.getLogger(PostgresSqlOperations.class);
-
   @Override
   public void insertRecordsInternal(final JdbcDatabase database,
                                     final List<AirbyteRecordMessage> records,
@@ -59,8 +56,8 @@ public void insertRecordsInternal(final JdbcDatabase database,
   }
 
   @Override
-  protected DataAdapter getDataAdapter() {
-    return new PostgresDataAdapter();
+  protected Optional<DataAdapter> getDataAdapter() {
+    return Optional.of(new PostgresDataAdapter());
   }
 
 }
@@ -10,13 +10,9 @@
 import io.airbyte.integrations.base.IntegrationRunner;
 import io.airbyte.integrations.destination.jdbc.copy.SwitchingDestination;
 import java.util.Map;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 public class SnowflakeDestination extends SwitchingDestination<SnowflakeDestination.DestinationType> {
 
-  private static final Logger LOGGER = LoggerFactory.getLogger(SnowflakeDestination.class);
-
   enum DestinationType {
     COPY_S3,
     COPY_GCS,

@@ -46,7 +46,7 @@ public class SnowflakeInternalStagingConsumerFactory {
 
   private static final Logger LOGGER = LoggerFactory.getLogger(SnowflakeInternalStagingConsumerFactory.class);
 
-  private static final long MAX_BATCH_SIZE_BYTES = 1024 * 1024 * 1024 / 4; // 256mb
+  private static final long MAX_BATCH_SIZE_BYTES = 128 * 1024 * 1024; // 128mb
   private final String CURRENT_SYNC_PATH = UUID.randomUUID().toString();
 
   public AirbyteMessageConsumer create(final Consumer<AirbyteMessage> outputRecordCollector,