airbytehq · sashaNeshcheret · Jun 14, 2022 · Jun 6, 2022 · Jun 6, 2022 · Jun 6, 2022
@@ -14,15 +14,18 @@
 import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater;
 import io.airbyte.integrations.destination.s3.util.AvroRecordHelper;
 import io.airbyte.integrations.standardtest.destination.comparator.TestDataComparator;
+import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.Set;
+import org.apache.avro.Schema.Type;
 import org.apache.avro.file.DataFileReader;
 import org.apache.avro.file.SeekableByteArrayInput;
 import org.apache.avro.generic.GenericData;
 import org.apache.avro.generic.GenericData.Record;
 import org.apache.avro.generic.GenericDatumReader;
 
-public class GcsAvroDestinationAcceptanceTest extends GcsDestinationAcceptanceTest {
+public class GcsAvroDestinationAcceptanceTest extends GcsAvroParquetDestinationAcceptanceTest {
 
   protected GcsAvroDestinationAcceptanceTest() {
     super(S3Format.AVRO);
@@ -71,4 +74,25 @@ protected List<JsonNode> retrieveRecords(final TestDestinationEnv testEnv,
     return jsonRecords;
   }
 
+  @Override
+  protected Set<Type> retrieveDataTypesFromPersistedFiles(final String streamName, final String namespace) throws Exception {
+
+    final List<S3ObjectSummary> objectSummaries = getAllSyncedObjects(streamName, namespace);
+    Set<Type> dataTypes = new HashSet<>();
+
+    for (final S3ObjectSummary objectSummary : objectSummaries) {
+      final S3Object object = s3Client.getObject(objectSummary.getBucketName(), objectSummary.getKey());
+      try (final DataFileReader<Record> dataFileReader = new DataFileReader<>(
+          new SeekableByteArrayInput(object.getObjectContent().readAllBytes()),
+          new GenericDatumReader<>())) {
+        while (dataFileReader.hasNext()) {
+          final GenericData.Record record = dataFileReader.next();
+          Set<Type> actualDataTypes = getTypes(record);
+          dataTypes.addAll(actualDataTypes);
+        }
+      }
+    }
+    return dataTypes;
+  }
+
 }
@@ -0,0 +1,99 @@
+package io.airbyte.integrations.destination.gcs;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import io.airbyte.commons.json.Jsons;
+import io.airbyte.commons.resources.MoreResources;
+import io.airbyte.integrations.destination.s3.S3Format;
+import io.airbyte.integrations.destination.s3.avro.JsonSchemaType;
+import io.airbyte.protocol.models.AirbyteCatalog;
+import io.airbyte.protocol.models.AirbyteMessage;
+import io.airbyte.protocol.models.AirbyteStream;
+import io.airbyte.protocol.models.CatalogHelpers;
+import io.airbyte.protocol.models.ConfiguredAirbyteCatalog;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;
+import org.apache.avro.Schema;
+import org.apache.avro.Schema.Type;
+import org.apache.avro.generic.GenericData.Record;
+import org.junit.jupiter.api.Test;
+
+public abstract class GcsAvroParquetDestinationAcceptanceTest extends GcsDestinationAcceptanceTest {
+
+  protected GcsAvroParquetDestinationAcceptanceTest(S3Format s3Format) {
+    super(s3Format);
+  }
+
+  @Test
+  public void testNumberDataType() throws Exception {
+    final AirbyteCatalog catalog = readCatalogFromFile("number_data_type_test_catalog.json");
+    final ConfiguredAirbyteCatalog configuredCatalog = CatalogHelpers.toDefaultConfiguredCatalog(catalog);
+    final List<AirbyteMessage> messages = readMessagesFromFile("number_data_type_test_messages.txt");
+
+    final JsonNode config = getConfig();
+    final String defaultSchema = getDefaultSchema(config);
+    runSyncAndVerifyStateOutput(config, messages, configuredCatalog, false);
+
+    for (final AirbyteStream stream : catalog.getStreams()) {
+      final String streamName = stream.getName();
+      final String schema = stream.getNamespace() != null ? stream.getNamespace() : defaultSchema;
+
+      Set<Type> actualSchemaTypes = retrieveDataTypesFromPersistedFiles(streamName, schema);
+      Optional<Type> actualSchemaTypesWithoutNull = actualSchemaTypes.stream().filter(type -> !type.equals(Type.NULL)).findAny();
+
+      JsonNode fieldDefinition = stream.getJsonSchema().get("properties").get("data");
+      List<Type> expectedTypeList = getExpectedSchemaType(fieldDefinition);
+      assertEquals(1, expectedTypeList.size(), "Several not null data types are not supported for single stream");
+      assertTrue(actualSchemaTypesWithoutNull.isPresent());
+      assertEquals(expectedTypeList.get(0), actualSchemaTypesWithoutNull.get());
+    }
+  }
+
+  private List<Type> getExpectedSchemaType(JsonNode fieldDefinition) {
+    final JsonNode typeProperty = fieldDefinition.get("type");
+    final JsonNode airbyteTypeProperty = fieldDefinition.get("airbyte_type");
+    final String airbyteTypePropertyText = airbyteTypeProperty == null ? null : airbyteTypeProperty.asText();
+    return Arrays.stream(JsonSchemaType.values())
+        .filter(
+            value -> value.getJsonSchemaType().equals(typeProperty.asText()) && compareAirbyteTypes(airbyteTypePropertyText, value))
+        .map(JsonSchemaType::getAvroType)
+        .toList();
+  }
+
+  private boolean compareAirbyteTypes(String airbyteTypePropertyText, JsonSchemaType value) {
+    if (airbyteTypePropertyText == null){
+      return value.getJsonSchemaAirbyteType() == null;
+    }
+    return airbyteTypePropertyText.equals(value.getJsonSchemaAirbyteType());
+  }
+
+  private AirbyteCatalog readCatalogFromFile(final String catalogFilename) throws IOException {
+    return Jsons.deserialize(MoreResources.readResource(catalogFilename), AirbyteCatalog.class);
+  }
+
+  private List<AirbyteMessage> readMessagesFromFile(final String messagesFilename) throws IOException {
+    return MoreResources.readResource(messagesFilename).lines()
+        .map(record -> Jsons.deserialize(record, AirbyteMessage.class)).collect(Collectors.toList());
+  }
+
+  protected abstract Set<Type> retrieveDataTypesFromPersistedFiles(final String streamName, final String namespace) throws Exception;
+
+  protected Set<Type> getTypes(Record record) {
+    List<Schema> listAvroTypes = record
+        .getSchema()
+        .getField("data")
+        .schema()
+        .getTypes();
+
+    return listAvroTypes
+        .stream()
+        .map(Schema::getType)
+        .collect(Collectors.toSet());
+  }
+}
@@ -13,20 +13,25 @@
 import io.airbyte.integrations.destination.s3.S3Format;
 import io.airbyte.integrations.destination.s3.avro.AvroConstants;
 import io.airbyte.integrations.destination.s3.avro.JsonFieldNameUpdater;
+import io.airbyte.integrations.destination.s3.parquet.S3ParquetWriter;
 import io.airbyte.integrations.destination.s3.util.AvroRecordHelper;
 import io.airbyte.integrations.standardtest.destination.comparator.TestDataComparator;
 import java.io.IOException;
 import java.net.URI;
 import java.net.URISyntaxException;
+import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import org.apache.avro.Schema.Type;
 import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericData.Record;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.parquet.avro.AvroReadSupport;
 import org.apache.parquet.hadoop.ParquetReader;
 
-public class GcsParquetDestinationAcceptanceTest extends GcsDestinationAcceptanceTest {
+public class GcsParquetDestinationAcceptanceTest extends GcsAvroParquetDestinationAcceptanceTest {
 
   protected GcsParquetDestinationAcceptanceTest() {
     super(S3Format.PARQUET);
@@ -78,4 +83,30 @@ protected List<JsonNode> retrieveRecords(final TestDestinationEnv testEnv,
     return jsonRecords;
   }
 
+  @Override
+  protected Set<Type> retrieveDataTypesFromPersistedFiles(final String streamName, final String namespace) throws Exception {
+
+    final List<S3ObjectSummary> objectSummaries = getAllSyncedObjects(streamName, namespace);
+    final Set<Type> dataTypes = new HashSet<>();
+
+    for (final S3ObjectSummary objectSummary : objectSummaries) {
+      final S3Object object = s3Client.getObject(objectSummary.getBucketName(), objectSummary.getKey());
+      final URI uri = new URI(String.format("s3a://%s/%s", object.getBucketName(), object.getKey()));
+      final var path = new org.apache.hadoop.fs.Path(uri);
+      final Configuration hadoopConfig = S3ParquetWriter.getHadoopConfig(config);
+
+      try (final ParquetReader<Record> parquetReader = ParquetReader.<GenericData.Record>builder(new AvroReadSupport<>(), path)
+          .withConf(hadoopConfig)
+          .build()) {
+        GenericData.Record record;
+        while ((record = parquetReader.read()) != null) {
+          Set<Type> actualDataTypes = getTypes(record);
+          dataTypes.addAll(actualDataTypes);
+        }
+      }
+    }
+
+    return dataTypes;
+  }
+
 }
diff --git a/...nectors/destination-gcs/src/test-integration/resources/number_data_type_test_catalog.json b/...nectors/destination-gcs/src/test-integration/resources/number_data_type_test_catalog.json
@@ -0,0 +1,47 @@
+{
+  "streams": [
+    {
+      "name": "int_test",
+      "json_schema": {
+        "properties": {
+          "data": {
+            "type": "number",
+            "airbyte_type": "integer"
+          }
+        }
+      }
+    },
+    {
+      "name": "big_integer_test",
+      "json_schema": {
+        "properties": {
+          "data": {
+            "type": "number",
+            "airbyte_type": "big_integer"
+          }
+        }
+      }
+    },
+    {
+      "name": "float_test",
+      "json_schema": {
+        "properties": {
+          "data": {
+            "type": "number",
+            "airbyte_type": "float"
+          }
+        }
+      }
+    },
+    {
+      "name": "default_number_test",
+      "json_schema": {
+        "properties": {
+          "data": {
+            "type": "number"
+          }
+        }
+      }
+    }
+  ]
+}
diff --git a/...nectors/destination-gcs/src/test-integration/resources/number_data_type_test_messages.txt b/...nectors/destination-gcs/src/test-integration/resources/number_data_type_test_messages.txt
@@ -0,0 +1,13 @@
+{"type": "RECORD", "record": {"stream": "int_test", "emitted_at": 1602637589100, "data": { "data" : 42 }}}
+{"type": "RECORD", "record": {"stream": "int_test", "emitted_at": 1602637589200, "data": { "data" : 0 }}}
+{"type": "RECORD", "record": {"stream": "int_test", "emitted_at": 1602637589300, "data": { "data" : -12345 }}}
+{"type": "RECORD", "record": {"stream": "big_integer_test", "emitted_at": 1602637589100, "data": { "data" : 1231123412412314 }}}
+{"type": "RECORD", "record": {"stream": "big_integer_test", "emitted_at": 1602637589200, "data": { "data" : 0 }}}
+{"type": "RECORD", "record": {"stream": "big_integer_test", "emitted_at": 1602637589300, "data": { "data" : -1234 }}}
+{"type": "RECORD", "record": {"stream": "float_test", "emitted_at": 1602637589100, "data": { "data" : 56.78 }}}
+{"type": "RECORD", "record": {"stream": "float_test", "emitted_at": 1602637589200, "data": { "data" : 0 }}}
+{"type": "RECORD", "record": {"stream": "float_test", "emitted_at": 1602637589300, "data": { "data" : -12345.678 }}}
+{"type": "RECORD", "record": {"stream": "default_number_test", "emitted_at": 1602637589100, "data": { "data" : 10000000000000000000000.1234 }}}
+{"type": "RECORD", "record": {"stream": "default_number_test", "emitted_at": 1602637589200, "data": { "data" : 0 }}}
+{"type": "RECORD", "record": {"stream": "default_number_test", "emitted_at": 1602637589300, "data": { "data" : -12345.678 }}}
+{"type": "STATE", "state": { "data": {"start_date": "2022-02-14"}}}
@@ -4,39 +4,70 @@
 
 package io.airbyte.integrations.destination.s3.avro;
 
+import java.util.Arrays;
+import java.util.List;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
 import org.apache.avro.Schema;
 
 /**
  * Mapping of JsonSchema types to Avro types.
  */
 public enum JsonSchemaType {
 
-  STRING("string", true, Schema.Type.STRING),
-  NUMBER("number", true, Schema.Type.DOUBLE),
-  INTEGER("integer", true, Schema.Type.INT),
-  BOOLEAN("boolean", true, Schema.Type.BOOLEAN),
-  NULL("null", true, Schema.Type.NULL),
-  OBJECT("object", false, Schema.Type.RECORD),
-  ARRAY("array", false, Schema.Type.ARRAY),
-  COMBINED("combined", false, Schema.Type.UNION);
+  STRING("string", true, null, Schema.Type.STRING),
+  NUMBER_INT("number", true, "integer", Schema.Type.INT),
+  NUMBER_LONG("number", true, "big_integer", Schema.Type.LONG),
+  NUMBER_FLOAT("number", true, "float", Schema.Type.FLOAT),
+  NUMBER("number", true, null, Schema.Type.DOUBLE),
+  INTEGER("integer", true, null, Schema.Type.INT),
+  BOOLEAN("boolean", true, null, Schema.Type.BOOLEAN),
+  NULL("null", true, null, Schema.Type.NULL),
+  OBJECT("object", false, null, Schema.Type.RECORD),
+  ARRAY("array", false, null, Schema.Type.ARRAY),
+  COMBINED("combined", false, null, Schema.Type.UNION);
 
   private final String jsonSchemaType;
   private final boolean isPrimitive;
   private final Schema.Type avroType;
+  private final String jsonSchemaAirbyteType;
 
-  JsonSchemaType(final String jsonSchemaType, final boolean isPrimitive, final Schema.Type avroType) {
+  JsonSchemaType(final String jsonSchemaType, final boolean isPrimitive, final String jsonSchemaAirbyteType, final Schema.Type avroType) {
     this.jsonSchemaType = jsonSchemaType;
+    this.jsonSchemaAirbyteType = jsonSchemaAirbyteType;
     this.isPrimitive = isPrimitive;
     this.avroType = avroType;
   }
 
-  public static JsonSchemaType fromJsonSchemaType(final String value) {
-    for (final JsonSchemaType type : values()) {
-      if (value.equals(type.jsonSchemaType)) {
-        return type;
-      }
+  public static JsonSchemaType fromJsonSchemaType(final String jsonSchemaType) {
+    return fromJsonSchemaType(jsonSchemaType, null);
+  }
+
+  public static JsonSchemaType fromJsonSchemaType(final @Nonnull String jsonSchemaType, final @Nullable String jsonSchemaAirbyteType) {
+    List<JsonSchemaType> matchSchemaType = null;
+    // Match by Type + airbyteType
+    if (jsonSchemaAirbyteType != null) {
+      matchSchemaType = Arrays.stream(values())
+          .filter(type -> jsonSchemaType.equals(type.jsonSchemaType))
+          .filter(type -> jsonSchemaAirbyteType.equals(type.jsonSchemaAirbyteType))
+          .toList();
+    }
+
+    // Match by Type are no results already
+    if (matchSchemaType == null || matchSchemaType.isEmpty()) {
+      matchSchemaType =
+          Arrays.stream(values()).filter(format -> jsonSchemaType.equals(format.jsonSchemaType) && format.jsonSchemaAirbyteType == null).toList();
+    }
+
+    if (matchSchemaType.isEmpty()) {
+      throw new IllegalArgumentException("Unexpected json schema type: " + jsonSchemaType);
+    } else if (matchSchemaType.size() > 1) {
+      throw new RuntimeException(
+          "Match with more than one json format! Matched formats : " + matchSchemaType + ", Inputs jsonSchemaFormat : " + jsonSchemaType
+              + ", jsonSchemaAirbyteType : " + jsonSchemaAirbyteType);
+    } else {
+      return matchSchemaType.get(0);
     }
-    throw new IllegalArgumentException("Unexpected json schema type: " + value);
   }
 
   public String getJsonSchemaType() {
@@ -56,4 +87,7 @@ public String toString() {
     return jsonSchemaType;
   }
 
+  public String getJsonSchemaAirbyteType() {
+    return jsonSchemaAirbyteType;
+  }
 }
@@ -36,6 +36,8 @@
  */
 public class JsonToAvroSchemaConverter {
 
+  private static final String TYPE = "type";
+  private static final String AIRBYTE_TYPE = "airbyte_type";
   private static final Schema UUID_SCHEMA = LogicalTypes.uuid()
       .addToSchema(Schema.create(Schema.Type.STRING));
   private static final Schema NULL_SCHEMA = Schema.create(Schema.Type.NULL);
@@ -60,7 +62,9 @@ static List<JsonSchemaType> getTypes(final String fieldName, final JsonNode fiel
       return Collections.singletonList(JsonSchemaType.COMBINED);
     }
 
-    final JsonNode typeProperty = fieldDefinition.get("type");
+    final JsonNode typeProperty = fieldDefinition.get(TYPE);
+    final JsonNode airbyteTypeProperty = fieldDefinition.get(AIRBYTE_TYPE);
+    final String airbyteType = airbyteTypeProperty == null ? null : airbyteTypeProperty.asText();
     if (typeProperty == null || typeProperty.isNull()) {
       LOGGER.warn("Field \"{}\" has no type specification. It will default to string", fieldName);
       return Collections.singletonList(JsonSchemaType.STRING);
@@ -73,7 +77,7 @@ static List<JsonSchemaType> getTypes(final String fieldName, final JsonNode fiel
     }
 
     if (typeProperty.isTextual()) {
-      return Collections.singletonList(JsonSchemaType.fromJsonSchemaType(typeProperty.asText()));
+      return Collections.singletonList(JsonSchemaType.fromJsonSchemaType(typeProperty.asText(), airbyteType));
     }
 
     LOGGER.warn("Field \"{}\" has unexpected type {}. It will default to string.", fieldName, typeProperty);
@@ -214,7 +218,7 @@ Schema parseSingleType(final String fieldName,
 
     final Schema fieldSchema;
     switch (fieldType) {
-      case NUMBER, INTEGER, BOOLEAN -> fieldSchema = Schema.create(fieldType.getAvroType());
+      case INTEGER, NUMBER, NUMBER_INT, NUMBER_LONG, NUMBER_FLOAT, BOOLEAN -> fieldSchema = Schema.create(fieldType.getAvroType());
       case STRING -> {
         if (fieldDefinition.has("format")) {
           final String format = fieldDefinition.get("format").asText();