Merge branch 'main' into vz--copy_planner_2

nikhil-zlai · web-flow · commit 9c5f949fdc6f · 2025-05-05T17:30:13.000-07:00
diff --git a/api/python/ai/chronon/repo/gcp.py b/api/python/ai/chronon/repo/gcp.py
@@ -58,7 +58,7 @@ def __init__(self, args):
             if args["mode"] == "fetch"
             else gcp_jar_path
         )
-        
+
         self._args = args
 
         super().__init__(args, os.path.expanduser(jar_path))
diff --git a/api/python/test/canary/joins/gcp/training_set.py b/api/python/test/canary/joins/gcp/training_set.py
@@ -0,0 +1,35 @@
+from group_bys.gcp.purchases import v1_dev, v1_test
+
+from ai.chronon.api.ttypes import EventSource, Source
+from ai.chronon.join import Join, JoinPart
+from ai.chronon.query import Query, selects
+
+"""
+This is the "left side" of the join that will comprise our training set. It is responsible for providing the primary keys
+and timestamps for which features will be computed.
+"""
+source = Source(
+    events=EventSource(
+        table="data.checkouts",
+        query=Query(
+            selects=selects(
+                "user_id"
+            ),  # The primary key used to join various GroupBys together
+            time_column="ts",
+        ),  # The event time used to compute feature values as-of
+    )
+)
+
+v1_test = Join(
+    left=source,
+    right_parts=[
+        JoinPart(group_by=v1_test)
+    ],
+)
+
+v1_dev = Join(
+    left=source,
+    right_parts=[
+        JoinPart(group_by=v1_dev)
+    ],
+)
diff --git a/api/src/main/scala/ai/chronon/api/DataType.scala b/api/src/main/scala/ai/chronon/api/DataType.scala
@@ -167,6 +167,12 @@ case class StructField(name: String, fieldType: DataType)
 case object DateType extends DataType
 
 // maps to java.sql.Timestamp
+// maps to java.time.Instant if DATETIME_JAVA8API_ENABLED is true for java8. See spark doc:
+// ```
+// If the configuration property is set to true, java.time.Instant and java.time.LocalDate classes of Java
+// 8 API are used as external types for Catalyst's TimestampType and DateType. If it is set to false,
+// java.sql.Timestamp and java.sql.Date are used for the same purpose.
+// ```
 case object TimestampType extends DataType
 
 // maps to Array[Any]
diff --git a/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/BigQueryCatalogTest.scala b/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/BigQueryCatalogTest.scala
@@ -34,7 +34,7 @@ class BigQueryCatalogTest extends AnyFlatSpec with MockitoSugar {
         "spark.chronon.partition.column" -> "ds",
         "spark.hadoop.fs.gs.impl" -> classOf[GoogleHadoopFileSystem].getName,
         "spark.hadoop.fs.AbstractFileSystem.gs.impl" -> classOf[GoogleHadoopFS].getName,
-        "spark.sql.catalogImplementation" -> "in-memory",
+        "spark.sql.catalogImplementation" -> "in-memory"
 
 //        Uncomment to test
 //        "spark.sql.defaultCatalog" -> "default_iceberg",
@@ -116,6 +116,19 @@ class BigQueryCatalogTest extends AnyFlatSpec with MockitoSugar {
     SparkBigQueryUtil.sparkDateToBigQuery(nonJava8Date)
   }
 
+  it should "bigquery connector converts spark timestamp regardless of setting" in {
+    val input = spark.createDataFrame(Seq((1, "2025-04-28 12:30:45"))).toDF("id", "ts")
+    spark.conf.set(SQLConf.DATETIME_JAVA8API_ENABLED.key, true)
+    val java8Timestamp = input.select(col("id"), col("ts").cast("timestamp")).collect.take(1).head.get(1)
+    assert(java8Timestamp.isInstanceOf[java.time.Instant])
+    SparkBigQueryUtil.sparkTimestampToBigQuery(java8Timestamp)
+
+    spark.conf.set(SQLConf.DATETIME_JAVA8API_ENABLED.key, false)
+    val nonJava8Timestamp = input.select(col("id"), col("ts").cast("timestamp")).collect.take(1).head.get(1)
+    assert(nonJava8Timestamp.isInstanceOf[java.sql.Timestamp])
+    SparkBigQueryUtil.sparkTimestampToBigQuery(nonJava8Timestamp)
+  }
+
   it should "integration testing bigquery native table" ignore {
     val nativeTable = "data.checkouts"
     val table = tableUtils.loadTable(nativeTable)
@@ -141,9 +154,8 @@ class BigQueryCatalogTest extends AnyFlatSpec with MockitoSugar {
 
     val singleFilter = tableUtils.loadTable(iceberg, List("ds = '2023-11-30'"))
     val multiFilter = tableUtils.loadTable(iceberg, List("ds = '2023-11-30'", "ds = '2023-11-30'"))
-    assertEquals(
-      singleFilter.select("user_id", "ds").as[(String, String)].collect.toList,
-      multiFilter.select("user_id", "ds").as[(String, String)].collect.toList)
+    assertEquals(singleFilter.select("user_id", "ds").as[(String, String)].collect.toList,
+                 multiFilter.select("user_id", "ds").as[(String, String)].collect.toList)
   }
 
   it should "integration testing formats" ignore {
@@ -180,37 +192,34 @@ class BigQueryCatalogTest extends AnyFlatSpec with MockitoSugar {
     assertTrue(dneFormat.isEmpty)
   }
 
-
   it should "integration testing bigquery partitions" ignore {
     // TODO(tchow): This test is ignored because it requires a running instance of the bigquery. Need to figure out stubbing locally.
     // to run, set `GOOGLE_APPLICATION_CREDENTIALS=<path_to_application_default_credentials.json>
     val externalPartitions = tableUtils.partitions("data.checkouts_parquet_partitioned")
-     assertEquals(Seq("2023-11-30"), externalPartitions)
+    assertEquals(Seq("2023-11-30"), externalPartitions)
     val nativePartitions = tableUtils.partitions("data.purchases")
     assertEquals(
-      Set(20231118, 20231122, 20231125, 20231102, 20231123, 20231119, 20231130, 20231101, 20231117, 20231110, 20231108, 20231112, 20231115, 20231116, 20231113, 20231104, 20231103, 20231106, 20231121, 20231124, 20231128, 20231109, 20231127, 20231129, 20231126, 20231114, 20231107, 20231111, 20231120, 20231105).map(_.toString), nativePartitions.toSet)
+      Set(20231118, 20231122, 20231125, 20231102, 20231123, 20231119, 20231130, 20231101, 20231117, 20231110, 20231108,
+          20231112, 20231115, 20231116, 20231113, 20231104, 20231103, 20231106, 20231121, 20231124, 20231128, 20231109,
+          20231127, 20231129, 20231126, 20231114, 20231107, 20231111, 20231120, 20231105).map(_.toString),
+      nativePartitions.toSet
+    )
 
     val df = tableUtils.loadTable("`canary-443022.data`.purchases")
     df.show
 
-    tableUtils.insertPartitions(
-      df,
-      "data.tchow_test_iceberg",
-      Map(
-      "file_format" -> "PARQUET",
-      "table_type" -> "iceberg"),
-      List("ds"))
-
+    tableUtils.insertPartitions(df,
+                                "data.tchow_test_iceberg",
+                                Map("file_format" -> "PARQUET", "table_type" -> "iceberg"),
+                                List("ds"))
 
     val icebergCols = spark.catalog.listColumns("data.tchow_test_iceberg")
     val externalCols = spark.catalog.listColumns("data.checkouts_parquet_partitioned")
     val nativeCols = spark.catalog.listColumns("data.purchases")
 
     val icebergPartitions = spark.sql("SELECT * FROM data.tchow_test_iceberg.partitions")
 
-
-    val sqlDf = tableUtils.sql(
-      s"""
+    val sqlDf = tableUtils.sql(s"""
         |SELECT ds FROM data.checkouts_parquet_partitioned -- external parquet
         |UNION ALL
         |SELECT ds FROM data.purchases -- bigquery native
@@ -272,8 +281,7 @@ class BigQueryCatalogTest extends AnyFlatSpec with MockitoSugar {
     input.close();
 
     assertNotNull("Deserialized object should not be null", deserializedObj);
-    assertTrue("Deserialized object should be an instance of GCSFileIO",
-      deserializedObj.isInstanceOf[GCSFileIO]);
+    assertTrue("Deserialized object should be an instance of GCSFileIO", deserializedObj.isInstanceOf[GCSFileIO]);
     assertEquals(original.properties(), deserializedObj.asInstanceOf[GCSFileIO].properties())
   }
 }
diff --git a/online/src/main/scala/ai/chronon/online/serde/AvroConversions.scala b/online/src/main/scala/ai/chronon/online/serde/AvroConversions.scala
@@ -35,7 +35,12 @@ object AvroConversions {
   def toAvroValue(value: AnyRef, schema: Schema): Object =
     schema.getType match {
       case Schema.Type.UNION => toAvroValue(value, schema.getTypes.get(1))
-      case Schema.Type.LONG  => value.asInstanceOf[Long].asInstanceOf[Object]
+      case Schema.Type.LONG
+          if Option(schema.getLogicalType).map(_.getName).getOrElse("") == LogicalTypes.timestampMillis().getName =>
+        // because we're setting spark.sql.datetime.java8API.enabled to True https://github.com/zipline-ai/chronon/blob/main/spark/src/main/scala/ai/chronon/spark/submission/SparkSessionBuilder.scala#L132,
+        // we'll convert to java.time.Instant
+        value.asInstanceOf[java.time.Instant].asInstanceOf[Object]
+      case Schema.Type.LONG => value.asInstanceOf[Long].asInstanceOf[Object]
       case Schema.Type.INT
           if Option(schema.getLogicalType).map(_.getName).getOrElse("") == LogicalTypes.date().getName =>
         // Avro represents as java.time.LocalDate: https://github.com/apache/avro/blob/fe0261deecf22234bbd09251764152d4bf9a9c4a/lang/java/avro/src/main/java/org/apache/avro/data/TimeConversions.java#L38
@@ -59,7 +64,10 @@ object AvroConversions {
       case Schema.Type.INT
           if Option(schema.getLogicalType).map(_.getName).getOrElse("") == LogicalTypes.date().getName =>
         DateType
-      case Schema.Type.INT     => IntType
+      case Schema.Type.INT => IntType
+      case Schema.Type.LONG
+          if Option(schema.getLogicalType).map(_.getName).getOrElse("") == LogicalTypes.timestampMillis().getName =>
+        TimestampType
       case Schema.Type.LONG    => LongType
       case Schema.Type.FLOAT   => FloatType
       case Schema.Type.DOUBLE  => DoubleType
@@ -109,13 +117,14 @@ object AvroConversions {
         assert(keyType == StringType, "Avro only supports string keys for a map")
         Schema.createMap(fromChrononSchema(valueType, nameSet))
       }
-      case StringType  => Schema.create(Schema.Type.STRING)
-      case IntType     => Schema.create(Schema.Type.INT)
-      case LongType    => Schema.create(Schema.Type.LONG)
-      case FloatType   => Schema.create(Schema.Type.FLOAT)
-      case DoubleType  => Schema.create(Schema.Type.DOUBLE)
-      case BinaryType  => Schema.create(Schema.Type.BYTES)
-      case BooleanType => Schema.create(Schema.Type.BOOLEAN)
+      case StringType    => Schema.create(Schema.Type.STRING)
+      case IntType       => Schema.create(Schema.Type.INT)
+      case LongType      => Schema.create(Schema.Type.LONG)
+      case FloatType     => Schema.create(Schema.Type.FLOAT)
+      case DoubleType    => Schema.create(Schema.Type.DOUBLE)
+      case BinaryType    => Schema.create(Schema.Type.BYTES)
+      case BooleanType   => Schema.create(Schema.Type.BOOLEAN)
+      case TimestampType => LogicalTypes.timestampMillis().addToSchema(Schema.create(Schema.Type.LONG))
       case DateType =>
         LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT))
       case _ =>
diff --git a/scripts/distribution/run_gcp_quickstart.sh b/scripts/distribution/run_gcp_quickstart.sh
@@ -71,10 +71,12 @@ if [[ "$ENVIRONMENT" == "canary" ]]; then
   bq rm -f -t canary-443022:data.gcp_purchases_v1_test
   bq rm -f -t canary-443022:data.gcp_purchases_v1_view_test
   bq rm -f -t canary-443022:data.gcp_purchases_v1_test_upload
+  bq rm -f -t canary-443022:data.gcp_training_set_v1_test
 else
   bq rm -f -t canary-443022:data.gcp_purchases_v1_dev
   bq rm -f -t canary-443022:data.gcp_purchases_v1_view_dev
   bq rm -f -t canary-443022:data.gcp_purchases_v1_dev_upload
+  bq rm -f -t canary-443022:data.gcp_training_set_v1_dev
 fi
 #TODO: delete bigtable rows
 
@@ -127,18 +129,27 @@ zipline compile --chronon-root=$CHRONON_ROOT
 
 echo -e "${GREEN}<<<<<.....................................BACKFILL.....................................>>>>>\033[0m"
 if [[ "$ENVIRONMENT" == "canary" ]]; then
-  zipline run --repo=$CHRONON_ROOT  --version $VERSION --mode backfill --conf compiled/group_bys/gcp/purchases.v1_test
+  zipline run --repo=$CHRONON_ROOT  --version $VERSION --mode backfill --conf compiled/group_bys/gcp/purchases.v1_test --start-ds 2023-11-01 --end-ds 2023-12-01
 else
-  zipline run --repo=$CHRONON_ROOT --version $VERSION --mode backfill --conf compiled/group_bys/gcp/purchases.v1_dev
+  zipline run --repo=$CHRONON_ROOT --version $VERSION --mode backfill --conf compiled/group_bys/gcp/purchases.v1_dev --start-ds 2023-11-01 --end-ds 2023-12-01
 fi
 
 fail_if_bash_failed $?
 
 echo -e "${GREEN}<<<<<.....................................BACKFILL-VIEW.....................................>>>>>\033[0m"
 if [[ "$ENVIRONMENT" == "canary" ]]; then
-  zipline run --repo=$CHRONON_ROOT  --version $VERSION --mode backfill --conf compiled/group_bys/gcp/purchases.v1_view_test
+  zipline run --repo=$CHRONON_ROOT  --version $VERSION --mode backfill --conf compiled/group_bys/gcp/purchases.v1_view_test --start-ds 2023-11-01 --end-ds 2023-12-01
 else
-  zipline run --repo=$CHRONON_ROOT --version $VERSION --mode backfill --conf compiled/group_bys/gcp/purchases.v1_view_dev
+  zipline run --repo=$CHRONON_ROOT --version $VERSION --mode backfill --conf compiled/group_bys/gcp/purchases.v1_view_dev --start-ds 2023-11-01 --end-ds 2023-12-01
+fi
+
+fail_if_bash_failed $?
+
+echo -e "${GREEN}<<<<<.....................................BACKFILL-JOIN.....................................>>>>>\033[0m"
+if [[ "$ENVIRONMENT" == "canary" ]]; then
+  zipline run --repo=$CHRONON_ROOT  --version $VERSION --mode backfill --conf compiled/joins/gcp/training_set.v1_test --start-ds 2023-11-01 --end-ds 2023-12-01
+else
+  zipline run --repo=$CHRONON_ROOT --version $VERSION --mode backfill --conf compiled/joins/gcp/training_set.v1_dev --start-ds 2023-11-01 --end-ds 2023-12-01
 fi
 
 fail_if_bash_failed $?
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala b/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala
@@ -95,11 +95,13 @@ class GroupByUpload(endPartition: String, groupBy: GroupBy) extends Serializable
     val irSchema = SparkConversions.fromChrononSchema(sawtoothOnlineAggregator.batchIrSchema)
     val keyBuilder = FastHashing.generateKeyBuilder(groupBy.keyColumns.toArray, groupBy.inputDf.schema)
 
-    logger.info(s"""
-        |BatchIR Element Size: ${SparkEnv.get.serializer
+    val batchIrElementSize = SparkEnv.get.serializer
       .newInstance()
       .serialize(sawtoothOnlineAggregator.init)
-      .capacity()}
+      .capacity()
+
+    logger.info(s"""
+        |BatchIR Element Size: $batchIrElementSize
         |""".stripMargin)
 
     val outputRdd = tableUtils
diff --git a/spark/src/main/scala/ai/chronon/spark/Join.scala b/spark/src/main/scala/ai/chronon/spark/Join.scala
diff --git a/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/catalog/TableUtils.scala
diff --git a/spark/src/main/scala/ai/chronon/spark/utils/InMemoryStream.scala b/spark/src/main/scala/ai/chronon/spark/utils/InMemoryStream.scala

Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ def __init__(self, args):`
`58`	`58`	`if args["mode"] == "fetch"`
`59`	`59`	`else gcp_jar_path`
`60`	`60`	`)`
`61`		`-`
	`61`	`+`
`62`	`62`	`self._args = args`
`63`	`63`
`64`	`64`	`super().__init__(args, os.path.expanduser(jar_path))`