tile codec code cleanup

nikhil-zlai · nikhil-zlai · commit 08e3315ade88 · 2025-03-12T16:40:03.000-07:00
diff --git a/spark/src/main/scala/ai/chronon/spark/utils/InMemoryStream.scala b/spark/src/main/scala/ai/chronon/spark/utils/InMemoryStream.scala
@@ -23,6 +23,7 @@ import ai.chronon.api.StructType
 import ai.chronon.online.AvroConversions
 import ai.chronon.online.SparkConversions
 import ai.chronon.online.TileCodec
+import ai.chronon.spark.utils.InMemoryStream.TileUpdate
 import ai.chronon.spark.{FastHashing, GenericRowHandler, KeyWithHash, TableUtils}
 import org.apache.avro.data.TimeConversions
 import org.apache.avro.generic.GenericData
@@ -39,6 +40,10 @@ import org.apache.spark.sql.execution.streaming.MemoryStream
 import org.slf4j.Logger
 import org.slf4j.LoggerFactory
 
+object InMemoryStream {
+  case class TileUpdate(keys: Array[Any], ir: Array[Any], tileTimestamp: Long, updateTimestamp: Long)
+}
+
 class InMemoryStream {
   @transient lazy val logger: Logger = LoggerFactory.getLogger(getClass)
 
@@ -119,7 +124,7 @@ class InMemoryStream {
     */
   def getInMemoryTiledStreamArray(spark: SparkSession,
                                   inputDf: Dataset[Row],
-                                  groupBy: GroupBy): Array[(Array[Any], Long, Array[Byte])] = {
+                                  groupBy: GroupBy): (Array[TileUpdate], TileCodec) = {
 
     val chrononSchema: StructType = StructType.from("input", SparkConversions.toChrononSchema(inputDf.schema))
     val schema = chrononSchema.iterator.map { field =>
@@ -155,7 +160,10 @@ class InMemoryStream {
         (keyWithHash, row.getLong(tsIndex))
       })
 
-    entityTimestampGroupedRows.toArray.map { keyedRow =>
+    val tileCodec = new TileCodec(groupBy, schema)
+    // val preAgg: Array[Byte] = tileCodec.makeTileIr(aggIr, isComplete = false)
+
+    val updates = entityTimestampGroupedRows.toArray.flatMap { keyedRow =>
       val ((KeyWithHash(keys, _, _), tileTimestamp), rows) = keyedRow
 
       val rowAggregator = TileCodec.buildRowAggregator(groupBy, schema)
@@ -174,13 +182,11 @@ class InMemoryStream {
         } else {
           rowAggregator.delete(aggIr, chrononRow)
         }
-
+        val updateTimestamp = row.getLong(tsIndex)
+        TileUpdate(keys, aggIr, tileTimestamp, updateTimestamp)
       }
-
-      val tileCodec = new TileCodec(groupBy, schema)
-      val preAgg: Array[Byte] = tileCodec.makeTileIr(aggIr, true)
-
-      (keys, tileTimestamp, preAgg)
     }
+
+    updates -> tileCodec
   }
 }
diff --git a/spark/src/test/scala/ai/chronon/spark/test/OnlineUtils.scala b/spark/src/test/scala/ai/chronon/spark/test/OnlineUtils.scala
@@ -25,8 +25,7 @@ import ai.chronon.api.Extensions.GroupByOps
 import ai.chronon.api.Extensions.MetadataOps
 import ai.chronon.api.Extensions.SourceOps
 import ai.chronon.api.TilingUtils
-import ai.chronon.online.AvroConversions
-import ai.chronon.online.KVStore
+import ai.chronon.online.{AvroConversions, KVStore, TileCodec}
 import ai.chronon.spark.GenericRowHandler
 import ai.chronon.spark.GroupByUpload
 import ai.chronon.spark.SparkSessionBuilder
@@ -35,6 +34,7 @@ import ai.chronon.spark.streaming.GroupBy
 import ai.chronon.spark.streaming.JoinSourceRunner
 import ai.chronon.spark.utils.InMemoryKvStore
 import ai.chronon.spark.utils.InMemoryStream
+import ai.chronon.spark.utils.InMemoryStream.TileUpdate
 import ai.chronon.spark.utils.MockApi
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.streaming.Trigger
@@ -52,12 +52,16 @@ object OnlineUtils {
                    debug: Boolean,
                    dropDsOnWrite: Boolean,
                    isTiled: Boolean): Unit = {
+
     val inputStreamDf = groupByConf.dataModel match {
+
       case DataModel.Entities =>
         val entity = groupByConf.streamingSource.get
         val df = tableUtils.sql(s"SELECT * FROM ${entity.getEntities.mutationTable} WHERE ds = '$ds'")
+
         df.withColumnRenamed(entity.query.reversalColumn, Constants.ReversalColumn)
           .withColumnRenamed(entity.query.mutationTimeColumn, Constants.MutationTimeColumn)
+
       case DataModel.Events =>
         val table = groupByConf.streamingSource.get.table
         tableUtils.sql(s"SELECT * FROM $table WHERE ds >= '$ds'")
@@ -66,6 +70,7 @@ object OnlineUtils {
     val inputStream = new InMemoryStream
     val mockApi = new MockApi(kvStore, namespace)
     var inputModified = inputStreamDf
+
     if (dropDsOnWrite && inputStreamDf.schema.fieldNames.contains(tableUtils.partitionColumn)) {
       inputModified = inputStreamDf.drop(tableUtils.partitionColumn)
     }
@@ -79,43 +84,48 @@ object OnlineUtils {
     }
 
     if (isTiled) {
-      val memoryStream: Array[(Array[Any], Long, Array[Byte])] =
+
+      val (memoryStream: Array[TileUpdate], tileCodec: TileCodec) =
         inputStream.getInMemoryTiledStreamArray(session, inputModified, groupByConf)
       val inMemoryKvStore: KVStore = kvStore()
 
-      val fetcher = mockApi.buildFetcher(false)
+      val fetcher = mockApi.buildFetcher(debug = false)
       val groupByServingInfo = fetcher.metadataStore.getGroupByServingInfo(groupByConf.getMetaData.getName).get
 
       val keyZSchema: api.StructType = groupByServingInfo.keyChrononSchema
       val keyToBytes = AvroConversions.encodeBytes(keyZSchema, GenericRowHandler.func)
 
-      val putRequests = memoryStream.map { entry =>
-        val keys = entry._1
-        val timestamp = entry._2
-        val tileBytes = entry._3
+      val putRequests = memoryStream.map { entry: TileUpdate =>
+        val keyBytes = keyToBytes(entry.keys)
+        val tileIrBytes = tileCodec.makeTileIr(entry.ir, isComplete = false)
 
-        val keyBytes = keyToBytes(keys)
         val tileKey = TilingUtils.buildTileKey(
           groupByConf.streamingDataset,
           keyBytes,
           Some(ResolutionUtils.getSmallestWindowResolutionInMillis(groupByServingInfo.groupBy)),
           None)
+
         KVStore.PutRequest(TilingUtils.serializeTileKey(tileKey),
-                           tileBytes,
+                           tileIrBytes,
                            groupByConf.streamingDataset,
-                           Some(timestamp))
+                           Some(entry.tileTimestamp))
       }
+
       inMemoryKvStore.multiPut(putRequests)
+
     } else {
+
       val groupByStreaming =
         new GroupBy(inputStream.getInMemoryStreamDF(session, inputModified),
                     session,
                     groupByConf,
                     mockApi,
                     debug = debug)
+
       // We modify the arguments for running to make sure all data gets into the KV Store before fetching.
       val dataStream = groupByStreaming.buildDataStream()
       val query = dataStream.trigger(Trigger.Once()).start()
+
       query.awaitTermination()
     }
   }
diff --git a/spark/src/test/scala/ai/chronon/spark/test/groupby/GroupByUploadTest.scala b/spark/src/test/scala/ai/chronon/spark/test/groupby/GroupByUploadTest.scala
@@ -314,7 +314,7 @@ class GroupByUploadTest extends AnyFlatSpec {
     def cRating(location: Double, cleanliness: Double): java.util.Map[String, Double] =
       Map("location" -> location, "cleanliness" -> cleanliness).toJava
     val gson = new Gson()
-    assertEquals(results, requestResponse.map(_._2))
+    assertEquals(requestResponse.map(_._2), results)
 
     val expectedCategoryRatings = Array(
       cRating(4.5, 4.0),