rebase

thomaschow · thomaschow · commit 8548a63aadb6 · 2025-05-17T15:53:46.000-07:00
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -165,12 +165,13 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     // add 1 day to the end times to include data [ds 00:00:00.000, ds + 1 00:00:00.000)
     val shiftedEndTimes = endTimes.map(_ + tableUtils.partitionSpec.spanMillis)
     val sawtoothAggregator = new SawtoothAggregator(aggregations, selectedSchema, resolution)
+    val sawtoothAggregatorBroadcast = sparkSession.sparkContext.broadcast(sawtoothAggregator)
     val hops = hopsAggregate(endTimes.min, resolution)
 
     hops
       .flatMap { case (keys, hopsArrays) =>
         // filter out if the all the irs are nulls
-        val irs = sawtoothAggregator.computeWindows(hopsArrays, shiftedEndTimes)
+        val irs = sawtoothAggregatorBroadcast.value.computeWindows(hopsArrays, shiftedEndTimes)
         irs.indices.flatMap { i =>
           val result = normalizeOrFinalize(irs(i))
           if (result.forall(_ == null)) None
@@ -230,16 +231,21 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     val snapshotKeyHashFx = FastHashing.generateKeyBuilder(keyColumns.toArray, expandedInputDf.schema)
     val sawtoothAggregator =
       new SawtoothMutationAggregator(aggregations, SparkConversions.toChrononSchema(expandedInputDf.schema), resolution)
+
+    val sawtoothAggregatorBroadcast = sparkSession.sparkContext.broadcast(sawtoothAggregator)
     val updateFunc = (ir: BatchIr, row: Row) => {
-      sawtoothAggregator.update(row.getLong(shiftedColumnIndexTs), ir, SparkConversions.toChrononRow(row, tsIndex))
+      sawtoothAggregatorBroadcast.value.update(row.getLong(shiftedColumnIndexTs),
+                                               ir,
+                                               SparkConversions.toChrononRow(row, tsIndex))
       ir
     }
 
     // end of day IR
     val snapshotByKeys = expandedInputDf.rdd
       .keyBy(row => (snapshotKeyHashFx(row), row.getString(shiftedColumnIndex)))
-      .aggregateByKey(sawtoothAggregator.init)(seqOp = updateFunc, combOp = sawtoothAggregator.merge)
-      .mapValues(sawtoothAggregator.finalizeSnapshot)
+      .aggregateByKey(sawtoothAggregatorBroadcast.value.init)(seqOp = updateFunc,
+                                                              combOp = sawtoothAggregatorBroadcast.value.merge)
+      .mapValues(sawtoothAggregatorBroadcast.value.finalizeSnapshot)
 
     // Preprocess for mutations: Add a ds of mutation ts column, collect sorted mutations by keys and ds of mutation.
     val mutationDf = mutationDfFn()
@@ -270,10 +276,10 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
         val sortedQueries = timeQueries.map { TimeTuple.getTs }
         val finalizedEodIr = eodIr.orNull
 
-        val irs = sawtoothAggregator.lambdaAggregateIrMany(tableUtils.partitionSpec.epochMillis(ds),
-                                                           finalizedEodIr,
-                                                           dayMutations.orNull,
-                                                           sortedQueries)
+        val irs = sawtoothAggregatorBroadcast.value.lambdaAggregateIrMany(tableUtils.partitionSpec.epochMillis(ds),
+                                                                          finalizedEodIr,
+                                                                          dayMutations.orNull,
+                                                                          sortedQueries)
         ((keyWithHash, ds), (timeQueries, sortedQueries.indices.map(i => normalizeOrFinalize(irs(i)))))
       }
 
@@ -329,14 +335,16 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     val sawtoothAggregator =
       new SawtoothAggregator(aggregations, selectedSchema, resolution)
 
+    val sawtoothBroadcast = sparkSession.sparkContext.broadcast(sawtoothAggregator)
+
     // create the IRs up to minHop accuracy
     val headStartsWithIrs = queriesByHeadStarts.keys
       .groupByKey()
       .leftOuterJoin(hopsRdd)
       .flatMap { case (keys, (headStarts, hopsOpt)) =>
         val headStartsArray = headStarts.toArray
         util.Arrays.sort(headStartsArray)
-        val headStartIrs = sawtoothAggregator.computeWindows(hopsOpt.orNull, headStartsArray)
+        val headStartIrs = sawtoothBroadcast.value.computeWindows(hopsOpt.orNull, headStartsArray)
         headStartsArray.indices.map { i => (keys, headStartsArray(i)) -> headStartIrs(i) }
       }
 
@@ -361,7 +369,7 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
             eventsOpt.map(_.map(SparkConversions.toChrononRow(_, tsIndex)).iterator).orNull
           }
           val queries = queriesWithPartition.map { TimeTuple.getTs }
-          val irs = sawtoothAggregator.cumulate(inputsIt, queries, headStartIrOpt.orNull)
+          val irs = sawtoothBroadcast.value.cumulate(inputsIt, queries, headStartIrOpt.orNull)
           queries.indices.map { i =>
             (keys.data ++ queriesWithPartition(i).toArray, normalizeOrFinalize(irs(i)))
           }
diff --git a/spark/src/main/scala/ai/chronon/spark/submission/ChrononKryoRegistrator.scala b/spark/src/main/scala/ai/chronon/spark/submission/ChrononKryoRegistrator.scala
@@ -15,6 +15,8 @@
  */
 package ai.chronon.spark.submission
 
+import ai.chronon.aggregator.windowing.SawtoothAggregator
+import com.esotericsoftware.kryo.serializers.JavaSerializer
 import ai.chronon.aggregator.base.FrequentItemType.{DoubleItemType, LongItemType, StringItemType}
 import ai.chronon.aggregator.base.FrequentItemsFriendly._
 import ai.chronon.aggregator.base.{FrequentItemType, FrequentItemsFriendly, ItemsSketchIR}
@@ -216,6 +218,7 @@ class ChrononKryoRegistrator extends KryoRegistrator {
     try {
       kryo.register(Class.forName(name))
       kryo.register(Class.forName(s"[L$name;")) // represents array of a type to jvm
+      kryo.register(classOf[SawtoothAggregator], new JavaSerializer)
     } catch {
       case _: ClassNotFoundException => // do nothing
     }