zipline-ai · chewy-zlai · Nov 7, 2024 · Sep 11, 2024 · Sep 21, 2024 · Sep 21, 2024
diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala b/aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala
@@ -18,16 +18,16 @@ package ai.chronon.aggregator.base
 
 import ai.chronon.aggregator.base.FrequentItemType.ItemType
 import ai.chronon.api._
-import com.yahoo.memory.Memory
-import com.yahoo.sketches.ArrayOfDoublesSerDe
-import com.yahoo.sketches.ArrayOfItemsSerDe
-import com.yahoo.sketches.ArrayOfLongsSerDe
-import com.yahoo.sketches.ArrayOfStringsSerDe
-import com.yahoo.sketches.cpc.CpcSketch
-import com.yahoo.sketches.cpc.CpcUnion
-import com.yahoo.sketches.frequencies.ErrorType
-import com.yahoo.sketches.frequencies.ItemsSketch
-import com.yahoo.sketches.kll.KllFloatsSketch
+import org.apache.datasketches.common.ArrayOfDoublesSerDe
+import org.apache.datasketches.common.ArrayOfItemsSerDe
+import org.apache.datasketches.common.ArrayOfLongsSerDe
+import org.apache.datasketches.common.ArrayOfStringsSerDe
+import org.apache.datasketches.cpc.CpcSketch
+import org.apache.datasketches.cpc.CpcUnion
+import org.apache.datasketches.frequencies.ErrorType
+import org.apache.datasketches.frequencies.ItemsSketch
+import org.apache.datasketches.kll.KllFloatsSketch
+import org.apache.datasketches.memory.Memory
 
 import java.io.ByteArrayInputStream
 import java.io.ByteArrayOutputStream
@@ -336,6 +336,13 @@ object CpcFriendly {
   implicit val stringIsCpcFriendly: CpcFriendly[String] = new CpcFriendly[String] {
     override def update(sketch: CpcSketch, input: String): Unit = sketch.update(input)
   }
+  implicit val intIsCpcFriendly: CpcFriendly[Int] = new CpcFriendly[Int] {
+    override def update(sketch: CpcSketch, input: Int): Unit = sketch.update(input.toLong)
+  }
+
+  implicit val floatIsCpcFriendly: CpcFriendly[Float] = new CpcFriendly[Float] {
+    override def update(sketch: CpcSketch, input: Float): Unit = sketch.update(input.toDouble)
+  }
 
   implicit val longIsCpcFriendly: CpcFriendly[Long] = new CpcFriendly[Long] {
     override def update(sketch: CpcSketch, input: Long): Unit = sketch.update(input)
@@ -344,6 +351,10 @@ object CpcFriendly {
     override def update(sketch: CpcSketch, input: Double): Unit = sketch.update(input)
   }
 
+  implicit val decimalIsCpcFriendly: CpcFriendly[java.math.BigDecimal] = new CpcFriendly[java.math.BigDecimal] {
+    override def update(sketch: CpcSketch, input: java.math.BigDecimal): Unit = sketch.update(input.toPlainString)
+  }
+
   implicit val BinaryIsCpcFriendly: CpcFriendly[Array[Byte]] = new CpcFriendly[Array[Byte]] {
     override def update(sketch: CpcSketch, input: Array[Byte]): Unit = sketch.update(input)
   }
@@ -666,7 +677,7 @@ class ApproxPercentiles(k: Int = 128, percentiles: Array[Double] = Array(0.5))
   override def irType: DataType = BinaryType
 
   override def prepare(input: Float): KllFloatsSketch = {
-    val sketch = new KllFloatsSketch(k)
+    val sketch = KllFloatsSketch.newHeapInstance(k)
     sketch.update(input)
     sketch
   }
@@ -682,7 +693,7 @@ class ApproxPercentiles(k: Int = 128, percentiles: Array[Double] = Array(0.5))
   }
 
   override def bulkMerge(irs: Iterator[KllFloatsSketch]): KllFloatsSketch = {
-    val result = new KllFloatsSketch(k)
+    val result = KllFloatsSketch.newHeapInstance(k)
     irs.foreach(result.merge)
     result
   }

diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/row/StatsGenerator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/row/StatsGenerator.scala
@@ -18,8 +18,8 @@ package ai.chronon.aggregator.row
 
 import ai.chronon.api
 import ai.chronon.api.Extensions._
-import com.yahoo.memory.Memory
-import com.yahoo.sketches.kll.KllFloatsSketch
+import org.apache.datasketches.kll.KllFloatsSketch
+import org.apache.datasketches.memory.Memory
 
 import java.util
 import scala.collection.Seq
@@ -131,11 +131,12 @@ object StatsGenerator {
     metrics :+ MetricTransform(totalColumn, InputTransform.One, api.Operation.COUNT)
   }
 
-  def lInfKllSketch(sketch1: AnyRef, sketch2: AnyRef, bins: Int = 128): AnyRef = {
+  def lInfKllSketch(sketch1: AnyRef, sketch2: AnyRef, bins: Int = 20): AnyRef = {
     if (sketch1 == null || sketch2 == null) return None
     val sketchIr1 = KllFloatsSketch.heapify(Memory.wrap(sketch1.asInstanceOf[Array[Byte]]))
     val sketchIr2 = KllFloatsSketch.heapify(Memory.wrap(sketch2.asInstanceOf[Array[Byte]]))
-    val keySet = sketchIr1.getQuantiles(bins).union(sketchIr2.getQuantiles(bins))
+    val binsToDoubles = (0 to bins).map(_.toDouble / bins).toArray
+    val keySet = sketchIr1.getQuantiles(binsToDoubles).union(sketchIr2.getQuantiles(binsToDoubles))
     var linfSimple = 0.0
     keySet.foreach { key =>
       val cdf1 = sketchIr1.getRank(key)
@@ -156,15 +157,17 @@ object StatsGenerator {
     * and PSI>0.25 means "significant shift, action required"
     * https://scholarworks.wmich.edu/dissertations/3208
     */
-  def PSIKllSketch(reference: AnyRef, comparison: AnyRef, bins: Int = 128, eps: Double = 0.000001): AnyRef = {
+  def PSIKllSketch(reference: AnyRef, comparison: AnyRef, bins: Int = 20, eps: Double = 0.000001): AnyRef = {
     if (reference == null || comparison == null) return None
     val referenceSketch = KllFloatsSketch.heapify(Memory.wrap(reference.asInstanceOf[Array[Byte]]))
     val comparisonSketch = KllFloatsSketch.heapify(Memory.wrap(comparison.asInstanceOf[Array[Byte]]))
-    val keySet = referenceSketch.getQuantiles(bins).union(comparisonSketch.getQuantiles(bins)).toSet.toArray.sorted
+    val binsToDoubles = (0 to bins).map(_.toDouble / bins).toArray
+    val keySet =
+      referenceSketch.getQuantiles(binsToDoubles).union(comparisonSketch.getQuantiles(binsToDoubles)).distinct.sorted
     val referencePMF = regularize(referenceSketch.getPMF(keySet), eps)
     val comparisonPMF = regularize(comparisonSketch.getPMF(keySet), eps)
     var psi = 0.0
-    for (i <- 0 until referencePMF.length) {
+    for (i <- referencePMF.indices) {
       psi += (referencePMF(i) - comparisonPMF(i)) * Math.log(referencePMF(i) / comparisonPMF(i))
     }
     psi.asInstanceOf[AnyRef]

diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/ApproxPercentilesTest.scala
@@ -18,8 +18,8 @@ package ai.chronon.aggregator.test
 
 import ai.chronon.aggregator.base.ApproxPercentiles
 import ai.chronon.aggregator.row.StatsGenerator
-import com.yahoo.sketches.kll.KllFloatsSketch
 import junit.framework.TestCase
+import org.apache.datasketches.kll.KllFloatsSketch
 import org.junit.Assert._
 import org.slf4j.Logger
 import org.slf4j.LoggerFactory
@@ -28,7 +28,8 @@ import scala.util.Random
 
 class ApproxPercentilesTest extends TestCase {
   @transient lazy val logger: Logger = LoggerFactory.getLogger(getClass)
-  def testBasicImpl(nums: Int, slide: Int, k: Int, percentiles: Array[Double], errorPercent: Float): Unit = {
+
+  def basicImplTestHelper(nums: Int, slide: Int, k: Int, percentiles: Array[Double], errorPercent: Float): Unit = {
     val sorted = (0 to nums).map(_.toFloat)
     val elems = Random.shuffle(sorted.toList).toArray
     val chunks = elems.sliding(slide, slide)
@@ -58,14 +59,14 @@ class ApproxPercentilesTest extends TestCase {
   def testBasicPercentiles: Unit = {
     val percentiles_tested: Int = 31
     val percentiles: Array[Double] = (0 to percentiles_tested).toArray.map(i => i * 1.0 / percentiles_tested)
-    testBasicImpl(3000, 5, 100, percentiles, errorPercent = 4)
-    testBasicImpl(30000, 50, 200, percentiles, errorPercent = 2)
-    testBasicImpl(30000, 50, 50, percentiles, errorPercent = 5)
+    basicImplTestHelper(3000, 5, 100, percentiles, errorPercent = 4)
+    basicImplTestHelper(30000, 50, 200, percentiles, errorPercent = 2)
+    basicImplTestHelper(30000, 50, 50, percentiles, errorPercent = 5)
   }
 
   def getPSIDrift(sample1: Array[Float], sample2: Array[Float]): Double = {
-    val sketch1 = new KllFloatsSketch(200)
-    val sketch2 = new KllFloatsSketch(200)
+    val sketch1 = KllFloatsSketch.newHeapInstance(200)
+    val sketch2 = KllFloatsSketch.newHeapInstance(200)
     sample1.map(sketch1.update)
     sample2.map(sketch2.update)
     val drift = StatsGenerator.PSIKllSketch(sketch1.toByteArray, sketch2.toByteArray).asInstanceOf[Double]

diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/DataGen.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/DataGen.scala
@@ -35,6 +35,12 @@ abstract class CStream[+T: ClassTag] {
     else min + ((max - min) * math.random)
   }
 
+  protected def rollFloat(max: JDouble, min: JDouble = 0, nullRate: Double = 0.1): java.lang.Float = {
+    val dice: Double = math.random
+    if (dice < nullRate) null
+    else (min + ((max - min) * math.random)).toFloat
+  }
+
   // roll a dice that gives max to min uniformly, with nulls interspersed as per null rate
   protected def roll(max: JLong, min: JLong = 0, nullRate: Double = 0.1): JLong = {
     val roll = rollDouble(max.toDouble, min.toDouble, nullRate)
@@ -45,9 +51,9 @@ abstract class CStream[+T: ClassTag] {
     Stream.fill(count)(next())
   }
 
-  def chunk(minSize: Long = 0, maxSize: Long = 10, nullRate: Double = 0.1): CStream[Seq[T]] = {
+  def chunk(minSize: Long = 0, maxSize: Long = 10, nullRate: Double = 0.1): CStream[Any] = {
     def innerNext(): T = next()
-    new CStream[Seq[T]] {
+    new CStream[Any] {
       override def next(): Seq[T] = {
         val size = roll(minSize, maxSize, nullRate)
         if (size != null) {
@@ -58,17 +64,36 @@ abstract class CStream[+T: ClassTag] {
       }
     }
   }
+
+  def zipChunk[Other](other: CStream[Other], minSize: Int = 0, maxSize: Int = 20, nullRate: Double = 0.1): CStream[Any] = {
+    def nextKey(): T = next()
+    def nextValue(): Other = other.next()
+
+    new CStream[Any] {
+      override def next(): Map[T, Other] = {
+        val size = roll(minSize, maxSize, nullRate)
+        if (size != null) {
+          (0 until size.toInt).map { _ => nextKey() -> nextValue() }.toMap
+        } else {
+          null
+        }
+      }
+    }
+  }
 }
 
 object CStream {
   private type JLong = java.lang.Long
   private type JDouble = java.lang.Double
+  private type JFloat = java.lang.Float
 
   def genTimestamps(window: Window,
                     count: Int,
                     roundMillis: Int = 1,
                     maxTs: Long = System.currentTimeMillis()): Array[Long] =
-    new CStream.TimeStream(window, roundMillis, maxTs).gen(count).toArray.sorted
+    new CStream.TimeStream(window, roundMillis, maxTs).gen(count).toArray.sorted(new Ordering[Any] {
+      override def compare(x: Any, y: Any): Int = x.asInstanceOf[Long].compareTo(y.asInstanceOf[Long])
+    })
 
   def genPartitions(count: Int, partitionSpec: PartitionSpec): Array[String] = {
     val today = partitionSpec.at(System.currentTimeMillis())
@@ -121,6 +146,11 @@ object CStream {
       Option(rollDouble(max, 1, nullRate = nullRate)).map(java.lang.Double.valueOf(_)).orNull
   }
 
+  class FloatStream(max: Double = 10000, nullRate: Double = 0.1) extends CStream[JFloat] {
+    override def next(): java.lang.Float =
+      Option(rollFloat(max, 1, nullRate = nullRate)).map(java.lang.Float.valueOf(_)).orNull
+  }
+
   class ZippedStream(streams: CStream[Any]*)(tsIndex: Int) extends CStream[TestRow] {
     override def next(): TestRow =
       new TestRow(streams.map(_.next()).toArray: _*)(tsIndex)
@@ -139,7 +169,7 @@ object CStream {
 }
 
 case class Column(name: String, `type`: DataType, cardinality: Int, chunkSize: Int = 10, nullRate: Double = 0.1) {
-  def genImpl(dtype: DataType, partitionColumn: String, partitionSpec: PartitionSpec): CStream[Any] =
+  def genImpl(dtype: DataType, partitionColumn: String, partitionSpec: PartitionSpec, nullRate: Double): CStream[Any] =
     dtype match {
       case StringType =>
         name match {
@@ -148,18 +178,23 @@ case class Column(name: String, `type`: DataType, cardinality: Int, chunkSize: I
         }
       case IntType    => new IntStream(cardinality, nullRate)
       case DoubleType => new DoubleStream(cardinality, nullRate)
+      case FloatType => new FloatStream(cardinality, nullRate)
       case LongType =>
         name match {
           case Constants.TimeColumn => new TimeStream(new Window(cardinality, TimeUnit.DAYS))
           case _                    => new LongStream(cardinality, nullRate)
         }
       case ListType(elementType) =>
-        genImpl(elementType, partitionColumn, partitionSpec).chunk(chunkSize)
+        genImpl(elementType, partitionColumn, partitionSpec, nullRate).chunk(chunkSize)
+      case MapType(keyType, valueType) =>
+        val keyStream = genImpl(keyType, partitionColumn, partitionSpec, 0)
+        val valueStream = genImpl(valueType, partitionColumn, partitionSpec, nullRate)
+        keyStream.zipChunk(valueStream, maxSize = chunkSize)
       case otherType => throw new UnsupportedOperationException(s"Can't generate random data for $otherType yet.")
     }
 
   def gen(partitionColumn: String, partitionSpec: PartitionSpec): CStream[Any] =
-    genImpl(`type`, partitionColumn, partitionSpec)
+    genImpl(`type`, partitionColumn, partitionSpec, nullRate)
   def schema: (String, DataType) = name -> `type`
 }
 case class RowsWithSchema(rows: Array[TestRow], schema: Seq[(String, DataType)])
diff --git a/aggregator/src/test/scala/ai/chronon/aggregator/test/TwoStackLiteAggregatorTest.scala b/aggregator/src/test/scala/ai/chronon/aggregator/test/TwoStackLiteAggregatorTest.scala
@@ -42,13 +42,13 @@ class TwoStackLiteAggregatorTest extends TestCase{
     val topK = new TopK[Integer](IntType, 2)
     val bankersBuffer = new TwoStackLiteAggregationBuffer(topK, 5)
     assertEquals(null, bankersBuffer.query) // null
-    Seq(7, 8, 9).map(x => new Integer(x)).foreach(i => bankersBuffer.push(i))
+    Seq(7, 8, 9).map(x => Integer.valueOf(x)).foreach(i => bankersBuffer.push(i))
     def assertBufferEquals(a: Seq[Int], b: java.util.ArrayList[Integer]): Unit = {
       if(a==null || b == null) {
         assertEquals(a, b)
       } else {
         assertArrayEquals(
-          Option(a).map(_.map(x => new Integer(x).asInstanceOf[AnyRef]).toArray).orNull,
+          Option(a).map(_.map(x => Integer.valueOf(x).asInstanceOf[AnyRef]).toArray).orNull,
           Option(b).map(_.toArray).orNull)
       }
     }
@@ -59,7 +59,7 @@ class TwoStackLiteAggregatorTest extends TestCase{
     assertBufferEquals(Seq(9), bankersBuffer.query)
     bankersBuffer.pop()
     assertBufferEquals(null, bankersBuffer.query)
-    bankersBuffer.push(new Integer(10))
+    bankersBuffer.push(Integer.valueOf(10))
     assertBufferEquals(Seq(10), bankersBuffer.query)
   }
 

diff --git a/api/py/python-api-build.sh b/api/py/python-api-build.sh
@@ -17,8 +17,8 @@
 
 set -o xtrace
 
-export CHRONON_VERSION_STR=$1
-export CHRONON_BRANCH_STR=$2
+export VERSION=$1
+export BRANCH=$2
 ACTION=$3
 
 echo "Finding working directory.."
@@ -37,8 +37,8 @@ elif [[ "${ACTION}" == "release" ]]; then
   PYPI_REPOSITORY="chronon-pypi"
   # Sanity checks, git state, Run Tests, Build, Release
   # Make sure the version string doesn't contain SNAPSHOT if so it signifies development build and cannot be released.
-  if [[ "${CHRONON_VERSION_STR}" == *"SNAPSHOT"* ]]; then
-    echo "Python releases cannot be done for in development versions. Version: ${CHRONON_VERSION_STR}"
+  if [[ "${VERSION}" == *"SNAPSHOT"* ]]; then
+    echo "Python releases cannot be done for in development versions. Version: ${VERSION}"
-  if [[ "${VERSION}" == *"SNAPSHOT"* ]]; then
-    echo "Python releases cannot be done for in development versions. Version: ${VERSION}"
+  if [[ "${CHRONON_VERSION_STR}" == *"SNAPSHOT"* ]]; then
+    echo "Python releases cannot be done for in development versions. Version: ${CHRONON_VERSION_STR}"
-  if [[ "${VERSION}" == *"SNAPSHOT"* ]]; then
-    echo "Python releases cannot be done for in development versions. Version: ${VERSION}"
+  if [[ "${CHRONON_VERSION_STR}" == *"SNAPSHOT"* ]]; then
+    echo "Python releases cannot be done for in development versions. Version: ${CHRONON_VERSION_STR}"
     exit 1
   fi
   echo "Running tests, git check, build and release..."

diff --git a/api/py/setup.py b/api/py/setup.py
@@ -29,8 +29,8 @@
 __version__ = "local"
 __branch__ = "main"
 def get_version():
-    version_str = os.environ.get("CHRONON_VERSION_STR", __version__)
-    branch_str = os.environ.get("CHRONON_BRANCH_STR", __branch__)
+    version_str = os.environ.get("VERSION", __version__)
+    branch_str = os.environ.get("BRANCH", __branch__)
     # Replace "-SNAPSHOT" with ".dev"
     version_str = version_str.replace("-SNAPSHOT", ".dev")
     # If the prefix is the branch name, then convert it as suffix after '+' to make it Python PEP440 complaint

diff --git a/api/src/main/scala/ai/chronon/api/Builders.scala b/api/src/main/scala/ai/chronon/api/Builders.scala
@@ -275,7 +275,9 @@ object Builders {
       result.setProduction(production)
       result.setCustomJson(customJson)
       result.setOutputNamespace(namespace)
-      result.setTeam(Option(team).getOrElse("chronon"))
+      val effectiveTeam = Option(team).getOrElse(name.split("\\.").headOption.getOrElse("chronon"))
+
+      result.setTeam(effectiveTeam)
       result.setHistoricalBackfill(historicalBackill)
       if (dependencies != null)
         result.setDependencies(dependencies.toSeq.toJava)

diff --git a/api/src/main/scala/ai/chronon/api/Constants.scala b/api/src/main/scala/ai/chronon/api/Constants.scala
@@ -48,6 +48,7 @@ object Constants {
   val MutationAvroFields: Seq[StructField] = Seq(TimeField, ReversalField)
   val MutationAvroColumns: Seq[String] = MutationAvroFields.map(_.name)
   val MutationFields: Seq[StructField] = Seq(MutationTimeField, ReversalField)
+  val TileColumn: String = "_tile"
   val TimedKvRDDKeySchemaKey: String = "__keySchema"
   val TimedKvRDDValueSchemaKey: String = "__valueSchema"
   val StatsKeySchema: StructType = StructType("keySchema", Array(StructField("JoinPath", StringType)))

diff --git a/api/src/main/scala/ai/chronon/api/Extensions.scala b/api/src/main/scala/ai/chronon/api/Extensions.scala
@@ -43,14 +43,16 @@ object Extensions {
   implicit class TimeUnitOps(timeUnit: TimeUnit) {
     def str: String =
       timeUnit match {
-        case TimeUnit.HOURS => "h"
-        case TimeUnit.DAYS  => "d"
+        case TimeUnit.HOURS   => "h"
+        case TimeUnit.DAYS    => "d"
+        case TimeUnit.MINUTES => "m"
       }
 
     def millis: Long =
       timeUnit match {
-        case TimeUnit.HOURS => 3600 * 1000
-        case TimeUnit.DAYS  => 24 * 3600 * 1000
+        case TimeUnit.HOURS   => 3600 * 1000
+        case TimeUnit.DAYS    => 24 * 3600 * 1000
+        case TimeUnit.MINUTES => 60 * 1000
       }
   }
 
@@ -108,6 +110,9 @@ object Extensions {
     def outputFinalView: String = s"${metaData.outputNamespace}.${metaData.cleanName}_labeled"
     def outputLatestLabelView: String = s"${metaData.outputNamespace}.${metaData.cleanName}_labeled_latest"
     def loggedTable: String = s"${outputTable}_logged"
+    def summaryTable: String = s"${outputTable}_summary"
+    def packedSummaryTable: String = s"${outputTable}_summary_packed"
+    def driftTable: String = s"${outputTable}_drift"
 
     def bootstrapTable: String = s"${outputTable}_bootstrap"