zipline-ai
diff --git a/‎api/src/main/scala/ai/chronon/api/Builders.scala
Lines changed: 5 additions & 2 deletions b/‎api/src/main/scala/ai/chronon/api/Builders.scala
Lines changed: 5 additions & 2 deletions
diff --git a/‎api/src/main/scala/ai/chronon/api/Constants.scala
Lines changed: 5 additions & 2 deletions b/‎api/src/main/scala/ai/chronon/api/Constants.scala
Lines changed: 5 additions & 2 deletions
diff --git a/‎api/src/main/scala/ai/chronon/api/Extensions.scala
Lines changed: 71 additions & 17 deletions b/‎api/src/main/scala/ai/chronon/api/Extensions.scala
Lines changed: 71 additions & 17 deletions
diff --git a/‎api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala
Lines changed: 8 additions & 0 deletions b/‎api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala
Lines changed: 8 additions & 0 deletions
diff --git a/‎api/thrift/api.thrift
Lines changed: 83 additions & 5 deletions b/‎api/thrift/api.thrift
Lines changed: 83 additions & 5 deletions
diff --git a/‎build.sbt
Lines changed: 1 addition & 1 deletion b/‎build.sbt
Lines changed: 1 addition & 1 deletion
diff --git a/‎cloud_aws/src/main/scala/ai/chronon/integrations/aws/DynamoDBKVStoreImpl.scala
Lines changed: 2 additions & 2 deletions b/‎cloud_aws/src/main/scala/ai/chronon/integrations/aws/DynamoDBKVStoreImpl.scala
Lines changed: 2 additions & 2 deletions
diff --git a/‎cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/BigTableKVStoreImpl.scala
Lines changed: 1 addition & 1 deletion b/‎cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/BigTableKVStoreImpl.scala
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/setup/Online_Integration.md
Lines changed: 2 additions & 2 deletions b/‎docs/source/setup/Online_Integration.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎online/src/main/java/ai/chronon/online/JavaFetcher.java
Lines changed: 0 additions & 12 deletions b/‎online/src/main/java/ai/chronon/online/JavaFetcher.java
Lines changed: 0 additions & 12 deletions
diff --git a/‎online/src/main/scala/ai/chronon/online/Api.scala
Lines changed: 3 additions & 3 deletions b/‎online/src/main/scala/ai/chronon/online/Api.scala
Lines changed: 3 additions & 3 deletions
@@ -267,7 +267,8 @@ object Builders {
         samplePercent: Double = 100,
         consistencySamplePercent: Double = 5,
         tableProperties: Map[String, String] = Map.empty,
-        historicalBackill: Boolean = true
+        historicalBackfill: Boolean = true,
+        driftSpec: DriftSpec = null
     ): MetaData = {
       val result = new MetaData()
       result.setName(name)
@@ -283,7 +284,7 @@ object Builders {
       }
 
       result.setTeam(effectiveTeam)
-      result.setHistoricalBackfill(historicalBackill)
+      result.setHistoricalBackfill(historicalBackfill)
       if (dependencies != null)
         result.setDependencies(dependencies.toSeq.toJava)
       if (samplePercent > 0)
@@ -292,6 +293,8 @@ object Builders {
         result.setConsistencySamplePercent(consistencySamplePercent)
       if (tableProperties.nonEmpty)
         result.setTableProperties(tableProperties.toJava)
+      if (driftSpec != null)
+        result.setDriftSpec(driftSpec)
       result
     }
   }
 
@@ -37,7 +37,7 @@ object Constants {
   val ChrononDynamicTable = "chronon_dynamic_table"
   val ChrononOOCTable: String = "chronon_ooc_table"
   val ChrononLogTable: String = "chronon_log_table"
-  val ChrononMetadataKey = "ZIPLINE_METADATA"
+  val MetadataDataset = "CHRONON_METADATA"
   val SchemaPublishEvent = "SCHEMA_PUBLISH_EVENT"
   val StatsBatchDataset = "CHRONON_STATS_BATCH"
   val ConsistencyMetricsDataset = "CHRONON_CONSISTENCY_METRICS_STATS_BATCH"
@@ -62,5 +62,8 @@ object Constants {
   val LabelViewPropertyFeatureTable: String = "feature_table"
   val LabelViewPropertyKeyLabelTable: String = "label_table"
   val ChrononRunDs: String = "CHRONON_RUN_DS"
-  val DriftStatsTable: String = "drift_statistics"
+
+  val TiledSummaryDataset: String = "TILE_SUMMARIES"
+
+  val DefaultDriftTileSize: Window = new Window(30, TimeUnit.MINUTES)
 }
@@ -158,6 +158,15 @@ object Extensions {
       val teamOverride = Try(customJsonLookUp(Constants.TeamOverride).asInstanceOf[String]).toOption
       teamOverride.getOrElse(metaData.team)
     }
+
+    // if drift spec is set but tile size is not set, default to 30 minutes
+    def driftTileSize: Option[Window] = {
+      Option(metaData.getDriftSpec) match {
+        case Some(driftSpec) =>
+          Option(driftSpec.getTileSize).orElse(Some(Constants.DefaultDriftTileSize))
+        case None => None
+      }
+    }
   }
 
   // one per output column - so single window
@@ -879,24 +888,69 @@ object Extensions {
       partHashes ++ Map(leftSourceKey -> leftHash, join.metaData.bootstrapTable -> bootstrapHash) ++ derivedHashMap
     }
 
-    /*
-    External features computed in online env and logged
-    This method will get the external feature column names
-     */
-    def getExternalFeatureCols: Seq[String] = {
-      Option(join.onlineExternalParts)
-        .map(_.toScala
-          .map { part =>
-            {
-              val keys = part.source.getKeySchema.params.toScala
-                .map(_.name)
-              val values = part.source.getValueSchema.params.toScala
-                .map(_.name)
-              keys ++ values
+    def externalPartColumns: Map[String, Array[String]] =
+      Option(join.onlineExternalParts) match {
+        case Some(parts) =>
+          parts.toScala.map { part =>
+            val keys = part.source.getKeySchema.params.toScala.map(_.name)
+            val values = part.source.getValueSchema.params.toScala.map(_.name)
+            part.fullName -> (keys ++ values).toArray
+          }.toMap
+        case None => Map.empty
+      }
+
+    def derivedColumns: Array[String] =
+      Option(join.getDerivations) match {
+        case Some(derivations) =>
+          derivations.toScala.flatMap { derivation =>
+            derivation.getName match {
+              case "*" => None
+              case _   => Some(derivation.getName)
             }
-          }
-          .flatMap(_.toSet))
-        .getOrElse(Seq.empty)
+          }.toArray
+        case None => Array.empty
+      }
+
+    // renamed cols are no longer part of the output
+    private def renamedColumns: Set[String] =
+      Option(join.derivations)
+        .map {
+          _.toScala.renameOnlyDerivations.map(_.expression).toSet
+        }
+        .getOrElse(Set.empty)
+
+    def joinPartColumns: Map[String, Array[String]] =
+      Option(join.getJoinParts) match {
+        case None => Map.empty
+        case Some(parts) =>
+          parts.toScala.map { part =>
+            val prefix = Option(part.prefix)
+            val groupByName = part.getGroupBy.getMetaData.cleanName
+            val partName = (prefix.toSeq :+ groupByName).mkString("_")
+
+            val outputColumns = part.getGroupBy.valueColumns
+            val cols = outputColumns.map { column =>
+              (prefix.toSeq :+ groupByName :+ column).mkString("_")
+            }
+            partName -> cols
+          }.toMap
+      }
+
+    def outputColumnsByGroup: Map[String, Array[String]] = {
+      val preDeriveCols = (joinPartColumns ++ externalPartColumns)
+      val preDerivedWithoutRenamed = preDeriveCols.mapValues(_.filterNot(renamedColumns.contains))
+      val derivedColumns: Array[String] = Option(join.derivations) match {
+        case Some(derivations) => derivations.toScala.map { _.getName }.filter(_ == "*").toArray
+        case None              => Array.empty
+      }
+      preDerivedWithoutRenamed ++ Map("derivations" -> derivedColumns)
+    }
+
+    def keyColumns: Array[String] = {
+      val joinPartKeys = join.joinParts.toScala.flatMap(_.groupBy.keyColumns.toScala).toSet
+      val externalKeys = join.onlineExternalParts.toScala.flatMap(_.source.keyNames).toSet
+      val bootstrapKeys = join.bootstrapParts.toScala.flatMap(_.keyColumns.toScala).toSet
+      (joinPartKeys ++ externalKeys ++ bootstrapKeys).toArray
     }
 
     /*
 
@@ -25,6 +25,7 @@ import ai.chronon.api.thrift.protocol.TSimpleJSONProtocol
 import com.fasterxml.jackson.databind.DeserializationFeature
 import com.fasterxml.jackson.databind.JsonNode
 import com.fasterxml.jackson.databind.ObjectMapper
+import com.google.gson.GsonBuilder
 import org.slf4j.Logger
 import org.slf4j.LoggerFactory
 
@@ -48,6 +49,13 @@ object ThriftJsonCodec {
     new String(serializer.serialize(obj), Constants.UTF8)
   }
 
+  @transient private lazy val prettyGson = new GsonBuilder().setPrettyPrinting().create()
+  def toPrettyJsonStr[T <: TBase[_, _]: Manifest](obj: T): String = {
+    val raw = toJsonStr(obj)
+    val je = prettyGson.fromJson(raw, classOf[com.google.gson.JsonElement])
+    prettyGson.toJson(je)
+  }
+
   def toJsonList[T <: TBase[_, _]: Manifest](obj: util.List[T]): String = {
     if (obj == null) return ""
     obj.toScala
 
@@ -234,16 +234,12 @@ enum Cardinality {
 +----------------------------------+-------------------+----------------+----------------------------------+
 | Hellinger Distance               | 0.1 - 0.25        | > 0.25         | Ranges from 0 to 1               |
 +----------------------------------+-------------------+----------------+----------------------------------+
-| Kolmogorov-Smirnov (K-S)         | 0.1 - 0.2         | > 0.2          | Ranges from 0 to 1               |
-| Distance                         |                   |                |                                  |
-+----------------------------------+-------------------+----------------+----------------------------------+
 | Population Stability Index (PSI) | 0.1 - 0.2         | > 0.2          | Industry standard in some fields |
 +----------------------------------+-------------------+----------------+----------------------------------+
 **/
 enum DriftMetric {
     JENSEN_SHANNON = 0,
     HELLINGER = 1,
-    KOLMOGOROV_SMIRNOV = 2,
     PSI = 3
 }
 
@@ -254,7 +250,10 @@ struct TileKey {
   4: optional i64 sizeMillis
 }
 
-struct TileSummaries {
+// summary of distribution & coverage etc for a given (table, column, slice, tileWindow)
+// for categorical types, distribution is histogram, otherwise percentiles
+// we also handle container types by counting inner value distribution and inner value coverage
+struct TileSummary {
   1: optional list<double> percentiles
   2: optional map<string, i64> histogram
   3: optional i64 count
@@ -269,6 +268,72 @@ struct TileSummaries {
   8: optional list<i32> stringLengthPercentiles
 }
 
+struct TileSeriesKey {
+    1: optional string column // name of the column - avg_txns
+    2: optional string slice // value of the slice - merchant_category
+    3: optional string groupName // name of the columnGroup within node, for join - joinPart name, externalPart name etc
+    4: optional string nodeName // name of the node - join name etc
+}
+
+// array of tuples of (TileSummary, timestamp) ==(pivot)==> TileSummarySeries
+struct TileSummarySeries {
+  1: optional list<list<double>> percentiles
+  2: optional map<string, list<i64>> histogram
+  3: optional list<i64> count
+  4: optional list<i64> nullCount
+
+  // for container types
+  5: optional list<i64> innerCount // total of number of entries within all containers of this column
+  6: optional list<i64> innerNullCount
+  7: optional list<list<i32>> lengthPercentiles
+
+  // high cardinality string type
+  8: optional list<list<i32>> stringLengthPercentiles
+
+  200: optional list<i64> timestamps
+  300: optional TileSeriesKey key
+}
+
+// (DriftMetric + old TileSummary + new TileSummary) = TileDrift
+struct TileDrift {
+
+  // for continuous values - scalar values or within containers
+  // (lists - for eg. via last_k or maps for eg. via bucketing)
+  1: optional double percentileDrift
+  // for categorical values - scalar values or within containers
+  2: optional double histogramDrift
+
+  // for all types
+  3: optional double countChangePercent
+  4: optional double nullRatioChangePercent
+
+  // additional tracking for container types
+  5: optional double innerCountChangePercent // total of number of entries within all containers of this column
+  6: optional double innerNullCountChangePercent
+  7: optional double lengthPercentilesDrift
+
+  // additional tracking for string types
+  8: optional double stringLengthPercentilesDrift
+}
+
+// PivotUtils.pivot(Array[(Long, TileDrift)])  = TileDriftSeries
+// used in front end after this is computed
+struct TileDriftSeries {
+  1: optional list<double> percentileDriftSeries
+  2: optional list<double> histogramDriftSeries
+  3: optional list<double> countChangePercentSeries
+  4: optional list<double> nullRatioChangePercentSeries
+
+  5: optional list<double> innerCountChangePercentSeries
+  6: optional list<double> innerNullCountChangePercentSeries
+  7: optional list<double> lengthPercentilesDriftSeries
+  8: optional list<double> stringLengthPercentilesDriftSeries
+
+  200: optional list<i64> timestamps
+
+  300: optional TileSeriesKey key
+}
+
 struct DriftSpec {
     // slices is another key to summarize the data with - besides the column & slice
     // currently supports only one slice
@@ -279,9 +344,19 @@ struct DriftSpec {
     // likes_over_dislines = IF(dislikes > likes, 1, 0)
     // or any other expression that you care about
     2: optional map<string, string> derivations
+
     // we measure the unique counts of the columns and decide if they are categorical and numeric
     // you can use this to override that decision by setting cardinality hints
     3: optional map<string, Cardinality> columnCardinalityHints
+
+    4: optional Window tileSize
+
+    // the current tile summary will be compared with older summaries using the metric
+    // if the drift is more than the threshold, we will raise an alert
+    5: optional list<Window> lookbackWindows
+
+    // default drift metric to use
+    6: optional DriftMetric driftMetric = DriftMetric.JENSEN_SHANNON
 }
 
 struct MetaData {
@@ -315,6 +390,9 @@ struct MetaData {
     // Flag to indicate whether join backfill should backfill previous holes.
     // Setting to false will only backfill latest single partition
     14: optional bool historicalBackfill
+
+    // specify how to compute drift
+    15: optional DriftSpec driftSpec
 }
 
 
 
@@ -122,7 +122,7 @@ lazy val online = project
       "com.datadoghq" % "java-dogstatsd-client" % "4.4.1",
       "org.rogach" %% "scallop" % "5.1.0",
       "net.jodah" % "typetools" % "0.6.3",
-      "com.github.ben-manes.caffeine" % "caffeine" % "3.1.8"
+      "com.github.ben-manes.caffeine" % "caffeine" % "3.1.8",
     ),
     libraryDependencies ++= jackson,
     libraryDependencies ++= spark_all.map(_ % "provided"),
 
@@ -127,7 +127,7 @@ class DynamoDBKVStoreImpl(dynamoDbClient: DynamoDbClient) extends KVStore {
   override def multiGet(requests: Seq[KVStore.GetRequest]): Future[Seq[KVStore.GetResponse]] = {
     // partition our requests into pure get style requests (where we're missing timestamps and only have key lookup)
     // and query requests (we want to query a range based on afterTsMillis -> endTsMillis or now() )
-    val (getLookups, queryLookups) = requests.partition(r => r.afterTsMillis.isEmpty)
+    val (getLookups, queryLookups) = requests.partition(r => r.startTsMillis.isEmpty)
     val getItemRequestPairs = getLookups.map { req =>
       val keyAttributeMap = primaryKeyMap(req.keyBytes)
       (req, GetItemRequest.builder.key(keyAttributeMap.asJava).tableName(req.dataset).build)
@@ -325,7 +325,7 @@ class DynamoDBKVStoreImpl(dynamoDbClient: DynamoDbClient) extends KVStore {
     val partitionAlias = "#pk"
     val timeAlias = "#ts"
     val attrNameAliasMap = Map(partitionAlias -> partitionKeyColumn, timeAlias -> sortKeyColumn)
-    val startTs = request.afterTsMillis.get
+    val startTs = request.startTsMillis.get
     val endTs = request.endTsMillis.getOrElse(System.currentTimeMillis())
     val attrValuesMap =
       Map(
 
@@ -47,7 +47,7 @@ class BigTableKVStoreImpl(projectId: String, instanceId: String) extends KVStore
 
         val queryTime = System.currentTimeMillis()
         // scan from afterTsMillis to now - skip events with future timestamps
-        request.afterTsMillis.foreach { ts =>
+        request.startTsMillis.foreach { ts =>
           // Bigtable uses microseconds
           query.filter(Filters.FILTERS.timestamp().range().startOpen(ts * 1000).endClosed(queryTime))
         }
 
@@ -18,8 +18,8 @@ If you'd to start with an example, please refer to the [MongoDB Implementation i
 
 ```scala
 object KVStore {
-  // `afterTsMillis` implies that this is a range scan of all values with `timestamp` >= to the specified one. This can be implemented efficiently, if `timestamp` can be a secondary key. Some databases have a native version id concept which also can map to timestamp.
-  case class GetRequest(keyBytes: Array[Byte], dataset: String, afterTsMillis: Option[Long] = None)
+  // `startTsMillis` implies that this is a range scan of all values with `timestamp` >= to the specified one. This can be implemented efficiently, if `timestamp` can be a secondary key. Some databases have a native version id concept which also can map to timestamp.
+  case class GetRequest(keyBytes: Array[Byte], dataset: String, startTsMillis: Option[Long] = None)
 
   // response is a series of values that are 
   case class TimedValue(bytes: Array[Byte], millis: Long)
 
@@ -141,18 +141,6 @@ private Metrics.Context getGroupByContext(String groupByName) {
     return new Metrics.Context("group_by.fetch", null, groupByName, null, false, null, null, null, null);
   }
 
-  public CompletableFuture<JavaSeriesStatsResponse> fetchStatsTimeseries(JavaStatsRequest request) {
-    Future<Fetcher.SeriesStatsResponse> response = this.fetcher.fetchStatsTimeseries(request.toScalaRequest());
-    // Convert responses to CompletableFuture
-    return FutureConverters.toJava(response).toCompletableFuture().thenApply(JavaFetcher::toJavaSeriesStatsResponse);
-  }
-
-  public CompletableFuture<JavaSeriesStatsResponse> fetchLogStatsTimeseries(JavaStatsRequest request) {
-    Future<Fetcher.SeriesStatsResponse> response = this.fetcher.fetchLogStatsTimeseries(request.toScalaRequest());
-    // Convert responses to CompletableFuture
-    return FutureConverters.toJava(response).toCompletableFuture().thenApply(JavaFetcher::toJavaSeriesStatsResponse);
-  }
-
   public CompletableFuture<JavaSeriesStatsResponse> fetchConsistencyMetricsTimeseries(JavaStatsRequest request) {
     Future<Fetcher.SeriesStatsResponse> response = this.fetcher.fetchConsistencyMetricsTimeseries(request.toScalaRequest());
     // Convert responses to CompletableFuture
 
@@ -46,7 +46,7 @@ object KVStore {
   // endTsMillis - end range of the scan (starts from afterTsMillis to endTsMillis)
   case class GetRequest(keyBytes: Array[Byte],
                         dataset: String,
-                        afterTsMillis: Option[Long] = None,
+                        startTsMillis: Option[Long] = None,
                         endTsMillis: Option[Long] = None)
   case class TimedValue(bytes: Array[Byte], millis: Long)
   case class GetResponse(request: GetRequest, values: Try[Seq[TimedValue]]) {
@@ -261,7 +261,7 @@ abstract class Api(userConf: Map[String, String]) extends Serializable {
                          callerName: String = null,
                          disableErrorThrows: Boolean = false): Fetcher =
     new Fetcher(genKvStore,
-                Constants.ChrononMetadataKey,
+                Constants.MetadataDataset,
                 logFunc = responseConsumer,
                 debug = debug,
                 externalSourceRegistry = externalRegistry,
@@ -272,7 +272,7 @@ abstract class Api(userConf: Map[String, String]) extends Serializable {
 
   final def buildJavaFetcher(callerName: String = null, disableErrorThrows: Boolean = false): JavaFetcher = {
     new JavaFetcher(genKvStore,
-                    Constants.ChrononMetadataKey,
+                    Constants.MetadataDataset,
                     timeoutMillis,
                     responseConsumer,
                     externalRegistry,
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ class BigTableKVStoreImpl(projectId: String, instanceId: String) extends KVStore`
`47`	`47`
`48`	`48`	`val queryTime = System.currentTimeMillis()`
`49`	`49`	`// scan from afterTsMillis to now - skip events with future timestamps`
`50`		`- request.afterTsMillis.foreach { ts =>`
	`50`	`+ request.startTsMillis.foreach { ts =>`
`51`	`51`	`// Bigtable uses microseconds`
`52`	`52`	`query.filter(Filters.FILTERS.timestamp().range().startOpen(ts * 1000).endClosed(queryTime))`
`53`	`53`	`}`