zipline-ai
diff --git a/‎.scalafmt.conf
Lines changed: 2 additions & 1 deletion b/‎.scalafmt.conf
Lines changed: 2 additions & 1 deletion
diff --git a/‎WORKSPACE
Lines changed: 5 additions & 4 deletions b/‎WORKSPACE
Lines changed: 5 additions & 4 deletions
diff --git a/‎aggregator/BUILD.bazel
Lines changed: 2 additions & 0 deletions b/‎aggregator/BUILD.bazel
Lines changed: 2 additions & 0 deletions
diff --git a/‎aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala
Lines changed: 7 additions & 8 deletions b/‎aggregator/src/main/scala/ai/chronon/aggregator/base/SimpleAggregators.scala
Lines changed: 7 additions & 8 deletions
diff --git a/‎aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
Lines changed: 25 additions & 26 deletions b/‎aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
Lines changed: 25 additions & 26 deletions
diff --git a/‎aggregator/src/main/scala/ai/chronon/aggregator/row/StatsGenerator.scala
Lines changed: 16 additions & 22 deletions b/‎aggregator/src/main/scala/ai/chronon/aggregator/row/StatsGenerator.scala
Lines changed: 16 additions & 22 deletions
diff --git a/‎aggregator/src/main/scala/ai/chronon/aggregator/windowing/Resolution.scala
Lines changed: 2 additions & 3 deletions b/‎aggregator/src/main/scala/ai/chronon/aggregator/windowing/Resolution.scala
Lines changed: 2 additions & 3 deletions
diff --git a/‎aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothMutationAggregator.scala
Lines changed: 4 additions & 10 deletions b/‎aggregator/src/main/scala/ai/chronon/aggregator/windowing/SawtoothMutationAggregator.scala
Lines changed: 4 additions & 10 deletions
@@ -3,4 +3,5 @@ align.openParenCallSite = true
 align.openParenDefnSite = true
 danglingParentheses.defnSite = false
 danglingParentheses.callSite = false
-maxColumn = 120
+docstrings.wrap = false
+maxColumn = 120
@@ -91,19 +91,20 @@ scala_maven_import_external(
 )
 
 load("@io_bazel_rules_scala//scala:scala.bzl", "scala_repositories")
-
 scala_repositories()
 
 load("@io_bazel_rules_scala//scala:toolchains.bzl", "scala_register_toolchains")
-
 scala_register_toolchains()
 
 load("@io_bazel_rules_scala//testing:scalatest.bzl", "scalatest_repositories", "scalatest_toolchain")
-
 scalatest_repositories()
-
 scalatest_toolchain()
 
+# For scalafmt
+load("@io_bazel_rules_scala//scala/scalafmt:scalafmt_repositories.bzl", "scalafmt_default_config", "scalafmt_repositories")
+scalafmt_default_config()
+scalafmt_repositories()
+
 # For Protobuf support
 http_archive(
     name = "rules_proto",
 
@@ -1,6 +1,7 @@
 scala_library(
     name = "lib",
     srcs = glob(["src/main/**/*.scala"]),
+    format = True,
     visibility = ["//visibility:public"],
     deps = [
         "//api:lib",
@@ -49,6 +50,7 @@ test_deps = [
 scala_library(
     name = "test_lib",
     srcs = glob(["src/test/**/*.scala"]),
+    format = True,
     visibility = ["//visibility:public"],
     deps = test_deps,
 )
 
@@ -435,14 +435,13 @@ class FrequentItems[T: FrequentItemsFriendly](val mapSize: Int, val errorType: E
     val items = ir.sketch.getFrequentItems(errorType).map(sk => sk.getItem -> sk.getEstimate)
     val heap = mutable.PriorityQueue[(T, Long)]()(Ordering.by(_._2))
 
-    items.foreach({
-      case (key, value) =>
-        if (heap.size < mapSize) {
-          heap.enqueue((key, value))
-        } else if (heap.head._2 < value) {
-          heap.dequeue()
-          heap.enqueue((key, value))
-        }
+    items.foreach({ case (key, value) =>
+      if (heap.size < mapSize) {
+        heap.enqueue((key, value))
+      } else if (heap.head._2 < value) {
+        heap.dequeue()
+        heap.enqueue((key, value))
+      }
     })
 
     val result = new util.HashMap[String, Long]()
 
@@ -36,32 +36,31 @@ class RowAggregator(val inputSchema: Seq[(String, DataType)], val aggregationPar
   val indices: Range = 0 until length
   // has to be array for fast random access
   val columnAggregators: Array[ColumnAggregator] = {
-    aggregationParts.zipWithIndex.map {
-      case (spec: AggregationPart, aggregatorIndex: Int) =>
-        val ((_, inputType), inputIndex) = {
-          inputSchema.zipWithIndex.find(_._1._1 == spec.inputColumn).get
-        }
-
-        val bucketIndex: Option[Int] = Option(spec.bucket).map { bucketCol =>
-          val bIndex = inputSchema.indexWhere(_._1 == bucketCol)
-          assert(bIndex != -1, s"bucketing column: $bucketCol is not found in input: ${inputSchema.map(_._1)}")
-          val bucketType = inputSchema(bIndex)._2
-          assert(bucketType == StringType, s"bucketing column: $bucketCol needs to be a string, but found $bucketType")
-          bIndex
-        }
-        try {
-          ColumnAggregator.construct(
-            inputType,
-            spec,
-            ColumnIndices(inputIndex, aggregatorIndex),
-            bucketIndex
-          )
-        } catch {
-          case e: Exception =>
-            throw new RuntimeException(
-              s"Failed to create ${spec.operation} aggregator for ${spec.inputColumn} column of type $inputType",
-              e)
-        }
+    aggregationParts.zipWithIndex.map { case (spec: AggregationPart, aggregatorIndex: Int) =>
+      val ((_, inputType), inputIndex) = {
+        inputSchema.zipWithIndex.find(_._1._1 == spec.inputColumn).get
+      }
+
+      val bucketIndex: Option[Int] = Option(spec.bucket).map { bucketCol =>
+        val bIndex = inputSchema.indexWhere(_._1 == bucketCol)
+        assert(bIndex != -1, s"bucketing column: $bucketCol is not found in input: ${inputSchema.map(_._1)}")
+        val bucketType = inputSchema(bIndex)._2
+        assert(bucketType == StringType, s"bucketing column: $bucketCol needs to be a string, but found $bucketType")
+        bIndex
+      }
+      try {
+        ColumnAggregator.construct(
+          inputType,
+          spec,
+          ColumnIndices(inputIndex, aggregatorIndex),
+          bucketIndex
+        )
+      } catch {
+        case e: Exception =>
+          throw new RuntimeException(
+            s"Failed to create ${spec.operation} aggregator for ${spec.inputColumn} column of type $inputType",
+            e)
+      }
     }
   }.toArray
 
 
@@ -25,8 +25,7 @@ import org.apache.datasketches.memory.Memory
 import java.util
 import scala.collection.Seq
 
-/**
-  * Module managing FeatureStats Schema, Aggregations to be used by type and aggregator construction.
+/** Module managing FeatureStats Schema, Aggregations to be used by type and aggregator construction.
   *
   * Stats Aggregation has an offline/ batch component and an online component.
   * The metrics defined for stats depend on the schema of the join. The dataTypes and column names.
@@ -45,32 +44,29 @@ object StatsGenerator {
   val finalizedPercentilesSeries: Array[Double] = Array(0.05, 0.25, 0.5, 0.75, 0.95)
   val ignoreColumns: Seq[String] = Seq(api.Constants.TimeColumn, "ds", "date_key", "date", "datestamp")
 
-  /**
-    * InputTransform acts as a signal of how to process the metric.
+  /** InputTransform acts as a signal of how to process the metric.
     *
     * IsNull: Check if the input is null.
     *
     * Raw: Operate in the input column.
     *
     * One: lit(true) in spark. Used for row counts leveraged to obtain null rate values.
-    * */
+    */
   object InputTransform extends Enumeration {
     type InputTransform = Value
     val IsNull, Raw, One = Value
   }
   import InputTransform._
 
-  /**
-    * MetricTransform represents a single statistic built on top of an input column.
+  /** MetricTransform represents a single statistic built on top of an input column.
     */
   case class MetricTransform(name: String,
                              expression: InputTransform,
                              operation: api.Operation,
                              suffix: String = "",
                              argMap: util.Map[String, String] = null)
 
-  /**
-    * Post processing for finalized values or IRs when generating a time series of stats.
+  /** Post processing for finalized values or IRs when generating a time series of stats.
     * In the case of percentiles for examples we reduce to 5 values in order to generate candlesticks.
     */
   def SeriesFinalizer(key: String, value: AnyRef): AnyRef = {
@@ -115,17 +111,16 @@ object StatsGenerator {
   /** For the schema of the data define metrics to be aggregated */
   def buildMetrics(fields: Seq[(String, api.DataType)]): Seq[MetricTransform] = {
     val metrics = fields
-      .flatMap {
-        case (name, dataType) =>
-          if (ignoreColumns.contains(name)) {
-            Seq.empty
-          } else if (api.DataType.isNumeric(dataType) && dataType != api.ByteType) {
-            // ByteTypes are not supported due to Avro Encodings and limited support on aggregators.
-            // Needs to be casted on source if required.
-            numericTransforms(name)
-          } else {
-            anyTransforms(name)
-          }
+      .flatMap { case (name, dataType) =>
+        if (ignoreColumns.contains(name)) {
+          Seq.empty
+        } else if (api.DataType.isNumeric(dataType) && dataType != api.ByteType) {
+          // ByteTypes are not supported due to Avro Encodings and limited support on aggregators.
+          // Needs to be casted on source if required.
+          numericTransforms(name)
+        } else {
+          anyTransforms(name)
+        }
       }
       .sortBy(_.name)
     metrics :+ MetricTransform(totalColumn, InputTransform.One, api.Operation.COUNT)
@@ -147,8 +142,7 @@ object StatsGenerator {
     linfSimple.asInstanceOf[AnyRef]
   }
 
-  /**
-    * PSI is a measure of the difference between two probability distributions.
+  /** PSI is a measure of the difference between two probability distributions.
     * However, it's not defined for cases where a bin can have zero elements in either distribution
     * (meant for continuous measures). In order to support PSI for discrete measures we add a small eps value to
     * perturb the distribution in bins.
 
@@ -64,10 +64,9 @@ object DailyResolution extends Resolution {
 
 object ResolutionUtils {
 
-  /**
-    * Find the smallest tail window resolution in a GroupBy. Returns 1D if the GroupBy does not define any windows (all-time aggregates).
+  /** Find the smallest tail window resolution in a GroupBy. Returns 1D if the GroupBy does not define any windows (all-time aggregates).
     * The window resolutions are: 5 min for a GroupBy a window < 12 hrs, 1 hr for < 12 days, 1 day for > 12 days.
-    * */
+    */
   def getSmallestWindowResolutionInMillis(groupBy: GroupBy): Long =
     Option(
       groupBy.aggregations.toScala.toArray
 
@@ -26,17 +26,14 @@ import scala.collection.mutable
 case class BatchIr(collapsed: Array[Any], tailHops: HopsAggregator.IrMapType)
 case class FinalBatchIr(collapsed: Array[Any], tailHops: HopsAggregator.OutputArrayType)
 
-/**
-  * Mutations processing starts with an end of the day snapshot FinalBatchIR.
+/** Mutations processing starts with an end of the day snapshot FinalBatchIR.
   * On top of this FinalBatchIR mutations are processed.
   *
-  *
   * update/merge/finalize are related to snapshot data. As such they follow the snapshot Schema
   * and aggregators.
   * However mutations come into play later in the group by and a finalized version of the snapshot
   * data is created to be processed with the mutations rows.
   * Since the dataframe inputs are aligned between mutations and snapshot (input) no additional schema is needed.
-  *
   */
 class SawtoothMutationAggregator(aggregations: Seq[Aggregation],
                                  inputSchema: Seq[(String, DataType)],
@@ -106,8 +103,7 @@ class SawtoothMutationAggregator(aggregations: Seq[Aggregation],
   def finalizeSnapshot(batchIr: BatchIr): FinalBatchIr =
     FinalBatchIr(batchIr.collapsed, Option(batchIr.tailHops).map(hopsAggregator.toTimeSortedArray).orNull)
 
-  /**
-    * Go through the aggregators and update or delete the intermediate with the information of the row if relevant.
+  /** Go through the aggregators and update or delete the intermediate with the information of the row if relevant.
     * Useful for both online and mutations
     */
   def updateIr(ir: Array[Any], row: Row, queryTs: Long, hasReversal: Boolean = false): Unit = {
@@ -142,8 +138,7 @@ class SawtoothMutationAggregator(aggregations: Seq[Aggregation],
     }
   }
 
-  /**
-    * Update the intermediate results with tail hops data from a FinalBatchIr.
+  /** Update the intermediate results with tail hops data from a FinalBatchIr.
     */
   def mergeTailHops(ir: Array[Any], queryTs: Long, batchEndTs: Long, batchIr: FinalBatchIr): Array[Any] = {
     var i: Int = 0
@@ -171,8 +166,7 @@ class SawtoothMutationAggregator(aggregations: Seq[Aggregation],
     ir
   }
 
-  /**
-    * Given aggregations FinalBatchIRs at the end of the Snapshot (batchEndTs) and mutation and query times,
+  /** Given aggregations FinalBatchIRs at the end of the Snapshot (batchEndTs) and mutation and query times,
     * determine the values at the query times for the aggregations.
     * This is pretty much a mix of online with extra work for multiple queries ts support.
     */