zipline-ai · nikhil-zlai · Mar 13, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/api/src/main/scala/ai/chronon/api/Builders.scala b/api/src/main/scala/ai/chronon/api/Builders.scala
@@ -264,14 +264,14 @@ object Builders {
         online: Boolean = false,
         production: Boolean = false,
         customJson: String = null,
-        dependencies: Seq[String] = null,
         namespace: String = null,
         team: String = null,
         samplePercent: Double = 100,
         consistencySamplePercent: Double = 5,
         tableProperties: Map[String, String] = Map.empty,
         historicalBackfill: Boolean = true,
-        driftSpec: DriftSpec = null
+        driftSpec: DriftSpec = null,
+        additionalOutputPartitionColumns: Seq[String] = Seq.empty
     ): MetaData = {
       val result = new MetaData()
       result.setName(name)
@@ -298,6 +298,11 @@ object Builders {
         result.setTableProperties(tableProperties.toJava)
       if (driftSpec != null)
         result.setDriftSpec(driftSpec)
+
+      if (additionalOutputPartitionColumns.nonEmpty) {
+        result.setAdditionalOutputPartitionColumns(additionalOutputPartitionColumns.toJava)
+      }
+
       result
     }
   }

diff --git a/api/src/main/scala/ai/chronon/api/Extensions.scala b/api/src/main/scala/ai/chronon/api/Extensions.scala
@@ -154,20 +154,6 @@ object Extensions {
     @deprecated("Use `name` instead.")
     def nameToFilePath: String = metaData.name.replaceFirst("\\.", "/")
 
-    // helper function to extract values from customJson
-    def customJsonLookUp(key: String): Any = {
-      if (metaData.customJson == null) return null
-      val mapper = new ObjectMapper()
-      val typeRef = new TypeReference[java.util.HashMap[String, Object]]() {}
-      val jMap: java.util.Map[String, Object] = mapper.readValue(metaData.customJson, typeRef)
-      jMap.toScala.get(key).orNull
-    }
-
-    def owningTeam: String = {
-      val teamOverride = Try(customJsonLookUp(Constants.TeamOverride).asInstanceOf[String]).toOption
-      teamOverride.getOrElse(metaData.team)
-    }
-
     // if drift spec is set but tile size is not set, default to 30 minutes
     def driftTileSize: Option[Window] = {
       Option(metaData.getDriftSpec) match {
@@ -462,9 +448,9 @@ object Extensions {
 
     // Check if tiling is enabled for a given GroupBy. Defaults to false if the 'enable_tiling' flag isn't set.
     def isTilingEnabled: Boolean =
-      groupBy.getMetaData.customJsonLookUp("enable_tiling") match {
-        case s: Boolean => s
-        case _          => false
+      Option(groupBy.getMetaData.streamWriteStrategy) match {
+        case Some(StreamWriteStrategy.SIMPLE_TILES) | None => true
+        case _                                             => false
       }
 
     def semanticHash: String = {

diff --git a/api/src/test/scala/ai/chronon/api/test/ExtensionsTest.scala b/api/src/test/scala/ai/chronon/api/test/ExtensionsTest.scala
@@ -41,24 +41,6 @@ class ExtensionsTest extends AnyFlatSpec {
     )
   }
 
-  it should "owning team" in {
-    val metadata =
-      Builders.MetaData(
-        customJson = "{\"check_consistency\": true, \"lag\": 0, \"team_override\": \"ml_infra\"}",
-        team = "chronon"
-      )
-
-    assertEquals(
-      "ml_infra",
-      metadata.owningTeam
-    )
-
-    assertEquals(
-      "chronon",
-      metadata.team
-    )
-  }
-
   it should "row identifier" in {
     val labelPart = Builders.LabelPart();
     val res = labelPart.rowIdentifier(Arrays.asList("yoyo", "yujia"), "ds")

diff --git a/api/thrift/api.thrift b/api/thrift/api.thrift
@@ -249,13 +249,19 @@ struct MetaData {
 
     4: optional string outputNamespace
 
-    5: optional map<string, string> tableProperties
+    /**
+    * By default we will just partition the output by the date column - set via "spark.chronon.partition.column"
+    * With this we will partition the output with the specified additional columns
+    **/
+    5: optional list<string> additionalOutputPartitionColumns
+
+    6: optional map<string, string> tableProperties
 
     // tag_key -> tag_value - tags allow for repository wide querying, deprecations etc
     // this is object level tag - applies to all columns produced by the object - GroupBy, Join, Model etc
-    6: optional map<string, string> tags
+    20: optional map<string, string> tags
     // column -> tag_key -> tag_value
-    7: optional map<string, map<string, string>> columnTags
+    21: optional map<string, map<string, string>> columnTags
 
     // marking this as true means that the conf can be served online
     // once marked online, a conf cannot be changed - compiling the conf won't be allowed
@@ -284,9 +290,15 @@ struct MetaData {
 
     # information that needs to be present on every physical node
     204: optional common.ExecutionInfo executionInfo
-}
 
+    300: optional StreamWriteStrategy streamWriteStrategy
+}
 
+enum StreamWriteStrategy {
+    RAW,
+    SIMPLE_TILES,
+    CUMULATIVE_AND_TILES,
+}
 
 // Equivalent to a FeatureSet in chronon terms
 struct GroupBy {

diff --git a/online/src/main/scala/ai/chronon/online/MetadataEndPoint.scala b/online/src/main/scala/ai/chronon/online/MetadataEndPoint.scala
@@ -25,22 +25,12 @@ object MetadataEndPoint {
   val ConfByKeyEndPointName = "CHRONON_METADATA"
   val NameByTeamEndPointName = "CHRONON_ENTITY_BY_TEAM"
 
-  private def getTeamFromMetadata(metaData: MetaData): String = {
-    val team = metaData.team
-    if (metaData.customJson != null && metaData.customJson.nonEmpty) {
-      implicit val formats = DefaultFormats
-      val customJson = parse(metaData.customJson)
-      val teamFromJson: String = (customJson \ "team_override").extractOpt[String].getOrElse("")
-      if (teamFromJson.nonEmpty) teamFromJson else team
-    } else team
-  }
-
   private def parseTeam[Conf <: TBase[_, _]: Manifest: ClassTag](conf: Conf): String = {
     conf match {
-      case join: Join                 => "joins/" + getTeamFromMetadata(join.metaData)
-      case groupBy: GroupBy           => "group_bys/" + getTeamFromMetadata(groupBy.metaData)
-      case stagingQuery: StagingQuery => "staging_queries/" + getTeamFromMetadata(stagingQuery.metaData)
-      case model: Model               => "models/" + getTeamFromMetadata(model.metaData)
+      case join: Join                 => "joins/" + join.metaData.team
+      case groupBy: GroupBy           => "group_bys/" + groupBy.metaData.team
+      case stagingQuery: StagingQuery => "staging_queries/" + stagingQuery.metaData.team
+      case model: Model               => "models/" + model.metaData.team
       case _ =>
         logger.error(s"Failed to parse team from $conf")
         throw new Exception(s"Failed to parse team from $conf")

diff --git a/online/src/main/scala/ai/chronon/online/Metrics.scala b/online/src/main/scala/ai/chronon/online/Metrics.scala
@@ -98,7 +98,7 @@ object Metrics {
         environment = environment,
         join = join.metaData.cleanName,
         production = join.metaData.isProduction,
-        team = join.metaData.owningTeam
+        team = join.metaData.team
       )
     }
 
@@ -108,7 +108,7 @@ object Metrics {
         groupBy = groupBy.metaData.cleanName,
         production = groupBy.metaData.isProduction,
         accuracy = groupBy.inferredAccuracy,
-        team = groupBy.metaData.owningTeam,
+        team = groupBy.metaData.team,
         join = groupBy.sources.toScala
           .find(_.isSetJoinSource)
           .map(_.getJoinSource.join.metaData.cleanName)
@@ -127,7 +127,7 @@ object Metrics {
         environment = environment,
         groupBy = stagingQuery.metaData.cleanName,
         production = stagingQuery.metaData.isProduction,
-        team = stagingQuery.metaData.owningTeam
+        team = stagingQuery.metaData.team
       )
     }
 

diff --git a/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala b/spark/src/main/scala/ai/chronon/spark/StagingQuery.scala
@@ -35,11 +35,7 @@ class StagingQuery(stagingQueryConf: api.StagingQuery, endPartition: String, tab
     .orNull
 
   private val partitionCols: Seq[String] =
-    Seq(Option(stagingQueryConf.getPartitionColumn).getOrElse(tableUtils.partitionColumn)) ++
-      Option(stagingQueryConf.metaData.customJsonLookUp(key = "additional_partition_cols"))
-        .getOrElse(new java.util.ArrayList[String]())
-        .asInstanceOf[java.util.ArrayList[String]]
-        .toScala
+    Seq(tableUtils.partitionColumn) ++ stagingQueryConf.metaData.additionalOutputPartitionColumns.toScala
 
   def computeStagingQuery(stepDays: Option[Int] = None,
                           enableAutoExpand: Option[Boolean] = Some(true),

diff --git a/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala b/spark/src/test/scala/ai/chronon/spark/test/StagingQueryTest.scala
@@ -25,14 +25,14 @@ import ai.chronon.spark.SparkSessionBuilder
 import ai.chronon.spark.StagingQuery
 import ai.chronon.spark.TableUtils
 import org.apache.spark.sql.SparkSession
-import org.junit.Assert.assertEquals
+import org.junit.Assert.{assertEquals, assertTrue}
 import org.scalatest.flatspec.AnyFlatSpec
 import org.slf4j.Logger
 import org.slf4j.LoggerFactory
 
 class StagingQueryTest extends AnyFlatSpec {
   @transient lazy val logger: Logger = LoggerFactory.getLogger(getClass)
-  lazy val spark: SparkSession = SparkSessionBuilder.build("StagingQueryTest", local = true)
+  implicit lazy val spark: SparkSession = SparkSessionBuilder.build("StagingQueryTest", local = true)
   implicit private val tableUtils: TableUtils = TableUtils(spark)
 
   private val today = tableUtils.partitionSpec.at(System.currentTimeMillis())
@@ -285,4 +285,87 @@ class StagingQueryTest extends AnyFlatSpec {
     }
     assertEquals(0, diff.count())
   }
+
+  private def getPartitionColumnNames(tableName: String)(implicit spark: SparkSession): Seq[String] = {
+    // Get the catalog table information
+    val tableIdentifier = spark.sessionState.sqlParser.parseTableIdentifier(tableName)
+    val catalogTable = spark.sessionState.catalog.getTableMetadata(tableIdentifier)
+
+    // Extract partition column names from the table schema
+    catalogTable.partitionColumnNames
+  }
+
+  it should "handle additional output partition columns" in {
+    val schema = List(
+      Column("user", StringType, 10),
+      Column("region", StringType, 5, nullRate = 0.0), // partition columns cannot have null
+      Column("device", StringType, 3, nullRate = 0.0), // partition columns cannot have null
+      Column("session_length", IntType, 1000)
+    )
+
+    // Generate test data with columns that can be used for additional partitioning
+    val df = DataFrameGen
+      .events(spark, schema, count = 10000, partitions = 20)
+      .dropDuplicates("ts")
+    logger.info("Generated test data for additional partition columns:")
+    df.show()
+
+    val tableName = s"$namespace.test_additional_partition_cols"
+    df.save(tableName)
+
+    // Define a staging query with multiple additional partition columns
+    val stagingQueryConf = Builders.StagingQuery(
+      query = s"select * from $tableName WHERE ds BETWEEN {{ start_date }} AND {{ end_date }}",
+      startPartition = ninetyDaysAgo,
+      metaData = Builders.MetaData(
+        name = "test.additional_partitions",
+        namespace = namespace,
+        additionalOutputPartitionColumns = Seq("region", "device"), // Explicitly specify additional partition columns
+        tableProperties = Map("key" -> "val")
+      )
+    )
+
+    val stagingQuery = new StagingQuery(stagingQueryConf, today, tableUtils)
+    stagingQuery.computeStagingQuery(stepDays = Option(30))
+
+    // Verify the data was written correctly
+    val expected = tableUtils.sql(
+      s"select * from $tableName where ds between '$ninetyDaysAgo' and '$today'"
+    )
+
+    val computed = tableUtils.sql(s"select * from ${stagingQueryConf.metaData.outputTable}")
+    val diff = Comparison.sideBySide(expected, computed, List("user", "ts", "ds"))
+
+    val diffCount = diff.count()
+    if (diffCount > 0) {
+      logger.info("Different rows between expected and computed")
+
+      logger.info("Expected rows")
+      expected.show()
+
+      logger.info("Computed rows")
+      computed.show()
+
+      logger.info("Diff rows (SxS)")
+      diff.show()
+    }
+
+    assertEquals(0, diff.count())
+
+    // Verify the table was created with the additional partition columns
+    val tableDesc = spark.sql(s"DESCRIBE ${stagingQueryConf.metaData.outputTable}")
+    val partitionInfo = spark.sql(s"SHOW PARTITIONS ${stagingQueryConf.metaData.outputTable}")
+
+    logger.info("Table description:")
+    tableDesc.show()
+    logger.info("Partition information:")
+    partitionInfo.show()
+
+    // Get the partition column names from the table metadata
+    val partitionColumnNames = getPartitionColumnNames(stagingQueryConf.metaData.outputTable)(spark)
+
+    // Verify all expected partition columns are present
+    val expectedPartitionCols = Seq(tableUtils.partitionColumn, "region", "device")
+    assertEquals(expectedPartitionCols.toSet, partitionColumnNames.toSet)
+  }
 }