Switch to env var driven approach

piyush-zlai · piyush-zlai · commit 6c1764704218 · 2024-10-27T15:46:05.000-04:00
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -78,6 +78,35 @@ jobs:
                   destination: spark_warehouse.tar.gz
                   when: on_fail
 
+    # run these separately as we need a isolated JVM to not have Spark session settings interfere with other runs
+    "Scala 13 -- Delta Lake Format Tests":
+      executor: docker_baseimg_executor
+      steps:
+        - checkout
+        - run:
+            name: Run Scala 13 tests for Delta Lake format
+            environment:
+              format_test: deltalake
+            shell: /bin/bash -leuxo pipefail
+            command: |
+              conda activate chronon_py
+              # Increase if we see OOM.
+              export SBT_OPTS="-XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=4G -Xmx4G -Xms2G"
+              sbt '++ 2.13.6' "testOnly ai.chronon.spark.test.TableUtilsFormatTest"
+        - store_test_results:
+            path: /chronon/spark/target/test-reports
+        - store_test_results:
+            path: /chronon/aggregator/target/test-reports
+        - run:
+            name: Compress spark-warehouse
+            command: |
+              cd /tmp/ && tar -czvf spark-warehouse.tar.gz chronon/spark-warehouse
+            when: on_fail
+        - store_artifacts:
+            path: /tmp/spark-warehouse.tar.gz
+            destination: spark_warehouse.tar.gz
+            when: on_fail
+
     "Scala 11 -- Compile":
       executor: docker_baseimg_executor
       steps:
diff --git a/spark/src/main/scala/ai/chronon/spark/ChrononKryoRegistrator.scala b/spark/src/main/scala/ai/chronon/spark/ChrononKryoRegistrator.scala
@@ -147,9 +147,9 @@ class ChrononKryoRegistrator extends KryoRegistrator {
       "org.apache.spark.sql.catalyst.InternalRow$$anonfun$getAccessor$8",
       "org.apache.spark.sql.catalyst.InternalRow$$anonfun$getAccessor$5",
       "scala.collection.immutable.ArraySeq$ofRef",
-      "org.apache.spark.sql.catalyst.expressions.GenericInternalRow"
-//      "org.apache.spark.sql.delta.stats.DeltaFileStatistics",
-//      "org.apache.spark.sql.delta.actions.AddFile"
+      "org.apache.spark.sql.catalyst.expressions.GenericInternalRow",
+      "org.apache.spark.sql.delta.stats.DeltaFileStatistics",
+      "org.apache.spark.sql.delta.actions.AddFile"
     )
     names.foreach { name =>
       try {
diff --git a/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala b/spark/src/main/scala/ai/chronon/spark/SparkSessionBuilder.scala
@@ -22,14 +22,14 @@ import org.apache.spark.SPARK_VERSION
 
 import java.io.File
 import java.util.logging.Logger
-import scala.reflect.io.Path
 import scala.util.Properties
 
 object SparkSessionBuilder {
   @transient private lazy val logger = LoggerFactory.getLogger(getClass)
 
   private val warehouseId = java.util.UUID.randomUUID().toString.takeRight(6)
   private val DefaultWarehouseDir = new File("/tmp/chronon/spark-warehouse_" + warehouseId)
+  val FormatTestEnvVar: String = "format_test"
 
   def expandUser(path: String): String = path.replaceFirst("~", System.getProperty("user.home"))
   // we would want to share locally generated warehouse during CI testing
@@ -38,6 +38,23 @@ object SparkSessionBuilder {
             localWarehouseLocation: Option[String] = None,
             additionalConfig: Option[Map[String, String]] = None,
             enforceKryoSerializer: Boolean = true): SparkSession = {
+
+    // allow us to override the format by specifying env vars. This allows us to not have to worry about interference
+    // between Spark sessions created in existing chronon tests that need the hive format and some specific tests
+    // that require a format override like delta lake.
+    val formatConfigs = sys.env.get(FormatTestEnvVar) match {
+      case Some("deltalake") =>
+        Map(
+          "spark.sql.extensions" -> "io.delta.sql.DeltaSparkSessionExtension",
+          "spark.sql.catalog.spark_catalog" -> "org.apache.spark.sql.delta.catalog.DeltaCatalog",
+          "spark.chronon.table_write.format" -> "delta"
+        )
+      case _ => Map.empty
+    }
+
+    // tack on format configs with additional configs
+    val mergedConfigs = additionalConfig.getOrElse(Map.empty) ++ formatConfigs
+
     if (local) {
       //required to run spark locally with hive support enabled - for sbt test
       System.setSecurityManager(null)
@@ -65,9 +82,8 @@ object SparkSessionBuilder {
         .config("spark.kryoserializer.buffer.max", "2000m")
         .config("spark.kryo.referenceTracking", "false")
     }
-    additionalConfig.foreach { configMap =>
-      configMap.foreach { config => baseBuilder = baseBuilder.config(config._1, config._2) }
-    }
+
+    mergedConfigs.foreach { config => baseBuilder = baseBuilder.config(config._1, config._2) }
 
     if (SPARK_VERSION.startsWith("2")) {
       // Otherwise files left from deleting the table with the same name result in test failures
diff --git a/spark/src/test/scala/ai/chronon/spark/test/TableUtilsFormatTest.scala b/spark/src/test/scala/ai/chronon/spark/test/TableUtilsFormatTest.scala
@@ -1,169 +1,144 @@
 package ai.chronon.spark.test
 
 import ai.chronon.api.{DoubleType, IntType, LongType, StringType, StructField, StructType}
+import ai.chronon.spark.SparkSessionBuilder.FormatTestEnvVar
 import ai.chronon.spark.test.TestUtils.makeDf
-import ai.chronon.spark.{DeltaLake, Format, Hive, IncompatibleSchemaException, SparkSessionBuilder, TableUtils}
-import org.apache.spark.SparkContext
+import ai.chronon.spark.{IncompatibleSchemaException, SparkSessionBuilder, TableUtils}
 import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SparkSession}
 import org.junit.Assert.{assertEquals, assertTrue}
-import org.scalatest.BeforeAndAfterEach
 import org.scalatest.funsuite.AnyFunSuite
-import org.scalatest.prop.TableDrivenPropertyChecks._
 
 import scala.util.Try
 
-class TestTableUtils(sparkSession: SparkSession, format: Format) extends TableUtils(sparkSession) {
-  override def getWriteFormat: Format = format
-}
-
-class TableUtilsFormatTest extends AnyFunSuite with BeforeAndAfterEach {
+class TableUtilsFormatTest extends AnyFunSuite {
 
   import TableUtilsFormatTest._
 
-  val deltaConfigMap = Map(
-    "spark.sql.extensions" -> "io.delta.sql.DeltaSparkSessionExtension",
-    "spark.sql.catalog.spark_catalog" -> "org.apache.spark.sql.delta.catalog.DeltaCatalog",
-  )
-  val hiveConfigMap = Map.empty[String, String]
-
-  // TODO: include Hive + Iceberg support in these tests
-  val formats =
-    Table(
-      ("format", "configs"),
-      (DeltaLake, deltaConfigMap),
-      (Hive, hiveConfigMap)
-    )
-
-  private def withSparkSession[T](configs: Map[String, String])(test: SparkSession => T): T = {
-    val spark = SparkSessionBuilder.build("TableUtilsFormatTest", local = true, additionalConfig = Some(configs))
-    val sc = SparkContext.getOrCreate()
+  // Read the format we want this instantiation of the test to run via environment vars
+  val format: String = sys.env.getOrElse(FormatTestEnvVar, "hive")
+
+  private def withSparkSession[T](test: SparkSession => T): T = {
+    val spark = SparkSessionBuilder.build("TableUtilsFormatTest", local = true)
     try {
       test(spark)
     } finally {
-      configs.keys.foreach(cfg => sc.getConf.remove(cfg))
       spark.stop()
     }
   }
 
   ignore("test insertion of partitioned data and adding of columns") {
-    forAll(formats) { (format, configs) =>
-      withSparkSession(configs) { spark =>
-        val tableUtils = new TestTableUtils(spark, format)
-
-        val tableName = s"db.test_table_1_$format"
-        spark.sql("CREATE DATABASE IF NOT EXISTS db")
-        val columns1 = Array(
-          StructField("long_field", LongType),
-          StructField("int_field", IntType),
-          StructField("string_field", StringType)
+    withSparkSession { spark =>
+      val tableUtils = TableUtils(spark)
+
+      val tableName = s"db.test_table_1_$format"
+      spark.sql("CREATE DATABASE IF NOT EXISTS db")
+      val columns1 = Array(
+        StructField("long_field", LongType),
+        StructField("int_field", IntType),
+        StructField("string_field", StringType)
+      )
+      val df1 = makeDf(
+        spark,
+        StructType(
+          tableName,
+          columns1 :+ StructField("ds", StringType)
+        ),
+        List(
+          Row(1L, 2, "3", "2022-10-01")
         )
-        val df1 = makeDf(
-          spark,
-          StructType(
-            tableName,
-            columns1 :+ StructField("ds", StringType)
-          ),
-          List(
-            Row(1L, 2, "3", "2022-10-01")
-          )
+      )
+
+      val df2 = makeDf(
+        spark,
+        StructType(
+          tableName,
+          columns1
+            :+ StructField("double_field", DoubleType)
+            :+ StructField("ds", StringType)
+        ),
+        List(
+          Row(4L, 5, "6", 7.0, "2022-10-02")
         )
-
-        val df2 = makeDf(
-          spark,
-          StructType(
-            tableName,
-            columns1
-              :+ StructField("double_field", DoubleType)
-              :+ StructField("ds", StringType)
-          ),
-          List(
-            Row(4L, 5, "6", 7.0, "2022-10-02")
-          )
-        )
-        testInsertPartitions(spark, tableUtils, tableName, format, df1, df2, ds1 = "2022-10-01", ds2 = "2022-10-02")
-      }
+      )
+      testInsertPartitions(spark, tableUtils, tableName, format, df1, df2, ds1 = "2022-10-01", ds2 = "2022-10-02")
     }
   }
 
   ignore("test insertion of partitioned data and removal of columns") {
-    forAll(formats) { (format, configs) =>
-      withSparkSession(configs) { spark =>
-        val tableUtils = TableUtils(spark)
-        val tableName = s"db.test_table_2_$format"
-        spark.sql("CREATE DATABASE IF NOT EXISTS db")
-        val columns1 = Array(
-          StructField("long_field", LongType),
-          StructField("int_field", IntType),
-          StructField("string_field", StringType)
+    withSparkSession { spark =>
+      val tableUtils = TableUtils(spark)
+      val tableName = s"db.test_table_2_$format"
+      spark.sql("CREATE DATABASE IF NOT EXISTS db")
+      val columns1 = Array(
+        StructField("long_field", LongType),
+        StructField("int_field", IntType),
+        StructField("string_field", StringType)
+      )
+      val df1 = makeDf(
+        spark,
+        StructType(
+          tableName,
+          columns1
+            :+ StructField("double_field", DoubleType)
+            :+ StructField("ds", StringType)
+        ),
+        List(
+          Row(1L, 2, "3", 4.0, "2022-10-01")
         )
-        val df1 = makeDf(
-          spark,
-          StructType(
-            tableName,
-            columns1
-              :+ StructField("double_field", DoubleType)
-              :+ StructField("ds", StringType)
-          ),
-          List(
-            Row(1L, 2, "3", 4.0, "2022-10-01")
-          )
+      )
+
+      val df2 = makeDf(
+        spark,
+        StructType(
+          tableName,
+          columns1 :+ StructField("ds", StringType)
+        ),
+        List(
+          Row(5L, 6, "7", "2022-10-02")
         )
-
-        val df2 = makeDf(
-          spark,
-          StructType(
-            tableName,
-            columns1 :+ StructField("ds", StringType)
-          ),
-          List(
-            Row(5L, 6, "7", "2022-10-02")
-          )
-        )
-        testInsertPartitions(spark, tableUtils, tableName, format, df1, df2, ds1 = "2022-10-01", ds2 = "2022-10-02")
-      }
+      )
+      testInsertPartitions(spark, tableUtils, tableName, format, df1, df2, ds1 = "2022-10-01", ds2 = "2022-10-02")
     }
   }
 
   ignore("test insertion of partitioned data and modification of columns") {
-    forAll(formats) { (format, configs) =>
-      withSparkSession(configs) { spark =>
-        val tableUtils = TableUtils(spark)
-
-        val tableName = s"db.test_table_3_$format"
-        spark.sql("CREATE DATABASE IF NOT EXISTS db")
-        val columns1 = Array(
-          StructField("long_field", LongType),
-          StructField("int_field", IntType)
+    withSparkSession { spark =>
+      val tableUtils = TableUtils(spark)
+
+      val tableName = s"db.test_table_3_$format"
+      spark.sql("CREATE DATABASE IF NOT EXISTS db")
+      val columns1 = Array(
+        StructField("long_field", LongType),
+        StructField("int_field", IntType)
+      )
+      val df1 = makeDf(
+        spark,
+        StructType(
+          tableName,
+          columns1
+            :+ StructField("string_field", StringType)
+            :+ StructField("ds", StringType)
+        ),
+        List(
+          Row(1L, 2, "3", "2022-10-01")
         )
-        val df1 = makeDf(
-          spark,
-          StructType(
-            tableName,
-            columns1
-              :+ StructField("string_field", StringType)
-              :+ StructField("ds", StringType)
-          ),
-          List(
-            Row(1L, 2, "3", "2022-10-01")
-          )
+      )
+
+      val df2 = makeDf(
+        spark,
+        StructType(
+          tableName,
+          columns1
+            :+ StructField("string_field", DoubleType) // modified column data type
+            :+ StructField("ds", StringType)
+        ),
+        List(
+          Row(1L, 2, 3.0, "2022-10-02")
         )
+      )
 
-        val df2 = makeDf(
-          spark,
-          StructType(
-            tableName,
-            columns1
-              :+ StructField("string_field", DoubleType) // modified column data type
-              :+ StructField("ds", StringType)
-          ),
-          List(
-            Row(1L, 2, 3.0, "2022-10-02")
-          )
-        )
-
-        testInsertPartitions(spark, tableUtils, tableName, format, df1, df2, ds1 = "2022-10-01", ds2 = "2022-10-02")
-      }
+      testInsertPartitions(spark, tableUtils, tableName, format, df1, df2, ds1 = "2022-10-01", ds2 = "2022-10-02")
     }
   }
 }
@@ -172,7 +147,7 @@ object TableUtilsFormatTest {
   private def testInsertPartitions(spark: SparkSession,
                                    tableUtils: TableUtils,
                                    tableName: String,
-                                   format: Format,
+                                   format: String,
                                    df1: DataFrame,
                                    df2: DataFrame,
                                    ds1: String,
@@ -204,7 +179,8 @@ object TableUtilsFormatTest {
     tableUtils.insertPartitions(df2, tableName, autoExpand = true)
 
     // check that we wrote out a table in the right format
-    assertTrue(tableUtils.tableFormat(tableName) == format)
+    val readTableFormat = tableUtils.tableFormat(tableName).toString
+    assertTrue(s"Mismatch in table format: $readTableFormat; expected: $format", readTableFormat.toLowerCase == format)
 
     // check we have all the partitions written
     val returnedPartitions = tableUtils.partitions(tableName)