feat: unit tests for local iteration (#148)

tchow-zlai · web-flow · commit fa27cf8b0ca0 · 2024-12-23T11:02:43.000-08:00
## Summary

## Checklist
- [ ] Added Unit Tests
- [ ] Covered by existing CI
- [ ] Integration tested
- [ ] Documentation update
&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit


- **New Features**
- Introduced a new BigQuery client for enhanced interaction with
BigQuery services.
- Added functionality for managing partitions in Spark SQL tables
through a new utility class.

- **Bug Fixes**
	- Improved error handling in the database creation process.

- **Tests**
	- Added a new test class for verifying BigQuery catalog functionality.
- Updated existing test classes to utilize the new partition management
utilities.

- **Chores**
	- Cleaned up deprecated methods in the TableUtils class.
	- Refactored comments for clarity regarding method dependencies.

&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

&lt;!-- av pr metadata
This information is embedded by the av CLI when creating PRs to track
the status of stacks when using Aviator. Please do not delete or edit
this section of the PR.
```
{"parent":"main","parentHead":"","trunk":"main"}
```
--&gt;
diff --git a/build.sbt b/build.sbt
@@ -213,7 +213,7 @@ lazy val cloud_gcp = project
     libraryDependencies += "com.google.cloud" % "google-cloud-pubsub" % "1.131.0",
     libraryDependencies += "com.google.cloud" % "google-cloud-dataproc" % "4.51.0",
     libraryDependencies += "com.google.cloud.bigdataoss" % "gcs-connector" % "3.0.3", // it's what's on the cluster
-    libraryDependencies += "com.google.cloud.bigdataoss" % "gcs-connector" % "hadoop3-2.2.26", // it's what's on the cluster
+    libraryDependencies += "com.google.cloud.bigdataoss" % "gcs-connector" % "hadoop3-2.2.26",
     libraryDependencies += "com.google.cloud.bigdataoss" % "gcsio" % "3.0.3", // need it for https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/master/gcsio/src/main/java/com/google/cloud/hadoop/gcsio/GoogleCloudStorageFileSystem.java
     libraryDependencies += "io.circe" %% "circe-yaml" % "1.15.0",
     libraryDependencies += "org.mockito" % "mockito-core" % "5.12.0" % Test,
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/BigQueryFormat.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/BigQueryFormat.scala
@@ -12,6 +12,7 @@ import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.Tabl
 import org.apache.spark.sql.SparkSession
 
 case class GcpFormatProvider(sparkSession: SparkSession) extends FormatProvider {
+
   lazy val bigQueryClient = BigQueryOptions.getDefaultInstance.getService
   def readFormat(tableName: String): Format = {
 
@@ -126,6 +127,7 @@ case class BQuery(project: String) extends Format {
       sparkSession.conf.set("viewsEnabled", originalViewsEnabled)
       sparkSession.conf.set("materializationDataset", originalMaterializationDataset)
     }
+
   }
 
   def createTableTypeString: String = "BIGQUERY"
diff --git a/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/test/BigQueryCatalogTest.scala b/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/test/BigQueryCatalogTest.scala
@@ -1,6 +1,66 @@
 package ai.chronon.integrations.cloud_gcp.test
 
+import ai.chronon.integrations.cloud_gcp.BQuery
+import ai.chronon.integrations.cloud_gcp.GcpFormatProvider
+import ai.chronon.spark.SparkSessionBuilder
+import ai.chronon.spark.TableUtils
+import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS
+import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem
+import org.apache.spark.sql.SparkSession
+import org.junit.Assert.assertEquals
+import org.junit.Assert.assertTrue
 import org.scalatest.funsuite.AnyFunSuite
 import org.scalatestplus.mockito.MockitoSugar
 
-class BigQueryCatalogTest extends AnyFunSuite with MockitoSugar {}
+class BigQueryCatalogTest extends AnyFunSuite with MockitoSugar {
+
+  lazy val spark: SparkSession = SparkSessionBuilder.build(
+    "BigQuerySparkTest",
+    local = true,
+    additionalConfig = Some(
+      Map(
+        "spark.chronon.table.format_provider.class" -> classOf[GcpFormatProvider].getName,
+        "hive.metastore.uris" -> "thrift://localhost:9083",
+        "spark.chronon.partition.column" -> "c",
+        "spark.hadoop.fs.gs.impl" -> classOf[GoogleHadoopFileSystem].getName,
+        "spark.hadoop.fs.AbstractFileSystem.gs.impl" -> classOf[GoogleHadoopFS].getName,
+        "spark.hadoop.google.cloud.auth.service.account.enable" -> true.toString,
+        "spark.hadoop.fs.gs.impl" -> classOf[GoogleHadoopFileSystem].getName
+      ))
+  )
+  lazy val tableUtils: TableUtils = TableUtils(spark)
+
+  test("hive uris are set") {
+    assertEquals("thrift://localhost:9083", spark.sqlContext.getConf("hive.metastore.uris"))
+  }
+
+  test("verify dynamic classloading of GCP providers") {
+    assertTrue(tableUtils.tableReadFormat("data.sample_native") match {
+      case BQuery(_) => true
+      case _         => false
+    })
+  }
+
+  ignore("integration testing bigquery load table") {
+    val externalTable = "data.checkouts_parquet"
+    val table = tableUtils.loadTable(externalTable)
+    tableUtils.isPartitioned(externalTable)
+    tableUtils.createDatabase("test_database")
+    tableUtils.allPartitions(externalTable)
+    table.show
+  }
+
+  ignore("integration testing bigquery partitions") {
+    // TODO(tchow): This test is ignored because it requires a running instance of the bigquery. Need to figure out stubbing locally.
+    // to run this:
+    //    1. Set up a tunnel to dataproc federation proxy:
+    //       gcloud compute ssh zipline-canary-cluster-m \
+    //        --zone us-central1-c \
+    //        -- -f -N -L 9083:localhost:9083
+    //    2. enable this test and off you go.
+    val externalPartitions = tableUtils.partitions("data.checkouts_parquet")
+    println(externalPartitions)
+    val nativePartitions = tableUtils.partitions("data.sample_native")
+    println(nativePartitions)
+  }
+}
diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
@@ -63,7 +63,7 @@ import scala.util.Try
   * retrieve metadata / configure it appropriately at creation time
   */
 
-case class TableUtils(sparkSession: SparkSession) {
+class TableUtils(@transient val sparkSession: SparkSession) extends Serializable {
   @transient lazy val logger: Logger = LoggerFactory.getLogger(getClass)
 
   private val ARCHIVE_TIMESTAMP_FORMAT = "yyyyMMddHHmmss"
@@ -141,16 +141,24 @@ case class TableUtils(sparkSession: SparkSession) {
       rdd
     }
 
-  def tableExists(tableName: String): Boolean = sparkSession.catalog.tableExists(tableName)
+  // Needs provider
+  def tableExists(tableName: String): Boolean = {
+    sparkSession.catalog.tableExists(tableName)
+  }
 
-  def loadTable(tableName: String): DataFrame = sparkSession.table(tableName)
+  // Needs provider
+  def loadTable(tableName: String): DataFrame = {
+    sparkSession.table(tableName)
+  }
 
+  // Needs provider
   def isPartitioned(tableName: String): Boolean = {
     // TODO: use proper way to detect if a table is partitioned or not
     val schema = getSchemaFromTable(tableName)
     schema.fieldNames.contains(partitionColumn)
   }
 
+  // Needs provider
   def createDatabase(database: String): Boolean = {
     try {
       val command = s"CREATE DATABASE IF NOT EXISTS $database"
@@ -168,6 +176,7 @@ case class TableUtils(sparkSession: SparkSession) {
 
   def tableReadFormat(tableName: String): Format = tableFormatProvider.readFormat(tableName)
 
+  // Needs provider
   // return all specified partition columns in a table in format of Map[partitionName, PartitionValue]
   def allPartitions(tableName: String, partitionColumnsFilter: Seq[String] = Seq.empty): Seq[Map[String, String]] = {
     if (!tableExists(tableName)) return Seq.empty[Map[String, String]]
@@ -182,6 +191,7 @@ case class TableUtils(sparkSession: SparkSession) {
     }
   }
 
+  // Needs provider
   def partitions(tableName: String, subPartitionsFilter: Map[String, String] = Map.empty): Seq[String] = {
     if (!tableExists(tableName)) return Seq.empty[String]
     val format = tableReadFormat(tableName)
@@ -222,11 +232,13 @@ case class TableUtils(sparkSession: SparkSession) {
     }
   }
 
+  // Needs provider
   def getSchemaFromTable(tableName: String): StructType = {
     sparkSession.sql(s"SELECT * FROM $tableName LIMIT 1").schema
   }
 
   // method to check if a user has access to a table
+  // Needs provider
   def checkTablePermission(tableName: String,
                            fallbackPartition: String =
                              partitionSpec.before(partitionSpec.at(System.currentTimeMillis()))): Boolean = {
@@ -252,12 +264,15 @@ case class TableUtils(sparkSession: SparkSession) {
     }
   }
 
+  // Needs provider
   def lastAvailablePartition(tableName: String, subPartitionFilters: Map[String, String] = Map.empty): Option[String] =
     partitions(tableName, subPartitionFilters).reduceOption((x, y) => Ordering[String].max(x, y))
 
+  // Needs provider
   def firstAvailablePartition(tableName: String, subPartitionFilters: Map[String, String] = Map.empty): Option[String] =
     partitions(tableName, subPartitionFilters).reduceOption((x, y) => Ordering[String].min(x, y))
 
+  // Needs provider
   def insertPartitions(df: DataFrame,
                        tableName: String,
                        tableProperties: Map[String, String] = null,
@@ -351,6 +366,7 @@ case class TableUtils(sparkSession: SparkSession) {
     }
   }
 
+  // Needs provider
   def insertUnPartitioned(df: DataFrame,
                           tableName: String,
                           tableProperties: Map[String, String] = null,
@@ -412,6 +428,7 @@ case class TableUtils(sparkSession: SparkSession) {
     }.get
   }
 
+  // Needs provider
   private def repartitionAndWriteInternal(df: DataFrame,
                                           tableName: String,
                                           saveMode: SaveMode,
@@ -488,6 +505,7 @@ case class TableUtils(sparkSession: SparkSession) {
     }
   }
 
+  // Needs provider
   private def createTableSql(tableName: String,
                              schema: StructType,
                              partitionColumns: Seq[String],
@@ -526,6 +544,7 @@ case class TableUtils(sparkSession: SparkSession) {
     Seq(createFragment, partitionFragment, fileFormatString, propertiesFragment).mkString("\n")
   }
 
+  // Needs provider
   private def alterTablePropertiesSql(tableName: String, properties: Map[String, String]): String = {
     // Only SQL api exists for setting TBLPROPERTIES
     val propertiesString = properties
@@ -612,6 +631,7 @@ case class TableUtils(sparkSession: SparkSession) {
     Some(missingChunks)
   }
 
+  // Needs provider
   def getTableProperties(tableName: String): Option[Map[String, String]] = {
     try {
       val tableId = sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName)
@@ -621,6 +641,7 @@ case class TableUtils(sparkSession: SparkSession) {
     }
   }
 
+  // Needs provider
   def dropTableIfExists(tableName: String): Unit = {
     val command = s"DROP TABLE IF EXISTS $tableName"
     logger.info(s"Dropping table with command: $command")
@@ -648,68 +669,6 @@ case class TableUtils(sparkSession: SparkSession) {
     }
   }
 
-  @deprecated
-  def dropPartitionsAfterHole(inputTable: String,
-                              outputTable: String,
-                              partitionRange: PartitionRange,
-                              subPartitionFilters: Map[String, String] = Map.empty): Option[String] = {
-
-    def partitionsInRange(table: String, partitionFilter: Map[String, String] = Map.empty): Set[String] = {
-      val allParts = partitions(table, partitionFilter)
-      val startPrunedParts = Option(partitionRange.start).map(start => allParts.filter(_ >= start)).getOrElse(allParts)
-      Option(partitionRange.end).map(end => startPrunedParts.filter(_ <= end)).getOrElse(startPrunedParts).toSet
-    }
-
-    val inputPartitions = partitionsInRange(inputTable)
-    val outputPartitions = partitionsInRange(outputTable, subPartitionFilters)
-    val earliestHoleOpt = (inputPartitions -- outputPartitions).reduceLeftOption(Ordering[String].min)
-    earliestHoleOpt.foreach { hole =>
-      val toDrop = outputPartitions.filter(_ > hole)
-      logger.info(s"""
-                 |Earliest hole at $hole in output table $outputTable, relative to $inputTable
-                 |Input Parts   : ${inputPartitions.toArray.sorted.mkString("Array(", ", ", ")")}
-                 |Output Parts  : ${outputPartitions.toArray.sorted.mkString("Array(", ", ", ")")}
-                 |Dropping Parts: ${toDrop.toArray.sorted.mkString("Array(", ", ", ")")}
-                 |Sub Partitions: ${subPartitionFilters.map(kv => s"${kv._1}=${kv._2}").mkString("Array(", ", ", ")")}
-          """.stripMargin)
-      dropPartitions(outputTable, toDrop.toArray.sorted, partitionColumn, subPartitionFilters)
-    }
-    earliestHoleOpt
-  }
-
-  def dropPartitions(tableName: String,
-                     partitions: Seq[String],
-                     partitionColumn: String = partitionColumn,
-                     subPartitionFilters: Map[String, String] = Map.empty): Unit = {
-    if (partitions.nonEmpty && tableExists(tableName)) {
-      val partitionSpecs = partitions
-        .map { partition =>
-          val mainSpec = s"$partitionColumn='$partition'"
-          val specs = mainSpec +: subPartitionFilters.map {
-            case (key, value) => s"$key='$value'"
-          }.toSeq
-          specs.mkString("PARTITION (", ",", ")")
-        }
-        .mkString(",")
-      val dropSql = s"ALTER TABLE $tableName DROP IF EXISTS $partitionSpecs"
-      sql(dropSql)
-    } else {
-      logger.info(s"$tableName doesn't exist, please double check before drop partitions")
-    }
-  }
-
-  def dropPartitionRange(tableName: String,
-                         startDate: String,
-                         endDate: String,
-                         subPartitionFilters: Map[String, String] = Map.empty): Unit = {
-    if (tableExists(tableName)) {
-      val toDrop = Stream.iterate(startDate)(partitionSpec.after).takeWhile(_ <= endDate)
-      dropPartitions(tableName, toDrop, partitionColumn, subPartitionFilters)
-    } else {
-      logger.info(s"$tableName doesn't exist, please double check before drop partitions")
-    }
-  }
-
   /*
    * This method detects new columns that appear in newSchema but not in current table,
    * and append those new columns at the end of the existing table. This allows continuous evolution
@@ -837,6 +796,12 @@ case class TableUtils(sparkSession: SparkSession) {
   }
 }
 
+object TableUtils {
+  def apply(sparkSession: SparkSession): TableUtils = {
+    new TableUtils(sparkSession)
+  }
+}
+
 sealed case class IncompatibleSchemaException(inconsistencies: Seq[(String, DataType, DataType)]) extends Exception {
   override def getMessage: String = {
     val inconsistenciesStr =
diff --git a/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala b/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala
@@ -57,7 +57,7 @@ object TestRow {
 class JoinTest extends AnyFunSuite with TaggedFilterSuite {
 
   val spark: SparkSession = SparkSessionBuilder.build("JoinTest", local = true)
-  private implicit val tableUtils = TableUtils(spark)
+  private implicit val tableUtils = TableTestUtils(spark)
 
   private val today = tableUtils.partitionSpec.at(System.currentTimeMillis())
   private val monthAgo = tableUtils.partitionSpec.minus(today, new Window(30, TimeUnit.DAYS))
diff --git a/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala b/spark/src/test/scala/ai/chronon/spark/test/LabelJoinTest.scala
@@ -37,7 +37,7 @@ class LabelJoinTest {
   private val namespace = "label_join"
   private val tableName = "test_label_join"
   private val labelDS = "2022-10-30"
-  private val tableUtils = TableUtils(spark)
+  private val tableUtils = TableTestUtils(spark)
   tableUtils.createDatabase(namespace)
 
   private val viewsGroupBy = TestUtils.createViewsGroupBy(namespace, spark)
diff --git a/spark/src/test/scala/ai/chronon/spark/test/TableTestUtils.scala b/spark/src/test/scala/ai/chronon/spark/test/TableTestUtils.scala
@@ -0,0 +1,41 @@
+package ai.chronon.spark.test
+
+import ai.chronon.spark.TableUtils
+import org.apache.spark.sql.SparkSession
+
+case class TableTestUtils(override val sparkSession: SparkSession) extends TableUtils(sparkSession: SparkSession) {
+
+  def dropPartitions(tableName: String,
+                     partitions: Seq[String],
+                     partitionColumn: String = partitionColumn,
+                     subPartitionFilters: Map[String, String] = Map.empty): Unit = {
+    if (partitions.nonEmpty && tableExists(tableName)) {
+      val partitionSpecs = partitions
+        .map { partition =>
+          val mainSpec = s"$partitionColumn='$partition'"
+          val specs = mainSpec +: subPartitionFilters.map {
+            case (key, value) => s"$key='$value'"
+          }.toSeq
+          specs.mkString("PARTITION (", ",", ")")
+        }
+        .mkString(",")
+      val dropSql = s"ALTER TABLE $tableName DROP IF EXISTS $partitionSpecs"
+      sql(dropSql)
+    } else {
+      logger.info(s"$tableName doesn't exist, please double check before drop partitions")
+    }
+  }
+
+  def dropPartitionRange(tableName: String,
+                         startDate: String,
+                         endDate: String,
+                         subPartitionFilters: Map[String, String] = Map.empty): Unit = {
+    if (tableExists(tableName)) {
+      val toDrop = Stream.iterate(startDate)(partitionSpec.after).takeWhile(_ <= endDate)
+      dropPartitions(tableName, toDrop, partitionColumn, subPartitionFilters)
+    } else {
+      logger.info(s"$tableName doesn't exist, please double check before drop partitions")
+    }
+  }
+
+}
diff --git a/spark/src/test/scala/ai/chronon/spark/test/TableUtilsTest.scala b/spark/src/test/scala/ai/chronon/spark/test/TableUtilsTest.scala

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,7 @@ import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.Tabl`
`12`	`12`	`import org.apache.spark.sql.SparkSession`
`13`	`13`
`14`	`14`	`case class GcpFormatProvider(sparkSession: SparkSession) extends FormatProvider {`
	`15`	`+`
`15`	`16`	`lazy val bigQueryClient = BigQueryOptions.getDefaultInstance.getService`
`16`	`17`	`def readFormat(tableName: String): Format = {`
`17`	`18`
`@@ -126,6 +127,7 @@ case class BQuery(project: String) extends Format {`
`126`	`127`	`sparkSession.conf.set("viewsEnabled", originalViewsEnabled)`
`127`	`128`	`sparkSession.conf.set("materializationDataset", originalMaterializationDataset)`
`128`	`129`	`}`
	`130`	`+`
`129`	`131`	`}`
`130`	`132`
`131`	`133`	`def createTableTypeString: String = "BIGQUERY"`