fix: handle partition overwrite (#206)

tchow-zlai · thomaschow · web-flow · commit a7a9a42fdc5b · 2025-01-15T11:48:20.000-08:00
## Summary Based on [@david Han](https://zipline-2kh4520.slack.com/team/U0846REC8F7)’s observation's around the spark-bigquery [connector](https://zipline-2kh4520.slack.com/archives/C08710CDH8D/p1736700947319249?thread_ts=1736644291.357239&cid=C08710CDH8D), there indeed is a lurking behavior. when creating BQ tables (in the case they don't exist), user needs to specify a partitioning. This is expected. We do that in the form of a partitionColumn write option. when the connector performs dynamic partition overwrites, you don't need to specify the partitioning at all. It will do the right thing because the destination table was already created with a partition spec. ^ ideally, the above could be idempotent even if the user passes the partition column to the write, but unfortunately it's a strict requirement that you don't define the partition column when doing dynamic partition overwrites. Fix is to specify the partition column only when the table DNE and needs to be created, and leave it out in all other cases. ## Checklist - [ ] Added Unit Tests - [ ] Covered by existing CI - [ ] Integration tested - [ ] Documentation update  ## Summary by CodeRabbit - **New Features** - Introduced separate read and write options for data pointers - Added support for Google Cloud SDK tool - Added new plugins for `gcloud` and `thrift` - **Bug Fixes** - Enhanced error handling in BigQuery format provider - **Refactor** - Standardized `DataPointer` instantiation method from `apply` to `from` - Improved options handling in data operations - **Chores** - Updated plugin and tool versions for development environment  --- - To see the specific tasks where the Asana app for GitHub is being used, see below: - https://app.asana.com/0/0/1209143482009688  --------- Co-authored-by: Thomas Chow <thomaschow369@gmail.com>
diff --git a/.plugin-versions b/.plugin-versions
@@ -1,5 +1,7 @@
 asdf-plugin-manager  https://github.com/asdf-community/asdf-plugin-manager.git  b5862c1
+gcloud               https://github.com/jthegedus/asdf-gcloud.git               00cdf06
 java                 https://github.com/halcyon/asdf-java.git                   0ec69b2
 python               https://github.com/danhper/asdf-python.git                 a3a0185
 sbt                  https://github.com/lerencao/asdf-sbt                       53c9f4b
 scala                https://github.com/asdf-community/asdf-scala.git           0533444
+thrift               https://github.com/alisaifee/asdf-thrift.git               fecdd6c
diff --git a/.tool-versions b/.tool-versions
@@ -5,3 +5,4 @@ sbt 1.8.2
 python
     3.7.17
     3.11.0
+gcloud 504.0.1
diff --git a/api/src/main/scala/ai/chronon/api/DataPointer.scala b/api/src/main/scala/ai/chronon/api/DataPointer.scala
@@ -5,16 +5,22 @@ abstract class DataPointer {
   def tableOrPath: String
   def readFormat: Option[String]
   def writeFormat: Option[String]
-  def options: Map[String, String]
+
+  def readOptions: Map[String, String]
+  def writeOptions: Map[String, String]
 
 }
 
 case class URIDataPointer(
     override val tableOrPath: String,
     override val readFormat: Option[String],
     override val writeFormat: Option[String],
-    override val options: Map[String, String]
-) extends DataPointer
+    options: Map[String, String]
+) extends DataPointer {
+
+  override val readOptions: Map[String, String] = options
+  override val writeOptions: Map[String, String] = options
+}
 
 // parses string representations of data pointers
 // ex: namespace.table
diff --git a/api/src/test/scala/ai/chronon/api/test/DataPointerTest.scala b/api/src/test/scala/ai/chronon/api/test/DataPointerTest.scala
@@ -5,8 +5,7 @@ import ai.chronon.api.URIDataPointer
 import org.scalatest.flatspec.AnyFlatSpec
 import org.scalatest.matchers.should.Matchers
 
-class
-DataPointerTest extends AnyFlatSpec with Matchers {
+class DataPointerTest extends AnyFlatSpec with Matchers {
 
   "DataPointer.apply" should "parse a simple s3 path" in {
     val result = DataPointer("s3://bucket/path/to/data.parquet")
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpFormatProvider.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpFormatProvider.scala
@@ -32,17 +32,21 @@ case class GcpFormatProvider(sparkSession: SparkSession) extends FormatProvider
   override def readFormat(tableName: String): Format = format(tableName)
 
   override def writeFormat(table: String): Format = {
+    val tableId = BigQueryUtil.parseTableId(table)
+    assert(scala.Option(tableId.getProject).isDefined, s"project required for ${table}")
+    assert(scala.Option(tableId.getDataset).isDefined, s"dataset required for ${table}")
 
     val tu = TableUtils(sparkSession)
+    val partitionColumnOption =
+      if (tu.tableReachable(table)) Map.empty else Map("partitionField" -> tu.partitionColumn)
 
     val sparkOptions: Map[String, String] = Map(
-      "partitionField" -> tu.partitionColumn,
       // todo(tchow): No longer needed after https://github.com/GoogleCloudDataproc/spark-bigquery-connector/pull/1320
       "temporaryGcsBucket" -> sparkSession.conf.get("spark.chronon.table.gcs.temporary_gcs_bucket"),
       "writeMethod" -> "indirect"
-    )
+    ) ++ partitionColumnOption
 
-    BigQueryFormat(bqOptions.getProjectId, sparkOptions)
+    BigQueryFormat(tableId.getProject, sparkOptions)
   }
 
   private def getFormat(table: Table): Format =
@@ -72,7 +76,10 @@ case class GcpFormatProvider(sparkSession: SparkSession) extends FormatProvider
     val table = bigQueryClient.getTable(btTableIdentifier.getDataset, btTableIdentifier.getTable)
 
     // lookup bq for the table, if not fall back to hive
-    scala.Option(table).map(getFormat).getOrElse(Hive)
+    scala
+      .Option(table)
+      .map(getFormat)
+      .getOrElse(scala.Option(btTableIdentifier.getProject).map(BigQueryFormat(_, Map.empty)).getOrElse(Hive))
 
   }
 }
diff --git a/spark/src/main/scala/ai/chronon/spark/CatalogAwareDataPointer.scala b/spark/src/main/scala/ai/chronon/spark/CatalogAwareDataPointer.scala
@@ -10,10 +10,12 @@ case class CatalogAwareDataPointer(inputTableOrPath: String, formatProvider: For
     formatProvider.resolveTableName(inputTableOrPath)
   }
 
-  override lazy val options: Map[String, String] = {
-    // Hack for now, include both read and write options for the datapointer.
-    // todo(tchow): rework this abstraction. https://app.asana.com/0/1208785567265389/1209026103291854/f
-    formatProvider.readFormat(inputTableOrPath).options ++ formatProvider.writeFormat(inputTableOrPath).options
+  override lazy val readOptions: Map[String, String] = {
+    formatProvider.readFormat(inputTableOrPath).options
+  }
+
+  override lazy val writeOptions: Map[String, String] = {
+    formatProvider.writeFormat(inputTableOrPath).options
   }
 
   override lazy val readFormat: Option[String] = {
@@ -28,7 +30,7 @@ case class CatalogAwareDataPointer(inputTableOrPath: String, formatProvider: For
 
 object DataPointer {
 
-  def apply(tableOrPath: String, sparkSession: SparkSession): DataPointer = {
+  def from(tableOrPath: String, sparkSession: SparkSession): DataPointer = {
 
     CatalogAwareDataPointer(tableOrPath, FormatProvider.from(sparkSession))
 
diff --git a/spark/src/main/scala/ai/chronon/spark/Extensions.scala b/spark/src/main/scala/ai/chronon/spark/Extensions.scala
@@ -304,28 +304,26 @@ object Extensions {
 
     def save(dataPointer: DataPointer): Unit = {
 
+      val optionDfw = dfw.options(dataPointer.writeOptions)
       dataPointer.writeFormat
         .map((wf) => {
           val normalized = wf.toLowerCase
           normalized match {
             case "bigquery" | "bq" =>
-              dfw
+              optionDfw
                 .format("bigquery")
-                .options(dataPointer.options)
                 .save(dataPointer.tableOrPath)
             case "snowflake" | "sf" =>
-              dfw
+              optionDfw
                 .format("net.snowflake.spark.snowflake")
-                .options(dataPointer.options)
                 .option("dbtable", dataPointer.tableOrPath)
                 .save()
             case "parquet" | "csv" =>
-              dfw
+              optionDfw
                 .format(normalized)
-                .options(dataPointer.options)
                 .save(dataPointer.tableOrPath)
             case "hive" | "delta" | "iceberg" =>
-              dfw
+              optionDfw
                 .format(normalized)
                 .insertInto(dataPointer.tableOrPath)
             case _ =>
@@ -334,7 +332,7 @@ object Extensions {
         })
         .getOrElse(
           // None case is just table against default catalog
-          dfw
+          optionDfw
             .format("hive")
             .insertInto(dataPointer.tableOrPath))
     }
@@ -345,29 +343,28 @@ object Extensions {
     def load(dataPointer: DataPointer): DataFrame = {
       val tableOrPath = dataPointer.tableOrPath
 
+      val optionDfr = dfr.options(dataPointer.readOptions)
+
       dataPointer.readFormat
         .map { fmt =>
           val fmtLower = fmt.toLowerCase
 
           fmtLower match {
 
             case "bigquery" | "bq" =>
-              dfr
+              optionDfr
                 .format("bigquery")
-                .options(dataPointer.options)
                 .load(tableOrPath)
 
             case "snowflake" | "sf" =>
-              dfr
+              optionDfr
                 .format("net.snowflake.spark.snowflake")
-                .options(dataPointer.options)
                 .option("dbtable", tableOrPath)
                 .load()
 
             case "parquet" | "csv" =>
-              dfr
-                .format(fmtLower)
-                .options(dataPointer.options)
+              optionDfr
+                .format(fmt)
                 .load(tableOrPath)
 
             case "hive" | "delta" | "iceberg" => dfr.table(tableOrPath)
@@ -379,7 +376,7 @@ object Extensions {
         }
         .getOrElse {
           // None case is just table against default catalog
-          dfr.table(tableOrPath)
+          optionDfr.table(tableOrPath)
         }
     }
   }
diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
@@ -125,7 +125,7 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
       true
     } catch {
       case ex: Exception =>
-        logger.info(s"""Couldn't reach $tableName. Error: ${ex.getMessage.red}
+        logger.debug(s"""Couldn't reach $tableName. Error: ${ex.getMessage.red}
              |Call path:
              |${cleanStackTrace(ex).yellow}
              |""".stripMargin)
@@ -135,7 +135,7 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
 
   // Needs provider
   def loadTable(tableName: String): DataFrame = {
-    sparkSession.read.load(DataPointer(tableName, sparkSession))
+    sparkSession.read.load(DataPointer.from(tableName, sparkSession))
   }
 
   def isPartitioned(tableName: String): Boolean = {
@@ -241,7 +241,7 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
   }
 
   def getSchemaFromTable(tableName: String): StructType = {
-    sparkSession.read.load(DataPointer(tableName, sparkSession)).limit(1).schema
+    sparkSession.read.load(DataPointer.from(tableName, sparkSession)).limit(1).schema
   }
 
   // method to check if a user has access to a table
@@ -254,7 +254,7 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
       // retrieve one row from the table
       val partitionFilter = lastAvailablePartition(tableName).getOrElse(fallbackPartition)
       sparkSession.read
-        .load(DataPointer(tableName, sparkSession))
+        .load(DataPointer.from(tableName, sparkSession))
         .where(s"$partitionColumn='$partitionFilter'")
         .limit(1)
         .collect()
@@ -545,8 +545,8 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
           (Seq(partitionColumn, saltCol), Seq(partitionColumn) ++ sortByCols)
         } else { (Seq(saltCol), sortByCols) }
       logger.info(s"Sorting within partitions with cols: $partitionSortCols")
+      val dataPointer = DataPointer.from(tableName, sparkSession)
 
-      val dataPointer = DataPointer(tableName, sparkSession)
       saltedDf
         .select(saltedDf.columns.map {
           case c if c == partitionColumn && dataPointer.writeFormat.map(_.toUpperCase).exists("BIGQUERY".equals) =>
@@ -763,14 +763,13 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
                  wheres: Seq[String],
                  rangeWheres: Seq[String],
                  fallbackSelects: Option[Map[String, String]] = None): DataFrame = {
-
-    val dp = DataPointer(table, sparkSession)
+    val dp = DataPointer.from(table, sparkSession)
     var df = sparkSession.read.load(dp)
     val selects = QueryUtils.buildSelects(selectMap, fallbackSelects)
 
     logger.info(s""" Scanning data:
          |  table: ${dp.tableOrPath.green}
-         |  options: ${dp.options}
+         |  options: ${dp.readOptions}
          |  format: ${dp.readFormat}
          |  selects:
          |    ${selects.mkString("\n    ").green}

-Original file line number
+Diff line change
 python
 .7.17
 .11.0
 +gcloud 504.0.1