feat: use direct writes to bigquery (#264)

tchow-zlai · thomaschow · web-flow · commit 31c78af65662 · 2025-01-27T13:14:03.000-08:00
## Summary - With #263 we control table creation ourselves. We don't need to rely on indirect writes to then do the table creation (and partitioning) for us, we just simply use the storage API to write directly into the table we created. This should be much more performant and preferred over indirect writes because we don't need to stage data, then load as a temp BQ table, and it uses the BigQuery storage API directly. - Remove configs that are used only for indirect writes ## Checklist - [ ] Added Unit Tests - [ ] Covered by existing CI - [ ] Integration tested - [ ] Documentation update  ## Summary by CodeRabbit ## Release Notes - **Improvements** - Enhanced BigQuery data writing process with more precise configuration options. - Simplified table creation and partition insertion logic. - Improved handling of DataFrame column arrangements during data operations. - **Changes** - Updated BigQuery write method to use a direct writing approach. - Introduced a new option to prevent table creation if it does not exist. - Modified table creation process to be more format-aware. - Streamlined partition insertion mechanism. These updates improve data management and writing efficiency in cloud data processing workflows.   --------- Co-authored-by: Thomas Chow <thomaschow369@gmail.com>
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpFormatProvider.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpFormatProvider.scala
@@ -1,6 +1,4 @@
 package ai.chronon.integrations.cloud_gcp
-
-import ai.chronon.spark.TableUtils
 import ai.chronon.spark.format.Format
 import ai.chronon.spark.format.FormatProvider
 import ai.chronon.spark.format.Hive
@@ -36,17 +34,10 @@ case class GcpFormatProvider(sparkSession: SparkSession) extends FormatProvider
     assert(scala.Option(tableId.getProject).isDefined, s"project required for ${table}")
     assert(scala.Option(tableId.getDataset).isDefined, s"dataset required for ${table}")
 
-    val tu = TableUtils(sparkSession)
-    val partitionColumnOption =
-      if (tu.tableReachable(table)) Map.empty else Map("partitionField" -> tu.partitionColumn)
-
     val sparkOptions: Map[String, String] = Map(
-      // todo(tchow): No longer needed after https://github.com/GoogleCloudDataproc/spark-bigquery-connector/pull/1320
-      "temporaryGcsBucket" -> sparkSession.conf.get("spark.chronon.table.gcs.temporary_gcs_bucket"),
-      "writeMethod" -> "indirect", // writeMethod direct does not output partitioned tables. keep as indirect.
-      "materializationProject" -> tableId.getProject,
-      "materializationDataset" -> tableId.getDataset
-    ) ++ partitionColumnOption
+      "writeMethod" -> "direct",
+      "createDisposition" -> JobInfo.CreateDisposition.CREATE_NEVER.name
+    )
 
     BigQueryFormat(tableId.getProject, bigQueryClient, sparkOptions)
   }
diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
@@ -284,12 +284,12 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
                   tableProperties: Map[String, String] = null,
                   fileFormat: String,
                   autoExpand: Boolean = false): Unit = {
+    val writeFormat = tableFormatProvider.writeFormat(tableName)
 
     if (!tableReachable(tableName)) {
 
       try {
 
-        val writeFormat = tableFormatProvider.writeFormat(tableName)
         val createTableOperation =
           writeFormat.createTable(df, tableName, partitionColumns, tableProperties, fileFormat)
 
@@ -309,11 +309,13 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
 
     // TODO: we need to also allow for bigquery tables to have their table properties (or tags) to be persisted too.
     //  https://app.asana.com/0/1208949807589885/1209111629687568/f
-    if (tableProperties != null && tableProperties.nonEmpty) {
-      sql(alterTablePropertiesSql(tableName, tableProperties))
-    }
-    if (autoExpand) {
-      expandTable(tableName, df.schema)
+    if (writeFormat.name.toUpperCase != "BIGQUERY") {
+      if (tableProperties != null && tableProperties.nonEmpty) {
+        sql(alterTablePropertiesSql(tableName, tableProperties))
+      }
+      if (autoExpand) {
+        expandTable(tableName, df.schema)
+      }
     }
   }
 
@@ -328,12 +330,13 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
                        stats: Option[DfStats] = None,
                        sortByCols: Seq[String] = Seq.empty): Unit = {
     // partitions to the last
-    val dfRearranged: DataFrame = if (!df.columns.endsWith(partitionColumns)) {
-      val colOrder = df.columns.diff(partitionColumns) ++ partitionColumns
-      df.select(colOrder.map(df.col): _*)
-    } else {
-      df
-    }
+    val dataPointer = DataPointer.from(tableName, sparkSession)
+    val colOrder = df.columns.diff(partitionColumns) ++ partitionColumns
+    val dfRearranged: DataFrame = df.select(colOrder.map {
+      case c if c == partitionColumn && dataPointer.writeFormat.map(_.toUpperCase).exists("BIGQUERY".equals) =>
+        to_date(df.col(c), partitionFormat).as(partitionColumn)
+      case c => df.col(c)
+    }: _*)
 
     createTable(dfRearranged, tableName, partitionColumns, tableProperties, fileFormat, autoExpand)
 
@@ -526,11 +529,6 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
       val dataPointer = DataPointer.from(tableName, sparkSession)
 
       saltedDf
-        .select(saltedDf.columns.map {
-          case c if c == partitionColumn && dataPointer.writeFormat.map(_.toUpperCase).exists("BIGQUERY".equals) =>
-            to_date(saltedDf.col(c), partitionFormat).as(partitionColumn)
-          case c => saltedDf.col(c)
-        }.toList: _*)
         .repartition(shuffleParallelism, repartitionCols.map(saltedDf.col): _*)
         .drop(saltCol)
         .sortWithinPartitions(partitionSortCols.map(col): _*)