fix: write to gcs parquet instead of bq native (#371)

nikhil-zlai · tchow-zlai · thomaschow · web-flow · commit 2010a0b40240 · 2025-02-13T00:41:31.000-08:00
## Checklist
- [ ] Added Unit Tests
- [ ] Covered by existing CI
- [ ] Integration tested
- [ ] Documentation update



&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- **New Features**
- Enhanced the table creation process to return clear, detailed
statuses, improving feedback during table building.
- Introduced a new method for generating table builders that integrates
with BigQuery, including error handling for partitioning.
- Streamlined data writing operations to cloud storage with automatic
path configuration and Parquet integration.
- Added explicit partitioning for DataFrame saves in Hive, Delta, and
Iceberg formats.
  
- **Refactor**
- Overhauled logic to enforce partition restrictions and incorporate
robust error handling for a smoother user experience.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Co-authored-by: tchow-zlai &lt;thomas@zipline.ai&gt;
Co-authored-by: Thomas Chow &lt;thomaschow369@gmail.com&gt;
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/BigQueryFormat.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/BigQueryFormat.scala
@@ -1,5 +1,7 @@
 package ai.chronon.integrations.cloud_gcp
 
+import ai.chronon.spark.TableUtils
+import ai.chronon.spark.TableUtils.TableCreationStatus
 import ai.chronon.spark.format.Format
 import com.google.cloud.bigquery.connector.common.BigQueryUtil
 import com.google.cloud.spark.bigquery.SchemaConverters
@@ -26,13 +28,14 @@ case class BigQueryFormat(project: String, bqClient: BigQuery, override val opti
   override def primaryPartitions(tableName: String, partitionColumn: String, subPartitionsFilter: Map[String, String])(
       implicit sparkSession: SparkSession): Seq[String] =
     super.primaryPartitions(tableName, partitionColumn, subPartitionsFilter)
-  override def createTable(df: DataFrame,
-                           tableName: String,
-                           partitionColumns: Seq[String],
-                           tableProperties: Map[String, String],
-                           fileFormat: String): (String => Unit) => Unit = {
+  override def generateTableBuilder(df: DataFrame,
+                                    tableName: String,
+                                    partitionColumns: Seq[String],
+                                    tableProperties: Map[String, String],
+                                    fileFormat: String): (String => Unit) => TableCreationStatus = {
 
-    def inner(df: DataFrame, tableName: String, partitionColumns: Seq[String])(sqlEvaluator: String => Unit) = {
+    def inner(df: DataFrame, tableName: String, partitionColumns: Seq[String])(
+        sqlEvaluator: String => Unit): TableCreationStatus = {
 
       // See: https://cloud.google.com/bigquery/docs/partitioned-tables#limitations
       // "BigQuery does not support partitioning by multiple columns. Only one column can be used to partition a table."
@@ -57,8 +60,8 @@ case class BigQueryFormat(project: String, bqClient: BigQuery, override val opti
       val tableInfoBuilder = TableInfo.newBuilder(shadedTableId, tableDefinition.build)
 
       val tableInfo = tableInfoBuilder.build
-
       bqClient.create(tableInfo)
+      TableUtils.TableCreatedWithoutInitialData
     }
 
     inner(df, tableName, partitionColumns)
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GCSFormat.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GCSFormat.scala
@@ -1,15 +1,33 @@
 package ai.chronon.integrations.cloud_gcp
 
+import ai.chronon.api.Extensions.StringOps
+import ai.chronon.api.ScalaJavaConversions.JListOps
+import ai.chronon.spark.TableUtils
+import ai.chronon.spark.TableUtils.{TableCreatedWithInitialData, TableCreationStatus}
 import ai.chronon.spark.format.Format
-import org.apache.spark.sql.Encoders
-import org.apache.spark.sql.Row
-import org.apache.spark.sql.SparkSession
+import com.google.cloud.bigquery.connector.common.BigQueryUtil
+import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.{
+  BigQuery,
+  BigQueryOptions,
+  ExternalTableDefinition,
+  FormatOptions,
+  HivePartitioningOptions,
+  TableInfo
+}
+import com.google.cloud.spark.bigquery.{SchemaConverters, SchemaConvertersConfiguration}
+import org.apache.spark.sql.{DataFrame, Encoders, Row, SparkSession}
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.execution.FileSourceScanExec
 import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
+import org.slf4j.LoggerFactory
 
 case class GCS(sourceUri: String, fileFormat: String) extends Format {
 
+  private lazy val logger = LoggerFactory.getLogger(this.getClass.getName)
+
+  private lazy val bqOptions = BigQueryOptions.getDefaultInstance
+  lazy val bigQueryClient: BigQuery = bqOptions.getService
+
   override def name: String = fileFormat
 
   override def primaryPartitions(tableName: String, partitionColumn: String, subPartitionsFilter: Map[String, String])(
@@ -72,6 +90,62 @@ case class GCS(sourceUri: String, fileFormat: String) extends Format {
         }.toMap)
   }
 
+  override def generateTableBuilder(df: DataFrame,
+                                    tableName: String,
+                                    partitionColumns: Seq[String],
+                                    tableProperties: Map[String, String],
+                                    fileFormat: String): (String => Unit) => TableCreationStatus = {
+
+    def inner(df: DataFrame, tableName: String, partitionColumns: Seq[String])(sqlEvaluator: String => Unit) = {
+
+      // See: https://cloud.google.com/bigquery/docs/partitioned-tables#limitations
+      // "BigQuery does not support partitioning by multiple columns. Only one column can be used to partition a table."
+      require(partitionColumns.size < 2,
+              s"BigQuery only supports at most one partition column, incoming spec: ${partitionColumns}")
+
+      val shadedTableId = BigQueryUtil.parseTableId(tableName)
+
+      val writePrefix = TableUtils(df.sparkSession).writePrefix
+      require(writePrefix.nonEmpty, "Please set conf 'spark.chronon.table_write.prefix' pointing to a data bucket.")
+
+      val path = writePrefix.get + tableName.sanitize + "/" //split("/").map(_.sanitize).mkString("/")
+      val dataGlob = path + "*"
+
+      logger.info(s"""
+           |table source uri: $dataGlob
+           |partition uri: $path
+           |""".stripMargin)
+
+      df.write
+        .partitionBy(partitionColumns: _*)
+        .mode("overwrite") // or "append" based on your needs
+        .parquet(path)
+
+      val baseTableDef = ExternalTableDefinition
+        .newBuilder(dataGlob, FormatOptions.parquet())
+        .setAutodetect(true)
+
+      if (partitionColumns.nonEmpty) {
+        val timePartitioning = HivePartitioningOptions
+          .newBuilder()
+          .setFields(partitionColumns.toJava)
+          .setSourceUriPrefix(path)
+          .setMode("STRINGS")
+          .build()
+        baseTableDef.setHivePartitioningOptions(timePartitioning)
+      }
+
+      val tableInfo = TableInfo.newBuilder(shadedTableId, baseTableDef.build).build()
+      val createdTable = bigQueryClient.create(tableInfo)
+
+      println(s"Created external table ${createdTable.getTableId}")
+
+      TableCreatedWithInitialData
+    }
+
+    inner(df, tableName, partitionColumns)
+  }
+
   def createTableTypeString: String = throw new UnsupportedOperationException("GCS does not support create table")
 
   def fileFormatString(format: String): String = ""
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpFormatProvider.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpFormatProvider.scala
@@ -1,18 +1,9 @@
 package ai.chronon.integrations.cloud_gcp
-import ai.chronon.spark.format.Format
-import ai.chronon.spark.format.FormatProvider
+import ai.chronon.api.Extensions.StringOps
+import ai.chronon.spark.TableUtils
+import ai.chronon.spark.format.{Format, FormatProvider}
 import com.google.cloud.bigquery.connector.common.BigQueryUtil
-import com.google.cloud.spark.bigquery.SparkBigQueryConfig
-import com.google.cloud.spark.bigquery.SparkBigQueryConfig.IntermediateFormat
-import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQuery
-import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryOptions
-import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.ExternalTableDefinition
-import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.FormatOptions
-import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.JobInfo
-import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.StandardTableDefinition
-import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.Table
-import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.TableDefinition
-import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.TableId
+import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery._
 import org.apache.spark.sql.SparkSession
 
 import scala.jdk.CollectionConverters._
@@ -38,24 +29,14 @@ case class GcpFormatProvider(sparkSession: SparkSession) extends FormatProvider
       }
       .getOrElse(tableName)
 
-  override def readFormat(tableName: String): Option[Format] = format(tableName)
+  override def readFormat(tableName: String): scala.Option[Format] = format(tableName)
 
   override def writeFormat(table: String): Format = {
-    val tableId = BigQueryUtil.parseTableId(table)
-    assert(scala.Option(tableId.getProject).isDefined, s"project required for ${table}")
-    assert(scala.Option(tableId.getDataset).isDefined, s"dataset required for ${table}")
+    val writePrefix = TableUtils(sparkSession).writePrefix
+    require(writePrefix.nonEmpty, "Please set conf 'spark.chronon.table_write.prefix' pointing to a data bucket.")
 
-    val sparkOptions: Map[String, String] = Map(
-      "temporaryGcsBucket" -> sparkSession.conf.get("spark.chronon.table.gcs.temporary_gcs_bucket"),
-      "writeMethod" -> "indirect",
-      SparkBigQueryConfig.INTERMEDIATE_FORMAT_OPTION -> IntermediateFormat.PARQUET.getDataSource,
-      SparkBigQueryConfig.ENABLE_LIST_INFERENCE -> true.toString,
-      "materializationProject" -> tableId.getProject,
-      "materializationDataset" -> tableId.getDataset,
-      "createDisposition" -> JobInfo.CreateDisposition.CREATE_NEVER.name
-    )
-
-    BigQueryFormat(tableId.getProject, bigQueryClient, sparkOptions)
+    val path = writePrefix.get + table.sanitize //split("/").map(_.sanitize).mkString("/")
+    GCS(path, "PARQUET")
   }
 
   private[cloud_gcp] def getFormat(table: Table): Format =
@@ -65,7 +46,8 @@ case class GcpFormatProvider(sparkSession: SparkSession) extends FormatProvider
         val formatOptions = definition.getFormatOptions
           .asInstanceOf[FormatOptions]
         val externalTable = table.getDefinition.asInstanceOf[ExternalTableDefinition]
-        val uri = Option(externalTable.getHivePartitioningOptions)
+        val uri = scala
+          .Option(externalTable.getHivePartitioningOptions)
           .map(_.getSourceUriPrefix)
           .getOrElse {
             val uris = externalTable.getSourceUris.asScala
@@ -81,10 +63,10 @@ case class GcpFormatProvider(sparkSession: SparkSession) extends FormatProvider
       case _ => throw new IllegalStateException(s"Cannot support table of type: ${table.getFriendlyName}")
     }
 
-  private def format(tableName: String): Option[Format] = {
+  private def format(tableName: String): scala.Option[Format] = {
 
     val btTableIdentifier: TableId = BigQueryUtil.parseTableId(tableName)
-    val table = Option(bigQueryClient.getTable(btTableIdentifier.getDataset, btTableIdentifier.getTable))
+    val table = scala.Option(bigQueryClient.getTable(btTableIdentifier.getDataset, btTableIdentifier.getTable))
     table
       .map(getFormat)
 
diff --git a/spark/src/main/scala/ai/chronon/spark/Extensions.scala b/spark/src/main/scala/ai/chronon/spark/Extensions.scala
@@ -326,6 +326,7 @@ object Extensions {
             case "hive" | "delta" | "iceberg" =>
               optionDfw
                 .format(normalized)
+                .partitionBy(null: _*)
                 .insertInto(dataPointer.tableOrPath)
             case _ =>
               throw new UnsupportedOperationException(s"Unsupported write catalog: ${normalized}")
@@ -335,6 +336,7 @@ object Extensions {
           // None case is just table against default catalog
           optionDfw
             .format("hive")
+            .partitionBy(null: _*)
             .insertInto(dataPointer.tableOrPath))
     }
   }
diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
diff --git a/spark/src/main/scala/ai/chronon/spark/format/Format.scala b/spark/src/main/scala/ai/chronon/spark/format/Format.scala