feat: do partition filtering on bq native tables by union individual partitions (#690)

tchow-zlai · thomaschow · web-flow · commit cbacae32066c · 2025-04-28T08:48:47.000-07:00
## Summary

- Getting a 403 querying for a range of partitions in bigquery native
tables:
```
Response too large to return. Consider specifying a destination table in your job configuration
```
- instead, let's just query individual partitions of data as separate
dataframes and union them together.

## Checklist
- [ ] Added Unit Tests
- [ ] Covered by existing CI
- [ ] Integration tested
- [ ] Documentation update



&lt;!-- av pr metadata
This information is embedded by the av CLI when creating PRs to track
the status of stacks when using Aviator. Please do not delete or edit
this section of the PR.
```
{"parent":"main","parentHead":"","trunk":"main"}
```
--&gt;


&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;

## Summary by CodeRabbit

- **Bug Fixes**
- Improved handling of BigQuery partitioned tables, ensuring more
accurate partition filtering and data retrieval.

- **Refactor**
- Streamlined the process for reading partitioned data from BigQuery,
resulting in a clearer and more consistent approach for users working
with partitioned tables.

&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Co-authored-by: Thomas Chow &lt;thomaschow369@gmail.com&gt;
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/BigQueryNative.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/BigQueryNative.scala
@@ -5,7 +5,7 @@ import ai.chronon.spark.catalog.Format
 import com.google.cloud.bigquery.BigQueryOptions
 import com.google.cloud.spark.bigquery.v2.Spark35BigQueryTableProvider
 import org.apache.spark.sql.{DataFrame, SparkSession}
-import org.apache.spark.sql.functions.{col, date_format, to_date}
+import org.apache.spark.sql.functions.{col, date_format, to_date, lit}
 
 case object BigQueryNative extends Format {
 
@@ -16,56 +16,61 @@ case object BigQueryNative extends Format {
 
   override def table(tableName: String, partitionFilters: String)(implicit sparkSession: SparkSession): DataFrame = {
     import sparkSession.implicits._
+
+    // First, need to clean the spark-based table name for the bigquery queries below.
     val bqTableId = SparkBQUtils.toTableId(tableName)
     val providedProject = scala.Option(bqTableId.getProject).getOrElse(bqOptions.getProjectId)
     val bqFriendlyName = f"${providedProject}.${bqTableId.getDataset}.${bqTableId.getTable}"
 
+    // Then, we query the BQ information schema to grab the table's partition column.
     val partColsSql =
       s"""
-         |SELECT column_name, IS_SYSTEM_DEFINED FROM `${providedProject}.${bqTableId.getDataset}.INFORMATION_SCHEMA.COLUMNS`
+         |SELECT column_name FROM `${providedProject}.${bqTableId.getDataset}.INFORMATION_SCHEMA.COLUMNS`
          |WHERE table_name = '${bqTableId.getTable}' AND is_partitioning_column = 'YES'
          |
          |""".stripMargin
 
-    val (partColName, systemDefined) = sparkSession.read
+    val partColName = sparkSession.read
       .format(bqFormat)
       .option("project", providedProject)
       // See: https://github.com/GoogleCloudDataproc/spark-bigquery-connector/issues/434#issuecomment-886156191
       // and: https://cloud.google.com/bigquery/docs/information-schema-intro#limitations
       .option("viewsEnabled", true)
       .option("materializationDataset", bqTableId.getDataset)
       .load(partColsSql)
-      .as[(String, String)]
+      .as[String]
       .collect
       .headOption
-      .getOrElse(throw new UnsupportedOperationException(s"No partition column for table ${tableName} found."))
-
-    val isPseudoColumn = systemDefined match {
-      case "YES" => true
-      case "NO"  => false
-      case _     => throw new IllegalArgumentException(s"Unknown partition column system definition: ${systemDefined}")
-    }
-
-    logger.info(
-      s"Found bigquery partition column: ${partColName} with system defined status: ${systemDefined} for table: ${tableName}")
+      .getOrElse(
+        throw new UnsupportedOperationException(s"No partition column for table ${tableName} found.")
+      ) // TODO: support unpartitioned tables (uncommon case).
 
+    // Next, we query the BQ table using the requested partitionFilter to grab all the distinct partition values that match the filter.
     val partitionWheres = if (partitionFilters.nonEmpty) s"WHERE ${partitionFilters}" else partitionFilters
     val partitionFormat = TableUtils(sparkSession).partitionFormat
-    val dfw = sparkSession.read
+    val select = s"SELECT distinct(${partColName}) AS ${internalBQCol} FROM ${bqFriendlyName} ${partitionWheres}"
+    val selectedParts = sparkSession.read
       .format(bqFormat)
       .option("viewsEnabled", true)
       .option("materializationDataset", bqTableId.getDataset)
-    if (isPseudoColumn) {
-      val select = s"SELECT ${partColName} AS ${internalBQCol}, * FROM ${bqFriendlyName} ${partitionWheres}"
-      logger.info(s"BQ select: ${select}")
-      dfw
-        .load(select)
-        .withColumn(partColName, date_format(col(internalBQCol), partitionFormat))
-        .drop(internalBQCol)
-    } else {
-      dfw
-        .load(s"SELECT * FROM ${bqFriendlyName} ${partitionWheres}")
-    }
+      .load(select)
+      .select(date_format(col(internalBQCol), partitionFormat))
+      .as[String]
+      .collect
+      .toList
+    logger.info(s"Part values: ${selectedParts}")
+
+    // Finally, we query the BQ table for each of the selected partition values and union them together.
+    selectedParts
+      .map((partValue) => {
+        val pFilter = f"${partColName} = '${partValue}'"
+        sparkSession.read
+          .format(bqFormat)
+          .option("filter", pFilter)
+          .load(bqFriendlyName)
+          .withColumn(partColName, lit(partValue))
+      }) // todo: make it nullable
+      .reduce(_ unionByName _)
   }
 
   override def primaryPartitions(tableName: String, partitionColumn: String, subPartitionsFilter: Map[String, String])(