feat: Make delegating BigQueryMetastore just a SparkCatalog (#520)

tchow-zlai · thomaschow · nikhil-zlai · commit bd30e0417478 · 2025-03-20T18:21:00.000-07:00
## Summary

- Do not use the `DelegatingBigQueryMetastore` as a session catalog,
just have it be a custom catalog.

This will change the following configuration set 


From:

```bash
spark.sql.catalog.spark_catalog.warehouse: "gs://zipline-warehouse-etsy/data/tables/"
spark.sql.catalog.spark_catalog.gcp_location: "us"
spark.sql.catalog.spark_catalog.gcp_project: "etsy-zipline-dev"
spark.sql.catalog.spark_catalog.catalog-impl: org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog
spark.sql.catalog.spark_catalog: ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog
spark.sql.catalog.spark_catalog.io-impl: org.apache.iceberg.io.ResolvingFileIO
spark.sql.catalog.default_iceberg:  ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog
spark.sql.catalog.default_iceberg.catalog-impl: org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog
spark.sql.catalog.default_iceberg.io-impl: org.apache.iceberg.io.ResolvingFileIO
spark.sql.catalog.default_iceberg.warehouse:  "gs://zipline-warehouse-etsy/data/tables/"
spark.sql.catalog.default_iceberg.gcp_location:  "us"
spark.sql.catalog.default_iceberg.gcp_project: "etsy-zipline-dev"
spark.sql.defaultUrlStreamHandlerFactory.enabled: "false"
spark.kryo.registrator: "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator"
```

to:


```bash

spark.sql.defaultCatalog: "default_iceberg"
spark.sql.catalog.default_iceberg:  "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog"
spark.sql.catalog.default_iceberg.catalog-impl: "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog"
spark.sql.catalog.default_iceberg.io-impl: "org.apache.iceberg.io.ResolvingFileIO"
spark.sql.catalog.default_iceberg.warehouse:  "gs://zipline-warehouse-etsy/data/tables/"
spark.sql.catalog.default_iceberg.gcp_location:  "us"
spark.sql.catalog.default_iceberg.gcp_project: "etsy-zipline-dev"
spark.sql.defaultUrlStreamHandlerFactory.enabled: "false"
spark.kryo.registrator: "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator"
spark.sql.catalog.default_bigquery: "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog"
```

## Checklist
- [ ] Added Unit Tests
- [ ] Covered by existing CI
- [ ] Integration tested
- [ ] Documentation update
&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- **Refactor**
- Improved internal table processing by restructuring class integrations
and enhancing error messaging when a table isn’t found.
- **Tests**
- Updated integration settings and adjusted reference parameters to
ensure validations remain aligned with the new catalog implementation.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

&lt;!-- av pr metadata
This information is embedded by the av CLI when creating PRs to track
the status of stacks when using Aviator. Please do not delete or edit
this section of the PR.
```
{"parent":"main","parentHead":"","trunk":"main"}
```
--&gt;

---------

Co-authored-by: Thomas Chow &lt;thomaschow369@gmail.com&gt;
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/DelegatingBigQueryMetastoreCatalog.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/DelegatingBigQueryMetastoreCatalog.scala
@@ -20,6 +20,7 @@ import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetTable
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 
 import java.util
 import scala.jdk.CollectionConverters._
@@ -77,16 +78,14 @@ object DelegatingTable {
   * NOTE that this abstraction currently only supports querying tables that all belong to the same GCP project. Multi-project
   * support will depend on underlying libraries to support them.
   */
-class DelegatingBigQueryMetastoreCatalog extends CatalogExtension {
+class DelegatingBigQueryMetastoreCatalog extends TableCatalog with SupportsNamespaces with FunctionCatalog {
 
   @transient private lazy val bqOptions = BigQueryOptions.getDefaultInstance
   @transient private lazy val bigQueryClient: BigQuery = bqOptions.getService
 
   @transient private lazy val icebergCatalog: SparkCatalog = new SparkCatalog()
   @transient private lazy val connectorCatalog: BigQueryCatalog = new BigQueryCatalog()
 
-  // Some stupid spark settings.
-  private var defaultSessionCatalog: CatalogPlugin = null
   private var catalogName: String =
     null // This corresponds to `spark_catalog in `spark.sql.catalog.spark_catalog`. This is necessary for spark to correctly choose which implementation to use.
 
@@ -160,7 +159,7 @@ class DelegatingBigQueryMetastoreCatalog extends CatalogExtension {
           }
         }
       }
-      .getOrElse(defaultSessionCatalog.asInstanceOf[TableCatalog].loadTable(rawIdent))
+      .getOrElse(throw new NoSuchTableException(f"Tgable: ${ident} not found in bigquery catalog."))
   }
 
   override def createTable(ident: Identifier,
@@ -192,12 +191,7 @@ class DelegatingBigQueryMetastoreCatalog extends CatalogExtension {
 
   override def name(): String = catalogName
 
-  override def setDelegateCatalog(delegate: CatalogPlugin): Unit = {
-    defaultSessionCatalog = delegate
-  }
-
   override def listFunctions(namespace: Array[String]): Array[Identifier] = icebergCatalog.listFunctions(namespace)
 
   override def loadFunction(ident: Identifier): UnboundFunction = icebergCatalog.loadFunction(ident)
-
 }
diff --git a/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/BigQueryCatalogTest.scala b/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/BigQueryCatalogTest.scala
@@ -41,7 +41,7 @@ class BigQueryCatalogTest extends AnyFlatSpec with MockitoSugar {
         "spark.sql.catalogImplementation" -> "in-memory",
 
 //        "spark.sql.defaultCatalog" -> "default_iceberg",
-//        "spark.sql.catalog.default_iceberg" -> classOf[SparkCatalog].getName,
+//        "spark.sql.catalog.default_iceberg" -> classOf[DelegatingBigQueryMetastoreCatalog].getName,
 //        "spark.sql.catalog.default_iceberg.catalog-impl" -> classOf[BQMSCatalog].getName,
 //        "spark.sql.catalog.default_iceberg.io-impl" -> classOf[ResolvingFileIO].getName,
 //        "spark.sql.catalog.default_iceberg.warehouse" -> "gs://zipline-warehouse-canary/data/tables/",
@@ -95,7 +95,7 @@ class BigQueryCatalogTest extends AnyFlatSpec with MockitoSugar {
   }
 
   it should "integration testing bigquery native table" ignore {
-    val nativeTable = "data.sample_native"
+    val nativeTable = "data.checkouts"
     val table = tableUtils.loadTable(nativeTable)
     table.show
     // val database = tableUtils.createDatabase("test_database")