chore: Remove use of DelegatingTable and bubble up exceptions properly (#638)

tchow-zlai · thomaschow · web-flow · commit 6b482eac8d01 · 2025-04-16T21:22:11.000-07:00
## Summary

- It's better to return the underlying table, that way we can let Spark
understand what capabilities are possible based on class type matching.
- Also modify the catalog to conform to the contract, throwing
NoSuchTableException. Improve logging in TableUtils.

## Checklist
- [ ] Added Unit Tests
- [ ] Covered by existing CI
- [ ] Integration tested
- [ ] Documentation update
&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- **Refactor**
- Simplified and streamlined table loading and error handling for
BigQuery and external tables.
- Improved reliability by updating exception handling and type-based
logic for table formats.
- **Chores**
- Removed unused classes, objects, and imports to reduce code
complexity.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

&lt;!-- av pr metadata
This information is embedded by the av CLI when creating PRs to track
the status of stacks when using Aviator. Please do not delete or edit
this section of the PR.
```
{"parent":"main","parentHead":"","trunk":"main"}
```
--&gt;

Co-authored-by: Thomas Chow &lt;thomaschow369@gmail.com&gt;
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/DelegatingBigQueryMetastoreCatalog.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/DelegatingBigQueryMetastoreCatalog.scala
@@ -9,61 +9,21 @@ import com.google.cloud.bigquery.{
   TableId
 }
 import com.google.cloud.spark.bigquery.BigQueryCatalog
+import org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog
 import org.apache.iceberg.spark.SparkCatalog
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.connector.catalog._
 import org.apache.spark.sql.connector.catalog.functions.UnboundFunction
 import org.apache.spark.sql.connector.expressions.Transform
-import org.apache.spark.sql.connector.read.ScanBuilder
-import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetTable
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
-import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
-import org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog
 
 import java.util
 import scala.jdk.CollectionConverters._
-import scala.util.Try
-
-/** A table that delegates all operations to an internal table, but with additional properties.
-  * This is mostly for enriching SparkTables with metadata that cannot be accessed by spark directly.
-  * For example, we can use a bigquery client to fetch table metadata / properties and then hydrate the Spark table
-  * with that information, before we pass it back to the Spark compute engine.
-  *
-  * Down the line, we could also support custom partition management.
-  */
-class DelegatingTable(internalTable: Table,
-                      additionalProperties: Map[String, String],
-                      partitioning: Option[Array[Transform]] = None)
-    extends Table
-    with SupportsRead
-    with SupportsWrite {
-
-  override def name(): String = internalTable.name
-
-  override def schema(): StructType = internalTable.schema
-
-  override def capabilities(): util.Set[TableCapability] = internalTable.capabilities()
-
-  override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder =
-    internalTable.asInstanceOf[SupportsRead].newScanBuilder(options)
-
-  override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder =
-    internalTable.asInstanceOf[SupportsWrite].newWriteBuilder(info)
-
-  override def properties(): util.Map[String, String] =
-    (internalTable.properties().asScala ++ additionalProperties).asJava
-
-  override def partitioning(): Array[Transform] = partitioning.getOrElse(internalTable.partitioning())
-
-}
-
-object DelegatingTable {
-  def apply(table: Table, additionalProperties: Map[String, String] = Map.empty): Table =
-    new DelegatingTable(table, additionalProperties = additionalProperties)
-}
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
+import scala.util.{Failure, Success, Try}
 
 /** Galactus catalog that allows us to interact with BigQuery metastore as a spark catalog. This allows for
   * querying of a variety of table types directly in spark sql or the dataframe api.
@@ -118,13 +78,10 @@ class DelegatingBigQueryMetastoreCatalog extends TableCatalog with SupportsNames
 
   override def loadTable(identNoCatalog: Identifier): Table = {
     Try {
-      val icebergSparkTable = icebergCatalog.loadTable(identNoCatalog)
-      DelegatingTable(icebergSparkTable,
-                      additionalProperties =
-                        Map(TableCatalog.PROP_EXTERNAL -> "false", TableCatalog.PROP_PROVIDER -> "ICEBERG"))
+      icebergCatalog.loadTable(identNoCatalog)
     }
       .recover {
-        case _ => {
+        case noIcebergTableEx: NoSuchTableException => {
           val project =
             catalogProps.getOrElse(BigQueryMetastoreCatalog.PROPERTIES_KEY_GCP_PROJECT, bqOptions.getProjectId)
           val tId = identNoCatalog.namespace().toList match {
@@ -134,7 +91,9 @@ class DelegatingBigQueryMetastoreCatalog extends TableCatalog with SupportsNames
               throw new IllegalArgumentException(
                 s"Table identifier namespace ${identNoCatalog} must have at least one part.")
           }
-          val table = bigQueryClient.getTable(tId)
+          val table = scala
+            .Option(bigQueryClient.getTable(tId))
+            .getOrElse(throw new NoSuchTableException(s"BigQuery table $identNoCatalog not found."))
           table.getDefinition.asInstanceOf[TableDefinition] match {
             case externalTable: ExternalTableDefinition => {
               val uris = externalTable.getSourceUris.asScala
@@ -146,33 +105,36 @@ class DelegatingBigQueryMetastoreCatalog extends TableCatalog with SupportsNames
                   uris.head.replaceAll("/\\*\\.parquet$", "")
                 }
 
-              val fileBasedTable = ParquetTable(tId.toString,
-                                                SparkSession.active,
-                                                CaseInsensitiveStringMap.empty(),
-                                                List(uri),
-                                                None,
-                                                classOf[ParquetFileFormat])
-              DelegatingTable(fileBasedTable,
-                              Map(TableCatalog.PROP_EXTERNAL -> "true",
-                                  TableCatalog.PROP_LOCATION -> uri,
-                                  TableCatalog.PROP_PROVIDER -> "PARQUET"))
+              val fileBasedTable = ParquetTable(
+                tId.toString,
+                SparkSession.active,
+                new CaseInsensitiveStringMap(
+                  Map(TableCatalog.PROP_EXTERNAL -> "true",
+                      TableCatalog.PROP_LOCATION -> uri,
+                      TableCatalog.PROP_PROVIDER -> "PARQUET").asJava),
+                List(uri),
+                None,
+                classOf[ParquetFileFormat]
+              )
+              fileBasedTable
             }
             case _: StandardTableDefinition => {
               //todo(tchow): Support partitioning
 
               // Hack because there's a bug in the BigQueryCatalog where they ignore the projectId.
               // See: https://github.com/GoogleCloudDataproc/spark-bigquery-connector/pull/1340
-              val connectorTable = connectorCatalog.loadTable(Identifier.of(Array(tId.getDataset), tId.getTable))
               // ideally it should be the below:
               // val connectorTable = connectorCatalog.loadTable(ident)
-              DelegatingTable(connectorTable,
-                              Map(TableCatalog.PROP_EXTERNAL -> "false", TableCatalog.PROP_PROVIDER -> "BIGQUERY"))
+              connectorCatalog.loadTable(Identifier.of(Array(tId.getDataset), tId.getTable))
             }
-            case _ => throw new IllegalStateException(s"Cannot support table of type: ${table.getFriendlyName}")
+            case _ => throw new IllegalStateException(s"Cannot support table of type: ${table.getDefinition}")
           }
         }
-      }
-      .getOrElse(throw new NoSuchTableException(f"Table: ${identNoCatalog} not found in bigquery catalog."))
+        case other: Throwable => throw other
+      } match {
+      case Success(table)     => table
+      case Failure(exception) => throw exception
+    }
   }
 
   override def createTable(ident: Identifier,
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpFormatProvider.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpFormatProvider.scala
@@ -1,13 +1,15 @@
 package ai.chronon.integrations.cloud_gcp
 import ai.chronon.spark.format.{DefaultFormatProvider, Format, Iceberg}
 import com.google.cloud.bigquery._
+import com.google.cloud.spark.bigquery.v2.Spark31BigQueryTable
 import org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog
 import org.apache.iceberg.spark.SparkCatalog
+import org.apache.iceberg.spark.source.SparkTable
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.connector.catalog.TableCatalog
+import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetTable
 
 import scala.jdk.CollectionConverters._
-import scala.util.Try
+import scala.util.{Failure, Success, Try}
 
 class GcpFormatProvider(override val sparkSession: SparkSession) extends DefaultFormatProvider(sparkSession) {
 
@@ -26,18 +28,17 @@ class GcpFormatProvider(override val sparkSession: SparkSession) extends Default
     cat match {
       case delegating: DelegatingBigQueryMetastoreCatalog =>
         Try {
-          delegating
-            .loadTable(identifier)
-            .properties
-            .asScala
-            .getOrElse(TableCatalog.PROP_PROVIDER, "")
-            .toUpperCase match {
-            case "ICEBERG"   => Iceberg
-            case "BIGQUERY"  => BigQueryNative
-            case "PARQUET"   => BigQueryExternal
+          val tbl = delegating.loadTable(identifier)
+          tbl match {
+            case iceberg: SparkTable            => Iceberg
+            case bigquery: Spark31BigQueryTable => BigQueryNative
+            case parquet: ParquetTable          => BigQueryExternal
             case unsupported => throw new IllegalStateException(s"Unsupported provider type: ${unsupported}")
           }
-        }.toOption
+        } match {
+          case s @ Success(_)     => s.toOption
+          case Failure(exception) => throw exception
+        }
       case iceberg: SparkCatalog if (iceberg.icebergCatalog().isInstanceOf[BigQueryMetastoreCatalog]) =>
         scala.Option(Iceberg)
       case _ => super.readFormat(tableName)