fix: properly detect bigquery catalog

tchow-zlai · thomaschow · tchow-zlai · commit eb08cb15c3c4 · 2025-04-10T23:44:06.000-07:00
Co-authored-by: Thomas Chow &lt;thomaschow369@gmail.com&gt;
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpFormatProvider.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpFormatProvider.scala
@@ -2,8 +2,10 @@ package ai.chronon.integrations.cloud_gcp
 import ai.chronon.spark.format.{DefaultFormatProvider, Format, Iceberg}
 import com.google.cloud.bigquery._
 import com.google.cloud.iceberg.bigquery.relocated.com.google.api.services.bigquery.model.TableReference
+import com.google.cloud.spark.bigquery.BigQueryCatalog
 import org.apache.iceberg.exceptions.NoSuchIcebergTableException
-import org.apache.iceberg.gcp.bigquery.{BigQueryClient, BigQueryClientImpl}
+import org.apache.iceberg.gcp.bigquery.{BigQueryClient, BigQueryClientImpl, BigQueryMetastoreCatalog}
+import org.apache.iceberg.spark.SparkCatalog
 import org.apache.spark.sql.SparkSession
 
 import scala.jdk.CollectionConverters._
@@ -23,19 +25,47 @@ class GcpFormatProvider(override val sparkSession: SparkSession) extends Default
   private lazy val icebergClient: BigQueryClient = new BigQueryClientImpl()
 
   override def readFormat(tableName: String): scala.Option[Format] = {
-    logger.info(s"Retrieving read format for table: ${tableName}")
+    val parsedCatalog = getCatalog(tableName)
+
+    if (isBigQueryCatalog(parsedCatalog)) {
+      logger.info(s"Detected BigQuery catalog: $parsedCatalog")
+      Try {
+        val btTableIdentifier = SparkBQUtils.toTableId(tableName)(sparkSession)
+        val bqTable = bigQueryClient.getTable(btTableIdentifier)
+        getFormat(bqTable)
+      } match {
+        case Success(format) => scala.Option(format)
+        case Failure(e) =>
+          throw new IllegalStateException(
+            s"${tableName} belongs to bigquery catalog ${parsedCatalog} but could not be found",
+            e)
+      }
+    } else {
+
+      logger.info(s"Detected non-BigQuery catalog: $parsedCatalog")
+      super.readFormat(tableName)
+    }
+  }
 
-    // order is important here. we want the Hive case where we just check for table in catalog to be last
-    Try {
-      val btTableIdentifier = SparkBQUtils.toTableId(tableName)(sparkSession)
-      val bqTable = bigQueryClient.getTable(btTableIdentifier)
-      getFormat(bqTable)
-    } match {
-      case Success(format) => scala.Option(format)
-      case Failure(e) =>
-        logger.info(s"${tableName} is not a BigQuery table")
-        super.readFormat(tableName)
+  private def getCatalog(tableName: String): String = {
+    logger.info(s"Retrieving read format for table: ${tableName}")
+    val parsed = sparkSession.sessionState.sqlParser.parseMultipartIdentifier(tableName)
+    val parsedCatalog = parsed match {
+      case catalog :: namespace :: tableName :: Nil => catalog
+      case namespace :: tableName :: Nil            => sparkSession.catalog.currentCatalog()
+      case tableName :: Nil                         => sparkSession.catalog.currentCatalog()
+      case _ => throw new IllegalStateException(s"Invalid table naming convention specified: ${tableName}")
     }
+    parsedCatalog
+  }
+
+  private def isBigQueryCatalog(catalog: String): Boolean = {
+    val cat = sparkSession.sessionState.catalogManager.catalog(catalog)
+    cat.isInstanceOf[DelegatingBigQueryMetastoreCatalog] || cat
+      .isInstanceOf[BigQueryCatalog] || (cat.isInstanceOf[SparkCatalog] && cat
+      .asInstanceOf[SparkCatalog]
+      .icebergCatalog()
+      .isInstanceOf[BigQueryMetastoreCatalog])
   }
 
   private[cloud_gcp] def getFormat(table: Table): Format = {