feat: TableUtils to be compatible with DataPointer (part 1) (#158)

tchow-zlai · thomaschow · web-flow · commit 492c4d09fe3d · 2025-01-03T16:51:19.000-08:00
## Summary - Fixing some earlier bugs. Doing case matching on java isn't the same with scala classes. - Adding a few more params to GCS format. Need the source URI, and the `format` string (which is basically the file format). - Deleting some queries to INFORMATION_SCHEMA in GCS format. no longer needed since we are using the BQ Client. - Adding some code to handle Spark InternalRows. We are using a low level impl to get at the InMemoryFileIndex which contains file partitions. It gives us internal rows so we need to translate that to rows, which involves the correct serialization based on the column types. - Add a couple tests to BigQueryCatalogTest - Adding a `name` field to `Format` - Begin to migrate some TableUtils methods to delegate to DataPointer. - https://app.asana.com/0/1208949807589885/1208960391734329/f - https://app.asana.com/0/1208949807589885/1208960391734331/f ## Checklist - [ ] Added Unit Tests - [ ] Covered by existing CI - [ ] Integration tested - [ ] Documentation update  ## Summary by CodeRabbit - **New Features** - Enhanced BigQuery and GCS format handling with improved table name resolution and data source support. - Updated Spark table utilities with more robust data loading and management capabilities. - Introduced new methods for resolving table names and handling data formats. - Added support for new dependencies related to Google Cloud Dataproc. - Introduced unit tests for GCS format functionality. - **Bug Fixes** - Improved error handling for data source formats and table operations. - Streamlined data pointer operations for better format compatibility. - **Refactor** - Simplified data loading and schema retrieval methods. - Consolidated format handling logic in data source operations. - Enhanced organization and clarity in data pointer handling. - Cleaned up dependency declarations and project settings in build configuration. - Improved error handling and control flow in join computation processes.   --------- Co-authored-by: Thomas Chow <thomaschow369@gmail.com>
diff --git a/build.sbt b/build.sbt
@@ -215,6 +215,7 @@ lazy val cloud_gcp = project
     libraryDependencies += "com.google.cloud.bigdataoss" % "gcs-connector" % "3.0.3", // it's what's on the cluster
     libraryDependencies += "com.google.cloud.bigdataoss" % "gcs-connector" % "hadoop3-2.2.26",
     libraryDependencies += "com.google.cloud.bigdataoss" % "gcsio" % "3.0.3", // need it for https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/master/gcsio/src/main/java/com/google/cloud/hadoop/gcsio/GoogleCloudStorageFileSystem.java
+    libraryDependencies += "com.google.cloud.bigdataoss" % "util-hadoop" % "3.0.0", // need it for https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/master/util-hadoop/src/main/java/com/google/cloud/hadoop/util/HadoopConfigurationProperty.java
     libraryDependencies += "io.circe" %% "circe-yaml" % "1.15.0",
     libraryDependencies += "com.google.cloud.spark" %% s"spark-bigquery-with-dependencies" % "0.41.0",
     libraryDependencies += "com.google.cloud.bigtable" % "bigtable-hbase-2.x" % "2.14.2",
@@ -389,7 +390,6 @@ lazy val hub = (project in file("hub"))
     }
   )
 
-
 val scala_test = "org.scalatest" %% "scalatest" % "3.2.19" % "test"
 val sl4j = "org.slf4j" % "slf4j-api" % slf4jApiVersion
 val logback = "ch.qos.logback" % "logback-classic" % logbackClassicVersion
@@ -403,25 +403,22 @@ val commonDependencies = Seq(
 lazy val orchestration = project
   .dependsOn(online.%("compile->compile;test->test"))
   .settings(
-
     assembly / mainClass := Some("ai.chronon.orchestration.RepoParser"),
     Compile / run / mainClass := Some("ai.chronon.orchestration.RepoParser"),
-
     assembly / assemblyMergeStrategy := {
-      case "log4j2.properties" => MergeStrategy.first
+      case "log4j2.properties"                  => MergeStrategy.first
       case "META-INF/log4j-provider.properties" => MergeStrategy.first
-      case PathList("org", "apache", "logging", "log4j", "core", "config", "plugins", "Log4j2Plugins.dat") => MergeStrategy.first
+      case PathList("org", "apache", "logging", "log4j", "core", "config", "plugins", "Log4j2Plugins.dat") =>
+        MergeStrategy.first
       case x => (assembly / assemblyMergeStrategy).value(x)
     },
-
     libraryDependencies ++= commonDependencies ++ Seq(
       "org.apache.logging.log4j" % "log4j-api" % log4j2_version,
       "org.apache.logging.log4j" % "log4j-core" % log4j2_version,
-      "org.apache.logging.log4j" % "log4j-slf4j-impl" % log4j2_version,
-    ),
+      "org.apache.logging.log4j" % "log4j-slf4j-impl" % log4j2_version
+    )
   )
 
-
 ThisBuild / assemblyMergeStrategy := {
   case PathList("META-INF", "MANIFEST.MF") => MergeStrategy.discard
   case PathList("META-INF", _*)            => MergeStrategy.filterDistinctLines
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/BigQueryFormat.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/BigQueryFormat.scala
@@ -5,63 +5,94 @@ import ai.chronon.spark.FormatProvider
 import ai.chronon.spark.Hive
 import com.google.cloud.bigquery.BigQueryOptions
 import com.google.cloud.bigquery.ExternalTableDefinition
+import com.google.cloud.bigquery.FormatOptions
 import com.google.cloud.bigquery.StandardTableDefinition
+import com.google.cloud.bigquery.Table
 import com.google.cloud.bigquery.connector.common.BigQueryUtil
-import com.google.cloud.bigquery.{TableId => BTableId}
 import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.TableId
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions.{col, to_date}
+
+import scala.collection.JavaConverters._
 
 case class GcpFormatProvider(sparkSession: SparkSession) extends FormatProvider {
+  // Order of Precedence for Default Project
+  // Explicitly configured project in code (e.g., setProjectId()).
+  // GOOGLE_CLOUD_PROJECT environment variable.
+  // project_id from the ADC service account JSON file.
+  // Active project in the gcloud CLI configuration.
+  // No default project: An error will occur if no project ID is available.
+  lazy val bqOptions = BigQueryOptions.getDefaultInstance
+  lazy val bigQueryClient = bqOptions.getService
+
+  override def resolveTableName(tableName: String): String = {
+    format(tableName: String) match {
+      case GCS(_, uri, _) => uri
+      case _              => tableName
+    }
+  }
+
+  override def readFormat(tableName: String): Format = format(tableName)
+
+  // Fixed to BigQuery for now.
+  override def writeFormat(tableName: String): Format = BQuery(bqOptions.getProjectId)
 
-  lazy val bigQueryClient = BigQueryOptions.getDefaultInstance.getService
-  def readFormat(tableName: String): Format = {
+  private def format(tableName: String): Format = {
 
     val btTableIdentifier: TableId = BigQueryUtil.parseTableId(tableName)
-    val unshadedTI: BTableId =
-      BTableId.of(btTableIdentifier.getProject, btTableIdentifier.getDataset, btTableIdentifier.getTable)
-
-    val tableOpt = Option(bigQueryClient.getTable(unshadedTI))
-
-    tableOpt match {
-      case Some(table) => {
-        table.getDefinition match {
-          case _: ExternalTableDefinition => BQuery(unshadedTI.getProject)
-          case _: StandardTableDefinition => GCS(unshadedTI.getProject)
-        }
-      }
-      case None => Hive
-    }
+
+    val tableOpt: Option[Table] = Option(
+      bigQueryClient.getTable(btTableIdentifier.getDataset, btTableIdentifier.getTable))
+    tableOpt
+      .map((table) => {
+
+        if (table.getDefinition.isInstanceOf[ExternalTableDefinition]) {
+          val uris = table.getDefinition
+            .asInstanceOf[ExternalTableDefinition]
+            .getSourceUris
+            .asScala
+            .toList
+            .map((uri) => uri.stripSuffix("/*") + "/")
+
+          assert(uris.length == 1, s"External table ${tableName} can be backed by only one URI.")
+
+          val formatStr = table.getDefinition
+            .asInstanceOf[ExternalTableDefinition]
+            .getFormatOptions
+            .asInstanceOf[FormatOptions]
+            .getType
+
+          GCS(table.getTableId.getProject, uris.head, formatStr)
+        } else if (table.getDefinition.isInstanceOf[StandardTableDefinition]) BQuery(table.getTableId.getProject)
+        else throw new IllegalStateException(s"Cannot support table of type: ${table.getDefinition}")
+      })
+      .getOrElse(Hive)
 
     /**
-    Using federation
-     val tableIdentifier = sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName)
-     val tableMeta = sparkSession.sessionState.catalog.getTableRawMetadata(tableIdentifier)
-    val storageProvider = tableMeta.provider
-    storageProvider match {
-      case Some("com.google.cloud.spark.bigquery") => {
-        val tableProperties = tableMeta.properties
-        val project = tableProperties
-          .get("FEDERATION_BIGQUERY_TABLE_PROPERTY")
-          .map(BigQueryUtil.parseTableId)
-          .map(_.getProject)
-          .getOrElse(throw new IllegalStateException("bigquery project required!"))
-        val bigQueryTableType = tableProperties.get("federation.bigquery.table.type")
-        bigQueryTableType.map(_.toUpperCase) match {
-          case Some("EXTERNAL") => GCS(project)
-          case Some("MANAGED")  => BQuery(project)
-          case None             => throw new IllegalStateException("Dataproc federation service must be available.")
-
-        }
-      }
-
-      case Some("hive") | None => Hive
-    }
+      * Using federation
+      * val tableIdentifier = sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName)
+      * val tableMeta = sparkSession.sessionState.catalog.getTableRawMetadata(tableIdentifier)
+      * val storageProvider = tableMeta.provider
+      * storageProvider match {
+      * case Some("com.google.cloud.spark.bigquery") => {
+      * val tableProperties = tableMeta.properties
+      * val project = tableProperties
+      * .get("FEDERATION_BIGQUERY_TABLE_PROPERTY")
+      * .map(BigQueryUtil.parseTableId)
+      * .map(_.getProject)
+      * .getOrElse(throw new IllegalStateException("bigquery project required!"))
+      * val bigQueryTableType = tableProperties.get("federation.bigquery.table.type")
+      * bigQueryTableType.map(_.toUpperCase) match {
+      * case Some("EXTERNAL") => GCS(project)
+      * case Some("MANAGED")  => BQuery(project)
+      * case None             => throw new IllegalStateException("Dataproc federation service must be available.")
+      *
+      * }
+      *
+      * case Some("hive") | None => Hive
+      * }
       * */
-
   }
-
-  // For now, fix to BigQuery. We'll clean this up.
-  def writeFormat(tableName: String): Format = ???
 }
 
 case class BQuery(project: String) extends Format {
@@ -120,6 +151,13 @@ case class BQuery(project: String) extends Format {
         .option("project", project)
         .option("query", partValsSql)
         .load()
+        .select(
+          to_date(col("partition_id"),
+                  "yyyyMMdd"
+          ) // Note: this "yyyyMMdd" format is hardcoded but we need to change it to be something else.
+            .as("partition_id"))
+        .na // Should filter out '__NULL__' and '__UNPARTITIONED__'. See: https://cloud.google.com/bigquery/docs/partitioned-tables#date_timestamp_partitioned_tables
+        .drop()
         .as[String]
         .collect
         .toList
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GCSFormat.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GCSFormat.scala
@@ -1,53 +1,21 @@
 package ai.chronon.integrations.cloud_gcp
 
 import ai.chronon.spark.Format
+import org.apache.spark.sql.Encoders
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.execution.FileSourceScanExec
 import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
-import org.apache.spark.sql.functions.col
-import org.apache.spark.sql.functions.explode
-import org.apache.spark.sql.functions.url_decode
+case class GCS(project: String, sourceUri: String, fileFormat: String) extends Format {
 
-case class GCS(project: String) extends Format {
-
-  override def name: String = ""
+  override def name: String = fileFormat
 
   override def primaryPartitions(tableName: String, partitionColumn: String, subPartitionsFilter: Map[String, String])(
       implicit sparkSession: SparkSession): Seq[String] =
     super.primaryPartitions(tableName, partitionColumn, subPartitionsFilter)
 
   override def partitions(tableName: String)(implicit sparkSession: SparkSession): Seq[Map[String, String]] = {
-    import sparkSession.implicits._
-
-    val tableIdentifier = sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName)
-    val table = tableIdentifier.table
-    val database = tableIdentifier.database.getOrElse(throw new IllegalArgumentException("database required!"))
-
-    // See: https://github.com/GoogleCloudDataproc/spark-bigquery-connector/issues/434#issuecomment-886156191
-    // and: https://cloud.google.com/bigquery/docs/information-schema-intro#limitations
-    sparkSession.conf.set("viewsEnabled", "true")
-    sparkSession.conf.set("materializationDataset", database)
-
-    // First, grab the URI location from BQ
-    val uriSQL =
-      s"""
-         |select JSON_EXTRACT_STRING_ARRAY(option_value) as option_values from `${project}.${database}.INFORMATION_SCHEMA.TABLE_OPTIONS`
-         |WHERE table_name = '${table}' and option_name = 'uris'
-         |
-         |""".stripMargin
-
-    val uris = sparkSession.read
-      .format("bigquery")
-      .option("project", project)
-      .option("query", uriSQL)
-      .load()
-      .select(explode(col("option_values")).as("option_value"))
-      .select(url_decode(col("option_value")))
-      .as[String]
-      .collect
-      .toList
-
-    assert(uris.length == 1, s"External table ${tableName} can be backed by only one URI.")
 
     /**
       * Given:
@@ -70,7 +38,8 @@ case class GCS(project: String) extends Format {
       *
       */
     val partitionSpec = sparkSession.read
-      .parquet(uris: _*)
+      .format(fileFormat)
+      .load(sourceUri)
       .queryExecution
       .sparkPlan
       .asInstanceOf[FileSourceScanExec]
@@ -82,16 +51,28 @@ case class GCS(project: String) extends Format {
     val partitionColumns = partitionSpec.partitionColumns
     val partitions = partitionSpec.partitions.map(_.values)
 
-    partitions
+    val deserializer =
+      try {
+        Encoders.row(partitionColumns).asInstanceOf[ExpressionEncoder[Row]].resolveAndBind().createDeserializer()
+      } catch {
+        case e: Exception =>
+          throw new RuntimeException(s"Failed to create deserializer for partition columns: ${e.getMessage}", e)
+      }
+
+    val roundTripped = sparkSession
+      .createDataFrame(sparkSession.sparkContext.parallelize(partitions.map(deserializer)), partitionColumns)
+      .collect
+      .toList
+
+    roundTripped
       .map((part) =>
         partitionColumns.fields.toList.zipWithIndex.map {
           case (field, idx) => {
             val fieldName = field.name
-            val fieldValue = part.get(idx, field.dataType)
+            val fieldValue = part.get(idx)
             fieldName -> fieldValue.toString // Just going to cast this as a string.
           }
         }.toMap)
-      .toList
   }
 
   def createTableTypeString: String = throw new UnsupportedOperationException("GCS does not support create table")
diff --git a/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/test/BigQueryCatalogTest.scala b/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/test/BigQueryCatalogTest.scala
@@ -6,6 +6,8 @@ import ai.chronon.spark.SparkSessionBuilder
 import ai.chronon.spark.TableUtils
 import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS
 import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem
+import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration
+import com.google.cloud.hadoop.util.HadoopConfigurationProperty
 import org.apache.spark.sql.SparkSession
 import org.junit.Assert.assertEquals
 import org.junit.Assert.assertTrue
@@ -24,8 +26,7 @@ class BigQueryCatalogTest extends AnyFunSuite with MockitoSugar {
         "spark.chronon.partition.column" -> "c",
         "spark.hadoop.fs.gs.impl" -> classOf[GoogleHadoopFileSystem].getName,
         "spark.hadoop.fs.AbstractFileSystem.gs.impl" -> classOf[GoogleHadoopFS].getName,
-        "spark.hadoop.google.cloud.auth.service.account.enable" -> true.toString,
-        "spark.hadoop.fs.gs.impl" -> classOf[GoogleHadoopFileSystem].getName
+        "spark.hadoop.google.cloud.auth.service.account.enable" -> true.toString
       ))
   )
   lazy val tableUtils: TableUtils = TableUtils(spark)
@@ -34,20 +35,43 @@ class BigQueryCatalogTest extends AnyFunSuite with MockitoSugar {
     assertEquals("thrift://localhost:9083", spark.sqlContext.getConf("hive.metastore.uris"))
   }
 
+  test("google runtime classes are available") {
+    assertTrue(GoogleHadoopFileSystemConfiguration.BLOCK_SIZE.isInstanceOf[HadoopConfigurationProperty[Long]])
+    assertCompiles("classOf[GoogleHadoopFileSystem]")
+    assertCompiles("classOf[GoogleHadoopFS]")
+
+  }
+
   test("verify dynamic classloading of GCP providers") {
     assertTrue(tableUtils.tableReadFormat("data.sample_native") match {
       case BQuery(_) => true
       case _         => false
     })
   }
 
-  ignore("integration testing bigquery load table") {
+  ignore("integration testing bigquery native table") {
+    val nativeTable = "data.sample_native"
+    val table = tableUtils.loadTable(nativeTable)
+    table.show
+    val partitioned = tableUtils.isPartitioned(nativeTable)
+    println(partitioned)
+    // val database = tableUtils.createDatabase("test_database")
+    val allParts = tableUtils.allPartitions(nativeTable)
+    println(allParts)
+  }
+
+  ignore("integration testing bigquery external table") {
     val externalTable = "data.checkouts_parquet"
+
+    val bs = GoogleHadoopFileSystemConfiguration.BLOCK_SIZE
+    println(bs)
     val table = tableUtils.loadTable(externalTable)
-    tableUtils.isPartitioned(externalTable)
-    tableUtils.createDatabase("test_database")
-    tableUtils.allPartitions(externalTable)
     table.show
+    val partitioned = tableUtils.isPartitioned(externalTable)
+    println(partitioned)
+    // val database = tableUtils.createDatabase("test_database")
+    val allParts = tableUtils.allPartitions(externalTable)
+    println(allParts)
   }
 
   ignore("integration testing bigquery partitions") {
diff --git a/spark/src/main/scala/ai/chronon/spark/Driver.scala b/spark/src/main/scala/ai/chronon/spark/Driver.scala
@@ -278,7 +278,7 @@ object Driver {
       val join = new Join(
         args.joinConf,
         args.endDate(),
-        args.buildTableUtils(),
+        tableUtils,
         !args.runFirstHole(),
         selectedJoinParts = args.selectedJoinParts.toOption
       )
diff --git a/spark/src/main/scala/ai/chronon/spark/Extensions.scala b/spark/src/main/scala/ai/chronon/spark/Extensions.scala
diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala

Original file line number	Diff line number	Diff line change
`@@ -278,7 +278,7 @@ object Driver {`
`278`	`278`	`val join = new Join(`
`279`	`279`	`args.joinConf,`
`280`	`280`	`args.endDate(),`
`281`		`- args.buildTableUtils(),`
	`281`	`+ tableUtils,`
`282`	`282`	`!args.runFirstHole(),`
`283`	`283`	`selectedJoinParts = args.selectedJoinParts.toOption`
`284`	`284`	`)`