|
1 | 1 | package ai.chronon.integrations.cloud_gcp
|
2 | 2 |
|
| 3 | +import ai.chronon.api.Extensions.StringOps |
| 4 | +import ai.chronon.api.ScalaJavaConversions.JListOps |
| 5 | +import ai.chronon.spark.TableUtils |
| 6 | +import ai.chronon.spark.TableUtils.{TableCreatedWithInitialData, TableCreationStatus} |
3 | 7 | import ai.chronon.spark.format.Format
|
4 |
| -import org.apache.spark.sql.Encoders |
5 |
| -import org.apache.spark.sql.Row |
6 |
| -import org.apache.spark.sql.SparkSession |
| 8 | +import com.google.cloud.bigquery.connector.common.BigQueryUtil |
| 9 | +import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.{ |
| 10 | + BigQuery, |
| 11 | + BigQueryOptions, |
| 12 | + ExternalTableDefinition, |
| 13 | + FormatOptions, |
| 14 | + HivePartitioningOptions, |
| 15 | + TableInfo |
| 16 | +} |
| 17 | +import com.google.cloud.spark.bigquery.{SchemaConverters, SchemaConvertersConfiguration} |
| 18 | +import org.apache.spark.sql.{DataFrame, Encoders, Row, SparkSession} |
7 | 19 | import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
8 | 20 | import org.apache.spark.sql.execution.FileSourceScanExec
|
9 | 21 | import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
|
| 22 | +import org.slf4j.LoggerFactory |
10 | 23 |
|
11 | 24 | case class GCS(sourceUri: String, fileFormat: String) extends Format {
|
12 | 25 |
|
| 26 | + private lazy val logger = LoggerFactory.getLogger(this.getClass.getName) |
| 27 | + |
| 28 | + private lazy val bqOptions = BigQueryOptions.getDefaultInstance |
| 29 | + lazy val bigQueryClient: BigQuery = bqOptions.getService |
| 30 | + |
13 | 31 | override def name: String = fileFormat
|
14 | 32 |
|
15 | 33 | override def primaryPartitions(tableName: String, partitionColumn: String, subPartitionsFilter: Map[String, String])(
|
@@ -72,6 +90,62 @@ case class GCS(sourceUri: String, fileFormat: String) extends Format {
|
72 | 90 | }.toMap)
|
73 | 91 | }
|
74 | 92 |
|
| 93 | + override def generateTableBuilder(df: DataFrame, |
| 94 | + tableName: String, |
| 95 | + partitionColumns: Seq[String], |
| 96 | + tableProperties: Map[String, String], |
| 97 | + fileFormat: String): (String => Unit) => TableCreationStatus = { |
| 98 | + |
| 99 | + def inner(df: DataFrame, tableName: String, partitionColumns: Seq[String])(sqlEvaluator: String => Unit) = { |
| 100 | + |
| 101 | + // See: https://cloud.google.com/bigquery/docs/partitioned-tables#limitations |
| 102 | + // "BigQuery does not support partitioning by multiple columns. Only one column can be used to partition a table." |
| 103 | + require(partitionColumns.size < 2, |
| 104 | + s"BigQuery only supports at most one partition column, incoming spec: ${partitionColumns}") |
| 105 | + |
| 106 | + val shadedTableId = BigQueryUtil.parseTableId(tableName) |
| 107 | + |
| 108 | + val writePrefix = TableUtils(df.sparkSession).writePrefix |
| 109 | + require(writePrefix.nonEmpty, "Please set conf 'spark.chronon.table_write.prefix' pointing to a data bucket.") |
| 110 | + |
| 111 | + val path = writePrefix.get + tableName.sanitize + "/" //split("/").map(_.sanitize).mkString("/") |
| 112 | + val dataGlob = path + "*" |
| 113 | + |
| 114 | + logger.info(s""" |
| 115 | + |table source uri: $dataGlob |
| 116 | + |partition uri: $path |
| 117 | + |""".stripMargin) |
| 118 | + |
| 119 | + df.write |
| 120 | + .partitionBy(partitionColumns: _*) |
| 121 | + .mode("overwrite") // or "append" based on your needs |
| 122 | + .parquet(path) |
| 123 | + |
| 124 | + val baseTableDef = ExternalTableDefinition |
| 125 | + .newBuilder(dataGlob, FormatOptions.parquet()) |
| 126 | + .setAutodetect(true) |
| 127 | + |
| 128 | + if (partitionColumns.nonEmpty) { |
| 129 | + val timePartitioning = HivePartitioningOptions |
| 130 | + .newBuilder() |
| 131 | + .setFields(partitionColumns.toJava) |
| 132 | + .setSourceUriPrefix(path) |
| 133 | + .setMode("STRINGS") |
| 134 | + .build() |
| 135 | + baseTableDef.setHivePartitioningOptions(timePartitioning) |
| 136 | + } |
| 137 | + |
| 138 | + val tableInfo = TableInfo.newBuilder(shadedTableId, baseTableDef.build).build() |
| 139 | + val createdTable = bigQueryClient.create(tableInfo) |
| 140 | + |
| 141 | + println(s"Created external table ${createdTable.getTableId}") |
| 142 | + |
| 143 | + TableCreatedWithInitialData |
| 144 | + } |
| 145 | + |
| 146 | + inner(df, tableName, partitionColumns) |
| 147 | + } |
| 148 | + |
75 | 149 | def createTableTypeString: String = throw new UnsupportedOperationException("GCS does not support create table")
|
76 | 150 |
|
77 | 151 | def fileFormatString(format: String): String = ""
|
|
0 commit comments