@@ -9,21 +9,61 @@ import com.google.cloud.bigquery.{
9
9
TableId
10
10
}
11
11
import com .google .cloud .spark .bigquery .BigQueryCatalog
12
- import org .apache .iceberg .gcp .bigquery .BigQueryMetastoreCatalog
13
12
import org .apache .iceberg .spark .SparkCatalog
14
13
import org .apache .spark .sql .SparkSession
15
14
import org .apache .spark .sql .connector .catalog ._
16
15
import org .apache .spark .sql .connector .catalog .functions .UnboundFunction
17
16
import org .apache .spark .sql .connector .expressions .Transform
17
+ import org .apache .spark .sql .connector .read .ScanBuilder
18
+ import org .apache .spark .sql .connector .write .{LogicalWriteInfo , WriteBuilder }
18
19
import org .apache .spark .sql .execution .datasources .parquet .ParquetFileFormat
19
20
import org .apache .spark .sql .execution .datasources .v2 .parquet .ParquetTable
20
21
import org .apache .spark .sql .types .StructType
21
22
import org .apache .spark .sql .util .CaseInsensitiveStringMap
23
+ import org .apache .spark .sql .catalyst .analysis .NoSuchTableException
24
+ import org .apache .iceberg .gcp .bigquery .BigQueryMetastoreCatalog
22
25
23
26
import java .util
24
27
import scala .jdk .CollectionConverters ._
25
- import org .apache .spark .sql .catalyst .analysis .NoSuchTableException
26
- import scala .util .{Failure , Success , Try }
28
+ import scala .util .Try
29
+
30
+ /** A table that delegates all operations to an internal table, but with additional properties.
31
+ * This is mostly for enriching SparkTables with metadata that cannot be accessed by spark directly.
32
+ * For example, we can use a bigquery client to fetch table metadata / properties and then hydrate the Spark table
33
+ * with that information, before we pass it back to the Spark compute engine.
34
+ *
35
+ * Down the line, we could also support custom partition management.
36
+ */
37
+ class DelegatingTable (internalTable : Table ,
38
+ additionalProperties : Map [String , String ],
39
+ partitioning : Option [Array [Transform ]] = None )
40
+ extends Table
41
+ with SupportsRead
42
+ with SupportsWrite {
43
+
44
+ override def name (): String = internalTable.name
45
+
46
+ override def schema (): StructType = internalTable.schema
47
+
48
+ override def capabilities (): util.Set [TableCapability ] = internalTable.capabilities()
49
+
50
+ override def newScanBuilder (options : CaseInsensitiveStringMap ): ScanBuilder =
51
+ internalTable.asInstanceOf [SupportsRead ].newScanBuilder(options)
52
+
53
+ override def newWriteBuilder (info : LogicalWriteInfo ): WriteBuilder =
54
+ internalTable.asInstanceOf [SupportsWrite ].newWriteBuilder(info)
55
+
56
+ override def properties (): util.Map [String , String ] =
57
+ (internalTable.properties().asScala ++ additionalProperties).asJava
58
+
59
+ override def partitioning (): Array [Transform ] = partitioning.getOrElse(internalTable.partitioning())
60
+
61
+ }
62
+
63
+ object DelegatingTable {
64
+ def apply (table : Table , additionalProperties : Map [String , String ] = Map .empty): Table =
65
+ new DelegatingTable (table, additionalProperties = additionalProperties)
66
+ }
27
67
28
68
/** Galactus catalog that allows us to interact with BigQuery metastore as a spark catalog. This allows for
29
69
* querying of a variety of table types directly in spark sql or the dataframe api.
@@ -78,10 +118,13 @@ class DelegatingBigQueryMetastoreCatalog extends TableCatalog with SupportsNames
78
118
79
119
override def loadTable (identNoCatalog : Identifier ): Table = {
80
120
Try {
81
- icebergCatalog.loadTable(identNoCatalog)
121
+ val icebergSparkTable = icebergCatalog.loadTable(identNoCatalog)
122
+ DelegatingTable (icebergSparkTable,
123
+ additionalProperties =
124
+ Map (TableCatalog .PROP_EXTERNAL -> " false" , TableCatalog .PROP_PROVIDER -> " ICEBERG" ))
82
125
}
83
126
.recover {
84
- case noIcebergTableEx : NoSuchTableException => {
127
+ case _ => {
85
128
val project =
86
129
catalogProps.getOrElse(BigQueryMetastoreCatalog .PROPERTIES_KEY_GCP_PROJECT , bqOptions.getProjectId)
87
130
val tId = identNoCatalog.namespace().toList match {
@@ -91,9 +134,7 @@ class DelegatingBigQueryMetastoreCatalog extends TableCatalog with SupportsNames
91
134
throw new IllegalArgumentException (
92
135
s " Table identifier namespace ${identNoCatalog} must have at least one part. " )
93
136
}
94
- val table = scala
95
- .Option (bigQueryClient.getTable(tId))
96
- .getOrElse(throw new NoSuchTableException (s " BigQuery table $identNoCatalog not found. " ))
137
+ val table = bigQueryClient.getTable(tId)
97
138
table.getDefinition.asInstanceOf [TableDefinition ] match {
98
139
case externalTable : ExternalTableDefinition => {
99
140
val uris = externalTable.getSourceUris.asScala
@@ -105,36 +146,33 @@ class DelegatingBigQueryMetastoreCatalog extends TableCatalog with SupportsNames
105
146
uris.head.replaceAll(" /\\ *\\ .parquet$" , " " )
106
147
}
107
148
108
- val fileBasedTable = ParquetTable (
109
- tId.toString,
110
- SparkSession .active,
111
- new CaseInsensitiveStringMap (
112
- Map (TableCatalog .PROP_EXTERNAL -> " true" ,
113
- TableCatalog .PROP_LOCATION -> uri,
114
- TableCatalog .PROP_PROVIDER -> " PARQUET" ).asJava),
115
- List (uri),
116
- None ,
117
- classOf [ParquetFileFormat ]
118
- )
119
- fileBasedTable
149
+ val fileBasedTable = ParquetTable (tId.toString,
150
+ SparkSession .active,
151
+ CaseInsensitiveStringMap .empty(),
152
+ List (uri),
153
+ None ,
154
+ classOf [ParquetFileFormat ])
155
+ DelegatingTable (fileBasedTable,
156
+ Map (TableCatalog .PROP_EXTERNAL -> " true" ,
157
+ TableCatalog .PROP_LOCATION -> uri,
158
+ TableCatalog .PROP_PROVIDER -> " PARQUET" ))
120
159
}
121
160
case _ : StandardTableDefinition => {
122
161
// todo(tchow): Support partitioning
123
162
124
163
// Hack because there's a bug in the BigQueryCatalog where they ignore the projectId.
125
164
// See: https://github.com/GoogleCloudDataproc/spark-bigquery-connector/pull/1340
165
+ val connectorTable = connectorCatalog.loadTable(Identifier .of(Array (tId.getDataset), tId.getTable))
126
166
// ideally it should be the below:
127
167
// val connectorTable = connectorCatalog.loadTable(ident)
128
- connectorCatalog.loadTable(Identifier .of(Array (tId.getDataset), tId.getTable))
168
+ DelegatingTable (connectorTable,
169
+ Map (TableCatalog .PROP_EXTERNAL -> " false" , TableCatalog .PROP_PROVIDER -> " BIGQUERY" ))
129
170
}
130
- case _ => throw new IllegalStateException (s " Cannot support table of type: ${table.getDefinition }" )
171
+ case _ => throw new IllegalStateException (s " Cannot support table of type: ${table.getFriendlyName }" )
131
172
}
132
173
}
133
- case other : Throwable => throw other
134
- } match {
135
- case Success (table) => table
136
- case Failure (exception) => throw exception
137
- }
174
+ }
175
+ .getOrElse(throw new NoSuchTableException (f " Table: ${identNoCatalog} not found in bigquery catalog. " ))
138
176
}
139
177
140
178
override def createTable (ident : Identifier ,
0 commit comments