@@ -9,61 +9,21 @@ import com.google.cloud.bigquery.{
9
9
TableId
10
10
}
11
11
import com .google .cloud .spark .bigquery .BigQueryCatalog
12
+ import org .apache .iceberg .gcp .bigquery .BigQueryMetastoreCatalog
12
13
import org .apache .iceberg .spark .SparkCatalog
13
14
import org .apache .spark .sql .SparkSession
14
15
import org .apache .spark .sql .connector .catalog ._
15
16
import org .apache .spark .sql .connector .catalog .functions .UnboundFunction
16
17
import org .apache .spark .sql .connector .expressions .Transform
17
- import org .apache .spark .sql .connector .read .ScanBuilder
18
- import org .apache .spark .sql .connector .write .{LogicalWriteInfo , WriteBuilder }
19
18
import org .apache .spark .sql .execution .datasources .parquet .ParquetFileFormat
20
19
import org .apache .spark .sql .execution .datasources .v2 .parquet .ParquetTable
21
20
import org .apache .spark .sql .types .StructType
22
21
import org .apache .spark .sql .util .CaseInsensitiveStringMap
23
- import org .apache .spark .sql .catalyst .analysis .NoSuchTableException
24
- import org .apache .iceberg .gcp .bigquery .BigQueryMetastoreCatalog
25
22
26
23
import java .util
27
24
import scala .jdk .CollectionConverters ._
28
- import scala .util .Try
29
-
30
- /** A table that delegates all operations to an internal table, but with additional properties.
31
- * This is mostly for enriching SparkTables with metadata that cannot be accessed by spark directly.
32
- * For example, we can use a bigquery client to fetch table metadata / properties and then hydrate the Spark table
33
- * with that information, before we pass it back to the Spark compute engine.
34
- *
35
- * Down the line, we could also support custom partition management.
36
- */
37
- class DelegatingTable (internalTable : Table ,
38
- additionalProperties : Map [String , String ],
39
- partitioning : Option [Array [Transform ]] = None )
40
- extends Table
41
- with SupportsRead
42
- with SupportsWrite {
43
-
44
- override def name (): String = internalTable.name
45
-
46
- override def schema (): StructType = internalTable.schema
47
-
48
- override def capabilities (): util.Set [TableCapability ] = internalTable.capabilities()
49
-
50
- override def newScanBuilder (options : CaseInsensitiveStringMap ): ScanBuilder =
51
- internalTable.asInstanceOf [SupportsRead ].newScanBuilder(options)
52
-
53
- override def newWriteBuilder (info : LogicalWriteInfo ): WriteBuilder =
54
- internalTable.asInstanceOf [SupportsWrite ].newWriteBuilder(info)
55
-
56
- override def properties (): util.Map [String , String ] =
57
- (internalTable.properties().asScala ++ additionalProperties).asJava
58
-
59
- override def partitioning (): Array [Transform ] = partitioning.getOrElse(internalTable.partitioning())
60
-
61
- }
62
-
63
- object DelegatingTable {
64
- def apply (table : Table , additionalProperties : Map [String , String ] = Map .empty): Table =
65
- new DelegatingTable (table, additionalProperties = additionalProperties)
66
- }
25
+ import org .apache .spark .sql .catalyst .analysis .NoSuchTableException
26
+ import scala .util .{Failure , Success , Try }
67
27
68
28
/** Galactus catalog that allows us to interact with BigQuery metastore as a spark catalog. This allows for
69
29
* querying of a variety of table types directly in spark sql or the dataframe api.
@@ -118,13 +78,10 @@ class DelegatingBigQueryMetastoreCatalog extends TableCatalog with SupportsNames
118
78
119
79
override def loadTable (identNoCatalog : Identifier ): Table = {
120
80
Try {
121
- val icebergSparkTable = icebergCatalog.loadTable(identNoCatalog)
122
- DelegatingTable (icebergSparkTable,
123
- additionalProperties =
124
- Map (TableCatalog .PROP_EXTERNAL -> " false" , TableCatalog .PROP_PROVIDER -> " ICEBERG" ))
81
+ icebergCatalog.loadTable(identNoCatalog)
125
82
}
126
83
.recover {
127
- case _ => {
84
+ case noIcebergTableEx : NoSuchTableException => {
128
85
val project =
129
86
catalogProps.getOrElse(BigQueryMetastoreCatalog .PROPERTIES_KEY_GCP_PROJECT , bqOptions.getProjectId)
130
87
val tId = identNoCatalog.namespace().toList match {
@@ -134,7 +91,9 @@ class DelegatingBigQueryMetastoreCatalog extends TableCatalog with SupportsNames
134
91
throw new IllegalArgumentException (
135
92
s " Table identifier namespace ${identNoCatalog} must have at least one part. " )
136
93
}
137
- val table = bigQueryClient.getTable(tId)
94
+ val table = scala
95
+ .Option (bigQueryClient.getTable(tId))
96
+ .getOrElse(throw new NoSuchTableException (s " BigQuery table $identNoCatalog not found. " ))
138
97
table.getDefinition.asInstanceOf [TableDefinition ] match {
139
98
case externalTable : ExternalTableDefinition => {
140
99
val uris = externalTable.getSourceUris.asScala
@@ -146,33 +105,36 @@ class DelegatingBigQueryMetastoreCatalog extends TableCatalog with SupportsNames
146
105
uris.head.replaceAll(" /\\ *\\ .parquet$" , " " )
147
106
}
148
107
149
- val fileBasedTable = ParquetTable (tId.toString,
150
- SparkSession .active,
151
- CaseInsensitiveStringMap .empty(),
152
- List (uri),
153
- None ,
154
- classOf [ParquetFileFormat ])
155
- DelegatingTable (fileBasedTable,
156
- Map (TableCatalog .PROP_EXTERNAL -> " true" ,
157
- TableCatalog .PROP_LOCATION -> uri,
158
- TableCatalog .PROP_PROVIDER -> " PARQUET" ))
108
+ val fileBasedTable = ParquetTable (
109
+ tId.toString,
110
+ SparkSession .active,
111
+ new CaseInsensitiveStringMap (
112
+ Map (TableCatalog .PROP_EXTERNAL -> " true" ,
113
+ TableCatalog .PROP_LOCATION -> uri,
114
+ TableCatalog .PROP_PROVIDER -> " PARQUET" ).asJava),
115
+ List (uri),
116
+ None ,
117
+ classOf [ParquetFileFormat ]
118
+ )
119
+ fileBasedTable
159
120
}
160
121
case _ : StandardTableDefinition => {
161
122
// todo(tchow): Support partitioning
162
123
163
124
// Hack because there's a bug in the BigQueryCatalog where they ignore the projectId.
164
125
// See: https://github.com/GoogleCloudDataproc/spark-bigquery-connector/pull/1340
165
- val connectorTable = connectorCatalog.loadTable(Identifier .of(Array (tId.getDataset), tId.getTable))
166
126
// ideally it should be the below:
167
127
// val connectorTable = connectorCatalog.loadTable(ident)
168
- DelegatingTable (connectorTable,
169
- Map (TableCatalog .PROP_EXTERNAL -> " false" , TableCatalog .PROP_PROVIDER -> " BIGQUERY" ))
128
+ connectorCatalog.loadTable(Identifier .of(Array (tId.getDataset), tId.getTable))
170
129
}
171
- case _ => throw new IllegalStateException (s " Cannot support table of type: ${table.getFriendlyName }" )
130
+ case _ => throw new IllegalStateException (s " Cannot support table of type: ${table.getDefinition }" )
172
131
}
173
132
}
174
- }
175
- .getOrElse(throw new NoSuchTableException (f " Table: ${identNoCatalog} not found in bigquery catalog. " ))
133
+ case other : Throwable => throw other
134
+ } match {
135
+ case Success (table) => table
136
+ case Failure (exception) => throw exception
137
+ }
176
138
}
177
139
178
140
override def createTable (ident : Identifier ,
0 commit comments