@@ -4,9 +4,9 @@ import ai.chronon.spark.Format
4
4
import org .apache .spark .sql .SparkSession
5
5
import org .apache .spark .sql .execution .FileSourceScanExec
6
6
import org .apache .spark .sql .execution .datasources .PartitioningAwareFileIndex
7
- import org .apache .spark .sql .functions . col
8
- import org .apache .spark .sql .functions . explode
9
- import org .apache .spark .sql .functions . url_decode
7
+ import org .apache .spark .sql .Encoders
8
+ import org .apache .spark .sql .catalyst . encoders . ExpressionEncoder
9
+ import org .apache .spark .sql .Row
10
10
11
11
case class GCS (project : String , sourceUri : String , format : String ) extends Format {
12
12
@@ -17,37 +17,6 @@ case class GCS(project: String, sourceUri: String, format: String) extends Forma
17
17
super .primaryPartitions(tableName, partitionColumn, subPartitionsFilter)
18
18
19
19
override def partitions (tableName : String )(implicit sparkSession : SparkSession ): Seq [Map [String , String ]] = {
20
- import sparkSession .implicits ._
21
-
22
- val tableIdentifier = sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName)
23
- val table = tableIdentifier.table
24
- val database = tableIdentifier.database.getOrElse(throw new IllegalArgumentException (" database required!" ))
25
-
26
- // See: https://github.com/GoogleCloudDataproc/spark-bigquery-connector/issues/434#issuecomment-886156191
27
- // and: https://cloud.google.com/bigquery/docs/information-schema-intro#limitations
28
- sparkSession.conf.set(" viewsEnabled" , " true" )
29
- sparkSession.conf.set(" materializationDataset" , database)
30
-
31
- // First, grab the URI location from BQ
32
- val uriSQL =
33
- s """
34
- |select JSON_EXTRACT_STRING_ARRAY(option_value) as option_values from ` ${project}. ${database}.INFORMATION_SCHEMA.TABLE_OPTIONS`
35
- |WHERE table_name = ' ${table}' and option_name = 'uris'
36
- |
37
- | """ .stripMargin
38
-
39
- val uris = sparkSession.read
40
- .format(" bigquery" )
41
- .option(" project" , project)
42
- .option(" query" , uriSQL)
43
- .load()
44
- .select(explode(col(" option_values" )).as(" option_value" ))
45
- .select(url_decode(col(" option_value" )))
46
- .as[String ]
47
- .collect
48
- .toList
49
-
50
- assert(uris.length == 1 , s " External table ${tableName} can be backed by only one URI. " )
51
20
52
21
/**
53
22
* Given:
@@ -70,7 +39,7 @@ case class GCS(project: String, sourceUri: String, format: String) extends Forma
70
39
*
71
40
*/
72
41
val partitionSpec = sparkSession.read
73
- .parquet(uris : _* )
42
+ .parquet(sourceUri )
74
43
.queryExecution
75
44
.sparkPlan
76
45
.asInstanceOf [FileSourceScanExec ]
@@ -82,16 +51,23 @@ case class GCS(project: String, sourceUri: String, format: String) extends Forma
82
51
val partitionColumns = partitionSpec.partitionColumns
83
52
val partitions = partitionSpec.partitions.map(_.values)
84
53
85
- partitions
54
+ val deserializer =
55
+ Encoders .row(partitionColumns).asInstanceOf [ExpressionEncoder [Row ]].resolveAndBind().createDeserializer()
56
+
57
+ val roundTripped = sparkSession
58
+ .createDataFrame(sparkSession.sparkContext.parallelize(partitions.map(deserializer)), partitionColumns)
59
+ .collect
60
+ .toList
61
+
62
+ roundTripped
86
63
.map((part) =>
87
64
partitionColumns.fields.toList.zipWithIndex.map {
88
65
case (field, idx) => {
89
66
val fieldName = field.name
90
- val fieldValue = part.get(idx, field.dataType )
67
+ val fieldValue = part.get(idx)
91
68
fieldName -> fieldValue.toString // Just going to cast this as a string.
92
69
}
93
70
}.toMap)
94
- .toList
95
71
}
96
72
97
73
def createTableTypeString : String = throw new UnsupportedOperationException (" GCS does not support create table" )
0 commit comments