Skip to content

Commit 3aa7369

Browse files
feat: Add unit tests for GCP support (#162)
## Summary - Adding unit tests for #147 - https://app.asana.com/0/1208949807589885/1208960391734329/f ## Checklist - [x] Added Unit Tests - [ ] Covered by existing CI - [ ] Integration tested - [ ] Documentation update <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Added a new test class to validate GCS format partitioning functionality. - **Refactor** - Updated package structure for test files. - Removed specific imports in test files. - **Chores** - Added an import for a BigQuery table in the format handling. <!-- end of auto-generated comment: release notes by coderabbit.ai --> <!-- av pr metadata This information is embedded by the av CLI when creating PRs to track the status of stacks when using Aviator. Please do not delete or edit this section of the PR. ``` {"parent":"main","parentHead":"","trunk":"main"} ``` --> --------- Co-authored-by: Thomas Chow <[email protected]>
1 parent 688db65 commit 3aa7369

File tree

4 files changed

+96
-7
lines changed

4 files changed

+96
-7
lines changed

cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/BigQueryFormat.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ import com.google.cloud.bigquery.Table
1111
import com.google.cloud.bigquery.connector.common.BigQueryUtil
1212
import com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.TableId
1313
import org.apache.spark.sql.SparkSession
14-
import org.apache.spark.sql.functions.{col, to_date}
14+
import org.apache.spark.sql.functions.col
15+
import org.apache.spark.sql.functions.to_date
1516

1617
import scala.collection.JavaConverters._
1718

cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/test/BigQueryCatalogTest.scala renamed to cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/BigQueryCatalogTest.scala

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
package ai.chronon.integrations.cloud_gcp.test
1+
package ai.chronon.integrations.cloud_gcp
22

3-
import ai.chronon.integrations.cloud_gcp.BQuery
4-
import ai.chronon.integrations.cloud_gcp.GcpFormatProvider
53
import ai.chronon.spark.SparkSessionBuilder
64
import ai.chronon.spark.TableUtils
75
import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS

cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/test/DataprocSubmitterTest.scala renamed to cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitterTest.scala

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
package ai.chronon.integrations.cloud_gcp.test
1+
package ai.chronon.integrations.cloud_gcp
22

3-
import ai.chronon.integrations.cloud_gcp.DataprocSubmitter
4-
import ai.chronon.integrations.cloud_gcp.SubmitterConf
53
import com.google.api.gax.rpc.UnaryCallable
64
import com.google.cloud.dataproc.v1._
75
import com.google.cloud.dataproc.v1.stub.JobControllerStub
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
package ai.chronon.integrations.cloud_gcp
2+
3+
import ai.chronon.spark.SparkSessionBuilder
4+
import org.apache.spark.sql.Row
5+
import org.apache.spark.sql.SaveMode
6+
import org.apache.spark.sql.SparkSession
7+
import org.apache.spark.sql.functions._
8+
import org.apache.spark.sql.types.StringType
9+
import org.apache.spark.sql.types.StructField
10+
import org.apache.spark.sql.types.StructType
11+
import org.junit.Assert.assertEquals
12+
import org.scalatest.funsuite.AnyFunSuite
13+
14+
import java.nio.file.Files
15+
16+
class GCSFormatTest extends AnyFunSuite {
17+
18+
lazy val spark: SparkSession = SparkSessionBuilder.build(
19+
"BigQuerySparkTest",
20+
local = true
21+
)
22+
23+
test("partitions method should return correctly parsed partitions as maps") {
24+
25+
val testData = List(
26+
("20241223", "b", "c"),
27+
("20241224", "e", "f"),
28+
("20241225", "h", "i")
29+
)
30+
31+
val dir = Files.createTempDirectory("spark-test-output").toFile
32+
dir.deleteOnExit()
33+
34+
val df = spark.createDataFrame(testData).toDF("ds", "first", "second")
35+
df.write.partitionBy("ds").format("parquet").mode(SaveMode.Overwrite).save(dir.getAbsolutePath)
36+
val gcsFormat = GCS(project = "test-project", sourceUri = dir.getAbsolutePath, fileFormat = "parquet")
37+
val partitions = gcsFormat.partitions("unused_table")(spark)
38+
39+
assertEquals(Set(Map("ds" -> "20241223"), Map("ds" -> "20241224"), Map("ds" -> "20241225")), partitions.toSet)
40+
41+
}
42+
43+
test("partitions method should handle empty partitions gracefully") {
44+
45+
val testData = List(
46+
("20241223", "b", "c"),
47+
("20241224", "e", "f"),
48+
("20241225", "h", "i")
49+
)
50+
51+
val dir = Files.createTempDirectory("spark-test-output").toFile
52+
dir.deleteOnExit()
53+
54+
val df = spark.createDataFrame(testData).toDF("ds", "first", "second")
55+
df.write.format("parquet").mode(SaveMode.Overwrite).save(dir.getAbsolutePath)
56+
val gcsFormat = GCS(project = "test-project", sourceUri = dir.getAbsolutePath, fileFormat = "parquet")
57+
val partitions = gcsFormat.partitions("unused_table")(spark)
58+
59+
assertEquals(Set.empty, partitions.toSet)
60+
61+
}
62+
63+
test("partitions method should handle date types") {
64+
val testData = List(
65+
Row("2024-12-23", "b", "c"),
66+
Row("2024-12-24", "e", "f"),
67+
Row("2024-12-25", "h", "i")
68+
)
69+
70+
val dir = Files.createTempDirectory("spark-test-output").toFile
71+
dir.deleteOnExit()
72+
73+
val schema = StructType(
74+
Seq(
75+
StructField("ds", StringType, nullable = true),
76+
StructField("first", StringType, nullable = true),
77+
StructField("second", StringType, nullable = true)
78+
))
79+
80+
val df =
81+
spark
82+
.createDataFrame(spark.sparkContext.parallelize(testData), schema)
83+
.toDF("ds", "first", "second")
84+
.select(to_date(col("ds"), "yyyy-MM-dd").as("ds"), col("first"), col("second"))
85+
df.write.format("parquet").partitionBy("ds").mode(SaveMode.Overwrite).save(dir.getAbsolutePath)
86+
val gcsFormat = GCS(project = "test-project", sourceUri = dir.getAbsolutePath, fileFormat = "parquet")
87+
val partitions = gcsFormat.partitions("unused_table")(spark)
88+
89+
assertEquals(Set(Map("ds" -> "2024-12-23"), Map("ds" -> "2024-12-24"), Map("ds" -> "2024-12-25")), partitions.toSet)
90+
91+
}
92+
}

0 commit comments

Comments
 (0)