use json4s pinned to the spark version

tchow-zlai · thomaschow · tchow-zlai · commit 2d9ac20c257a · 2025-01-02T14:33:07.000-08:00
-e
Co-authored-by: Thomas Chow &lt;thomaschow369@gmail.com&gt;

-e
Co-authored-by: Thomas Chow &lt;thomaschow369@gmail.com&gt;
-e 
Co-authored-by: Thomas Chow &lt;thomaschow369@gmail.com&gt;
diff --git a/build.sbt b/build.sbt
@@ -190,8 +190,10 @@ lazy val spark = project
     libraryDependencies += "com.google.guava" % "guava" % "33.3.1-jre",
     libraryDependencies ++= log4j2,
     libraryDependencies ++= delta.map(_ % "provided"),
-    libraryDependencies += "io.circe" %% "circe-yaml" % "1.15.0",
-    libraryDependencies ++= circe
+    libraryDependencies += "org.json4s" % "json4s-jackson_2.12" % "3.7.0-M11",
+    libraryDependencies += "org.json4s" %% "json4s-native" % "3.7.0-M11", // Use json4s-native or json4s-jackson
+    libraryDependencies += "org.json4s" %% "json4s-core" % "3.7.0-M11",
+    libraryDependencies += "org.yaml" % "snakeyaml" % "2.3"
   )
 
 lazy val flink = project
@@ -213,16 +215,17 @@ lazy val cloud_gcp = project
     libraryDependencies += "com.google.cloud" % "google-cloud-bigquery" % "2.42.0",
     libraryDependencies += "com.google.cloud" % "google-cloud-bigtable" % "2.41.0",
     libraryDependencies += "com.google.cloud" % "google-cloud-pubsub" % "1.131.0",
-    libraryDependencies += "com.google.cloud" % "google-cloud-dataproc" % "4.51.0",
-    libraryDependencies += "com.google.cloud.bigdataoss" % "gcs-connector" % "3.0.3", // it's what's on the cluster
+    libraryDependencies += "com.google.cloud" % "google-cloud-dataproc" % "4.52.0",
     libraryDependencies += "com.google.cloud.bigdataoss" % "gcs-connector" % "hadoop3-2.2.26",
     libraryDependencies += "com.google.cloud.bigdataoss" % "gcsio" % "3.0.3", // need it for https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/master/gcsio/src/main/java/com/google/cloud/hadoop/gcsio/GoogleCloudStorageFileSystem.java
     libraryDependencies += "com.google.cloud.bigdataoss" % "util-hadoop" % "3.0.0", // need it for https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/master/util-hadoop/src/main/java/com/google/cloud/hadoop/util/HadoopConfigurationProperty.java
-    libraryDependencies += "io.circe" %% "circe-yaml" % "1.15.0",
-    libraryDependencies += "com.google.cloud.spark" %% s"spark-bigquery-with-dependencies" % "0.41.0",
-    libraryDependencies += "com.google.cloud.spark.bigtable" %% "spark-bigtable" % "0.2.1",
+    libraryDependencies += "com.google.cloud.spark" %% "spark-bigquery-with-dependencies" % "0.41.0",
+    libraryDependencies += "org.json4s" % "json4s-jackson_2.12" % "3.7.0-M11",
+    libraryDependencies += "org.json4s" %% "json4s-native" % "3.7.0-M11", // Use json4s-native or json4s-jackson
+    libraryDependencies += "org.json4s" %% "json4s-core" % "3.7.0-M11",
+    libraryDependencies += "org.yaml" % "snakeyaml" % "2.3",
+    // libraryDependencies += "com.google.cloud.spark.bigtable" %% "spark-bigtable" % "0.2.1",
     libraryDependencies += "com.google.cloud.bigtable" % "bigtable-hbase-2.x" % "2.14.2",
-    libraryDependencies ++= circe,
     libraryDependencies ++= avro,
     libraryDependencies ++= spark_all_provided,
     dependencyOverrides ++= jackson,
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitter.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitter.scala
@@ -3,8 +3,9 @@ import ai.chronon.spark.JobAuth
 import ai.chronon.spark.JobSubmitter
 import com.google.api.gax.rpc.ApiException
 import com.google.cloud.dataproc.v1._
-import io.circe.generic.auto._
-import io.circe.yaml.parser
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+import org.yaml.snakeyaml.Yaml
 
 import scala.io.Source
 
@@ -88,16 +89,21 @@ object DataprocSubmitter {
   }
 
   private[cloud_gcp] def loadConfig: SubmitterConf = {
-    val is = getClass.getClassLoader.getResourceAsStream("dataproc-submitter-conf.yaml")
-    val confStr = Source.fromInputStream(is).mkString
-    val res: Either[io.circe.Error, SubmitterConf] = parser
-      .parse(confStr)
-      .flatMap(_.as[SubmitterConf])
-    res match {
-
-      case Right(v) => v
-      case Left(e)  => throw e
-    }
+    val isO = Option(getClass.getClassLoader.getResourceAsStream("dataproc-submitter-conf.yaml"))
+    val yamlLoader = new Yaml()
+    implicit val formats: Formats = DefaultFormats
+    isO
+      .map(Source.fromInputStream)
+      .map((is) =>
+        try { is.mkString }
+        finally { is.close })
+      .map(yamlLoader.load(_).asInstanceOf[java.util.Map[String, Any]])
+      .map((jMap) => Extraction.decompose(jMap.asScala.toMap))
+      .map((jVal) => render(jVal))
+      .map(compact)
+      .map(parse(_).extract[SubmitterConf])
+      .getOrElse(throw new IllegalArgumentException(s"Yaml conf not found or invalid yaml"))
+
   }
 }
 
diff --git a/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/BigQueryCatalogTest.scala b/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/BigQueryCatalogTest.scala
@@ -5,12 +5,13 @@ import ai.chronon.spark.TableUtils
 import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS
 import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem
 import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration
-import com.google.cloud.hadoop.util.HadoopConfigurationProperty
 import org.apache.spark.sql.SparkSession
 import org.junit.Assert.assertEquals
 import org.junit.Assert.assertTrue
 import org.scalatest.funsuite.AnyFunSuite
 import org.scalatestplus.mockito.MockitoSugar
+import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem
+import com.google.cloud.hadoop.fs.gcs.HadoopConfigurationProperty
 
 class BigQueryCatalogTest extends AnyFunSuite with MockitoSugar {
 
@@ -37,6 +38,7 @@ class BigQueryCatalogTest extends AnyFunSuite with MockitoSugar {
     assertTrue(GoogleHadoopFileSystemConfiguration.BLOCK_SIZE.isInstanceOf[HadoopConfigurationProperty[Long]])
     assertCompiles("classOf[GoogleHadoopFileSystem]")
     assertCompiles("classOf[GoogleHadoopFS]")
+    assertCompiles("classOf[GoogleCloudStorageFileSystem]")
 
   }
 
diff --git a/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/BigTableKVStoreTest.scala b/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/BigTableKVStoreTest.scala
@@ -52,13 +52,15 @@ class BigTableKVStoreTest {
   @Before
   def setup(): Unit = {
     // Configure settings to use emulator
-    val dataSettings = BigtableDataSettings.newBuilderForEmulator(bigtableEmulator.getPort)
+    val dataSettings = BigtableDataSettings
+      .newBuilderForEmulator(bigtableEmulator.getPort)
       .setProjectId(projectId)
       .setInstanceId(instanceId)
       .setCredentialsProvider(NoCredentialsProvider.create())
       .build()
 
-    val adminSettings = BigtableTableAdminSettings.newBuilderForEmulator(bigtableEmulator.getPort)
+    val adminSettings = BigtableTableAdminSettings
+      .newBuilderForEmulator(bigtableEmulator.getPort)
       .setProjectId(projectId)
       .setInstanceId(instanceId)
       .setCredentialsProvider(NoCredentialsProvider.create())
@@ -153,11 +155,10 @@ class BigTableKVStoreTest {
     val kvStore = new BigTableKVStoreImpl(dataClient, adminClient)
     kvStore.create(dataset)
 
-    val putReqs = (0 until 100).map {
-      i =>
-        val key = s"key-$i"
-        val value = s"""{"name": "name-$i", "age": $i}"""
-        PutRequest(key.getBytes, value.getBytes, dataset, None)
+    val putReqs = (0 until 100).map { i =>
+      val key = s"key-$i"
+      val value = s"""{"name": "name-$i", "age": $i}"""
+      PutRequest(key.getBytes, value.getBytes, dataset, None)
     }
 
     val putResults = Await.result(kvStore.multiPut(putReqs), 1.second)
@@ -185,7 +186,9 @@ class BigTableKVStoreTest {
 
     // lets collect all the keys and confirm we got everything
     val allKeys = (listValues1 ++ listValues2).map(v => new String(v.keyBytes, StandardCharsets.UTF_8))
-    allKeys.toSet shouldBe putReqs.map(r => new String(buildRowKey(r.keyBytes, r.dataset), StandardCharsets.UTF_8)).toSet
+    allKeys.toSet shouldBe putReqs
+      .map(r => new String(buildRowKey(r.keyBytes, r.dataset), StandardCharsets.UTF_8))
+      .toSet
   }
 
   @Test
@@ -227,7 +230,8 @@ class BigTableKVStoreTest {
 
     when(mockDataClient.readRowsCallable()).thenReturn(serverStreamingCallable)
     when(serverStreamingCallable.all()).thenReturn(unaryCallable)
-    val failedFuture = ApiFutures.immediateFailedFuture[util.List[Row]](new RuntimeException("some BT exception on read"))
+    val failedFuture =
+      ApiFutures.immediateFailedFuture[util.List[Row]](new RuntimeException("some BT exception on read"))
     when(unaryCallable.futureCall(any[Query])).thenReturn(failedFuture)
 
     val getResult = Await.result(kvStoreWithMocks.multiGet(Seq(getReq1, getReq2)), 1.second)
@@ -323,11 +327,15 @@ class BigTableKVStoreTest {
     val getResult1 = Await.result(kvStore.multiGet(Seq(getRequest1)), 1.second)
     getResult1.size shouldBe 1
     // we expect results to only cover the time range where we have data
-    val expectedTimeSeriesPoints = (queryStartsTs until  dataEndTs by 1.hour.toMillis).toSeq
+    val expectedTimeSeriesPoints = (queryStartsTs until dataEndTs by 1.hour.toMillis).toSeq
     validateTimeSeriesValueExpectedPayload(getResult1.head, expectedTimeSeriesPoints, fakePayload)
   }
 
-  private def writeGeneratedTimeSeriesData(kvStore: BigTableKVStoreImpl, dataset: String, key: String, tsRange: Seq[Long], payload: String): Unit = {
+  private def writeGeneratedTimeSeriesData(kvStore: BigTableKVStoreImpl,
+                                           dataset: String,
+                                           key: String,
+                                           tsRange: Seq[Long],
+                                           payload: String): Unit = {
     val points = Seq.fill(tsRange.size)(payload)
     val putRequests = tsRange.zip(points).map {
       case (ts, point) =>
@@ -350,10 +358,10 @@ class BigTableKVStoreTest {
     }
   }
 
-  private def validateTimeSeriesValueExpectedPayload(response: GetResponse, expectedTimestamps: Seq[Long], expectedPayload: String): Unit = {
-    for (
-      tSeq <- response.values
-    ) {
+  private def validateTimeSeriesValueExpectedPayload(response: GetResponse,
+                                                     expectedTimestamps: Seq[Long],
+                                                     expectedPayload: String): Unit = {
+    for (tSeq <- response.values) {
       tSeq.map(_.millis).toSet shouldBe expectedTimestamps.toSet
       tSeq.map(v => new String(v.bytes, StandardCharsets.UTF_8)).foreach(v => v shouldBe expectedPayload)
       tSeq.length shouldBe expectedTimestamps.length
diff --git a/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitterTest.scala b/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitterTest.scala
@@ -47,7 +47,7 @@ class DataprocSubmitterTest extends AnyFunSuite with MockitoSugar {
     BigQueryUtilScala.validateScalaVersionCompatibility()
   }
 
-  ignore("Used to iterate locally. Do not enable this in CI/CD!") {
+  test("Used to iterate locally. Do not enable this in CI/CD!") {
 
     val submitter = DataprocSubmitter()
     val submittedJobId =
diff --git a/spark/src/main/scala/ai/chronon/spark/Driver.scala b/spark/src/main/scala/ai/chronon/spark/Driver.scala
@@ -69,8 +69,9 @@ import scala.reflect.internal.util.ScalaClassLoader
 import scala.util.Failure
 import scala.util.Success
 import scala.util.Try
-import io.circe.generic.auto._
-import io.circe.yaml.parser
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+import org.yaml.snakeyaml.Yaml
 
 // useful to override spark.sql.extensions args - there is no good way to unset that conf apparently
 // so we give it dummy extensions
@@ -152,18 +153,18 @@ object Driver {
     protected def isLocal: Boolean = localTableMapping.nonEmpty || localDataPath.isDefined
 
     protected def buildSparkSession(): SparkSession = {
+      implicit val formats: Formats = DefaultFormats
+      val yamlLoader = new Yaml()
       val additionalConfs = additionalConfPath.toOption
         .map(Source.fromFile)
-        .map(_.mkString)
-        .map((cp) => {
-          parser
-            .parse(cp)
-            .flatMap((r) => r.as[Map[String, String]])
-        })
-        .map {
-          case Right(v) => v
-          case Left(e)  => throw e
-        }
+        .map((src) =>
+          try { src.mkString }
+          finally { src.close })
+        .map(yamlLoader.load(_).asInstanceOf[java.util.Map[String, Any]])
+        .map((map) => Extraction.decompose(map.asScala.toMap))
+        .map((v) => render(v))
+        .map(compact)
+        .map((str) => parse(str).extract[Map[String, String]])
 
       // We use the KryoSerializer for group bys and joins since we serialize the IRs.
       // But since staging query is fairly freeform, it's better to stick to the java serializer.
diff --git a/spark/src/test/scala/ai/chronon/spark/test/OfflineSubcommandTest.scala b/spark/src/test/scala/ai/chronon/spark/test/OfflineSubcommandTest.scala
@@ -17,12 +17,15 @@
 package ai.chronon.spark.test
 
 import ai.chronon.spark.Driver.OfflineSubcommand
-import io.circe.yaml.parser
 import org.apache.spark.sql.SparkSession
 import org.junit.Assert.assertEquals
 import org.junit.Assert.assertTrue
 import org.junit.Test
 import org.rogach.scallop.ScallopConf
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+import org.yaml.snakeyaml.Yaml
+import collection.JavaConverters._
 
 import scala.io.Source
 
@@ -60,22 +63,27 @@ class OfflineSubcommandTest {
   }
 
   @Test
-  def additionalConfsParsedCorrectly: Unit = {
+  def additionalConfsParsedCorrectly(): Unit = {
+    implicit val formats: Formats = DefaultFormats
+
     val url = getClass.getClassLoader.getResource("test-driver-additional-confs.yaml")
 
     val args = new TestArgs(Seq("--conf-path", "does_not_exist", "--additional-conf-path", url.toURI.getPath).toArray)
     val sparkSession = args.buildSparkSession()
-
-    val is = getClass.getClassLoader.getResourceAsStream("test-driver-additional-confs.yaml")
-
-    val additionalConfs = parser
-      .parse(Source.fromInputStream(is).mkString)
-      .flatMap((r) => r.as[Map[String, String]])
-
-    val confs = additionalConfs match {
-      case Right(v) => v
-      case Left(e)  => throw e
-    }
+    val yamlLoader = new Yaml()
+
+    val confs = Option(getClass.getClassLoader
+      .getResourceAsStream("test-driver-additional-confs.yaml"))
+      .map(Source.fromInputStream)
+      .map((is) =>
+        try { is.mkString }
+        finally { is.close })
+      .map(yamlLoader.load(_).asInstanceOf[java.util.Map[String, Any]])
+      .map((jMap) => Extraction.decompose(jMap.asScala.toMap))
+      .map((jVal) => render(jVal))
+      .map(compact)
+      .map(parse(_).extract[Map[String, String]])
+      .getOrElse(throw new IllegalArgumentException(s"Yaml conf not found or invalid yaml"))
 
     val confKey = "spark.chronon.table.format_provider.class"
     assertEquals(confs.get(confKey), sparkSession.conf.getOption(confKey))

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ class DataprocSubmitterTest extends AnyFunSuite with MockitoSugar {`
`47`	`47`	`BigQueryUtilScala.validateScalaVersionCompatibility()`
`48`	`48`	`}`
`49`	`49`
`50`		`- ignore("Used to iterate locally. Do not enable this in CI/CD!") {`
	`50`	`+ test("Used to iterate locally. Do not enable this in CI/CD!") {`
`51`	`51`
`52`	`52`	`val submitter = DataprocSubmitter()`
`53`	`53`	`val submittedJobId =`