Merge branch 'main' into vz--cherry_pick_oss_888_withColumn_fix

varant-zlai · web-flow · commit d8550b553f32 · 2025-01-23T18:50:42.000-05:00
diff --git a/build.sbt b/build.sbt
@@ -109,7 +109,9 @@ val circe = Seq(
 val flink_all = Seq(
   "org.apache.flink" % "flink-metrics-dropwizard",
   "org.apache.flink" % "flink-clients",
-  "org.apache.flink" % "flink-yarn"
+  "org.apache.flink" % "flink-yarn",
+  "org.apache.flink" % "flink-connector-kafka",
+  "org.apache.flink" % "flink-avro",
 ).map(_ % flink_1_17)
 
 val vertx_java = Seq(
@@ -220,6 +222,8 @@ lazy val flink = project
     // mark the flink-streaming scala as provided as otherwise we end up with some extra Flink classes in our jar
     // and errors at runtime like: java.io.InvalidClassException: org.apache.flink.streaming.api.scala.DataStream$$anon$1; local class incompatible
     libraryDependencies += "org.apache.flink" %% "flink-streaming-scala" % flink_1_17 % "provided",
+    libraryDependencies += "org.apache.flink" % "flink-connector-files" % flink_1_17 % "provided",
+    libraryDependencies += "org.apache.spark" %% "spark-avro" % spark_3_5,
     assembly / assemblyMergeStrategy := {
       case PathList("META-INF", "services", xs @ _*) => MergeStrategy.concat
       case "reference.conf"                          => MergeStrategy.concat
@@ -239,13 +243,38 @@ lazy val flink = project
           .startsWith("protobuf")
       }
     },
+    assembly / packageOptions += Package.ManifestAttributes(
+      ("Main-Class", "ai.chronon.flink.FlinkJob")
+    ),
     libraryDependencies += "org.apache.flink" % "flink-test-utils" % flink_1_17 % Test excludeAll (
       ExclusionRule(organization = "org.apache.logging.log4j", name = "log4j-api"),
       ExclusionRule(organization = "org.apache.logging.log4j", name = "log4j-core"),
       ExclusionRule(organization = "org.apache.logging.log4j", name = "log4j-slf4j-impl")
     )
   )
 
+// We carve out a separate module for the Flink Kafka ingestion job. This isn't included in the main root module list
+// for now as we use this for testing adhoc using: sbt "project flink_kafka_ingest" assembly
+lazy val flink_kafka_ingest = project
+  .dependsOn(flink)
+  .settings(
+    // Exclude Hadoop & Guava from the assembled JAR
+    // Else we hit an error - IllegalAccessError: class org.apache.hadoop.hdfs.web.HftpFileSystem cannot access its
+    // superinterface org.apache.hadoop.hdfs.web.TokenAspect$TokenManagementDelegator
+    // Or: java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(...)
+    // Or: 'com/google/protobuf/MapField' is not assignable to 'com/google/protobuf/MapFieldReflectionAccessor'
+    assembly / assemblyExcludedJars := {
+      val cp = (assembly / fullClasspath).value
+      cp filter { jar =>
+        jar.data.getName.startsWith("hadoop-") || jar.data.getName.startsWith("guava") || jar.data.getName
+          .startsWith("protobuf")
+      }
+    },
+    assembly / packageOptions += Package.ManifestAttributes(
+      ("Main-Class", "ai.chronon.flink.FlinkKafkaBeaconEventDriver")
+    ),
+  )
+
 // GCP requires java 11, can't cross compile higher
 
 javacOptions ++= Seq("-source", "11", "-target", "11")
@@ -266,6 +295,9 @@ lazy val cloud_gcp = project
     libraryDependencies += "org.json4s" %% "json4s-core" % "3.7.0-M11",
     libraryDependencies += "org.yaml" % "snakeyaml" % "2.3",
     libraryDependencies += "io.grpc" % "grpc-netty-shaded" % "1.62.2",
+    libraryDependencies += "com.google.cloud.hosted.kafka" % "managed-kafka-auth-login-handler" % "1.0.3" excludeAll (
+      ExclusionRule(organization = "io.confluent", name = "kafka-schema-registry-client")
+    ),
     libraryDependencies ++= avro,
     libraryDependencies ++= spark_all_provided,
     dependencyOverrides ++= jackson,
@@ -279,9 +311,12 @@ lazy val cloud_gcp = project
     },
     libraryDependencies += "org.mockito" % "mockito-core" % "5.12.0" % Test,
     libraryDependencies += "com.google.cloud" % "google-cloud-bigtable-emulator" % "0.178.0" % Test,
-    // force a newer version of reload4j to sidestep: https://security.snyk.io/vuln/SNYK-JAVA-CHQOSRELOAD4J-5731326
+    // force some newer versions of reload4j and kafka-clients to sidestep:
+    // https://security.snyk.io/vuln/SNYK-JAVA-CHQOSRELOAD4J-5731326
+    // https://security.snyk.io/vuln/SNYK-JAVA-ORGAPACHEKAFKA-8528112
     dependencyOverrides ++= Seq(
-      "ch.qos.reload4j" % "reload4j" % "1.2.25"
+      "ch.qos.reload4j" % "reload4j" % "1.2.25",
+      "org.apache.kafka" % "kafka-clients" % "3.8.1"
     )
   )
 
diff --git a/cloud_gcp/src/main/resources/dataproc-submitter-conf.yaml b/cloud_gcp/src/main/resources/dataproc-submitter-conf.yaml
@@ -1,4 +1,4 @@
 # configurations for testing
 projectId: "canary-443022"
 region: "us-central1"
-clusterName: "canary-2"
+clusterName: "zipline-canary-cluster"
diff --git a/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitterTest.scala b/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitterTest.scala
@@ -16,7 +16,7 @@ import org.scalatestplus.mockito.MockitoSugar
 
 class DataprocSubmitterTest extends AnyFlatSpec with MockitoSugar {
 
-"DataprocClient" should "return job id when a job is submitted" in {
+  "DataprocClient" should "return job id when a job is submitted" in {
 
     // Mock dataproc job client.
     val jobId = "mock-job-id"
@@ -52,33 +52,46 @@ class DataprocSubmitterTest extends AnyFlatSpec with MockitoSugar {
   }
 
   it should "test flink job locally" ignore {
-    
+
+    val submitter = DataprocSubmitter()
+    submitter.submit(spark.FlinkJob,
+      Map(MainClass -> "ai.chronon.flink.FlinkJob",
+        FlinkMainJarURI -> "gs://zipline-jars/flink-assembly-0.1.0-SNAPSHOT.jar",
+        JarURI -> "gs://zipline-jars/cloud_gcp_bigtable.jar"),
+      List.empty,
+      "--online-class=ai.chronon.integrations.cloud_gcp.GcpApiImpl",
+      "--groupby-name=e2e-count",
+      "-ZGCP_PROJECT_ID=bigtable-project-id",
+      "-ZGCP_INSTANCE_ID=bigtable-instance-id")
+  }
+
+  it should "test flink kafka ingest job locally" ignore {
+
     val submitter = DataprocSubmitter()
     val submittedJobId =
       submitter.submit(spark.FlinkJob,
-        Map(MainClass -> "ai.chronon.flink.FlinkJob",
-          FlinkMainJarURI -> "gs://zipline-jars/flink-assembly-0.1.0-SNAPSHOT.jar",
+        Map(MainClass -> "ai.chronon.flink.FlinkKafkaBeaconEventDriver",
+          FlinkMainJarURI -> "gs://zipline-jars/flink_kafka_ingest-assembly-0.1.0-SNAPSHOT.jar",
           JarURI -> "gs://zipline-jars/cloud_gcp_bigtable.jar"),
         List.empty,
-        "--online-class=ai.chronon.integrations.cloud_gcp.GcpApiImpl",
-        "--groupby-name=e2e-count",
-        "-ZGCP_PROJECT_ID=bigtable-project-id",
-        "-ZGCP_INSTANCE_ID=bigtable-instance-id")
+        "--kafka-bootstrap=bootstrap.zipline-kafka-cluster.us-central1.managedkafka.canary-443022.cloud.goog:9092",
+        "--kafka-topic=test-beacon-main",
+        "--data-file-name=gs://zl-warehouse/beacon_events/beacon-output.avro",
+      )
     println(submittedJobId)
-    
   }
 
-  it should "Used to iterate locally. Do not enable this in CI/CD!" ignore  {
+  it should "Used to iterate locally. Do not enable this in CI/CD!" ignore {
 
     val submitter = DataprocSubmitter()
     val submittedJobId =
       submitter.submit(
         spark.SparkJob,
         Map(MainClass -> "ai.chronon.spark.Driver",
-              JarURI -> "gs://zipline-jars/cloud_gcp-assembly-0.1.0-SNAPSHOT.jar"),
+          JarURI -> "gs://zipline-jars/cloud_gcp-assembly-0.1.0-SNAPSHOT.jar"),
         List("gs://zipline-jars/training_set.v1",
-             "gs://zipline-jars/dataproc-submitter-conf.yaml",
-             "gs://zipline-jars/additional-confs.yaml"),
+          "gs://zipline-jars/dataproc-submitter-conf.yaml",
+          "gs://zipline-jars/additional-confs.yaml"),
         "join",
         "--end-date=2024-12-10",
         "--additional-conf-path=additional-confs.yaml",
@@ -107,5 +120,4 @@ class DataprocSubmitterTest extends AnyFlatSpec with MockitoSugar {
     println(submittedJobId)
     assertEquals(submittedJobId, "mock-job-id")
   }
-
 }
diff --git a/flink/src/main/scala/ai/chronon/flink/FlinkKafkaBeaconEventDriver.scala b/flink/src/main/scala/ai/chronon/flink/FlinkKafkaBeaconEventDriver.scala
@@ -0,0 +1,148 @@
+package ai.chronon.flink
+
+import org.apache.avro.Schema
+import org.apache.avro.generic.GenericRecord
+import org.apache.flink.api.common.functions.MapFunction
+import org.apache.flink.api.common.typeinfo.TypeInformation
+import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema
+import org.apache.flink.connector.kafka.sink.KafkaSink
+import org.apache.flink.core.fs.Path
+import org.apache.flink.formats.avro.AvroInputFormat
+import org.apache.flink.formats.avro.AvroSerializationSchema
+import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo
+import org.apache.flink.formats.avro.utils.AvroKryoSerializerUtils
+import org.apache.flink.streaming.api.scala.DataStream
+import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
+import org.apache.kafka.clients.producer.ProducerConfig
+import org.rogach.scallop.ScallopConf
+import org.rogach.scallop.ScallopOption
+import org.rogach.scallop.Serialization
+
+// Canary test app that can point to a source data file and will emit an event to Kafka periodically with an updated timestamp
+object FlinkKafkaBeaconEventDriver {
+  // Pull in the Serialization trait to sidestep: https://github.com/scallop/scallop/issues/137
+  class JobArgs(args: Seq[String]) extends ScallopConf(args) with Serialization {
+    val dataFileName: ScallopOption[String] =
+      opt[String](required = true, descr = "Name of the file on GCS to read data from")
+    val kafkaBootstrap: ScallopOption[String] =
+      opt[String](required = true, descr = "Kafka bootstrap server in host:port format")
+    val kafkaTopic: ScallopOption[String] = opt[String](required = true, descr = "Kafka topic to write to")
+    val eventDelayMillis: ScallopOption[Int] =
+      opt[Int](required = false,
+               descr = "Delay to use between event publishes (dictates the eps)",
+               default = Some(1000))
+
+    verify()
+  }
+
+  def main(args: Array[String]): Unit = {
+    val jobArgs = new JobArgs(args)
+    val dataFileName = jobArgs.dataFileName()
+    val bootstrapServers = jobArgs.kafkaBootstrap()
+    val kafkaTopic = jobArgs.kafkaTopic()
+    val eventDelayMillis = jobArgs.eventDelayMillis()
+
+    val schema = buildAvroSchema()
+    // Configure GCS source
+    val avroFormat = new AvroInputFormat[GenericRecord](
+      new Path(dataFileName),
+      classOf[GenericRecord]
+    )
+
+    implicit val typeInfo: TypeInformation[GenericRecord] = new GenericRecordAvroTypeInfo(schema)
+
+    // Set up the streaming execution environment
+    val env = StreamExecutionEnvironment.getExecutionEnvironment
+    env.getConfig
+      .enableForceKryo() // use kryo for complex types that Flink's default ser system doesn't support (e.g case classes)
+    env.getConfig.enableGenericTypes() // more permissive type checks
+    env.addDefaultKryoSerializer(classOf[Schema], classOf[AvroKryoSerializerUtils.AvroSchemaSerializer])
+
+    val stream = env
+      .createInput(avroFormat)
+      .setParallelism(1)
+
+    val transformedStream: DataStream[GenericRecord] = stream
+      .map(new DelayedSourceTransformFn(eventDelayMillis))
+      .setParallelism(stream.parallelism)
+
+    // Configure Kafka sink
+    val serializationSchema = KafkaRecordSerializationSchema
+      .builder()
+      .setTopic(kafkaTopic)
+      .setValueSerializationSchema(AvroSerializationSchema.forGeneric(schema))
+      .build()
+
+    val producerConfig = new java.util.Properties()
+    producerConfig.setProperty(ProducerConfig.ACKS_CONFIG, "all")
+    producerConfig.setProperty(ProducerConfig.RETRIES_CONFIG, "3")
+    producerConfig.setProperty("security.protocol", "SASL_SSL")
+    producerConfig.setProperty("sasl.mechanism", "OAUTHBEARER")
+    producerConfig.setProperty("sasl.login.callback.handler.class",
+                               "com.google.cloud.hosted.kafka.auth.GcpLoginCallbackHandler")
+    producerConfig.setProperty("sasl.jaas.config",
+                               "org.apache.kafka.common.security.oauthbearer.OAuthBearerLoginModule required;")
+
+    val kafkaSink = KafkaSink
+      .builder()
+      .setBootstrapServers(bootstrapServers)
+      .setRecordSerializer(serializationSchema)
+      .setKafkaProducerConfig(producerConfig)
+      .build()
+
+    // Write to Kafka
+    transformedStream
+      .sinkTo(kafkaSink)
+      .setParallelism(transformedStream.parallelism)
+
+    // Execute program
+    env.execute("Periodic Kafka Beacon Data Producer")
+  }
+
+  def buildAvroSchema(): Schema = {
+    new Schema.Parser().parse(
+      """
+    {
+      "type": "record",
+      "name": "Beacon",
+      "namespace": "com.etsy",
+      "fields": [
+        {"name": "event_name", "type": ["null", "string"], "default": null},
+        {"name": "timestamp", "type": "long"},
+        {"name": "browser_id", "type": ["null", "string"], "default": null},
+        {"name": "primary_event", "type": "boolean"},
+        {"name": "guid", "type": ["null", "string"], "default": null},
+        {"name": "page_guid", "type": ["null", "string"], "default": null},
+        {"name": "event_logger", "type": ["null", "string"], "default": null},
+        {"name": "event_source", "type": ["null", "string"], "default": null},
+        {"name": "ip", "type": ["null", "string"], "default": null},
+        {"name": "user_agent", "type": ["null", "string"], "default": null},
+        {"name": "loc", "type": ["null", "string"], "default": null},
+        {"name": "ref", "type": ["null", "string"], "default": null},
+        {"name": "cookies", "type": ["null", {"type": "map", "values": ["null", "string"]}], "default": null},
+        {"name": "ab", "type": ["null", {"type": "map", "values": ["null", {"type": "array", "items": ["null", "string"]}]}], "default": null},
+        {"name": "user_id", "type": ["null", "long"], "default": null},
+        {"name": "isMobileRequest", "type": ["null", "boolean"], "default": null},
+        {"name": "isMobileDevice", "type": ["null", "boolean"], "default": null},
+        {"name": "isMobileTemplate", "type": ["null", "boolean"], "default": null},
+        {"name": "detected_currency_code", "type": ["null", "string"], "default": null},
+        {"name": "detected_language", "type": ["null", "string"], "default": null},
+        {"name": "detected_region", "type": ["null", "string"], "default": null},
+        {"name": "listing_ids", "type": ["null", {"type": "array", "items": "long"}], "default": null},
+        {"name": "event_timestamp", "type": ["null", "long"], "default": null},
+        {"name": "properties", "type": ["null", {"type": "map", "values": ["null", "string"]}], "default": null}
+      ]
+    }
+  """)
+  }
+}
+
+class DelayedSourceTransformFn(delayMs: Int) extends MapFunction[GenericRecord, GenericRecord] {
+  override def map(value: GenericRecord): GenericRecord = {
+    val updatedTimestamp = System.currentTimeMillis()
+    // Update the timestamp field in the record
+    value.put("timestamp", updatedTimestamp)
+    Thread.sleep(delayMs)
+    value
+  }
+}