Skip to content

Commit d8550b5

Browse files
authored
Merge branch 'main' into vz--cherry_pick_oss_888_withColumn_fix
2 parents e9dd863 + 2b0546f commit d8550b5

File tree

4 files changed

+213
-18
lines changed

4 files changed

+213
-18
lines changed

build.sbt

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,9 @@ val circe = Seq(
109109
val flink_all = Seq(
110110
"org.apache.flink" % "flink-metrics-dropwizard",
111111
"org.apache.flink" % "flink-clients",
112-
"org.apache.flink" % "flink-yarn"
112+
"org.apache.flink" % "flink-yarn",
113+
"org.apache.flink" % "flink-connector-kafka",
114+
"org.apache.flink" % "flink-avro",
113115
).map(_ % flink_1_17)
114116

115117
val vertx_java = Seq(
@@ -220,6 +222,8 @@ lazy val flink = project
220222
// mark the flink-streaming scala as provided as otherwise we end up with some extra Flink classes in our jar
221223
// and errors at runtime like: java.io.InvalidClassException: org.apache.flink.streaming.api.scala.DataStream$$anon$1; local class incompatible
222224
libraryDependencies += "org.apache.flink" %% "flink-streaming-scala" % flink_1_17 % "provided",
225+
libraryDependencies += "org.apache.flink" % "flink-connector-files" % flink_1_17 % "provided",
226+
libraryDependencies += "org.apache.spark" %% "spark-avro" % spark_3_5,
223227
assembly / assemblyMergeStrategy := {
224228
case PathList("META-INF", "services", xs @ _*) => MergeStrategy.concat
225229
case "reference.conf" => MergeStrategy.concat
@@ -239,13 +243,38 @@ lazy val flink = project
239243
.startsWith("protobuf")
240244
}
241245
},
246+
assembly / packageOptions += Package.ManifestAttributes(
247+
("Main-Class", "ai.chronon.flink.FlinkJob")
248+
),
242249
libraryDependencies += "org.apache.flink" % "flink-test-utils" % flink_1_17 % Test excludeAll (
243250
ExclusionRule(organization = "org.apache.logging.log4j", name = "log4j-api"),
244251
ExclusionRule(organization = "org.apache.logging.log4j", name = "log4j-core"),
245252
ExclusionRule(organization = "org.apache.logging.log4j", name = "log4j-slf4j-impl")
246253
)
247254
)
248255

256+
// We carve out a separate module for the Flink Kafka ingestion job. This isn't included in the main root module list
257+
// for now as we use this for testing adhoc using: sbt "project flink_kafka_ingest" assembly
258+
lazy val flink_kafka_ingest = project
259+
.dependsOn(flink)
260+
.settings(
261+
// Exclude Hadoop & Guava from the assembled JAR
262+
// Else we hit an error - IllegalAccessError: class org.apache.hadoop.hdfs.web.HftpFileSystem cannot access its
263+
// superinterface org.apache.hadoop.hdfs.web.TokenAspect$TokenManagementDelegator
264+
// Or: java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(...)
265+
// Or: 'com/google/protobuf/MapField' is not assignable to 'com/google/protobuf/MapFieldReflectionAccessor'
266+
assembly / assemblyExcludedJars := {
267+
val cp = (assembly / fullClasspath).value
268+
cp filter { jar =>
269+
jar.data.getName.startsWith("hadoop-") || jar.data.getName.startsWith("guava") || jar.data.getName
270+
.startsWith("protobuf")
271+
}
272+
},
273+
assembly / packageOptions += Package.ManifestAttributes(
274+
("Main-Class", "ai.chronon.flink.FlinkKafkaBeaconEventDriver")
275+
),
276+
)
277+
249278
// GCP requires java 11, can't cross compile higher
250279

251280
javacOptions ++= Seq("-source", "11", "-target", "11")
@@ -266,6 +295,9 @@ lazy val cloud_gcp = project
266295
libraryDependencies += "org.json4s" %% "json4s-core" % "3.7.0-M11",
267296
libraryDependencies += "org.yaml" % "snakeyaml" % "2.3",
268297
libraryDependencies += "io.grpc" % "grpc-netty-shaded" % "1.62.2",
298+
libraryDependencies += "com.google.cloud.hosted.kafka" % "managed-kafka-auth-login-handler" % "1.0.3" excludeAll (
299+
ExclusionRule(organization = "io.confluent", name = "kafka-schema-registry-client")
300+
),
269301
libraryDependencies ++= avro,
270302
libraryDependencies ++= spark_all_provided,
271303
dependencyOverrides ++= jackson,
@@ -279,9 +311,12 @@ lazy val cloud_gcp = project
279311
},
280312
libraryDependencies += "org.mockito" % "mockito-core" % "5.12.0" % Test,
281313
libraryDependencies += "com.google.cloud" % "google-cloud-bigtable-emulator" % "0.178.0" % Test,
282-
// force a newer version of reload4j to sidestep: https://security.snyk.io/vuln/SNYK-JAVA-CHQOSRELOAD4J-5731326
314+
// force some newer versions of reload4j and kafka-clients to sidestep:
315+
// https://security.snyk.io/vuln/SNYK-JAVA-CHQOSRELOAD4J-5731326
316+
// https://security.snyk.io/vuln/SNYK-JAVA-ORGAPACHEKAFKA-8528112
283317
dependencyOverrides ++= Seq(
284-
"ch.qos.reload4j" % "reload4j" % "1.2.25"
318+
"ch.qos.reload4j" % "reload4j" % "1.2.25",
319+
"org.apache.kafka" % "kafka-clients" % "3.8.1"
285320
)
286321
)
287322

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# configurations for testing
22
projectId: "canary-443022"
33
region: "us-central1"
4-
clusterName: "canary-2"
4+
clusterName: "zipline-canary-cluster"

cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitterTest.scala

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ import org.scalatestplus.mockito.MockitoSugar
1616

1717
class DataprocSubmitterTest extends AnyFlatSpec with MockitoSugar {
1818

19-
"DataprocClient" should "return job id when a job is submitted" in {
19+
"DataprocClient" should "return job id when a job is submitted" in {
2020

2121
// Mock dataproc job client.
2222
val jobId = "mock-job-id"
@@ -52,33 +52,46 @@ class DataprocSubmitterTest extends AnyFlatSpec with MockitoSugar {
5252
}
5353

5454
it should "test flink job locally" ignore {
55-
55+
56+
val submitter = DataprocSubmitter()
57+
submitter.submit(spark.FlinkJob,
58+
Map(MainClass -> "ai.chronon.flink.FlinkJob",
59+
FlinkMainJarURI -> "gs://zipline-jars/flink-assembly-0.1.0-SNAPSHOT.jar",
60+
JarURI -> "gs://zipline-jars/cloud_gcp_bigtable.jar"),
61+
List.empty,
62+
"--online-class=ai.chronon.integrations.cloud_gcp.GcpApiImpl",
63+
"--groupby-name=e2e-count",
64+
"-ZGCP_PROJECT_ID=bigtable-project-id",
65+
"-ZGCP_INSTANCE_ID=bigtable-instance-id")
66+
}
67+
68+
it should "test flink kafka ingest job locally" ignore {
69+
5670
val submitter = DataprocSubmitter()
5771
val submittedJobId =
5872
submitter.submit(spark.FlinkJob,
59-
Map(MainClass -> "ai.chronon.flink.FlinkJob",
60-
FlinkMainJarURI -> "gs://zipline-jars/flink-assembly-0.1.0-SNAPSHOT.jar",
73+
Map(MainClass -> "ai.chronon.flink.FlinkKafkaBeaconEventDriver",
74+
FlinkMainJarURI -> "gs://zipline-jars/flink_kafka_ingest-assembly-0.1.0-SNAPSHOT.jar",
6175
JarURI -> "gs://zipline-jars/cloud_gcp_bigtable.jar"),
6276
List.empty,
63-
"--online-class=ai.chronon.integrations.cloud_gcp.GcpApiImpl",
64-
"--groupby-name=e2e-count",
65-
"-ZGCP_PROJECT_ID=bigtable-project-id",
66-
"-ZGCP_INSTANCE_ID=bigtable-instance-id")
77+
"--kafka-bootstrap=bootstrap.zipline-kafka-cluster.us-central1.managedkafka.canary-443022.cloud.goog:9092",
78+
"--kafka-topic=test-beacon-main",
79+
"--data-file-name=gs://zl-warehouse/beacon_events/beacon-output.avro",
80+
)
6781
println(submittedJobId)
68-
6982
}
7083

71-
it should "Used to iterate locally. Do not enable this in CI/CD!" ignore {
84+
it should "Used to iterate locally. Do not enable this in CI/CD!" ignore {
7285

7386
val submitter = DataprocSubmitter()
7487
val submittedJobId =
7588
submitter.submit(
7689
spark.SparkJob,
7790
Map(MainClass -> "ai.chronon.spark.Driver",
78-
JarURI -> "gs://zipline-jars/cloud_gcp-assembly-0.1.0-SNAPSHOT.jar"),
91+
JarURI -> "gs://zipline-jars/cloud_gcp-assembly-0.1.0-SNAPSHOT.jar"),
7992
List("gs://zipline-jars/training_set.v1",
80-
"gs://zipline-jars/dataproc-submitter-conf.yaml",
81-
"gs://zipline-jars/additional-confs.yaml"),
93+
"gs://zipline-jars/dataproc-submitter-conf.yaml",
94+
"gs://zipline-jars/additional-confs.yaml"),
8295
"join",
8396
"--end-date=2024-12-10",
8497
"--additional-conf-path=additional-confs.yaml",
@@ -107,5 +120,4 @@ class DataprocSubmitterTest extends AnyFlatSpec with MockitoSugar {
107120
println(submittedJobId)
108121
assertEquals(submittedJobId, "mock-job-id")
109122
}
110-
111123
}
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
package ai.chronon.flink
2+
3+
import org.apache.avro.Schema
4+
import org.apache.avro.generic.GenericRecord
5+
import org.apache.flink.api.common.functions.MapFunction
6+
import org.apache.flink.api.common.typeinfo.TypeInformation
7+
import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema
8+
import org.apache.flink.connector.kafka.sink.KafkaSink
9+
import org.apache.flink.core.fs.Path
10+
import org.apache.flink.formats.avro.AvroInputFormat
11+
import org.apache.flink.formats.avro.AvroSerializationSchema
12+
import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo
13+
import org.apache.flink.formats.avro.utils.AvroKryoSerializerUtils
14+
import org.apache.flink.streaming.api.scala.DataStream
15+
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
16+
import org.apache.kafka.clients.producer.ProducerConfig
17+
import org.rogach.scallop.ScallopConf
18+
import org.rogach.scallop.ScallopOption
19+
import org.rogach.scallop.Serialization
20+
21+
// Canary test app that can point to a source data file and will emit an event to Kafka periodically with an updated timestamp
22+
object FlinkKafkaBeaconEventDriver {
23+
// Pull in the Serialization trait to sidestep: https://github.com/scallop/scallop/issues/137
24+
class JobArgs(args: Seq[String]) extends ScallopConf(args) with Serialization {
25+
val dataFileName: ScallopOption[String] =
26+
opt[String](required = true, descr = "Name of the file on GCS to read data from")
27+
val kafkaBootstrap: ScallopOption[String] =
28+
opt[String](required = true, descr = "Kafka bootstrap server in host:port format")
29+
val kafkaTopic: ScallopOption[String] = opt[String](required = true, descr = "Kafka topic to write to")
30+
val eventDelayMillis: ScallopOption[Int] =
31+
opt[Int](required = false,
32+
descr = "Delay to use between event publishes (dictates the eps)",
33+
default = Some(1000))
34+
35+
verify()
36+
}
37+
38+
def main(args: Array[String]): Unit = {
39+
val jobArgs = new JobArgs(args)
40+
val dataFileName = jobArgs.dataFileName()
41+
val bootstrapServers = jobArgs.kafkaBootstrap()
42+
val kafkaTopic = jobArgs.kafkaTopic()
43+
val eventDelayMillis = jobArgs.eventDelayMillis()
44+
45+
val schema = buildAvroSchema()
46+
// Configure GCS source
47+
val avroFormat = new AvroInputFormat[GenericRecord](
48+
new Path(dataFileName),
49+
classOf[GenericRecord]
50+
)
51+
52+
implicit val typeInfo: TypeInformation[GenericRecord] = new GenericRecordAvroTypeInfo(schema)
53+
54+
// Set up the streaming execution environment
55+
val env = StreamExecutionEnvironment.getExecutionEnvironment
56+
env.getConfig
57+
.enableForceKryo() // use kryo for complex types that Flink's default ser system doesn't support (e.g case classes)
58+
env.getConfig.enableGenericTypes() // more permissive type checks
59+
env.addDefaultKryoSerializer(classOf[Schema], classOf[AvroKryoSerializerUtils.AvroSchemaSerializer])
60+
61+
val stream = env
62+
.createInput(avroFormat)
63+
.setParallelism(1)
64+
65+
val transformedStream: DataStream[GenericRecord] = stream
66+
.map(new DelayedSourceTransformFn(eventDelayMillis))
67+
.setParallelism(stream.parallelism)
68+
69+
// Configure Kafka sink
70+
val serializationSchema = KafkaRecordSerializationSchema
71+
.builder()
72+
.setTopic(kafkaTopic)
73+
.setValueSerializationSchema(AvroSerializationSchema.forGeneric(schema))
74+
.build()
75+
76+
val producerConfig = new java.util.Properties()
77+
producerConfig.setProperty(ProducerConfig.ACKS_CONFIG, "all")
78+
producerConfig.setProperty(ProducerConfig.RETRIES_CONFIG, "3")
79+
producerConfig.setProperty("security.protocol", "SASL_SSL")
80+
producerConfig.setProperty("sasl.mechanism", "OAUTHBEARER")
81+
producerConfig.setProperty("sasl.login.callback.handler.class",
82+
"com.google.cloud.hosted.kafka.auth.GcpLoginCallbackHandler")
83+
producerConfig.setProperty("sasl.jaas.config",
84+
"org.apache.kafka.common.security.oauthbearer.OAuthBearerLoginModule required;")
85+
86+
val kafkaSink = KafkaSink
87+
.builder()
88+
.setBootstrapServers(bootstrapServers)
89+
.setRecordSerializer(serializationSchema)
90+
.setKafkaProducerConfig(producerConfig)
91+
.build()
92+
93+
// Write to Kafka
94+
transformedStream
95+
.sinkTo(kafkaSink)
96+
.setParallelism(transformedStream.parallelism)
97+
98+
// Execute program
99+
env.execute("Periodic Kafka Beacon Data Producer")
100+
}
101+
102+
def buildAvroSchema(): Schema = {
103+
new Schema.Parser().parse(
104+
"""
105+
{
106+
"type": "record",
107+
"name": "Beacon",
108+
"namespace": "com.etsy",
109+
"fields": [
110+
{"name": "event_name", "type": ["null", "string"], "default": null},
111+
{"name": "timestamp", "type": "long"},
112+
{"name": "browser_id", "type": ["null", "string"], "default": null},
113+
{"name": "primary_event", "type": "boolean"},
114+
{"name": "guid", "type": ["null", "string"], "default": null},
115+
{"name": "page_guid", "type": ["null", "string"], "default": null},
116+
{"name": "event_logger", "type": ["null", "string"], "default": null},
117+
{"name": "event_source", "type": ["null", "string"], "default": null},
118+
{"name": "ip", "type": ["null", "string"], "default": null},
119+
{"name": "user_agent", "type": ["null", "string"], "default": null},
120+
{"name": "loc", "type": ["null", "string"], "default": null},
121+
{"name": "ref", "type": ["null", "string"], "default": null},
122+
{"name": "cookies", "type": ["null", {"type": "map", "values": ["null", "string"]}], "default": null},
123+
{"name": "ab", "type": ["null", {"type": "map", "values": ["null", {"type": "array", "items": ["null", "string"]}]}], "default": null},
124+
{"name": "user_id", "type": ["null", "long"], "default": null},
125+
{"name": "isMobileRequest", "type": ["null", "boolean"], "default": null},
126+
{"name": "isMobileDevice", "type": ["null", "boolean"], "default": null},
127+
{"name": "isMobileTemplate", "type": ["null", "boolean"], "default": null},
128+
{"name": "detected_currency_code", "type": ["null", "string"], "default": null},
129+
{"name": "detected_language", "type": ["null", "string"], "default": null},
130+
{"name": "detected_region", "type": ["null", "string"], "default": null},
131+
{"name": "listing_ids", "type": ["null", {"type": "array", "items": "long"}], "default": null},
132+
{"name": "event_timestamp", "type": ["null", "long"], "default": null},
133+
{"name": "properties", "type": ["null", {"type": "map", "values": ["null", "string"]}], "default": null}
134+
]
135+
}
136+
""")
137+
}
138+
}
139+
140+
class DelayedSourceTransformFn(delayMs: Int) extends MapFunction[GenericRecord, GenericRecord] {
141+
override def map(value: GenericRecord): GenericRecord = {
142+
val updatedTimestamp = System.currentTimeMillis()
143+
// Update the timestamp field in the record
144+
value.put("timestamp", updatedTimestamp)
145+
Thread.sleep(delayMs)
146+
value
147+
}
148+
}

0 commit comments

Comments
 (0)