Add a Flink canary app that can be run on demand (#762)

piyush-zlai · web-flow · commit 811bc41fcaf3 · 2025-05-13T16:42:49.000-04:00
## Summary Builds on top of PR: #751. This PR adds a streaming GroupBy that can be run as a canary to sanity check and test things out while making Flink changes. I used this to sanity check the creation & use of a Mock schema serde that some users have been asking for. Can be submitted via: ``` $ CHRONON_ROOT=`pwd`/api/python/test/canary $ zipline compile --chronon-root=$CHRONON_ROOT $ zipline run --repo=$CHRONON_ROOT --version $VERSION --mode streaming --conf compiled/group_bys/gcp/item_event_canary.actions_v1 --kafka-bootstrap=bootstrap.zipline-kafka-cluster.us-central1.managedkafka.canary-443022.cloud.goog:9092 --groupby-name gcp.item_event_canary.actions_v1 --validate ``` (Needs the Flink event driver to be running - triggered via DataProcSubmitterTest) ## Checklist - [ ] Added Unit Tests - [ ] Covered by existing CI - [X] Integration tested - [ ] Documentation update  ## Summary by CodeRabbit ## Summary by CodeRabbit - **New Features** - Introduced a new group-by aggregation for item event actions, supporting real-time analytics by listing ID with data sourced from GCP Kafka and BigQuery. - Added a mock schema provider for testing item event ingestion. - **Bug Fixes** - Updated test configurations to use new event schemas, topics, and data paths for improved accuracy in Flink Kafka ingest job tests. - **Refactor** - Renamed and restructured the event driver to focus on item events, with a streamlined schema and updated job naming. - **Chores** - Added new environment variable for Flink state storage configuration. - Updated build configuration to reference the renamed event driver.
diff --git a/api/python/test/canary/group_bys/gcp/item_event_canary.py b/api/python/test/canary/group_bys/gcp/item_event_canary.py
@@ -0,0 +1,61 @@
+from ai.chronon.api.ttypes import EventSource, Source
+from ai.chronon.group_by import Aggregation, GroupBy, Operation
+from ai.chronon.query import Query, selects
+from ai.chronon.types import ConfigProperties
+
+_action_events = [
+    "backend_add_to_cart",
+    "view_listing",
+    "backend_cart_payment",
+    "backend_favorite_item2",
+]
+_action_events_csv = ", ".join([f"'{event}'" for event in _action_events])
+_action_events_filter = f"event_type in ({_action_events_csv})"
+
+def build_source(topic: str) -> Source:
+    return Source(
+        events=EventSource(
+            # This source table contains a custom struct ('attributes') that enables
+            # attributes['key'] style access pattern in a BQ native table.
+            table="data.item_events_parquet_compat",
+            topic=topic,
+            query=Query(
+                selects=selects(
+                    listing_id="EXPLODE(TRANSFORM(SPLIT(COALESCE(attributes['sold_listing_ids'], attributes['listing_id']), ','), e -> CAST(e AS LONG)))",
+                    add_cart="IF(event_type = 'backend_add_to_cart', 1, 0)",
+                    view="IF(event_type = 'view_listing', 1, 0)",
+                    purchase="IF(event_type = 'backend_cart_payment', 1, 0)",
+                    favorite="IF(event_type = 'backend_favorite_item2', 1, 0)",
+                ),
+                wheres=[_action_events_filter],
+                time_column="timestamp",
+            ),
+        )
+    )
+
+# GCP Kafka clusters require TLS
+google_kafka_cfgs = "security.protocol=SASL_SSL/sasl.mechanism=OAUTHBEARER/sasl.login.callback.handler.class=com.google.cloud.hosted.kafka.auth.GcpLoginCallbackHandler/sasl.jaas.config=org.apache.kafka.common.security.oauthbearer.OAuthBearerLoginModule required;"
+schema_provider_cfgs = "provider_class=ai.chronon.flink.deser.MockCustomSchemaProvider/schema_name=item_event"
+kafka_topic = f"kafka://test-item-event-data/{schema_provider_cfgs}/{google_kafka_cfgs}"
+actions_source = build_source(kafka_topic)
+
+actions_v1 = GroupBy(
+    sources=[actions_source],
+    keys=["listing_id"],
+    online=True,
+    aggregations=[
+        Aggregation(
+            input_column="add_cart", operation=Operation.SUM, windows=["1d"]
+        ),
+        Aggregation(input_column="view", operation=Operation.SUM, windows=["1d"]),
+        Aggregation(input_column="purchase", operation=Operation.SUM, windows=["7d"]),
+        Aggregation(
+            input_column="favorite", operation=Operation.SUM, windows=["1d"]
+        ),
+    ],
+    conf=ConfigProperties(
+        common={
+            "spark.chronon.partition.column": "_DATE",
+        }
+    ),
+)
diff --git a/api/python/test/canary/teams.py b/api/python/test/canary/teams.py
@@ -25,6 +25,7 @@
             "GCP_REGION": "us-central1",
             "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster",
             "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance",
+            "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state",
         },
     ),
 )
diff --git a/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitterTest.scala b/cloud_gcp/src/test/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitterTest.scala
@@ -82,7 +82,7 @@ class DataprocSubmitterTest extends AnyFlatSpec with MockitoSugar {
       submitter.submit(
         spark.submission.FlinkJob,
         Map(
-          MainClass -> "ai.chronon.flink.FlinkKafkaBeaconEventDriver",
+          MainClass -> "ai.chronon.flink.FlinkKafkaItemEventDriver",
           FlinkMainJarURI -> "gs://zipline-jars/flink_kafka_ingest-assembly-0.1.0-SNAPSHOT.jar",
           JarURI -> "gs://zipline-jars/cloud_gcp_bigtable.jar",
           // This is where we write out checkpoints / persist state while the job is running
@@ -91,8 +91,8 @@ class DataprocSubmitterTest extends AnyFlatSpec with MockitoSugar {
         Map.empty,
         List.empty,
         "--kafka-bootstrap=bootstrap.zipline-kafka-cluster.us-central1.managedkafka.canary-443022.cloud.goog:9092",
-        "--kafka-topic=test-beacon-main",
-        "--data-file-name=gs://zl-warehouse/beacon_events/beacon-output.avro",
+        "--kafka-topic=test-item-event-data",
+        "--data-file-name=gs://zl-warehouse/canary_item_events/events-output.avro",
         "--event-delay-millis=10",
       )
     println(submittedJobId)
diff --git a/flink/BUILD.bazel b/flink/BUILD.bazel
@@ -88,6 +88,6 @@ jvm_binary(
     # To exclude runtime dependencies not needed for flink environment in the cluster
     # otherwise we run into version conflict errors
     deploy_env = ["//tools/build_rules/flink:flink"],
-    main_class = "ai.chronon.flink.FlinkKafkaBeaconEventDriver",
+    main_class = "ai.chronon.flink.FlinkKafkaItemEventDriver",
     runtime_deps = [":lib"],
 )
diff --git a/flink/src/main/scala/ai/chronon/flink/FlinkKafkaItemEventDriver.scala b/flink/src/main/scala/ai/chronon/flink/FlinkKafkaItemEventDriver.scala
@@ -19,7 +19,7 @@ import org.rogach.scallop.ScallopOption
 import org.rogach.scallop.Serialization
 
 // Canary test app that can point to a source data file and will emit an event to Kafka periodically with an updated timestamp
-object FlinkKafkaBeaconEventDriver {
+object FlinkKafkaItemEventDriver {
   // Pull in the Serialization trait to sidestep: https://github.com/scallop/scallop/issues/137
   class JobArgs(args: Seq[String]) extends ScallopConf(args) with Serialization {
     val dataFileName: ScallopOption[String] =
@@ -42,14 +42,13 @@ object FlinkKafkaBeaconEventDriver {
     val kafkaTopic = jobArgs.kafkaTopic()
     val eventDelayMillis = jobArgs.eventDelayMillis()
 
-    val schema = buildAvroSchema()
     // Configure GCS source
     val avroFormat = new AvroInputFormat[GenericRecord](
       new Path(dataFileName),
       classOf[GenericRecord]
     )
 
-    implicit val typeInfo: TypeInformation[GenericRecord] = new GenericRecordAvroTypeInfo(schema)
+    implicit val typeInfo: TypeInformation[GenericRecord] = new GenericRecordAvroTypeInfo(avroSchema)
 
     // Set up the streaming execution environment
     val env = StreamExecutionEnvironment.getExecutionEnvironment
@@ -70,7 +69,7 @@ object FlinkKafkaBeaconEventDriver {
     val serializationSchema = KafkaRecordSerializationSchema
       .builder()
       .setTopic(kafkaTopic)
-      .setValueSerializationSchema(AvroSerializationSchema.forGeneric(schema))
+      .setValueSerializationSchema(AvroSerializationSchema.forGeneric(avroSchema))
       .build()
 
     val producerConfig = new java.util.Properties()
@@ -96,40 +95,28 @@ object FlinkKafkaBeaconEventDriver {
       .setParallelism(transformedStream.getParallelism)
 
     // Execute program
-    env.execute("Periodic Kafka Beacon Data Producer")
+    env.execute("Periodic Kafka Data Producer")
   }
 
-  def buildAvroSchema(): Schema = {
+  lazy val avroSchema: Schema = {
     new Schema.Parser().parse("""
     {
       "type": "record",
-      "name": "Beacon",
-      "namespace": "com.customer",
+      "name": "Event",
+      "namespace": "ai.chronon",
       "fields": [
-        {"name": "event_name", "type": ["null", "string"], "default": null},
+        {"name": "event_type", "type": ["null", "string"], "default": null},
         {"name": "timestamp", "type": "long"},
-        {"name": "browser_id", "type": ["null", "string"], "default": null},
-        {"name": "primary_event", "type": "boolean"},
-        {"name": "guid", "type": ["null", "string"], "default": null},
-        {"name": "page_guid", "type": ["null", "string"], "default": null},
-        {"name": "event_logger", "type": ["null", "string"], "default": null},
-        {"name": "event_source", "type": ["null", "string"], "default": null},
-        {"name": "ip", "type": ["null", "string"], "default": null},
-        {"name": "user_agent", "type": ["null", "string"], "default": null},
-        {"name": "loc", "type": ["null", "string"], "default": null},
-        {"name": "ref", "type": ["null", "string"], "default": null},
-        {"name": "cookies", "type": ["null", {"type": "map", "values": ["null", "string"]}], "default": null},
-        {"name": "ab", "type": ["null", {"type": "map", "values": ["null", {"type": "array", "items": ["null", "string"]}]}], "default": null},
-        {"name": "user_id", "type": ["null", "long"], "default": null},
-        {"name": "isMobileRequest", "type": ["null", "boolean"], "default": null},
-        {"name": "isMobileDevice", "type": ["null", "boolean"], "default": null},
-        {"name": "isMobileTemplate", "type": ["null", "boolean"], "default": null},
-        {"name": "detected_currency_code", "type": ["null", "string"], "default": null},
-        {"name": "detected_language", "type": ["null", "string"], "default": null},
-        {"name": "detected_region", "type": ["null", "string"], "default": null},
-        {"name": "listing_ids", "type": ["null", {"type": "array", "items": "long"}], "default": null},
-        {"name": "event_timestamp", "type": ["null", "long"], "default": null},
-        {"name": "properties", "type": ["null", {"type": "map", "values": ["null", "string"]}], "default": null}
+        {"name": "visitor_id", "type": ["null", "string"], "default": null},
+        {"name": "is_primary", "type": "boolean"},
+        {"name": "logger_name", "type": ["null", "string"], "default": null},
+        {"name": "source", "type": ["null", "string"], "default": null},
+        {"name": "is_mobile_req", "type": ["null", "boolean"], "default": null},
+        {"name": "is_mobile_device", "type": ["null", "boolean"], "default": null},
+        {"name": "is_mobile_view", "type": ["null", "boolean"], "default": null},
+        {"name": "item_ids", "type": ["null", {"type": "array", "items": "long"}], "default": null},
+        {"name": "created_at", "type": ["null", "long"], "default": null},
+        {"name": "attributes", "type": ["null", {"type": "map", "values": ["null", "string"]}], "default": null}
       ]
     }
   """)
diff --git a/flink/src/main/scala/ai/chronon/flink/deser/CustomSchemaSerDe.scala b/flink/src/main/scala/ai/chronon/flink/deser/CustomSchemaSerDe.scala
@@ -1,10 +1,12 @@
 package ai.chronon.flink.deser
 
+import ai.chronon.api.StructType
+import ai.chronon.flink.FlinkKafkaItemEventDriver
 import ai.chronon.online.TopicInfo
-import ai.chronon.online.serde.SerDe
+import ai.chronon.online.serde.{AvroConversions, AvroSerDe, Mutation, SerDe}
 
 // Configured in topic config in this fashion:
-// kafka://test-beacon-main/provider_class=ai.chronon.flink.deser.MockCustomSchemaProvider/schema_name=beacon
+// kafka://my-test-topic/provider_class=ai.chronon.flink.deser.MockCustomSchemaProvider/schema_name=item_event
 object CustomSchemaSerDe {
   val ProviderClass = "provider_class"
   val SchemaName = "schema_name"
@@ -19,3 +21,21 @@ object CustomSchemaSerDe {
     provider.asInstanceOf[SerDe]
   }
 }
+
+/** Mock custom schema provider that vends out a custom hardcoded event schema
+  */
+class MockCustomSchemaProvider(topicInfo: TopicInfo) extends SerDe {
+  private val schemaName = topicInfo.params.getOrElse(CustomSchemaSerDe.SchemaName, "item_event")
+  require(schemaName == "item_event", s"Schema name must be 'item_event', but got $schemaName")
+
+  lazy val chrononSchema: StructType =
+    AvroConversions.toChrononSchema(FlinkKafkaItemEventDriver.avroSchema).asInstanceOf[StructType]
+
+  lazy val avroSerDe = new AvroSerDe(FlinkKafkaItemEventDriver.avroSchema)
+
+  override def schema: StructType = chrononSchema
+
+  override def fromBytes(messageBytes: Array[Byte]): Mutation = {
+    avroSerDe.fromBytes(messageBytes)
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@`
`25`	`25`	`"GCP_REGION": "us-central1",`
`26`	`26`	`"GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster",`
`27`	`27`	`"GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance",`
	`28`	`+ "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state",`
`28`	`29`	`},`
`29`	`30`	`),`
`30`	`31`	`)`
Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,6 @@ jvm_binary(`
`88`	`88`	`# To exclude runtime dependencies not needed for flink environment in the cluster`
`89`	`89`	`# otherwise we run into version conflict errors`
`90`	`90`	`deploy_env = ["//tools/build_rules/flink:flink"],`
`91`		`- main_class = "ai.chronon.flink.FlinkKafkaBeaconEventDriver",`
	`91`	`+ main_class = "ai.chronon.flink.FlinkKafkaItemEventDriver",`
`92`	`92`	`runtime_deps = [":lib"],`
`93`	`93`	`)`