zipline-ai · piyush-zlai · Nov 27, 2024 · Nov 3, 2024 · Nov 3, 2024 · Nov 3, 2024
diff --git a/.github/workflows/test_scala_no_spark.yaml b/.github/workflows/test_scala_no_spark.yaml
@@ -60,4 +60,9 @@ jobs:
 
       - name: Run api tests
         run: |
-          sbt "++ 2.12.18 api/test"
+          sbt "++ 2.12.18 api/test"
+
+      - name: Run hub tests
+        run: |
+          export SBT_OPTS="-Xmx8G -Xms2G"
+          sbt "++ 2.12.18 hub/test"
diff --git a/build.sbt b/build.sbt
@@ -80,6 +80,13 @@ val jackson = Seq(
   "com.fasterxml.jackson.module" %% "jackson-module-scala"
 ).map(_ % jackson_2_15)
 
+// Circe is used to ser / deser case class payloads for the Hub Play webservice
+val circe = Seq(
+  "io.circe" %% "circe-core",
+  "io.circe" %% "circe-generic",
+  "io.circe" %% "circe-parser",
+).map(_ % circeVersion)
+
 val flink_all = Seq(
   "org.apache.flink" %% "flink-streaming-scala",
   "org.apache.flink" % "flink-metrics-dropwizard",
@@ -129,6 +136,8 @@ lazy val online = project
       "com.github.ben-manes.caffeine" % "caffeine" % "3.1.8"
     ),
     libraryDependencies ++= jackson,
+    // dep needed for HTTPKvStore - yank when we rip this out
+    libraryDependencies += "com.softwaremill.sttp.client3" %% "core" % "3.9.7",
     libraryDependencies ++= spark_all.map(_ % "provided"),
     libraryDependencies ++= flink_all.map(_ % "provided")
   )
@@ -236,20 +245,18 @@ lazy val frontend = (project in file("frontend"))
 // build interop between one module solely on 2.13 and others on 2.12 is painful
 lazy val hub = (project in file("hub"))
   .enablePlugins(PlayScala)
-  .dependsOn(cloud_aws)
+  .dependsOn(cloud_aws, spark)
   .settings(
     name := "hub",
     libraryDependencies ++= Seq(
       guice,
       "org.scalatestplus.play" %% "scalatestplus-play" % "5.1.0" % Test,
       "org.scalatestplus" %% "mockito-3-4" % "3.2.10.0" % "test",
-      "io.circe" %% "circe-core" % circeVersion,
-      "io.circe" %% "circe-generic" % circeVersion,
-      "io.circe" %% "circe-parser" % circeVersion,
       "org.scala-lang.modules" %% "scala-xml" % "2.1.0",
       "org.scala-lang.modules" %% "scala-parser-combinators" % "2.3.0",
       "org.scala-lang.modules" %% "scala-java8-compat" % "1.0.2"
     ),
+    libraryDependencies ++= circe,
     libraryDependencySchemes ++= Seq(
       "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always,
       "org.scala-lang.modules" %% "scala-parser-combinators" % VersionScheme.Always,
@@ -258,7 +265,10 @@ lazy val hub = (project in file("hub"))
     excludeDependencies ++= Seq(
       ExclusionRule(organization = "org.slf4j", name = "slf4j-log4j12"),
       ExclusionRule(organization = "log4j", name = "log4j"),
-      ExclusionRule(organization = "org.apache.logging.log4j", name = "log4j-to-slf4j")
+      ExclusionRule(organization = "org.apache.logging.log4j", name = "log4j-to-slf4j"),
+      ExclusionRule("org.apache.logging.log4j", "log4j-slf4j-impl"),
+      ExclusionRule("org.apache.logging.log4j", "log4j-core"),
+      ExclusionRule("org.apache.logging.log4j", "log4j-api")
     ),
     // Ensure consistent versions of logging libraries
     dependencyOverrides ++= Seq(

diff --git a/docker-init/Dockerfile b/docker-init/Dockerfile
@@ -43,6 +43,7 @@ ENV CHRONON_DRIVER_JAR="/app/cli/spark.jar"
 # Set up Spark dependencies to help with launching CLI
 # Copy Spark JARs from the Bitnami image
 COPY --from=spark-source /opt/bitnami/spark/jars /opt/spark/jars
+COPY --from=spark-source /opt/bitnami/spark/bin /opt/spark/bin
 
 # Add all Spark JARs to the classpath
 ENV CLASSPATH=/opt/spark/jars/*

diff --git a/docker-init/demo/README.md b/docker-init/demo/README.md
@@ -1,5 +1,25 @@
+# Populate Observability Demo Data
+To populate the observability demo data:
+* Launch the set of docker containers:
+```bash
+~/workspace/chronon $ docker-compose -f docker-init/compose.yaml up --build
+...
+app-1           | [info] 2024-11-26 05:10:45,758 [main] INFO  play.api.Play - Application started (Prod) (no global state)
+app-1           | [info] 2024-11-26 05:10:45,958 [main] INFO  play.core.server.AkkaHttpServer - Listening for HTTP on /[0:0:0:0:0:0:0:0]:9000
+```
+(you can skip the --build if you don't wish to rebuild your code)
+
+Now you can trigger the script to load summary data:
+```bash
+~/workspace/chronon $ docker-init/demo/load_summaries.sh
+...
+Done uploading summaries! 🥳
+```
+
+# Streamlit local experimentation
 run build.sh once, and you can repeatedly exec to quickly visualize 
 
 In first terminal: `sbt spark/assembly` 
 In second terminal: `./run.sh` to load the built jar and serve the data on localhost:8181
 In third terminal: `streamlit run viz.py`
+
diff --git a/docker-init/demo/load_summaries.sh b/docker-init/demo/load_summaries.sh
@@ -0,0 +1,12 @@
+# Kick off the ObsDemo spark job in the app container
+
-# Kick off the ObsDemo spark job in the app container
+#!/bin/bash
+
+# Script: load_summaries.sh
+# Purpose: Initiates ObservabilityDemo Spark job in the app container
+#
+# Requirements:
+# - Docker and docker-compose must be running
+# - Containers must be up (see docker-init/demo/README.md)
+#
+# Usage: ./load_summaries.sh
+
+# Kick off the ObsDemo spark job in the app container
+
-# Kick off the ObsDemo spark job in the app container
+#!/bin/bash
+
+# Script: load_summaries.sh
+# Purpose: Initiates ObservabilityDemo Spark job in the app container
+#
+# Requirements:
+# - Docker and docker-compose must be running
+# - Containers must be up (see docker-init/demo/README.md)
+#
+# Usage: ./load_summaries.sh
+
+# Kick off the ObsDemo spark job in the app container
+
+docker-compose -f docker-init/compose.yaml exec app /opt/spark/bin/spark-submit \
+  --master "local[*]" \
+  --driver-memory 8g \
+  --conf "spark.driver.maxResultSize=6g" \
+  --conf "spark.driver.memory=8g" \
+  --driver-class-path "/opt/spark/jars/*:/app/cli/*" \
+  --conf "spark.driver.host=localhost" \
+  --conf "spark.driver.bindAddress=0.0.0.0" \
+  --class ai.chronon.spark.scripts.ObservabilityDemoDataLoader \
+  /app/cli/spark.jar
diff --git a/docker-init/demo/log4j2.properties b/docker-init/demo/log4j2.properties
diff --git a/docker-init/start.sh b/docker-init/start.sh
@@ -39,23 +39,6 @@ echo "DynamoDB Table created successfully!"
 
 start_time=$(date +%s)
 
-if ! java \
-  --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \
-  --add-opens=java.base/sun.security.action=ALL-UNNAMED \
-  -cp $SPARK_JAR:$CLASSPATH ai.chronon.spark.Driver summarize-and-upload \
-  --online-jar=$CLOUD_AWS_JAR \
-  --online-class=$ONLINE_CLASS \
-  --parquet-path="$(pwd)/drift_data" \
-  --conf-path=/chronon_sample/production/ \
-  --time-column=transaction_time; then
-  echo "Error: Failed to load summary data into DynamoDB" >&2
-  exit 1
-else
-  end_time=$(date +%s)
-  elapsed_time=$((end_time - start_time))
-  echo "Summary load completed successfully! Took $elapsed_time seconds."
-fi
-
 # Add these java options as without them we hit the below error:
 # throws java.lang.ClassFormatError accessible: module java.base does not "opens java.lang" to unnamed module @36328710
 export JAVA_OPTS="--add-opens java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED"

diff --git a/hub/app/controllers/InMemKVStoreController.scala b/hub/app/controllers/InMemKVStoreController.scala
@@ -0,0 +1,60 @@
+package controllers
+
+import ai.chronon.online.KVStore
+import ai.chronon.online.KVStore.PutRequest
+import io.circe.Codec
+import io.circe.Decoder
+import io.circe.Encoder
+import io.circe.generic.semiauto.deriveCodec
+import io.circe.parser.decode
+import play.api.Logger
+import play.api.mvc
+import play.api.mvc.BaseController
+import play.api.mvc.ControllerComponents
+import play.api.mvc.RawBuffer
+
+import java.util.Base64
+import javax.inject.Inject
+import scala.concurrent.ExecutionContext
+import scala.concurrent.Future
+
+class InMemKVStoreController @Inject() (val controllerComponents: ControllerComponents, kvStore: KVStore)(implicit
+    ec: ExecutionContext)
+    extends BaseController {
+
+  import PutRequestCodec._
+
+  val logger: Logger = Logger(this.getClass)
+
+  def bulkPut(): mvc.Action[RawBuffer] =
+    Action(parse.raw).async { request =>
+      request.body.asBytes() match {
+        case Some(bytes) =>
+          decode[Array[PutRequest]](bytes.utf8String) match {
+            case Right(putRequests) =>
+              logger.debug(s"Attempting a bulkPut with ${putRequests.length} items")
+              val resultFuture = kvStore.multiPut(putRequests)
+              resultFuture.map { responses =>
+                if (responses.contains(false)) {
+                  logger.warn("Some write failures encountered")
+                }
+                Ok("Success")
+              }
+            case Left(error) => Future.successful(BadRequest(error.getMessage))
+          }
+        case None => Future.successful(BadRequest("Empty body"))
+      }
+    }
+}
+
+object PutRequestCodec {
+  // Custom codec for byte arrays using Base64
+  implicit val byteArrayEncoder: Encoder[Array[Byte]] =
+    Encoder.encodeString.contramap[Array[Byte]](Base64.getEncoder.encodeToString)
+
+  implicit val byteArrayDecoder: Decoder[Array[Byte]] =
+    Decoder.decodeString.map(Base64.getDecoder.decode)
+
+  // Derive codec for PutRequest
+  implicit val putRequestCodec: Codec[PutRequest] = deriveCodec[PutRequest]
+}
diff --git a/hub/app/controllers/JoinController.scala b/hub/app/controllers/JoinController.scala
@@ -0,0 +1,54 @@
+package controllers
+
+import io.circe.generic.auto._
+import io.circe.syntax._
+import model.ListJoinResponse
+import play.api.mvc._
+import store.MonitoringModelStore
+
+import javax.inject._
+
+/**
+  * Controller for the Zipline Join entities
+  */
+@Singleton
+class JoinController @Inject() (val controllerComponents: ControllerComponents, monitoringStore: MonitoringModelStore)
+    extends BaseController
+    with Paginate {
+
+  /**
+    * Powers the /api/v1/joins endpoint. Returns a list of models
+    * @param offset - For pagination. We skip over offset entries before returning results
+    * @param limit - Number of elements to return
+    */
+  def list(offset: Option[Int], limit: Option[Int]): Action[AnyContent] =
+    Action { implicit request: Request[AnyContent] =>
+      // Default values if the parameters are not provided
+      val offsetValue = offset.getOrElse(defaultOffset)
+      val limitValue = limit.map(l => math.min(l, maxLimit)).getOrElse(defaultLimit)
+
+      if (offsetValue < 0) {
+        BadRequest("Invalid offset - expect a positive number")
+      } else if (limitValue < 0) {
+        BadRequest("Invalid limit - expect a positive number")
+      } else {
+        val joins = monitoringStore.getJoins
+        val paginatedResults = paginateResults(joins, offsetValue, limitValue)
+        val json = ListJoinResponse(offsetValue, paginatedResults).asJson.noSpaces
+        Ok(json)
+      }
+    }
-  def list(offset: Option[Int], limit: Option[Int]): Action[AnyContent] =
-    Action { implicit request: Request[AnyContent] =>
-      // Default values if the parameters are not provided
-      val offsetValue = offset.getOrElse(defaultOffset)
-      val limitValue = limit.map(l => math.min(l, maxLimit)).getOrElse(defaultLimit)
-
-      if (offsetValue < 0) {
-        BadRequest("Invalid offset - expect a positive number")
-      } else if (limitValue < 0) {
-        BadRequest("Invalid limit - expect a positive number")
-      } else {
-        val joins = monitoringStore.getJoins
-        val paginatedResults = paginateResults(joins, offsetValue, limitValue)
-        val json = ListJoinResponse(offsetValue, paginatedResults).asJson.noSpaces
-        Ok(json)
-      }
-    }
+  def list(offset: Option[Int], limit: Option[Int]): Action[AnyContent] =
+    Action { implicit request: Request[AnyContent] =>
+      Logger.debug(s"Fetching joins with offset=$offset, limit=$limit")
+      val offsetValue = offset.getOrElse(defaultOffset)
+      val limitValue = limit.map(l => math.min(l, maxLimit)).getOrElse(defaultLimit)
+
+      if (offsetValue < 0) {
+        BadRequest("Invalid offset - expect a positive number")
+      } else if (limitValue < 0) {
+        BadRequest("Invalid limit - expect a positive number")
+      } else {
+        Try {
+          val joins = monitoringStore.getJoins
+          val totalCount = joins.size
+          val paginatedResults = paginateResults(joins, offsetValue, limitValue)
+          val response = ListJoinResponse(
+            offset = offsetValue,
+            total = totalCount,
+            results = paginatedResults
+          )
+          Ok(response.asJson.noSpaces)
+        } recover {
+          case ex: Exception =>
+            Logger.error("Failed to fetch joins", ex)
+            InternalServerError(s"Failed to fetch joins: ${ex.getMessage}")
+        } get
+      }
+    }
-  def list(offset: Option[Int], limit: Option[Int]): Action[AnyContent] =
-    Action { implicit request: Request[AnyContent] =>
-      // Default values if the parameters are not provided
-      val offsetValue = offset.getOrElse(defaultOffset)
-      val limitValue = limit.map(l => math.min(l, maxLimit)).getOrElse(defaultLimit)
-
-      if (offsetValue < 0) {
-        BadRequest("Invalid offset - expect a positive number")
-      } else if (limitValue < 0) {
-        BadRequest("Invalid limit - expect a positive number")
-      } else {
-        val joins = monitoringStore.getJoins
-        val paginatedResults = paginateResults(joins, offsetValue, limitValue)
-        val json = ListJoinResponse(offsetValue, paginatedResults).asJson.noSpaces
-        Ok(json)
-      }
-    }
+  def list(offset: Option[Int], limit: Option[Int]): Action[AnyContent] =
+    Action { implicit request: Request[AnyContent] =>
+      Logger.debug(s"Fetching joins with offset=$offset, limit=$limit")
+      val offsetValue = offset.getOrElse(defaultOffset)
+      val limitValue = limit.map(l => math.min(l, maxLimit)).getOrElse(defaultLimit)
+
+      if (offsetValue < 0) {
+        BadRequest("Invalid offset - expect a positive number")
+      } else if (limitValue < 0) {
+        BadRequest("Invalid limit - expect a positive number")
+      } else {
+        Try {
+          val joins = monitoringStore.getJoins
+          val totalCount = joins.size
+          val paginatedResults = paginateResults(joins, offsetValue, limitValue)
+          val response = ListJoinResponse(
+            offset = offsetValue,
+            total = totalCount,
+            results = paginatedResults
+          )
+          Ok(response.asJson.noSpaces)
+        } recover {
+          case ex: Exception =>
+            Logger.error("Failed to fetch joins", ex)
+            InternalServerError(s"Failed to fetch joins: ${ex.getMessage}")
+        } get
+      }
+    }
+
+  /**
+    * Returns a specific join by name
+    */
+  def get(name: String): Action[AnyContent] = {
+    Action { implicit request: Request[AnyContent] =>
+      val maybeJoin = monitoringStore.getJoins.find(j => j.name.equalsIgnoreCase(name))
+      maybeJoin match {
+        case None       => NotFound(s"Join: $name wasn't found")
+        case Some(join) => Ok(join.asJson.noSpaces)
+      }
+    }
+  }
-  def get(name: String): Action[AnyContent] = {
-    Action { implicit request: Request[AnyContent] =>
-      val maybeJoin = monitoringStore.getJoins.find(j => j.name.equalsIgnoreCase(name))
-      maybeJoin match {
-        case None       => NotFound(s"Join: $name wasn't found")
-        case Some(join) => Ok(join.asJson.noSpaces)
-      }
-    }
-  }
+  def get(name: String): Action[AnyContent] = {
+    Action { implicit request: Request[AnyContent] =>
+      Logger.info(s"Received request to get join with name=$name")
+      
+      if (name.trim.isEmpty) {
+        BadRequest("Join name cannot be empty")
+      } else Try {
+        val maybeJoin = monitoringStore.getJoins.find(j => j.name.equalsIgnoreCase(name))
+        maybeJoin match {
+          case None       => NotFound(s"Join: $name wasn't found")
+          case Some(join) => Ok(Map("data" -> join).asJson.noSpaces)
+        }
+      } recover {
+        case ex: Exception =>
+          Logger.error(s"Failed to fetch join: $name", ex)
+          InternalServerError(s"Failed to fetch join: ${ex.getMessage}")
+      } get
+    }
+  }
-  def get(name: String): Action[AnyContent] = {
-    Action { implicit request: Request[AnyContent] =>
-      val maybeJoin = monitoringStore.getJoins.find(j => j.name.equalsIgnoreCase(name))
-      maybeJoin match {
-        case None       => NotFound(s"Join: $name wasn't found")
-        case Some(join) => Ok(join.asJson.noSpaces)
-      }
-    }
-  }
+  def get(name: String): Action[AnyContent] = {
+    Action { implicit request: Request[AnyContent] =>
+      Logger.info(s"Received request to get join with name=$name")
+      
+      if (name.trim.isEmpty) {
+        BadRequest("Join name cannot be empty")
+      } else Try {
+        val maybeJoin = monitoringStore.getJoins.find(j => j.name.equalsIgnoreCase(name))
+        maybeJoin match {
+          case None       => NotFound(s"Join: $name wasn't found")
+          case Some(join) => Ok(Map("data" -> join).asJson.noSpaces)
+        }
+      } recover {
+        case ex: Exception =>
+          Logger.error(s"Failed to fetch join: $name", ex)
+          InternalServerError(s"Failed to fetch join: ${ex.getMessage}")
+      } get
+    }
+  }
+}
diff --git a/hub/app/controllers/ModelController.scala b/hub/app/controllers/ModelController.scala
@@ -4,16 +4,15 @@ import io.circe.generic.auto._
 import io.circe.syntax._
 import model.ListModelResponse
 import play.api.mvc._
-import store.DynamoDBMonitoringStore
+import store.MonitoringModelStore
 
 import javax.inject._
 
 /**
   * Controller for the Zipline models entities
   */
 @Singleton
-class ModelController @Inject() (val controllerComponents: ControllerComponents,
-                                 monitoringStore: DynamoDBMonitoringStore)
+class ModelController @Inject() (val controllerComponents: ControllerComponents, monitoringStore: MonitoringModelStore)
     extends BaseController
     with Paginate {
 

diff --git a/hub/app/controllers/Paginate.scala b/hub/app/controllers/Paginate.scala
@@ -1,13 +1,11 @@
 package controllers
 
-import model.Model
-
 trait Paginate {
   val defaultOffset = 0
   val defaultLimit = 10
   val maxLimit = 100
 
-  def paginateResults(results: Seq[Model], offset: Int, limit: Int): Seq[Model] = {
+  def paginateResults[T](results: Seq[T], offset: Int, limit: Int): Seq[T] = {
     results.slice(offset, offset + limit)
   }
 }
diff --git a/hub/app/controllers/SearchController.scala b/hub/app/controllers/SearchController.scala
@@ -2,24 +2,23 @@ package controllers
 
 import io.circe.generic.auto._
 import io.circe.syntax._
-import model.Model
-import model.SearchModelResponse
+import model.Join
+import model.SearchJoinResponse
 import play.api.mvc._
-import store.DynamoDBMonitoringStore
+import store.MonitoringModelStore
 
 import javax.inject._
 
 /**
   * Controller to power search related APIs
   */
-class SearchController @Inject() (val controllerComponents: ControllerComponents,
-                                  monitoringStore: DynamoDBMonitoringStore)
+class SearchController @Inject() (val controllerComponents: ControllerComponents, monitoringStore: MonitoringModelStore)
     extends BaseController
     with Paginate {
 
   /**
-    * Powers the /api/v1/search endpoint. Returns a list of models
-    * @param term - Search term to search for (currently we only support searching model names)
+    * Powers the /api/v1/search endpoint. Returns a list of joins
+    * @param term - Search term to search for (currently we only support searching join names)
     * @param offset - For pagination. We skip over offset entries before returning results
     * @param limit - Number of elements to return
     */
@@ -36,14 +35,14 @@ class SearchController @Inject() (val controllerComponents: ControllerComponents
       } else {
         val searchResults = searchRegistry(term)
         val paginatedResults = paginateResults(searchResults, offsetValue, limitValue)
-        val json = SearchModelResponse(offsetValue, paginatedResults).asJson.noSpaces
+        val json = SearchJoinResponse(offsetValue, paginatedResults).asJson.noSpaces
         Ok(json)
       }
     }
 
-  // a trivial search where we check the model name for similarity with the search term
-  private def searchRegistry(term: String): Seq[Model] = {
-    val models = monitoringStore.getModels
-    models.filter(m => m.name.contains(term))
+  // a trivial search where we check the join name for similarity with the search term
+  private def searchRegistry(term: String): Seq[Join] = {
+    val joins = monitoringStore.getJoins
+    joins.filter(j => j.name.contains(term))
   }
 }