Connect GroupByUploadToKVBulkLoad from Driver.scala to run.py (#221)

david-zlai · web-flow · commit f19dfe9d9acc · 2025-01-17T15:06:12.000-05:00
## Summary ^^^ ### Testing This testing assumes the gbu job (this one is just the job that generates the data to bigquery) has run. Tested with this command: ``` (dev_chronon) davidhan@Davids-MacBook-Pro: ~/zipline/chronon (main) $ python api/py/ai/chronon/repo/run.py --mode upload-to-kv --conf production/group_bys/quickstart/purchases.v1 --partition-string=2023-11-30 --dataproc --repo=/Users/davidhan/zipline/chronon/api/py/test/sample ``` and led to successful run of this job: https://console.cloud.google.com/dataproc/jobs/2f6b0b81-7b34-4a92-840d-cb90059f3d42/monitoring?region=us-central1&project=canary-443022 ## Checklist - [ ] Added Unit Tests - [ ] Covered by existing CI - [ ] Integration tested - [ ] Documentation update  ## Summary by CodeRabbit - **New Features** - Added support for a new upload mode to Google Cloud Platform. - Enhanced configuration handling for GCP Dataproc clusters. - Introduced new GCP-related options in the command-line interface. - Updated JSON configuration with new GCP parameters. - **Bug Fixes** - Improved error handling and argument processing in various components. - **Refactor** - Updated environment variable naming conventions. - Restructured configuration management across multiple files. - Enhanced clarity and organization in code structure. - **Chores** - Added support for multiple Java versions. - Updated build and deployment scripts for improved reliability.
diff --git a/.tool-versions b/.tool-versions
@@ -1,4 +1,6 @@
-java corretto-17.0.9.8.1
+java 
+    corretto-11.0.25.9.1
+    corretto-17.0.9.8.1
 scala 2.12.18
 asdf-plugin-manager 1.4.0
 sbt 1.8.2
diff --git a/api/py/ai/chronon/repo/run.py b/api/py/ai/chronon/repo/run.py
diff --git a/api/py/test/sample/teams.json b/api/py/test/sample/teams.json
@@ -1,66 +1,70 @@
 {
-    "default": {
-        "table_properties": {
-            "source": "chronon"
-        },
-        "common_env": {
-            "VERSION": "latest",
-            "SPARK_SUBMIT_PATH": "[TODO]/path/to/spark-submit",
-            "JOB_MODE": "local[*]",
-            "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing",
-            "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class",
-            "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host=<YOUR_HOST> -Zkv-port=<YOUR_PORT>",
-            "PARTITION_COLUMN": "ds",
-            "PARTITION_FORMAT": "yyyy-MM-dd"
-        },
-        "production": {            
-            "backfill" : {
-                "EXECUTOR_CORES": "1",
-                "DRIVER_MEMORY": "15G",
-                "EXECUTOR_MEMORY": "8G",
-                "PARALLELISM": "4000",
-                "MAX_EXECUTORS": "1000"
-            },
-            "upload" : {
-                "EXECUTOR_CORES": "1",
-                "EXECUTOR_MEMORY": "8G",
-                "PARALLELISM": "1000",
-                "MAX_EXECUTORS": "1000"
-            },
-            "streaming" : {
-                "EXECUTOR_CORES": "2",
-                "EXECUTOR_MEMORY": "4G",
-                "PARALLELISM": "16"
-            }
-        }
+  "default": {
+    "table_properties": {
+      "source": "chronon"
     },
-    "sample_team": {
-        "description": "Team description",
-        "namespace": "chronon_db",
-        "user": "# TODO: ldap user name to run the jobs as, from airflow or your own scheduler",
-        "production": {
-            "backfill" : {
-                "EXECUTOR_CORES": "4"
-            }
-        },
-        "dev": {
-            "backfill" : {
-                "EXECUTOR_CORES": "2",
-                "DRIVER_MEMORY": "30G"
-            }
-        }
+    "common_env": {
+      "VERSION": "latest",
+      "SPARK_SUBMIT_PATH": "[TODO]/path/to/spark-submit",
+      "JOB_MODE": "local[*]",
+      "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing",
+      "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class",
+      "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host=<YOUR_HOST> -Zkv-port=<YOUR_PORT>",
+      "PARTITION_COLUMN": "ds",
+      "PARTITION_FORMAT": "yyyy-MM-dd",
+      "CUSTOMER_ID": "canary",
+      "GCP_PROJECT_ID": "canary-443022",
+      "GCP_REGION": "us-central1",
+      "GCP_DATAPROC_CLUSTER_NAME": "canary-2",
+      "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance"
     },
-    "kaggle": {
-        "description": "Workspace for kaggle compeitions",
-        "namespace": "default"
-    },
-    "quickstart": {
-        "description": "Used for the quickstart example",
-        "namespace": "default"
+    "production": {
+      "backfill": {
+        "EXECUTOR_CORES": "1",
+        "DRIVER_MEMORY": "15G",
+        "EXECUTOR_MEMORY": "8G",
+        "PARALLELISM": "4000",
+        "MAX_EXECUTORS": "1000"
+      },
+      "upload": {
+        "EXECUTOR_CORES": "1",
+        "EXECUTOR_MEMORY": "8G",
+        "PARALLELISM": "1000",
+        "MAX_EXECUTORS": "1000"
+      },
+      "streaming": {
+        "EXECUTOR_CORES": "2",
+        "EXECUTOR_MEMORY": "4G",
+        "PARALLELISM": "16"
+      }
+    }
+  },
+  "sample_team": {
+    "description": "Team description",
+    "namespace": "chronon_db",
+    "user": "# TODO: ldap user name to run the jobs as, from airflow or your own scheduler",
+    "production": {
+      "backfill": {
+        "EXECUTOR_CORES": "4"
+      }
     },
-    "risk": {
-        "description": "Used for proof of concept",
-        "namespace": "default"
+    "dev": {
+      "backfill": {
+        "EXECUTOR_CORES": "2",
+        "DRIVER_MEMORY": "30G"
+      }
     }
-
+  },
+  "kaggle": {
+    "description": "Workspace for kaggle compeitions",
+    "namespace": "default"
+  },
+  "quickstart": {
+    "description": "Used for the quickstart example",
+    "namespace": "default"
+  },
+  "risk": {
+    "description": "Used for proof of concept",
+    "namespace": "default"
+  }
 }
diff --git a/chronon_dataproc_submitter.env b/chronon_dataproc_submitter.env
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitter.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitter.scala
@@ -181,41 +181,57 @@ object DataprocSubmitter {
     val chrononJarUri = args.filter(_.startsWith("--chronon_jar_uri"))(0).split("=")(1)
 
     // search args array for prefix `--gcs_files`
-    val gcsFiles = args
-      .filter(_.startsWith("--gcs_files"))(0)
-      .split("=")(1)
-      .split(",")
+    val gcsFilesArgs = args.filter(_.startsWith("--gcs_files"))
+    assert(gcsFilesArgs.length == 0 || gcsFilesArgs.length == 1)
+
+    val gcsFiles = if (gcsFilesArgs.isEmpty) {
+      Array.empty[String]
+    } else {
+      gcsFilesArgs(0).split("=")(1).split(",")
+    }
 
     val userArgs = args.filter(f => !f.startsWith("--gcs_files") && !f.startsWith("--chronon_jar_uri"))
 
     val required_vars = List.apply(
-      "ZIPLINE_GCP_PROJECT_ID",
-      "ZIPLINE_GCP_REGION",
-      "ZIPLINE_GCP_DATAPROC_CLUSTER_NAME"
+      "GCP_PROJECT_ID",
+      "GCP_REGION",
+      "GCP_DATAPROC_CLUSTER_NAME"
     )
     val missing_vars = required_vars.filter(!sys.env.contains(_))
     if (missing_vars.nonEmpty) {
       throw new Exception(s"Missing required environment variables: ${missing_vars.mkString(", ")}")
     }
 
-    val projectId = sys.env.getOrElse("ZIPLINE_GCP_PROJECT_ID", throw new Exception("ZIPLINE_GCP_PROJECT_ID not set"))
-    val region = sys.env.getOrElse("ZIPLINE_GCP_REGION", throw new Exception("ZIPLINE_GCP_REGION not set"))
+    val projectId = sys.env.getOrElse("GCP_PROJECT_ID", throw new Exception("GCP_PROJECT_ID not set"))
+    val region = sys.env.getOrElse("GCP_REGION", throw new Exception("GCP_REGION not set"))
     val clusterName = sys.env
-      .getOrElse("ZIPLINE_GCP_DATAPROC_CLUSTER_NAME", throw new Exception("ZIPLINE_GCP_DATAPROC_CLUSTER_NAME not set"))
+      .getOrElse("GCP_DATAPROC_CLUSTER_NAME", throw new Exception("GCP_DATAPROC_CLUSTER_NAME not set"))
 
     val submitterConf = SubmitterConf(
       projectId,
       region,
       clusterName
     )
 
+    val bigtableInstanceId = sys.env.getOrElse("GCP_BIGTABLE_INSTANCE_ID", "")
+
+    val gcpArgsToPass = Array.apply(
+      "--is-gcp",
+      s"--gcp-project-id=${projectId}",
+      s"--gcp-bigtable-instance-id=$bigtableInstanceId"
+    )
+
+    val finalArgs = Array.concat(userArgs, gcpArgsToPass)
+
+    println(finalArgs.mkString("Array(", ", ", ")"))
+
     val a = DataprocSubmitter(submitterConf)
 
     val jobId = a.submit(
       TypeSparkJob,
       Map(MainClass -> "ai.chronon.spark.Driver", JarURI -> chrononJarUri),
       gcsFiles.toList,
-      userArgs: _*
+      finalArgs: _*
     )
     println("Dataproc submitter job id: " + jobId)
   }
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpApiImpl.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpApiImpl.scala
@@ -28,9 +28,9 @@ class GcpApiImpl(conf: Map[String, String]) extends Api(conf) {
       .getOrElse(throw new IllegalArgumentException("GCP_PROJECT_ID environment variable not set"))
 
     val instanceId = sys.env
-      .get("GCP_INSTANCE_ID")
-      .orElse(conf.get("GCP_INSTANCE_ID"))
-      .getOrElse(throw new IllegalArgumentException("GCP_INSTANCE_ID environment variable not set"))
+      .get("GCP_BIGTABLE_INSTANCE_ID")
+      .orElse(conf.get("GCP_BIGTABLE_INSTANCE_ID"))
+      .getOrElse(throw new IllegalArgumentException("GCP_BIGTABLE_INSTANCE_ID environment variable not set"))
 
     // Create settings builder based on whether we're in emulator mode (e.g. docker) or not
     val (dataSettingsBuilder, adminSettingsBuilder, maybeBQClient) = sys.env.get("BIGTABLE_EMULATOR_HOST") match {
diff --git a/distribution/build_and_upload_gcp_artifacts.sh b/distribution/build_and_upload_gcp_artifacts.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+set -e
+
 SCRIPT_DIRECTORY=$(dirname -- "$(realpath -- "$0")")
 CHRONON_ROOT_DIR=$(dirname "$SCRIPT_DIRECTORY")
 
diff --git a/spark/src/main/scala/ai/chronon/spark/Driver.scala b/spark/src/main/scala/ai/chronon/spark/Driver.scala
@@ -86,7 +86,17 @@ object Driver {
   def parseConf[T <: TBase[_, _]: Manifest: ClassTag](confPath: String): T =
     ThriftJsonCodec.fromJsonFile[T](confPath, check = true)
 
-  trait OfflineSubcommand {
+  trait AddGcpSubCommandArgs {
+    this: ScallopConf =>
+    val isGcp: ScallopOption[Boolean] =
+      opt[Boolean](required = false, default = Some(false), descr = "Whether to use GCP")
+    val gcpProjectId: ScallopOption[String] =
+      opt[String](required = false, descr = "GCP project id")
+    val gcpBigtableInstanceId: ScallopOption[String] =
+      opt[String](required = false, descr = "GCP BigTable instance id")
+  }
+
+  trait OfflineSubcommand extends AddGcpSubCommandArgs {
     this: ScallopConf =>
     val confPath: ScallopOption[String] = opt[String](required = true, descr = "Path to conf")
 
@@ -513,10 +523,20 @@ object Driver {
   object GroupByUploader {
     class Args extends Subcommand("group-by-upload") with OfflineSubcommand {
       override def subcommandName() = "group-by-upload"
+
+      // jsonPercent
+      val jsonPercent: ScallopOption[Int] =
+        opt[Int](name = "json-percent",
+                 required = false,
+                 descr = "Percentage of json encoding to retain for debuggability",
+                 default = Some(1))
     }
 
     def run(args: Args): Unit = {
-      GroupByUpload.run(parseConf[api.GroupBy](args.confPath()), args.endDate(), Some(args.buildTableUtils()))
+      GroupByUpload.run(parseConf[api.GroupBy](args.confPath()),
+                        args.endDate(),
+                        Some(args.buildTableUtils()),
+                        jsonPercent = args.jsonPercent.apply())
     }
   }
 
@@ -564,7 +584,7 @@ object Driver {
   }
 
   // common arguments to all online commands
-  trait OnlineSubcommand { s: ScallopConf =>
+  trait OnlineSubcommand extends AddGcpSubCommandArgs { s: ScallopConf =>
     // this is `-Z` and not `-D` because sbt-pack plugin uses that for JAVA_OPTS
     val propsInner: Map[String, String] = props[String]('Z')
     val onlineJar: ScallopOption[String] =
@@ -573,14 +593,26 @@ object Driver {
       opt[String](required = true,
                   descr = "Fully qualified Online.Api based class. We expect the jar to be on the class path")
 
+    // TODO: davidhan - remove this when we've migrated away from additional-conf-path
+    val additionalConfPath: ScallopOption[String] =
+      opt[String](required = false, descr = "Path to additional driver job configurations")
+
     // hashmap implements serializable
     def serializableProps: Map[String, String] = {
       val map = new mutable.HashMap[String, String]()
       propsInner.foreach { case (key, value) => map.update(key, value) }
       map.toMap
     }
 
-    lazy val api: Api = impl(serializableProps)
+    lazy private val gcpMap = Map(
+      "GCP_PROJECT_ID" -> gcpProjectId.toOption.getOrElse(""),
+      "GCP_BIGTABLE_INSTANCE_ID" -> gcpBigtableInstanceId.toOption.getOrElse("")
+    )
+
+    lazy val api: Api = isGcp.toOption match {
+      case Some(true) => impl(serializableProps ++ gcpMap)
+      case _          => impl(serializableProps)
+    }
 
     def metaDataStore =
       new MetadataStore(impl(serializableProps).genKvStore, MetadataDataset, timeoutMillis = 10000)
@@ -734,31 +766,36 @@ object Driver {
   object GroupByUploadToKVBulkLoad {
     @transient lazy val logger: Logger = LoggerFactory.getLogger(getClass)
     class Args extends Subcommand("groupby-upload-bulk-load") with OnlineSubcommand {
-      val srcOfflineTable: ScallopOption[String] =
-        opt[String](required = true, descr = "Name of the source GroupBy Upload table")
-
-      val groupbyName: ScallopOption[String] =
-        opt[String](required = true, descr = "Name of the GroupBy that we're triggering this upload for")
+      // Expectation that run.py only sets confPath
+      val confPath: ScallopOption[String] = opt[String](required = false, descr = "path to groupBy conf")
 
       val partitionString: ScallopOption[String] =
         opt[String](required = true, descr = "Partition string (in 'yyyy-MM-dd' format) that we are uploading")
     }
 
     def run(args: Args): Unit = {
-      logger.info(s"Triggering bulk load for GroupBy: ${args.groupbyName()} for partition: ${args
-        .partitionString()} from table: ${args.srcOfflineTable()}")
+      val groupByConf = parseConf[api.GroupBy](args.confPath())
+
+      val offlineTable = groupByConf.metaData.uploadTable
+
+      val groupByName = groupByConf.metaData.name
+
+      logger.info(s"Triggering bulk load for GroupBy: ${groupByName} for partition: ${args
+        .partitionString()} from table: ${offlineTable}")
       val kvStore = args.api.genKvStore
       val startTime = System.currentTimeMillis()
+
       try {
-        kvStore.bulkPut(args.srcOfflineTable(), args.groupbyName(), args.partitionString())
+        // TODO: we may need to wrap this around TableUtils
+        kvStore.bulkPut(offlineTable, groupByName, args.partitionString())
       } catch {
         case e: Exception =>
-          logger.error(s"Failed to upload GroupBy: ${args.groupbyName()} for partition: ${args
-                         .partitionString()} from table: ${args.srcOfflineTable()}",
+          logger.error(s"Failed to upload GroupBy: ${groupByName} for partition: ${args
+                         .partitionString()} from table: $offlineTable",
                        e)
           throw e
       }
-      logger.info(s"Uploaded GroupByUpload data to KV store for GroupBy: ${args.groupbyName()}; partition: ${args
+      logger.info(s"Uploaded GroupByUpload data to KV store for GroupBy: ${groupByName}; partition: ${args
         .partitionString()} in ${(System.currentTimeMillis() - startTime) / 1000} seconds")
     }
   }
diff --git a/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala b/spark/src/main/scala/ai/chronon/spark/GroupByUpload.scala
@@ -257,8 +257,8 @@ object GroupByUpload {
       .withColumn("ds", lit(endDs))
       .saveUnPartitioned(groupByConf.metaData.uploadTable, groupByConf.metaData.tableProps)
 
-    val kvDfReloaded = tableUtils.sparkSession
-      .table(groupByConf.metaData.uploadTable)
+    val kvDfReloaded = tableUtils
+      .loadTable(groupByConf.metaData.uploadTable)
       .where(not(col("key_json").eqNullSafe(Constants.GroupByServingInfoKey)))
 
     val metricRow =