zipline-ai
diff --git a/‎api/python/ai/chronon/cli/compile/parse_configs.py
Lines changed: 13 additions & 0 deletions b/‎api/python/ai/chronon/cli/compile/parse_configs.py
Lines changed: 13 additions & 0 deletions
diff --git a/‎api/python/ai/chronon/repo/aws.py
Lines changed: 2 additions & 1 deletion b/‎api/python/ai/chronon/repo/aws.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎api/python/ai/chronon/repo/default_runner.py
Lines changed: 52 additions & 28 deletions b/‎api/python/ai/chronon/repo/default_runner.py
Lines changed: 52 additions & 28 deletions
diff --git a/‎api/python/ai/chronon/repo/gcp.py
Lines changed: 2 additions & 2 deletions b/‎api/python/ai/chronon/repo/gcp.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎api/thrift/api.thrift
Lines changed: 2 additions & 0 deletions b/‎api/thrift/api.thrift
Lines changed: 2 additions & 0 deletions
diff --git a/‎api/thrift/common.thrift
Lines changed: 7 additions & 0 deletions b/‎api/thrift/common.thrift
Lines changed: 7 additions & 0 deletions
diff --git a/‎cloud_aws/src/main/scala/ai/chronon/integrations/aws/EmrSubmitter.scala
Lines changed: 39 additions & 39 deletions b/‎cloud_aws/src/main/scala/ai/chronon/integrations/aws/EmrSubmitter.scala
Lines changed: 39 additions & 39 deletions
diff --git a/‎cloud_aws/src/main/scala/ai/chronon/integrations/aws/LivySubmitter.scala
Lines changed: 0 additions & 15 deletions b/‎cloud_aws/src/main/scala/ai/chronon/integrations/aws/LivySubmitter.scala
Lines changed: 0 additions & 15 deletions
@@ -4,6 +4,8 @@
 import os
 from typing import List
 
+from ai.chronon.api.common.ttypes import ConfType
+from ai.chronon.api.ttypes import GroupBy, Join, Model, StagingQuery
 from ai.chronon.cli.compile import parse_teams, serializer
 from ai.chronon.cli.compile.compile_context import CompileContext
 from ai.chronon.cli.compile.display.compiled_obj import CompiledObj
@@ -24,6 +26,16 @@ def from_folder(
 
     results = []
 
+    conf_type = None
+    if cls == GroupBy:
+        conf_type = ConfType.GROUP_BYS
+    elif cls == Join:
+        conf_type = ConfType.JOINS
+    elif cls == Model:
+        conf_type = ConfType.MODELS
+    elif cls == StagingQuery:
+        conf_type = ConfType.STAGING_QUERIES
+
     for f in python_files:
 
         try:
@@ -32,6 +44,7 @@ def from_folder(
             for name, obj in results_dict.items():
                 parse_teams.update_metadata(obj, compile_context.teams_dict)
                 obj.metaData.sourceFile = f
+                obj.metaData.confType = conf_type
 
                 tjson = serializer.thrift_simple_json(obj)
 
 
@@ -194,7 +194,8 @@ def generate_emr_submitter_args(
                     job_type=job_type.value,
                     main_class=main_class,
                 )
-                + f" --additional-conf-path={EMR_MOUNT_FILE_PREFIX}additional-confs.yaml --files={s3_file_args}"
+                + f" --additional-conf-path={EMR_MOUNT_FILE_PREFIX}additional-confs.yaml"
+                  f" --files={s3_file_args}"
             )
         else:
             raise ValueError(f"Invalid job type: {job_type}")
 
@@ -12,19 +12,23 @@
     ROUTES,
     SPARK_MODES,
     UNIVERSAL_ROUTES,
+    RunMode,
 )
 
 
 class Runner:
     def __init__(self, args, jar_path):
         self.repo = args["repo"]
         self.conf = args["conf"]
+        self.local_abs_conf_path = os.path.realpath(os.path.join(self.repo, self.conf))
         self.sub_help = args["sub_help"]
         self.mode = args["mode"]
         self.online_jar = args.get(ONLINE_JAR_ARG)
         self.online_class = args.get(ONLINE_CLASS_ARG)
 
-        self.conf_type = args.get("conf_type", "").replace("-", "_")  # in case user sets dash instead of underscore
+        self.conf_type = args.get("conf_type", "").replace(
+            "-", "_"
+        )  # in case user sets dash instead of underscore
 
         # streaming flink
         self.groupby_name = args.get("groupby_name")
@@ -37,32 +41,35 @@ def __init__(self, args, jar_path):
         valid_jar = args["online_jar"] and os.path.exists(args["online_jar"])
 
         # fetch online jar if necessary
-        if (self.mode in ONLINE_MODES) and (not args["sub_help"]) and not valid_jar and (
-                args.get("online_jar_fetch")):
+        if (
+            (self.mode in ONLINE_MODES)
+            and (not args["sub_help"])
+            and not valid_jar
+            and (args.get("online_jar_fetch"))
+        ):
             print("Downloading online_jar")
-            self.online_jar = utils.cheour clients_output("{}".format(args["online_jar_fetch"])).decode(
-                "utf-8"
-            )
+            self.online_jar = utils.cheour clients_output(
+                "{}".format(args["online_jar_fetch"])
+            ).decode("utf-8")
             os.environ["CHRONON_ONLINE_JAR"] = self.online_jar
             print("Downloaded jar to {}".format(self.online_jar))
 
         if self.conf:
             try:
-                self.context, self.conf_type, self.team, _ = self.conf.split(
-                    "/")[-4:]
+                self.context, self.conf_type, self.team, _ = self.conf.split("/")[-4:]
             except Exception as e:
                 logging.error(
                     "Invalid conf path: {}, please ensure to supply the relative path to zipline/ folder".format(
                         self.conf
                     )
                 )
                 raise e
-            possible_modes = list(
-                ROUTES[self.conf_type].keys()) + UNIVERSAL_ROUTES
+            possible_modes = list(ROUTES[self.conf_type].keys()) + UNIVERSAL_ROUTES
             assert (
-                    args["mode"] in possible_modes), ("Invalid mode:{} for conf:{} of type:{}, please choose from {}"
-                                                      .format(args["mode"], self.conf, self.conf_type, possible_modes
-                                                              ))
+                args["mode"] in possible_modes
+            ), "Invalid mode:{} for conf:{} of type:{}, please choose from {}".format(
+                args["mode"], self.conf, self.conf_type, possible_modes
+            )
 
         self.ds = args["end_ds"] if "end_ds" in args and args["end_ds"] else args["ds"]
         self.start_ds = (
@@ -124,7 +131,9 @@ def run_spark_streaming(self):
                 )
             )
             if self.mode == "streaming":
-                assert (len(filtered_apps) == 1), "More than one found, please kill them all"
+                assert (
+                    len(filtered_apps) == 1
+                ), "More than one found, please kill them all"
                 print("All good. No need to start a new app.")
                 return
             elif self.mode == "streaming-client":
@@ -139,9 +148,7 @@ def run_spark_streaming(self):
             jar=self.jar_path,
             subcommand=ROUTES[self.conf_type][self.mode],
             args=self._gen_final_args(),
-            additional_args=os.environ.get(
-                "CHRONON_CONFIG_ADDITIONAL_ARGS", ""
-            ),
+            additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
         )
         return command
 
@@ -182,23 +189,22 @@ def run(self):
                     )
                     for start_ds, end_ds in date_ranges:
                         command = (
-                                "bash {script} --class ai.chronon.spark.Driver " +
-                                "{jar} {subcommand} {args} {additional_args}"
+                            "bash {script} --class ai.chronon.spark.Driver "
+                            + "{jar} {subcommand} {args} {additional_args}"
                         ).format(
                             script=self.spark_submit,
                             jar=self.jar_path,
                             subcommand=ROUTES[self.conf_type][self.mode],
-                            args=self._gen_final_args(
-                                start_ds=start_ds, end_ds=end_ds),
+                            args=self._gen_final_args(start_ds=start_ds, end_ds=end_ds),
                             additional_args=os.environ.get(
                                 "CHRONON_CONFIG_ADDITIONAL_ARGS", ""
                             ),
                         )
                         command_list.append(command)
                 else:
                     command = (
-                            "bash {script} --class ai.chronon.spark.Driver "
-                            + "{jar} {subcommand} {args} {additional_args}"
+                        "bash {script} --class ai.chronon.spark.Driver "
+                        + "{jar} {subcommand} {args} {additional_args}"
                     ).format(
                         script=self.spark_submit,
                         jar=self.jar_path,
@@ -222,21 +228,39 @@ def run(self):
         elif len(command_list) == 1:
             utils.cheour clients_call(command_list[0])
 
-    def _gen_final_args(self, start_ds=None, end_ds=None, override_conf_path=None, **kwargs):
+    def _gen_final_args(
+        self, start_ds=None, end_ds=None, override_conf_path=None, **kwargs
+    ):
         base_args = MODE_ARGS[self.mode].format(
             conf_path=override_conf_path if override_conf_path else self.conf,
             ds=end_ds if end_ds else self.ds,
             online_jar=self.online_jar,
-            online_class=self.online_class
+            online_class=self.online_class,
         )
-        base_args = base_args + f" --conf-type={self.conf_type} " if self.conf_type else base_args
+
+        base_args = (
+            base_args + f" --conf-type={self.conf_type} "
+            if self.conf_type
+            else base_args
+        )
+
+        if self.mode != RunMode.FETCH:
+            base_args += " --local-conf-path={conf}".format(
+                conf=self.local_abs_conf_path
+            ) + " --original-mode={mode}".format(mode=self.mode)
 
         override_start_partition_arg = (
             "--start-partition-override=" + start_ds if start_ds else ""
         )
 
-        additional_args = " ".join(f"--{key.replace('_', '-')}={value}" for key, value in kwargs.items() if value)
+        additional_args = " ".join(
+            f"--{key.replace('_', '-')}={value}"
+            for key, value in kwargs.items()
+            if value
+        )
 
-        final_args = " ".join([base_args, str(self.args), override_start_partition_arg, additional_args])
+        final_args = " ".join(
+            [base_args, str(self.args), override_start_partition_arg, additional_args]
+        )
 
         return final_args
@@ -260,8 +260,8 @@ def generate_dataproc_submitter_args(
                     jar_uri=jar_uri,
                     job_type=job_type.value,
                     main_class=main_class,
-                )
-                + f" --additional-conf-path=additional-confs.yaml --gcs-files={gcs_file_args}"
+                ) + f" --files={gcs_file_args}"
+
             )
         else:
             raise ValueError(f"Invalid job type: {job_type}")
 
@@ -290,6 +290,8 @@ struct MetaData {
 
     # information that needs to be present on every physical node
     204: optional common.ExecutionInfo executionInfo
+
+    205: optional common.ConfType confType
 }
 
 // Equivalent to a FeatureSet in chronon terms
 
@@ -132,4 +132,11 @@ struct ExecutionInfo {
     # note that batch jobs could in theory also depend on model training runs
     # in which case we will be polling
     # in the future we will add other types of dependencies
+}
+
+enum ConfType {
+    JOINS = 0
+    GROUP_BYS = 1
+    MODELS = 2
+    STAGING_QUERIES = 3
 }
@@ -158,20 +158,23 @@ class EmrSubmitter(customerId: String, emrClient: EmrClient) extends JobSubmitte
   }
 
   override def submit(jobType: JobType,
+                      submissionProperties: Map[String, String],
                       jobProperties: Map[String, String],
                       files: List[String],
                       args: String*): String = {
-    if (jobProperties.get(ShouldCreateCluster).exists(_.toBoolean)) {
+    if (submissionProperties.get(ShouldCreateCluster).exists(_.toBoolean)) {
       // create cluster
       val runJobFlowBuilder = createClusterRequestBuilder(
-        emrReleaseLabel = jobProperties.getOrElse(EmrReleaseLabel, DefaultEmrReleaseLabel),
-        clusterIdleTimeout = jobProperties.getOrElse(ClusterIdleTimeout, DefaultClusterIdleTimeout.toString).toInt,
-        masterInstanceType = jobProperties.getOrElse(ClusterInstanceType, DefaultClusterInstanceType),
-        slaveInstanceType = jobProperties.getOrElse(ClusterInstanceType, DefaultClusterInstanceType),
-        instanceCount = jobProperties.getOrElse(ClusterInstanceCount, DefaultClusterInstanceCount.toString).toInt
+        emrReleaseLabel = submissionProperties.getOrElse(EmrReleaseLabel, DefaultEmrReleaseLabel),
+        clusterIdleTimeout =
+          submissionProperties.getOrElse(ClusterIdleTimeout, DefaultClusterIdleTimeout.toString).toInt,
+        masterInstanceType = submissionProperties.getOrElse(ClusterInstanceType, DefaultClusterInstanceType),
+        slaveInstanceType = submissionProperties.getOrElse(ClusterInstanceType, DefaultClusterInstanceType),
+        instanceCount = submissionProperties.getOrElse(ClusterInstanceCount, DefaultClusterInstanceCount.toString).toInt
       )
 
-      runJobFlowBuilder.steps(createStepConfig(files, jobProperties(MainClass), jobProperties(JarURI), args: _*))
+      runJobFlowBuilder.steps(
+        createStepConfig(files, submissionProperties(MainClass), submissionProperties(JarURI), args: _*))
 
       val responseJobId = emrClient.runJobFlow(runJobFlowBuilder.build()).jobFlowId()
       println("EMR job id: " + responseJobId)
@@ -181,11 +184,11 @@ class EmrSubmitter(customerId: String, emrClient: EmrClient) extends JobSubmitte
 
     } else {
       // use existing cluster
-      val existingJobId = jobProperties.getOrElse(ClusterId, throw new RuntimeException("JobFlowId not found"))
+      val existingJobId = submissionProperties.getOrElse(ClusterId, throw new RuntimeException("JobFlowId not found"))
       val request = AddJobFlowStepsRequest
         .builder()
         .jobFlowId(existingJobId)
-        .steps(createStepConfig(files, jobProperties(MainClass), jobProperties(JarURI), args: _*))
+        .steps(createStepConfig(files, submissionProperties(MainClass), submissionProperties(JarURI), args: _*))
         .build()
 
       val responseStepId = emrClient.addJobFlowSteps(request).stepIds().get(0)
@@ -230,40 +233,35 @@ object EmrSubmitter {
   def main(args: Array[String]): Unit = {
     // List of args that are not application args
     val internalArgs = Set(
-      JarUriArgKeyword,
-      JobTypeArgKeyword,
-      MainClassKeyword,
-      FlinkMainJarUriArgKeyword,
-      FlinkSavepointUriArgKeyword,
       ClusterInstanceTypeArgKeyword,
       ClusterInstanceCountArgKeyword,
       ClusterIdleTimeoutArgKeyword,
-      FilesArgKeyword,
       CreateClusterArgKeyword
-    )
+    ) ++ SharedInternalArgs
 
     val userArgs = args.filter(arg => !internalArgs.exists(arg.startsWith))
 
-    val jarUri =
-      args.find(_.startsWith(JarUriArgKeyword)).map(_.split("=")(1)).getOrElse(throw new Exception("Jar URI not found"))
-    val mainClass = args
-      .find(_.startsWith(MainClassKeyword))
-      .map(_.split("=")(1))
-      .getOrElse(throw new Exception("Main class not found"))
-    val jobTypeValue = args
-      .find(_.startsWith(JobTypeArgKeyword))
-      .map(_.split("=")(1))
-      .getOrElse(throw new Exception("Job type not found"))
-    val clusterInstanceType =
-      args.find(_.startsWith(ClusterInstanceTypeArgKeyword)).map(_.split("=")(1)).getOrElse(DefaultClusterInstanceType)
-    val clusterInstanceCount = args
-      .find(_.startsWith(ClusterInstanceCountArgKeyword))
-      .map(_.split("=")(1))
+    val jarUri = JobSubmitter
+      .getArgValue(args, JarUriArgKeyword)
+      .getOrElse(throw new Exception("Missing required argument: " + JarUriArgKeyword))
+    val mainClass = JobSubmitter
+      .getArgValue(args, MainClassKeyword)
+      .getOrElse(throw new Exception("Missing required argument: " + MainClassKeyword))
+    val jobTypeValue =
+      JobSubmitter
+        .getArgValue(args, JobTypeArgKeyword)
+        .getOrElse(throw new Exception("Missing required argument: " + JobTypeArgKeyword))
+
+    val clusterInstanceType = JobSubmitter
+      .getArgValue(args, ClusterInstanceTypeArgKeyword)
+      .getOrElse(DefaultClusterInstanceType)
+    val clusterInstanceCount = JobSubmitter
+      .getArgValue(args, ClusterInstanceCountArgKeyword)
       .getOrElse(DefaultClusterInstanceCount.toString)
-    val clusterIdleTimeout = args
-      .find(_.startsWith(ClusterIdleTimeoutArgKeyword))
-      .map(_.split("=")(1))
+    val clusterIdleTimeout = JobSubmitter
+      .getArgValue(args, ClusterIdleTimeoutArgKeyword)
       .getOrElse(DefaultClusterIdleTimeout.toString)
+
     val createCluster = args.exists(_.startsWith(CreateClusterArgKeyword))
 
     val clusterId = sys.env.get("EMR_CLUSTER_ID")
@@ -278,7 +276,7 @@ object EmrSubmitter {
       filesArgs(0).split("=")(1).split(",")
     }
 
-    val (jobType, jobProps) = jobTypeValue.toLowerCase match {
+    val (jobType, submissionProps) = jobTypeValue.toLowerCase match {
       case "spark" => {
         val baseProps = Map(
           MainClass -> mainClass,
@@ -299,13 +297,15 @@ object EmrSubmitter {
       case _ => throw new Exception("Invalid job type")
     }
 
-    val finalArgs = userArgs
+    val finalArgs = userArgs.toSeq
+    val modeConfigProperties = JobSubmitter.getModeConfigProperties(args)
 
     val emrSubmitter = EmrSubmitter()
     emrSubmitter.submit(
-      jobType,
-      jobProps,
-      files.toList,
+      jobType = jobType,
+      submissionProperties = submissionProps,
+      jobProperties = modeConfigProperties.getOrElse(Map.empty),
+      files = files.toList,
       finalArgs: _*
     )
   }
Original file line number	Diff line number	Diff line change
`@@ -194,7 +194,8 @@ def generate_emr_submitter_args(`
`194`	`194`	`job_type=job_type.value,`
`195`	`195`	`main_class=main_class,`
`196`	`196`	`)`
`197`		`- + f" --additional-conf-path={EMR_MOUNT_FILE_PREFIX}additional-confs.yaml --files={s3_file_args}"`
	`197`	`+ + f" --additional-conf-path={EMR_MOUNT_FILE_PREFIX}additional-confs.yaml"`
	`198`	`+ f" --files={s3_file_args}"`
`198`	`199`	`)`
`199`	`200`	`else:`
`200`	`201`	`raise ValueError(f"Invalid job type: {job_type}")`
Original file line number	Diff line number	Diff line change
`@@ -290,6 +290,8 @@ struct MetaData {`
`290`	`290`
`291`	`291`	`# information that needs to be present on every physical node`
`292`	`292`	`204: optional common.ExecutionInfo executionInfo`
	`293`	`+`
	`294`	`+ 205: optional common.ConfType confType`
`293`	`295`	`}`
`294`	`296`
`295`	`297`	`// Equivalent to a FeatureSet in chronon terms`