zipline-ai
diff --git a/‎api/py/ai/chronon/utils.py
Lines changed: 1 addition & 0 deletions b/‎api/py/ai/chronon/utils.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎api/thrift/orchestration.thrift
Lines changed: 1 addition & 13 deletions b/‎api/thrift/orchestration.thrift
Lines changed: 1 addition & 13 deletions
diff --git a/‎build.sbt
Lines changed: 12 additions & 21 deletions b/‎build.sbt
Lines changed: 12 additions & 21 deletions
diff --git a/‎orchestration/README.md
Lines changed: 290 additions & 0 deletions b/‎orchestration/README.md
Lines changed: 290 additions & 0 deletions
diff --git a/‎orchestration/src/main/resources/log4j2.properties
Lines changed: 1 addition & 1 deletion b/‎orchestration/src/main/resources/log4j2.properties
Lines changed: 1 addition & 1 deletion
@@ -262,6 +262,7 @@ def join_part_output_table_name(join, jp, full_name: bool = False):
     def partOutputTable(jp: JoinPart): String = (Seq(join.metaData.outputTable) ++ Option(jp.prefix) :+
       jp.groupBy.metaData.cleanName).mkString("_")
     """
+    print(join)
     if not join.metaData.name and isinstance(join, api.Join):
         __set_name(join, api.Join, "joins")
     return "_".join(
 
@@ -82,18 +82,6 @@ struct NodeInfo {
     30: optional LogicalNode conf
 }
 
-
-/** First Pass
-* NodeInstance::(name, type, conf_hash) -> #[parent_nodes]
-* Node::(name, type) -> #[conf_hash]
-
-* Second Pass
-* Node::(name, type, compute_hash) -> #[parent_nodes]
-
-* different file_hashes but same lineage_hash should all go into the same orchestrator workflow
-* Node::(name, type, lineage_hash)
-**/
-
 struct NodeConnections {
     1: optional list<NodeKey> parents
     2: optional list<NodeKey> children
@@ -272,7 +260,7 @@ struct TableDependency {
     * JoinParts could use data from batch backfills or upload tables when available
     * When not available they shouldn't force computation of the backfills and upload tables.
     **/
-    21: optional bool forceComputae
+    21: optional bool forceCompute
 }
 
 union Dependency {
 
@@ -136,7 +136,9 @@ lazy val api = project
       "org.scala-lang.modules" %% "scala-collection-compat" % "2.11.0",
       "com.novocode" % "junit-interface" % "0.11" % "test",
       "org.scalatest" %% "scalatest" % "3.2.19" % "test",
-      "org.scalatestplus" %% "mockito-3-4" % "3.2.10.0" % "test"
+      "org.scalatestplus" %% "mockito-3-4" % "3.2.10.0" % "test",
+      // needed by thrift
+      "org.slf4j" % "slf4j-api" % slf4jApiVersion,
     )
   )
 
@@ -410,33 +412,22 @@ lazy val hub = (project in file("hub"))
     }
   )
 
-val scala_test = "org.scalatest" %% "scalatest" % "3.2.19" % "test"
-val sl4j = "org.slf4j" % "slf4j-api" % slf4jApiVersion
-val logback = "ch.qos.logback" % "logback-classic" % logbackClassicVersion
-val commonDependencies = Seq(
-  scala_test,
-  sl4j,
-  logback
-)
 
 // orchestrator
 lazy val orchestration = project
   .dependsOn(online.%("compile->compile;test->test"))
   .settings(
     assembly / mainClass := Some("ai.chronon.orchestration.RepoParser"),
+    
     Compile / run / mainClass := Some("ai.chronon.orchestration.RepoParser"),
-    assembly / assemblyMergeStrategy := {
-      case "log4j2.properties"                  => MergeStrategy.first
-      case "META-INF/log4j-provider.properties" => MergeStrategy.first
-      case PathList("org", "apache", "logging", "log4j", "core", "config", "plugins", "Log4j2Plugins.dat") =>
-        MergeStrategy.first
-      case x => (assembly / assemblyMergeStrategy).value(x)
-    },
-    libraryDependencies ++= commonDependencies ++ Seq(
-      "org.apache.logging.log4j" % "log4j-api" % log4j2_version,
-      "org.apache.logging.log4j" % "log4j-core" % log4j2_version,
-      "org.apache.logging.log4j" % "log4j-slf4j-impl" % log4j2_version
-    )
+    Compile / unmanagedResourceDirectories += baseDirectory.value / "src" / "main" / "resources",
+
+    libraryDependencies ++= Seq(
+      "org.apache.logging.log4j" %% "log4j-api-scala" % "13.1.0",
+      "org.apache.logging.log4j" % "log4j-core" % "2.20.0",
+//      "org.slf4j" % "slf4j-api" % slf4jApiVersion,
+      "org.scalatest" %% "scalatest" % "3.2.19" % "test",
+    ),
   )
 
 ThisBuild / assemblyMergeStrategy := {
 
@@ -0,0 +1,290 @@
+
+
+# Branch support
+
+We want to support "branches" that allow users to run pipelines and services with two 
+critical goals:
+
+- don't pollute production datasets and end-points
+- are cheap, by re-using as much of existing datasets as possible
+
+## Scenarios within experimentation
+
+While developing on a branch - users could
+- make semantic updates - that change output data - eg., logic within where-s, selects, aggregations etc
+- make non-semantic updates - that don't change output data - eg., spark exec memory, # of pods / workers etc
+  - *note that everything that is stored in metadata field within our API's are non-semantic fields*
+- add new nodes
+- delete existing nodes
+- make changes to several compute nodes at once
+- decide to merge the branch into master
+
+With this context, the goal of this document is to develop / describe a representation to handle the above user workflows.
+
+
+## Motivating Example
+
+Legend: 
+```
+"sq" stands for StagingQuery    "j" for Join
+"t" stands for table            "m" for Model
+"gb"  for GroupBy               
+```
+
+Nodes will be numbered - `gb4`, `m2` etc
+
+Semantic changes to node notated using a plus "+".
+Eg., Join `J3` becomes `J3+`
+
+Non-Semantic changes with an asterisk "*" - `J3*` 
+
+
+```mermaid
+---
+title: Initial state of the example
+---
+
+graph TD;
+      sq1-->t1;
+      t1-->gb1;
+      gb1-->j1;
+      t2-->gb2;
+      gb2-->j1;
+      j1-->m1;
+      gb1-->j2;
+```
+
+### Semantic updates
+
+Say that, `sq1` changes semantically to `sq1+`. It is going to change the output of all  
+nodes downstream of it.
+
+```mermaid
+---
+title: sq1 is updated semantically
+---
+
+graph TD;
+      sq1+-->t1+;
+      t1+-->gb1+;
+      gb1+-->j1+;
+      t2-->gb2;
+      gb2-->j1+;
+      j1+-->m1+;
+      gb1+-->j2+;
+
+  style sq1+ fill:wheat,color:black,stroke:#333
+  style t1+ fill:wheat,color:black,stroke:#333
+  style gb1+ fill:wheat,color:black,stroke:#333
+  style j1+ fill:wheat,color:black,stroke:#333
+  style j2+ fill:wheat,color:black,stroke:#333
+  style m1+ fill:wheat,color:black,stroke:#333
+```
+
+> A major concern here is that, if the local repository of the user is behind remote, 
+> we will a lot more changes than the user intends to.
+
+One approach to mitigate this is to, only make the CLI only pick up changes to files listed as edited by 
+commits to the git branch. 
+
+Another approach is to force user to rebase on any change to the repo. However, this does not 
+guarantee that changes while the job is running is accounted for.
+
+### Non Semantic updates
+Instead, if `sq1` changes non-semantically to `sq1-`. None of the downstream nodes would change.
+
+```mermaid
+---
+title: sq1 is updated non-semantically
+---
+
+  graph TD;
+      sq1*-->t1;
+      t1-->gb1;
+      gb1-->j1;
+      t2-->gb2;
+      gb2-->j1;
+      j1-->m1;
+      gb1-->j2;
+      
+  style sq1* fill:lavender,color:black,stroke:#333
+```
+
+Depending on who is running the job we need to decide which version of the node to use
+- if the branch author is causing the node to be computed we need to use `sq1*` instead of `sq1`
+- if the prod flow or other authors who haven't updated `sq1` are causing the compute, we should use `sq1`
+- if another branch is also updating `sq1` non semantically to `sq1**` we need to use that instead. 
+
+### Adding new nodes
+
+Adding new leaf nodes will not impact any of the existing nodes. 
+
+```mermaid
+---
+title: m2 is added
+---
+
+  graph TD;
+      sq1-->t1;
+      t1-->gb1;
+      gb1-->j1;
+      t2-->gb2;
+      gb2-->j1;
+      j1-->m1;
+      gb1-->j2;
+      j2-->m2;
+      
+  style m2 fill:lightgreen,color:black,stroke:#333
+```
+
+
+But adding non-leaf node - 
+as parent to existing node - would almost always cause semantic updates to nodes downstream.
+
+```mermaid
+---
+title: gb3 is added
+---
+
+  graph TD;
+      sq1-->t1;
+      t1-->gb1;
+      gb1-->j1+;
+      t2-->gb2;
+      t3-->gb3;
+      gb2-->j1+;
+      gb3-->j1+; 
+      j1+-->m1+;
+      gb1-->j2;
+
+  style t3 fill:lightgreen,color:black,stroke:#333
+  style gb3 fill:lightgreen,color:black,stroke:#333
+  style j1+ fill:wheat,color:black,stroke:#333
+  style m1+ fill:wheat,color:black,stroke:#333
+```
+
+One interesting case here is migrating the sql from an external system to StagingQuery of an 
+already used table. Even though this is not a leaf node, absorbing it as same as a leaf node change 
+would be the right thing to do.
+
+
+```mermaid
+---
+title: sq2 is added
+---
+
+  graph TD;
+      sq1-->t1;
+      t1-->gb1;
+      gb1-->j1;
+      t2-->gb2;
+      gb2-->j1;
+      j1-->m1;
+      gb1-->j2;
+      sq2-->t2;
+      
+  style sq2 fill:lightgreen,color:black,stroke:#333
+```
+
+### Deleting existing nodes
+
+Deleting leaf nodes is straight forward. We just need to program a cleanup mechanism 
+to remove data and pipelines generated by that node.
+
+```mermaid
+---
+title: m1 is deleted
+---
+
+  graph TD;
+      sq1-->t1;
+      t1-->gb1;
+      gb1-->j1;
+      t2-->gb2;
+      gb2-->j1;
+      j1-->m1;
+      gb1-->j2;
+      sq2-->t2;
+      
+  style m1 fill:coral,color:black,stroke:#333
+```
+
+Indirectly connected components - via table references - shouldn't be allowed to be deleted 
+as long as there are nodes that depend on the table. We will fail this during the sync step.
+
+```mermaid
+---
+title: sq1 is deleted (not-allowed)
+---
+
+  graph TD;
+      sq1-->t1;
+      t1-->gb1;
+      gb1-->j1;
+      t2-->gb2;
+      gb2-->j1;
+      j1-->m1;
+      gb1-->j2;
+      sq2-->t2;
+      
+  style sq1 fill:coral,color:black,stroke:#333
+```
+
+Directly connected parents when deleted will have updates in the child node - or the compilation 
+would fail. In these cases it would be ideal to garbage collect upstream chain of the deleted node.
+
+```mermaid
+---
+title: gb2 is deleted, j1 is updated 
+---
+
+graph TD;
+      sq2-->t2;
+      sq1-->t1;
+      t1-->gb1;
+      gb1-->j1+;
+      t2-->gb2;
+      gb2-->j1+
+      j1+-->m1+;
+      gb1-->j2;
+      
+  style sq2 fill:coral,color:black,stroke:#333
+  style t2 fill:coral,color:black,stroke:#333
+  style gb2 fill:coral,color:black,stroke:#333
+  style j1+ fill:wheat,color:black,stroke:#333
+  style m1+ fill:wheat,color:black,stroke:#333
+```
+
+### Isolating the changed assets
+
+While development on a branch is in progress we need to create temporary data assets for
+the semantically changed nodes - shown in yellow above. Adds, Deletes & Semantic Updates could 
+trigger this flow.
+
+
+#### Logic to achieve isolation
+Make a new copy of the conf & update the name (file name & metadata.name) - 
+
+`new_name = old_name + '_' + branch_name`
+
+This needs to be followed by changing references in the downstream nodes - 
+all tables and nodes downstream will have the branch suffix.
+
+```
+# 1. cli sends file_hash_map to remote
+
+local_file_map = repo.compiled.file_hash_map
+remote_file_map = remote.file_map
+deleted = remote_file_map - local_file_map
+added = local_file_map - remote_file_map
+updated = [k in intersect(local_file_map, remote_file_map)] 
+# 2. remote marks the changed files it needs
+
+(node, lineage_hash) => 
+```
+
+### Merging changes into `main` 
+
+- Deletes should actually trigger asset and pipeline clean up.
+- Updates should trigger asset renaming
+- Adds can work as is - we are not suffixing adds.
@@ -7,7 +7,7 @@ appender.console.type = Console
 appender.console.name = console
 appender.console.target = SYSTEM_OUT
 appender.console.layout.type = PatternLayout
-appender.console.layout.pattern = %yellow{%d{yyyy/MM/dd HH:mm:ss}} %highlight{%-5level} %green{%file:%line} - %message%n
+appender.console.layout.pattern = %cyan{%d{yyyy/MM/dd HH:mm:ss}} %highlight{%-5level} %magenta{%file:%line} - %message%n
 
 # Configure specific logger
 logger.chronon.name = ai.chronon