[airflow] -- add dependencies for airflow to customJson (#648)

varant-zlai · ezvz · tchow-zlai · web-flow · commit 263a029d40fa · 2025-04-15T19:07:09.000-07:00
## Summary

Setting airflow dependencies in customJson.

This is meant to be temporary, until we can ship orchestrator.

## Checklist
- [ ] Added Unit Tests
- [ ] Covered by existing CI
- [ ] Integration tested
- [ ] Documentation update



&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- **New Features**
- Introduced a new utility for defining staging queries with explicit
table dependencies and partition columns.
- Added support for specifying engine type, scheduling, and advanced
metadata when creating staging queries.
- Added helper functions to automatically generate and set Airflow
dependency metadata for Chronon objects.

- **Refactor**
- Updated sample staging query definitions to use the new dependency and
metadata structure, improving clarity and consistency.
- Replaced nested metadata objects with direct keyword arguments for
easier configuration.
- Integrated automatic setting of Airflow dependencies during
configuration parsing.

- **Chores**
- Enhanced internal handling of Airflow dependencies for relevant
objects, ensuring accurate dependency tracking.
- Updated team configurations to include default partition column
settings for improved environment consistency.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Co-authored-by: ezvz &lt;vzanoyan@gmail.com&gt;
Co-authored-by: tchow-zlai &lt;thomas@zipline.ai&gt;
Co-authored-by: Thomas Chow &lt;thomaschow369@gmail.com&gt;
diff --git a/api/python/ai/chronon/airflow_helpers.py b/api/python/ai/chronon/airflow_helpers.py
@@ -0,0 +1,146 @@
+
+import json
+
+import ai.chronon.utils as utils
+from ai.chronon.api.ttypes import GroupBy, Join
+
+
+def create_airflow_dependency(table, partition_column):
+    """
+    Create an Airflow dependency object for a table.
+    
+    Args:
+        table: The table name (with namespace)
+        partition_column: The partition column to use (defaults to 'ds')
+        
+    Returns:
+        A dictionary with name and spec for the Airflow dependency
+    """
+    assert partition_column is not None, """Partition column must be provided via the spark.chronon.partition.column 
+        config. This can be set as a default in teams.py, or at the individual config level. For example:
+        ``` 
+        Team(
+            conf=ConfigProperties(
+                common={
+                    "spark.chronon.partition.column": "_test_column",
+                }
+            )
+        )
+        ```
+        """
+    return {
+        "name": f"wf_{utils.sanitize(table)}",
+        "spec": f"{table}/{partition_column}={{{{ ds }}}}",
+    }
+
+def _get_partition_col_from_query(query):
+    """Gets partition column from query if available"""
+    if query:
+        return query.partitionColumn
+    return None
+
+def _get_airflow_deps_from_source(source, partition_column=None):
+    """
+    Given a source, return a list of Airflow dependencies.
+    
+    Args:
+        source: The source object (events, entities, or joinSource)
+        partition_column: The partition column to use
+        
+    Returns:
+        A list of Airflow dependency objects
+    """
+    tables = []
+    # Assumes source has already been normalized
+    if source.events:
+        tables = [source.events.table]
+        # Use partition column from query if available, otherwise use the provided one
+        source_partition_column = _get_partition_col_from_query(source.events.query) or partition_column
+    elif source.entities:
+        # Given the setup of Query, we currently mandate the same partition column for snapshot and mutations tables
+        tables = [source.entities.snapshotTable]
+        if source.entities.mutationTable:
+            tables.append(source.entities.mutationTable)
+        source_partition_column = _get_partition_col_from_query(source.entities.query) or partition_column
+    elif source.joinSource:
+        # TODO: Handle joinSource -- it doesn't work right now because the metadata isn't set on joinSource at this point
+        return []
+    else:
+        # Unknown source type
+        return []
+
+    return [create_airflow_dependency(table, source_partition_column) for table in tables]
+
+
+def extract_default_partition_column(obj):
+    return obj.metaData.executionInfo.conf.common.get("spark.chronon.partition.column")
+
+
+def _set_join_deps(join):
+    default_partition_col = extract_default_partition_column(join)
+
+    deps = []
+
+    # Handle left source
+    left_query = utils.get_query(join.left)
+    left_partition_column = _get_partition_col_from_query(left_query) or default_partition_col
+    deps.extend(_get_airflow_deps_from_source(join.left, left_partition_column))
+
+    # Handle right parts (join parts)
+    if join.joinParts:
+        for join_part in join.joinParts:
+            if join_part.groupBy and join_part.groupBy.sources:
+                for source in join_part.groupBy.sources:
+                    source_query = utils.get_query(source)
+                    source_partition_column = _get_partition_col_from_query(source_query) or default_partition_col
+                    deps.extend(_get_airflow_deps_from_source(source, source_partition_column))
+
+    # Handle label parts
+    if join.labelParts and join.labelParts.labels:
+        for label_part in join.labelParts.labels:
+            if label_part.groupBy and label_part.groupBy.sources:
+                for source in label_part.groupBy.sources:
+                    source_query = utils.get_query(source)
+                    source_partition_column = _get_partition_col_from_query(source_query) or default_partition_col
+                    deps.extend(_get_airflow_deps_from_source(source, source_partition_column))
+
+    # Update the metadata customJson with dependencies
+    _set_airflow_deps_json(join, deps)
+
+
+def _set_group_by_deps(group_by):
+    if not group_by.sources:
+        return
+    
+    default_partition_col = extract_default_partition_column(group_by)
+    
+    deps = []
+    
+    # Process each source in the group_by
+    for source in group_by.sources:
+        source_query = utils.get_query(source)
+        source_partition_column = _get_partition_col_from_query(source_query) or default_partition_col
+        deps.extend(_get_airflow_deps_from_source(source, source_partition_column))
+    
+    # Update the metadata customJson with dependencies
+    _set_airflow_deps_json(group_by, deps)
+
+
+def _set_airflow_deps_json(obj, deps):
+    existing_json = obj.metaData.customJson or "{}"
+    json_map = json.loads(existing_json)
+    json_map["airflowDependencies"] = deps
+    obj.metaData.customJson = json.dumps(json_map)
+
+def set_airflow_deps(obj):
+    """
+    Set Airflow dependencies for a Chronon object.
+    
+    Args:
+        obj: A Join, GroupBy
+    """
+    # StagingQuery dependency setting is handled directly in object init
+    if isinstance(obj, Join):
+        _set_join_deps(obj)
+    elif isinstance(obj, GroupBy):
+        _set_group_by_deps(obj)
diff --git a/api/python/ai/chronon/cli/compile/parse_configs.py b/api/python/ai/chronon/cli/compile/parse_configs.py
@@ -4,6 +4,7 @@
 import os
 from typing import List
 
+from ai.chronon import airflow_helpers
 from ai.chronon.cli.compile import parse_teams, serializer
 from ai.chronon.cli.compile.compile_context import CompileContext
 from ai.chronon.cli.compile.display.compiled_obj import CompiledObj
@@ -30,6 +31,9 @@ def from_folder(
 
             for name, obj in results_dict.items():
                 parse_teams.update_metadata(obj, compile_context.teams_dict)
+                # Airflow deps must be set AFTER updating metadata
+                airflow_helpers.set_airflow_deps(obj)
+
                 obj.metaData.sourceFile = os.path.relpath(f, compile_context.chronon_root)
 
                 tjson = serializer.thrift_simple_json(obj)
diff --git a/api/python/ai/chronon/staging_query.py b/api/python/ai/chronon/staging_query.py
@@ -0,0 +1,123 @@
+
+import inspect
+import json
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+import ai.chronon.airflow_helpers as airflow_helpers
+import ai.chronon.api.common.ttypes as common
+import ai.chronon.api.ttypes as ttypes
+
+
+# Wrapper for EngineType
+class EngineType:
+    SPARK = ttypes.EngineType.SPARK
+    BIGQUERY = ttypes.EngineType.BIGQUERY
+
+@dataclass
+class TableDependency:
+    table: str
+    partition_column: Optional[str] = None
+
+def StagingQuery(
+    name: str,
+    query: str,
+    output_namespace: Optional[str] = None,
+    start_partition: Optional[str] = None,
+    table_properties: Optional[Dict[str, str]] = None,
+    setups: Optional[List[str]] = None,
+    partition_column: Optional[str] = None,
+    engine_type: Optional[EngineType] = None,
+    dependencies: Optional[List[TableDependency]] = None,
+    tags: Optional[Dict[str, str]] = None,
+    # execution params
+    offline_schedule: str = "@daily",
+    conf: Optional[common.ConfigProperties] = None,
+    env_vars: Optional[common.EnvironmentVariables] = None,
+    step_days: Optional[int] = None,
+) -> ttypes.StagingQuery:
+    """
+    Creates a StagingQuery object for executing arbitrary SQL queries with templated date parameters.
+
+    :param query:
+        Arbitrary spark query that should be written with template parameters:
+        - `{{ start_date }}`: Initial run uses start_partition, future runs use latest partition + 1 day
+        - `{{ end_date }}`: The end partition of the computing range
+        - `{{ latest_date }}`: End partition independent of the computing range (for cumulative sources)
+        - `{{ max_date(table=namespace.my_table) }}`: Max partition available for a given table
+        These parameters can be modified with offset and bounds:
+        - `{{ start_date(offset=-10, lower_bound='2023-01-01', upper_bound='2024-01-01') }}`
+    :type query: str
+    :param start_partition:
+        On the first run, `{{ start_date }}` will be set to this user provided start date,
+        future incremental runs will set it to the latest existing partition + 1 day.
+    :type start_partition: str
+    :param setups:
+        Spark SQL setup statements. Used typically to register UDFs.
+    :type setups: List[str]
+    :param partition_column:
+        Only needed for `max_date` template
+    :type partition_column: str
+    :param engine_type:
+        By default, spark is the compute engine. You can specify an override (eg. bigquery, etc.)
+        Use the EngineType class constants: EngineType.SPARK, EngineType.BIGQUERY, etc.
+    :type engine_type: int
+    :param tags:
+        Additional metadata that does not directly affect computation, but is useful for management.
+    :type tags: Dict[str, str]
+    :param offline_schedule:
+        The offline schedule interval for batch jobs. Format examples:
+        '@hourly': '0 * * * *',
+        '@daily': '0 0 * * *',
+        '@weekly': '0 0 * * 0',
+        '@monthly': '0 0 1 * *',
+        '@yearly': '0 0 1 1 *'
+    :type offline_schedule: str
+    :param conf:
+        Configuration properties for the StagingQuery.
+    :type conf: common.ConfigProperties
+    :param env_vars:
+        Environment variables for the StagingQuery.
+    :type env_vars: common.EnvironmentVariables
+    :param step_days:
+        The maximum number of days to process at once
+    :type step_days: int
+    :return:
+        A StagingQuery object
+    """
+    # Get caller's filename to assign team
+    team = inspect.stack()[1].filename.split("/")[-2]
+
+    # Create execution info
+    exec_info = common.ExecutionInfo(
+        scheduleCron=offline_schedule,
+        conf=conf,
+        env=env_vars,
+        stepDays=step_days,
+    )
+
+    airflow_dependencies = [airflow_helpers.create_airflow_dependency(t.table, t.partition_column) for t in dependencies] if dependencies else []
+    custom_json = json.dumps({"airflow_dependencies": airflow_dependencies})
+
+    # Create metadata
+    meta_data = ttypes.MetaData(
+        name=name,
+        outputNamespace=output_namespace,
+        team=team,
+        executionInfo=exec_info,
+        tags=tags,
+        customJson=custom_json,
+        tableProperties=table_properties,
+    )
+
+    # Create and return the StagingQuery object with camelCase parameter names
+    staging_query = ttypes.StagingQuery(
+        metaData=meta_data,
+        query=query,
+        startPartition=start_partition,
+        setups=setups,
+        partitionColumn=partition_column,
+        engineType=engine_type,
+    )
+
+    return staging_query
diff --git a/api/python/test/sample/staging_queries/kaggle/outbrain.py b/api/python/test/sample/staging_queries/kaggle/outbrain.py
@@ -13,9 +13,10 @@
 #     See the License for the specific language governing permissions and
 #     limitations under the License.
 
-from ai.chronon.api.ttypes import MetaData, StagingQuery
+from ai.chronon.staging_query import StagingQuery, TableDependency
 
 base_table = StagingQuery(
+    name='outbrain_left',
     query="""
         SELECT
             clicks_train.display_id,
@@ -35,8 +36,9 @@
         AND ABS(HASH(clicks_train.display_id)) % 100  < 5
         AND ABS(HASH(events.display_id)) % 100  < 5
     """,
-    metaData=MetaData(
-        name='outbrain_left',
-        outputNamespace="default",
-    )
+    output_namespace="default",
+    dependencies=[
+        TableDependency(table="kaggle_outbrain.clicks_train", partition_column="ds"),
+        TableDependency(table="kaggle_outbrain.events", partition_column="ds")
+    ],
 )
diff --git a/api/python/test/sample/staging_queries/quickstart/checkouts_external.py b/api/python/test/sample/staging_queries/quickstart/checkouts_external.py
@@ -12,7 +12,7 @@
 #     See the License for the specific language governing permissions and
 #     limitations under the License.
 
-from ai.chronon.api.ttypes import MetaData, StagingQuery
+from ai.chronon.staging_query import StagingQuery, TableDependency
 
 query = """
     SELECT
@@ -30,11 +30,13 @@
     WHERE purchases.ds BETWEEN '{{ start_date }}' AND '{{ end_date }}'
 """
 
-staging_query = StagingQuery(
+checkouts_query = StagingQuery(
     query=query,
-    startPartition="2023-10-31",
-    metaData=MetaData(
-        name='checkouts_staging_query',
-        outputNamespace="data"
-    ),
+    start_partition="2023-10-31",
+    name='checkouts_staging_query',
+    output_namespace="data",
+    dependencies=[
+        TableDependency(table="data.purchases", partition_column="ds"),
+        TableDependency(table="data.checkouts_external", partition_column="ds")
+    ],
 )
diff --git a/api/python/test/sample/staging_queries/sample_team/sample_staging_query.py b/api/python/test/sample/staging_queries/sample_team/sample_staging_query.py
@@ -12,7 +12,7 @@
 #     See the License for the specific language governing permissions and
 #     limitations under the License.
 
-from ai.chronon.types import MetaData, StagingQuery
+from ai.chronon.staging_query import StagingQuery, TableDependency
 
 query = """
 SELECT
@@ -28,15 +28,14 @@
 
 v1 = StagingQuery(
     query=query,
-    startPartition="2020-03-01",
+    start_partition="2020-03-01",
     setups=[
         "CREATE TEMPORARY FUNCTION S2_CELL AS 'com.sample.hive.udf.S2CellId'",
     ],
-    metaData=MetaData(
-        name="sample_staging_query",
-        outputNamespace="sample_namespace",
-        tableProperties={
-            "sample_config_json": """{"sample_key": "sample value}""",
-        },
-    ),
-)
+    name="sample_staging_query",
+    output_namespace="sample_namespace",
+    table_properties={"sample_config_json": """{"sample_key": "sample value}"""},
+    dependencies=[
+        TableDependency(table="sample_namespace.sample_table", partition_column="ds")
+    ],
+    )
diff --git a/api/python/test/sample/teams.py b/api/python/test/sample/teams.py