|
| 1 | +import logging |
| 2 | +import os |
| 3 | +from datetime import datetime, timedelta |
| 4 | + |
| 5 | +import boto3 |
| 6 | +from airflow import DAG |
| 7 | +from airflow.operators.bash import BashOperator |
| 8 | +from airflow.operators.python import PythonOperator |
| 9 | +from airflow.providers.amazon.aws.operators.emr import EmrContainerOperator |
| 10 | +from botocore.exceptions import ClientError |
| 11 | + |
| 12 | +from astronomer.providers.amazon.aws.sensors.emr import EmrContainerSensorAsync |
| 13 | + |
| 14 | +# [START howto_operator_emr_eks_env_variables] |
| 15 | +VIRTUAL_CLUSTER_ID = os.getenv("VIRTUAL_CLUSTER_ID", "xxxxxxxx") |
| 16 | +AWS_CONN_ID = os.getenv("ASTRO_AWS_CONN_ID", "aws_default") |
| 17 | +JOB_ROLE_ARN = os.getenv("JOB_ROLE_ARN", "arn:aws:iam::121212121212:role/test_iam_job_execution_role") |
| 18 | +# [END howto_operator_emr_eks_env_variables] |
| 19 | + |
| 20 | +# Job role name and policy name attached to the role |
| 21 | +JOB_EXECUTION_ROLE = os.getenv("JOB_EXECUTION_ROLE", "test_iam_job_execution_role") |
| 22 | +DEBUGGING_MONITORING_POLICY = os.getenv("DEBUGGING_MONITORING_POLICY", "test_debugging_monitoring_policy") |
| 23 | +CONTAINER_SUBMIT_JOB_POLICY = os.getenv( |
| 24 | + "CONTAINER_SUBMIT_JOB_POLICY", "test_emr_container_submit_jobs_policy" |
| 25 | +) |
| 26 | +JOB_EXECUTION_POLICY = os.getenv("JOB_EXECUTION_POLICY", "test_job_execution_policy") |
| 27 | +MANAGE_VIRTUAL_CLUSTERS = os.getenv("MANAGE_VIRTUAL_CLUSTERS", "test_manage_virtual_clusters") |
| 28 | + |
| 29 | +EKS_CONTAINER_PROVIDER_CLUSTER_NAME = os.getenv( |
| 30 | + "EKS_CONTAINER_PROVIDER_CLUSTER_NAME", "providers-team-eks-cluster" |
| 31 | +) |
| 32 | +KUBECTL_CLUSTER_NAME = os.getenv("KUBECTL_CLUSTER_NAME", "providers-team-eks-namespace") |
| 33 | +VIRTUAL_CLUSTER_NAME = os.getenv("EMR_VIRTUAL_CLUSTER_NAME", "providers-team-virtual-eks-cluster") |
| 34 | +AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", "xxxxxxx") |
| 35 | +AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", "xxxxxxxx") |
| 36 | +AWS_DEFAULT_REGION = os.getenv("AWS_DEFAULT_REGION", "us-east-2") |
| 37 | +INSTANCE_TYPE = os.getenv("INSTANCE_TYPE", "m4.large") |
| 38 | +AIRFLOW_HOME = os.getenv("AIRFLOW_HOME", "/usr/local/airflow") |
| 39 | + |
| 40 | +default_args = { |
| 41 | + "execution_timeout": timedelta(minutes=30), |
| 42 | +} |
| 43 | + |
| 44 | + |
| 45 | +def create_emr_virtual_cluster_func() -> None: |
| 46 | + """Create EMR virtual cluster in container""" |
| 47 | + client = boto3.client("emr-containers") |
| 48 | + try: |
| 49 | + response = client.create_virtual_cluster( |
| 50 | + name=VIRTUAL_CLUSTER_NAME, |
| 51 | + containerProvider={ |
| 52 | + "id": EKS_CONTAINER_PROVIDER_CLUSTER_NAME, |
| 53 | + "type": "EKS", |
| 54 | + "info": {"eksInfo": {"namespace": KUBECTL_CLUSTER_NAME}}, |
| 55 | + }, |
| 56 | + ) |
| 57 | + os.environ["VIRTUAL_CLUSTER_ID"] = response["id"] |
| 58 | + except ClientError: |
| 59 | + logging.exception("Error while creating EMR virtual cluster") |
| 60 | + return None |
| 61 | + |
| 62 | + |
| 63 | +# [START howto_operator_emr_eks_config] |
| 64 | +JOB_DRIVER_ARG = { |
| 65 | + "sparkSubmitJobDriver": { |
| 66 | + "entryPoint": "local:///usr/lib/spark/examples/src/main/python/pi.py", |
| 67 | + "sparkSubmitParameters": "--conf spark.executors.instances=2 --conf spark.executors.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1", # noqa: E501 |
| 68 | + } |
| 69 | +} |
| 70 | + |
| 71 | +CONFIGURATION_OVERRIDES_ARG = { |
| 72 | + "applicationConfiguration": [ |
| 73 | + { |
| 74 | + "classification": "spark-defaults", |
| 75 | + "properties": { |
| 76 | + "spark.hadoop.hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory", # noqa: E501 |
| 77 | + }, |
| 78 | + } |
| 79 | + ], |
| 80 | + "monitoringConfiguration": { |
| 81 | + "cloudWatchMonitoringConfiguration": { |
| 82 | + "logGroupName": "/aws/emr-eks-spark", |
| 83 | + "logStreamNamePrefix": "airflow", |
| 84 | + } |
| 85 | + }, |
| 86 | +} |
| 87 | +# [END howto_operator_emr_eks_config] |
| 88 | + |
| 89 | +with DAG( |
| 90 | + dag_id="emr_eks_pi_job", |
| 91 | + start_date=datetime(2022, 1, 1), |
| 92 | + schedule_interval=None, |
| 93 | + catchup=False, |
| 94 | + default_args=default_args, |
| 95 | + tags=["emr", "example"], |
| 96 | +) as dag: |
| 97 | + # Task steps for DAG to be self-sufficient |
| 98 | + setup_aws_config = BashOperator( |
| 99 | + task_id="setup_aws_config", |
| 100 | + bash_command=f"aws configure set aws_access_key_id {AWS_ACCESS_KEY_ID}; " |
| 101 | + f"aws configure set aws_secret_access_key {AWS_SECRET_ACCESS_KEY}; " |
| 102 | + f"aws configure set default.region {AWS_DEFAULT_REGION}; ", |
| 103 | + ) |
| 104 | + |
| 105 | + # Task to create EMR clusters on EKS |
| 106 | + create_EKS_cluster_kube_namespace_with_role = BashOperator( |
| 107 | + task_id="create_EKS_cluster_kube_namespace_with_role", |
| 108 | + bash_command="sh $AIRFLOW_HOME/dags/example_create_EKS_kube_namespace_with_role.sh ", |
| 109 | + ) |
| 110 | + |
| 111 | + # Task to create EMR virtual cluster |
| 112 | + create_EMR_virtual_cluster = PythonOperator( |
| 113 | + task_id="create_EMR_virtual_cluster", |
| 114 | + python_callable=create_emr_virtual_cluster_func, |
| 115 | + ) |
| 116 | + |
| 117 | + # [START howto_operator_run_emr_container_job] |
| 118 | + run_emr_container_job = EmrContainerOperator( |
| 119 | + task_id="run_emr_container_job", |
| 120 | + virtual_cluster_id=VIRTUAL_CLUSTER_ID, |
| 121 | + execution_role_arn=JOB_ROLE_ARN, |
| 122 | + release_label="emr-6.2.0-latest", |
| 123 | + job_driver=JOB_DRIVER_ARG, |
| 124 | + configuration_overrides=CONFIGURATION_OVERRIDES_ARG, |
| 125 | + name="pi.py", |
| 126 | + ) |
| 127 | + # [END howto_operator_emr_eks_jobrun] |
| 128 | + |
| 129 | + # [START howto_sensor_emr_job_container_sensor] |
| 130 | + emr_job_container_sensor = EmrContainerSensorAsync( |
| 131 | + task_id="emr_job_container_sensor", |
| 132 | + job_id=run_emr_container_job.output, |
| 133 | + virtual_cluster_id=VIRTUAL_CLUSTER_ID, |
| 134 | + poll_interval=5, |
| 135 | + aws_conn_id=AWS_CONN_ID, |
| 136 | + ) |
| 137 | + # [END howto_sensor_emr_job_container_sensor] |
| 138 | + |
| 139 | + # Delete clusters, container providers, role, policy |
| 140 | + remove_clusters_container_role_policy = BashOperator( |
| 141 | + task_id="remove_clusters_container_role_policy", |
| 142 | + bash_command="sh $AIRFLOW_HOME/dags/example_delete_eks_cluster_and_role_policies.sh ", |
| 143 | + trigger_rule="all_done", |
| 144 | + ) |
| 145 | + |
| 146 | + ( |
| 147 | + setup_aws_config |
| 148 | + >> create_EKS_cluster_kube_namespace_with_role |
| 149 | + >> create_EMR_virtual_cluster |
| 150 | + >> run_emr_container_job |
| 151 | + >> emr_job_container_sensor |
| 152 | + >> remove_clusters_container_role_policy |
| 153 | + ) |
0 commit comments