|
5 | 5 |
|
6 | 6 | from airflow import models
|
7 | 7 | from airflow.providers.google.cloud.operators.dataproc import (
|
8 |
| - ClusterGenerator, |
9 | 8 | DataprocCreateClusterOperator,
|
10 |
| - DataprocCreateWorkflowTemplateOperator, |
11 | 9 | DataprocDeleteClusterOperator,
|
12 |
| - DataprocInstantiateInlineWorkflowTemplateOperator, |
13 |
| - DataprocInstantiateWorkflowTemplateOperator, |
14 |
| - DataprocUpdateClusterOperator, |
15 | 10 | )
|
16 | 11 | from airflow.providers.google.cloud.operators.gcs import (
|
17 | 12 | GCSCreateBucketOperator,
|
|
40 | 35 | "machine_type_uri": "n1-standard-4",
|
41 | 36 | "disk_config": {"boot_disk_type": "pd-standard", "boot_disk_size_gb": 1024},
|
42 | 37 | },
|
43 |
| - "worker_config": { |
44 |
| - "num_instances": 2, |
45 |
| - "machine_type_uri": "n1-standard-4", |
46 |
| - "disk_config": {"boot_disk_type": "pd-standard", "boot_disk_size_gb": 1024}, |
47 |
| - }, |
48 | 38 | }
|
49 | 39 |
|
50 | 40 | # [END how_to_cloud_dataproc_create_cluster]
|
51 | 41 |
|
52 |
| -# Cluster definition: Generating Cluster Config for DataprocCreateClusterOperator |
53 |
| -# [START how_to_cloud_dataproc_create_cluster_generate_cluster_config] |
54 |
| -path = "gs://goog-dataproc-initialization-actions-us-central1/python/pip-install.sh" |
55 |
| - |
56 |
| -CLUSTER_GENERATOR_CONFIG = ClusterGenerator( |
57 |
| - project_id="test", |
58 |
| - zone="us-central1-a", |
59 |
| - master_machine_type="n1-standard-4", |
60 |
| - worker_machine_type="n1-standard-4", |
61 |
| - num_workers=2, |
62 |
| - storage_bucket="test", |
63 |
| - init_actions_uris=[path], |
64 |
| - metadata={"PIP_PACKAGES": "pyyaml requests pandas openpyxl"}, |
65 |
| -).make() |
66 |
| - |
67 |
| -create_cluster_operator = DataprocCreateClusterOperator( |
68 |
| - task_id="create_dataproc_cluster", |
69 |
| - cluster_name="test", |
70 |
| - project_id="test", |
71 |
| - region="us-central1", |
72 |
| - cluster_config=CLUSTER_GENERATOR_CONFIG, |
73 |
| -) |
74 |
| -# [END how_to_cloud_dataproc_create_cluster_generate_cluster_config] |
75 |
| - |
76 |
| -# Update options |
77 |
| -# [START how_to_cloud_dataproc_updatemask_cluster_operator] |
78 |
| -CLUSTER_UPDATE = { |
79 |
| - "config": {"worker_config": {"num_instances": 3}, "secondary_worker_config": {"num_instances": 3}} |
80 |
| -} |
81 |
| -UPDATE_MASK = { |
82 |
| - "paths": ["config.worker_config.num_instances", "config.secondary_worker_config.num_instances"] |
83 |
| -} |
84 |
| -# [END how_to_cloud_dataproc_updatemask_cluster_operator] |
85 |
| - |
86 | 42 | TIMEOUT = {"seconds": 1 * 24 * 60 * 60}
|
87 | 43 |
|
88 | 44 | # Jobs definitions
|
|
183 | 139 | )
|
184 | 140 | # [END howto_create_bucket_task]
|
185 | 141 |
|
186 |
| - # [START how_to_cloud_dataproc_update_cluster_operator] |
187 |
| - scale_cluster = DataprocUpdateClusterOperator( |
188 |
| - task_id="scale_cluster", |
189 |
| - cluster_name=CLUSTER_NAME, |
190 |
| - cluster=CLUSTER_UPDATE, |
191 |
| - update_mask=UPDATE_MASK, |
192 |
| - graceful_decommission_timeout=TIMEOUT, |
193 |
| - project_id=PROJECT_ID, |
194 |
| - region=REGION, |
195 |
| - ) |
196 |
| - # [END how_to_cloud_dataproc_update_cluster_operator] |
197 |
| - |
198 |
| - # [START how_to_cloud_dataproc_create_workflow_template] |
199 |
| - create_workflow_template = DataprocCreateWorkflowTemplateOperator( |
200 |
| - task_id="create_workflow_template", |
201 |
| - template=WORKFLOW_TEMPLATE, |
202 |
| - project_id=PROJECT_ID, |
203 |
| - region=REGION, |
204 |
| - ) |
205 |
| - # [END how_to_cloud_dataproc_create_workflow_template] |
206 |
| - |
207 |
| - # [START how_to_cloud_dataproc_trigger_workflow_template] |
208 |
| - trigger_workflow = DataprocInstantiateWorkflowTemplateOperator( |
209 |
| - task_id="trigger_workflow", region=REGION, project_id=PROJECT_ID, template_id=WORKFLOW_NAME |
210 |
| - ) |
211 |
| - # [END how_to_cloud_dataproc_trigger_workflow_template] |
212 |
| - |
213 |
| - # [START how_to_cloud_dataproc_instantiate_inline_workflow_template] |
214 |
| - instantiate_inline_workflow_template = DataprocInstantiateInlineWorkflowTemplateOperator( |
215 |
| - task_id="instantiate_inline_workflow_template", template=WORKFLOW_TEMPLATE, region=REGION |
216 |
| - ) |
217 |
| - # [END how_to_cloud_dataproc_instantiate_inline_workflow_template] |
218 |
| - |
219 | 142 | # [START howto_DataprocSubmitJobOperatorAsync]
|
220 | 143 | pig_task = DataprocSubmitJobOperatorAsync(
|
221 | 144 | task_id="pig_task", job=PIG_JOB, region=REGION, project_id=PROJECT_ID
|
|
243 | 166 | # [END howto_DataprocSubmitJobOperatorAsync]
|
244 | 167 | # [START how_to_cloud_dataproc_delete_cluster_operator]
|
245 | 168 | delete_cluster = DataprocDeleteClusterOperator(
|
246 |
| - task_id="delete_cluster", project_id=PROJECT_ID, cluster_name=CLUSTER_NAME, region=REGION |
| 169 | + task_id="delete_cluster", |
| 170 | + project_id=PROJECT_ID, |
| 171 | + cluster_name=CLUSTER_NAME, |
| 172 | + region=REGION, |
| 173 | + trigger_rule="all_done", |
247 | 174 | )
|
248 | 175 | # [END how_to_cloud_dataproc_delete_cluster_operator]
|
249 | 176 | # [START howto_delete_buckettask]
|
250 | 177 | delete_bucket = GCSDeleteBucketOperator(
|
251 | 178 | task_id="delete_bucket",
|
252 | 179 | bucket_name=BUCKET,
|
| 180 | + trigger_rule="all_done", |
253 | 181 | )
|
254 | 182 | # [END howto_delete_buckettask]
|
255 | 183 |
|
256 |
| - create_cluster >> scale_cluster >> create_bucket |
257 |
| - scale_cluster >> create_workflow_template >> trigger_workflow >> delete_cluster |
258 |
| - scale_cluster >> hive_task >> delete_cluster >> delete_bucket |
259 |
| - scale_cluster >> pig_task >> delete_cluster >> delete_bucket |
260 |
| - scale_cluster >> spark_sql_task >> delete_cluster >> delete_bucket |
261 |
| - scale_cluster >> spark_task >> delete_cluster >> delete_bucket |
262 |
| - scale_cluster >> hadoop_task >> delete_cluster >> delete_bucket |
| 184 | + create_cluster >> create_bucket |
| 185 | + create_cluster >> pig_task >> hive_task >> delete_cluster >> delete_bucket |
| 186 | + create_cluster >> spark_task >> spark_sql_task >> hadoop_task >> delete_cluster >> delete_bucket |
0 commit comments