Skip to content

Commit 255111f

Browse files
bkossakowskaBeata Kossakowska
authored andcommitted
Add Dataplex Data Quality operators. (apache#32256)
--------- Co-authored-by: Beata Kossakowska <[email protected]>
1 parent 3fdbc57 commit 255111f

File tree

9 files changed

+2454
-18
lines changed

9 files changed

+2454
-18
lines changed

airflow/providers/google/cloud/hooks/dataplex.py

Lines changed: 498 additions & 2 deletions
Large diffs are not rendered by default.

airflow/providers/google/cloud/operators/dataplex.py

Lines changed: 854 additions & 5 deletions
Large diffs are not rendered by default.

airflow/providers/google/cloud/sensors/dataplex.py

Lines changed: 124 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,22 @@
1717
"""This module contains Google Dataplex sensors."""
1818
from __future__ import annotations
1919

20+
import time
2021
from typing import TYPE_CHECKING, Sequence
2122

2223
if TYPE_CHECKING:
2324
from airflow.utils.context import Context
24-
25+
from google.api_core.exceptions import GoogleAPICallError
2526
from google.api_core.gapic_v1.method import DEFAULT, _MethodDefault
2627
from google.api_core.retry import Retry
28+
from google.cloud.dataplex_v1.types import DataScanJob
2729

2830
from airflow.exceptions import AirflowException
29-
from airflow.providers.google.cloud.hooks.dataplex import DataplexHook
31+
from airflow.providers.google.cloud.hooks.dataplex import (
32+
AirflowDataQualityScanException,
33+
AirflowDataQualityScanResultTimeoutException,
34+
DataplexHook,
35+
)
3036
from airflow.sensors.base import BaseSensorOperator
3137

3238

@@ -114,3 +120,119 @@ def poke(self, context: Context) -> bool:
114120
self.log.info("Current status of the Dataplex task %s => %s", self.dataplex_task_id, task_status)
115121

116122
return task_status == TaskState.ACTIVE
123+
124+
125+
class DataplexDataQualityJobStatusSensor(BaseSensorOperator):
126+
"""
127+
Check the status of the Dataplex DataQuality job.
128+
129+
:param project_id: Required. The ID of the Google Cloud project that the task belongs to.
130+
:param region: Required. The ID of the Google Cloud region that the task belongs to.
131+
:param data_scan_id: Required. Data Quality scan identifier.
132+
:param job_id: Required. Job ID.
133+
:param api_version: The version of the api that will be requested for example 'v3'.
134+
:param retry: A retry object used to retry requests. If `None` is specified, requests
135+
will not be retried.
136+
:param metadata: Additional metadata that is provided to the method.
137+
:param gcp_conn_id: The connection ID to use when fetching connection info.
138+
:param impersonation_chain: Optional service account to impersonate using short-term
139+
credentials, or chained list of accounts required to get the access_token
140+
of the last account in the list, which will be impersonated in the request.
141+
If set as a string, the account must grant the originating account
142+
the Service Account Token Creator IAM role.
143+
If set as a sequence, the identities from the list must grant
144+
Service Account Token Creator IAM role to the directly preceding identity, with first
145+
account from the list granting this role to the originating account (templated).
146+
:param result_timeout: Value in seconds for which operator will wait for the Data Quality scan result.
147+
Throws exception if there is no result found after specified amount of seconds.
148+
:param fail_on_dq_failure: If set to true and not all Data Quality scan rules have been passed,
149+
an exception is thrown. If set to false and not all Data Quality scan rules have been passed,
150+
execution will finish with success.
151+
152+
:return: Boolean indicating if the job run has reached the ``DataScanJob.State.SUCCEEDED``.
153+
"""
154+
155+
template_fields = ["job_id"]
156+
157+
def __init__(
158+
self,
159+
project_id: str,
160+
region: str,
161+
data_scan_id: str,
162+
job_id: str,
163+
api_version: str = "v1",
164+
retry: Retry | _MethodDefault = DEFAULT,
165+
metadata: Sequence[tuple[str, str]] = (),
166+
gcp_conn_id: str = "google_cloud_default",
167+
impersonation_chain: str | Sequence[str] | None = None,
168+
fail_on_dq_failure: bool = False,
169+
result_timeout: float = 60.0 * 10,
170+
start_sensor_time: float = time.monotonic(),
171+
*args,
172+
**kwargs,
173+
) -> None:
174+
super().__init__(*args, **kwargs)
175+
self.project_id = project_id
176+
self.region = region
177+
self.data_scan_id = data_scan_id
178+
self.job_id = job_id
179+
self.api_version = api_version
180+
self.retry = retry
181+
self.metadata = metadata
182+
self.gcp_conn_id = gcp_conn_id
183+
self.impersonation_chain = impersonation_chain
184+
self.fail_on_dq_failure = fail_on_dq_failure
185+
self.result_timeout = result_timeout
186+
self.start_sensor_time = start_sensor_time
187+
188+
def execute(self, context: Context) -> None:
189+
super().execute(context)
190+
191+
def _duration(self):
192+
return time.monotonic() - self.start_sensor_time
193+
194+
def poke(self, context: Context) -> bool:
195+
self.log.info("Waiting for job %s to be %s", self.job_id, DataScanJob.State.SUCCEEDED)
196+
if self.result_timeout:
197+
duration = self._duration()
198+
if duration > self.result_timeout:
199+
raise AirflowDataQualityScanResultTimeoutException(
200+
f"Timeout: Data Quality scan {self.job_id} is not ready after {self.result_timeout}s"
201+
)
202+
203+
hook = DataplexHook(
204+
gcp_conn_id=self.gcp_conn_id,
205+
api_version=self.api_version,
206+
impersonation_chain=self.impersonation_chain,
207+
)
208+
209+
try:
210+
job = hook.get_data_scan_job(
211+
project_id=self.project_id,
212+
region=self.region,
213+
data_scan_id=self.data_scan_id,
214+
job_id=self.job_id,
215+
timeout=self.timeout,
216+
retry=self.retry,
217+
metadata=self.metadata,
218+
)
219+
except GoogleAPICallError as e:
220+
raise AirflowException(
221+
f"Error occurred when trying to retrieve Data Quality scan job: {self.data_scan_id}", e
222+
)
223+
224+
job_status = job.state
225+
self.log.info(
226+
"Current status of the Dataplex Data Quality scan job %s => %s", self.job_id, job_status
227+
)
228+
if job_status == DataScanJob.State.FAILED:
229+
raise AirflowException(f"Data Quality scan job failed: {self.job_id}")
230+
if job_status == DataScanJob.State.CANCELLED:
231+
raise AirflowException(f"Data Quality scan job cancelled: {self.job_id}")
232+
if self.fail_on_dq_failure:
233+
if job_status == DataScanJob.State.SUCCEEDED and not job.data_quality_result.passed:
234+
raise AirflowDataQualityScanException(
235+
f"Data Quality job {self.job_id} execution failed due to failure of its scanning "
236+
f"rules: {self.data_scan_id}"
237+
)
238+
return job_status == DataScanJob.State.SUCCEEDED

docs/apache-airflow-providers-google/operators/cloud/dataplex.rst

Lines changed: 164 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,6 @@ With this configuration we can create the lake:
129129
:start-after: [START howto_dataplex_create_lake_operator]
130130
:end-before: [END howto_dataplex_create_lake_operator]
131131

132-
133132
Delete a lake
134133
-------------
135134

@@ -142,3 +141,167 @@ To delete a lake you can use:
142141
:dedent: 4
143142
:start-after: [START howto_dataplex_delete_lake_operator]
144143
:end-before: [END howto_dataplex_delete_lake_operator]
144+
145+
Create or update a Data Quality scan
146+
------------------------------------
147+
148+
Before you create a Dataplex Data Quality scan you need to define its body.
149+
For more information about the available fields to pass when creating a Data Quality scan, visit `Dataplex create data quality API. <https://cloud.google.com/dataplex/docs/reference/rest/v1/projects.locations.dataScans#DataScan>`__
150+
151+
A simple Data Quality scan configuration can look as followed:
152+
153+
.. exampleinclude:: /../../tests/system/providers/google/cloud/dataplex/example_dataplex_dq.py
154+
:language: python
155+
:dedent: 0
156+
:start-after: [START howto_dataplex_data_quality_configuration]
157+
:end-before: [END howto_dataplex_data_quality_configuration]
158+
159+
With this configuration we can create or update the Data Quality scan:
160+
161+
:class:`~airflow.providers.google.cloud.operators.dataplex.DataplexCreateOrUpdateDataQualityScanOperator`
162+
163+
.. exampleinclude:: /../../tests/system/providers/google/cloud/dataplex/example_dataplex_dq.py
164+
:language: python
165+
:dedent: 4
166+
:start-after: [START howto_dataplex_create_data_quality_operator]
167+
:end-before: [END howto_dataplex_create_data_quality_operator]
168+
169+
Get a Data Quality scan
170+
-----------------------
171+
172+
To get a Data Quality scan you can use:
173+
174+
:class:`~airflow.providers.google.cloud.operators.dataplex.DataplexGetDataQualityScanOperator`
175+
176+
.. exampleinclude:: /../../tests/system/providers/google/cloud/dataplex/example_dataplex_dq.py
177+
:language: python
178+
:dedent: 4
179+
:start-after: [START howto_dataplex_get_data_quality_operator]
180+
:end-before: [END howto_dataplex_get_data_quality_operator]
181+
182+
183+
184+
Delete a Data Quality scan
185+
--------------------------
186+
187+
To delete a Data Quality scan you can use:
188+
189+
:class:`~airflow.providers.google.cloud.operators.dataplex.DataplexDeleteDataQualityScanOperator`
190+
191+
.. exampleinclude:: /../../tests/system/providers/google/cloud/dataplex/example_dataplex_dq.py
192+
:language: python
193+
:dedent: 4
194+
:start-after: [START howto_dataplex_delete_data_quality_operator]
195+
:end-before: [END howto_dataplex_delete_data_quality_operator]
196+
197+
Run a Data Quality scan
198+
-----------------------
199+
200+
You can run Dataplex Data Quality scan in asynchronous modes to later check its status using sensor:
201+
202+
:class:`~airflow.providers.google.cloud.operators.dataplex.DataplexRunDataQualityScanOperator`
203+
204+
.. exampleinclude:: /../../tests/system/providers/google/cloud/dataplex/example_dataplex_dq.py
205+
:language: python
206+
:dedent: 4
207+
:start-after: [START howto_dataplex_run_data_quality_operator]
208+
:end-before: [END howto_dataplex_run_data_quality_operator]
209+
210+
To check that running Dataplex Data Quality scan succeeded you can use:
211+
212+
:class:`~airflow.providers.google.cloud.sensors.dataplex.DataplexDataQualityJobStatusSensor`.
213+
214+
.. exampleinclude:: /../../tests/system/providers/google/cloud/dataplex/example_dataplex_dq.py
215+
:language: python
216+
:dedent: 4
217+
:start-after: [START howto_dataplex_data_scan_job_state_sensor]
218+
:end-before: [END howto_dataplex_data_scan_job_state_sensor]
219+
220+
Get a Data Quality scan job
221+
---------------------------
222+
223+
To get a Data Quality scan job you can use:
224+
225+
:class:`~airflow.providers.google.cloud.operators.dataplex.DataplexGetDataQualityScanResultOperator`
226+
227+
.. exampleinclude:: /../../tests/system/providers/google/cloud/dataplex/example_dataplex_dq.py
228+
:language: python
229+
:dedent: 4
230+
:start-after: [START howto_dataplex_get_data_quality_job_operator]
231+
:end-before: [END howto_dataplex_get_data_quality_job_operator]
232+
233+
Create a zone
234+
-------------
235+
236+
Before you create a Dataplex zone you need to define its body.
237+
238+
For more information about the available fields to pass when creating a zone, visit `Dataplex create zone API. <https://cloud.google.com/dataplex/docs/reference/rest/v1/projects.locations.lakes.zones#Zone>`__
239+
240+
A simple zone configuration can look as followed:
241+
242+
.. exampleinclude:: /../../tests/system/providers/google/cloud/dataplex/example_dataplex_dq.py
243+
:language: python
244+
:dedent: 0
245+
:start-after: [START howto_dataplex_zone_configuration]
246+
:end-before: [END howto_dataplex_zone_configuration]
247+
248+
With this configuration we can create a zone:
249+
250+
:class:`~airflow.providers.google.cloud.operators.dataplex.DataplexCreateZoneOperator`
251+
252+
.. exampleinclude:: /../../tests/system/providers/google/cloud/dataplex/example_dataplex_dq.py
253+
:language: python
254+
:dedent: 4
255+
:start-after: [START howto_dataplex_create_zone_operator]
256+
:end-before: [END howto_dataplex_create_zone_operator]
257+
258+
Delete a zone
259+
-------------
260+
261+
To delete a zone you can use:
262+
263+
:class:`~airflow.providers.google.cloud.operators.dataplex.DataplexDeleteZoneOperator`
264+
265+
.. exampleinclude:: /../../tests/system/providers/google/cloud/dataplex/example_dataplex_dq.py
266+
:language: python
267+
:dedent: 4
268+
:start-after: [START howto_dataplex_delete_zone_operator]
269+
:end-before: [END howto_dataplex_delete_zone_operator]
270+
271+
Create a asset
272+
--------------
273+
274+
Before you create a Dataplex asset you need to define its body.
275+
276+
For more information about the available fields to pass when creating a asset, visit `Dataplex create asset API. <https://cloud.google.com/dataplex/docs/reference/rest/v1/projects.locations.lakes.zones.assets#Asset>`__
277+
278+
A simple asset configuration can look as followed:
279+
280+
.. exampleinclude:: /../../tests/system/providers/google/cloud/dataplex/example_dataplex_dq.py
281+
:language: python
282+
:dedent: 0
283+
:start-after: [START howto_dataplex_asset_configuration]
284+
:end-before: [END howto_dataplex_asset_configuration]
285+
286+
With this configuration we can create the asset:
287+
288+
:class:`~airflow.providers.google.cloud.operators.dataplex.DataplexCreateAssetOperator`
289+
290+
.. exampleinclude:: /../../tests/system/providers/google/cloud/dataplex/example_dataplex_dq.py
291+
:language: python
292+
:dedent: 4
293+
:start-after: [START howto_dataplex_create_asset_operator]
294+
:end-before: [END howto_dataplex_create_asset_operator]
295+
296+
Delete a asset
297+
--------------
298+
299+
To delete a asset you can use:
300+
301+
:class:`~airflow.providers.google.cloud.operators.dataplex.DataplexDeleteAssetOperator`
302+
303+
.. exampleinclude:: /../../tests/system/providers/google/cloud/dataplex/example_dataplex_dq.py
304+
:language: python
305+
:dedent: 4
306+
:start-after: [START howto_dataplex_delete_asset_operator]
307+
:end-before: [END howto_dataplex_delete_asset_operator]

docs/spelling_wordlist.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,8 @@ datapoint
373373
Dataprep
374374
Dataproc
375375
dataproc
376+
DataScan
377+
dataScans
376378
Dataset
377379
dataset
378380
datasetId
@@ -485,6 +487,7 @@ DOS'ing
485487
DownloadReportV
486488
downscaling
487489
downstreams
490+
dq
488491
Drillbit
489492
Drivy
490493
dropdown

0 commit comments

Comments
 (0)