|
| 1 | +import json |
| 2 | +from unittest import mock |
| 3 | +from unittest.mock import MagicMock |
| 4 | + |
| 5 | +import pytest |
| 6 | +from airflow.exceptions import TaskDeferred |
| 7 | +from airflow.models.dagrun import DagRun |
| 8 | +from airflow.models.taskinstance import TaskInstance |
| 9 | +from airflow.utils.timezone import datetime |
| 10 | +from airflow.utils.types import DagRunType |
| 11 | +from openlineage.client.facet import OutputStatisticsOutputDatasetFacet |
| 12 | +from openlineage.common.dataset import Dataset, Source |
| 13 | +from openlineage.common.provider.bigquery import ( |
| 14 | + BigQueryFacets, |
| 15 | + BigQueryJobRunFacet, |
| 16 | + BigQueryStatisticsDatasetFacet, |
| 17 | +) |
| 18 | + |
| 19 | +from astronomer.providers.google.cloud.extractors.bigquery_async_extractor import ( |
| 20 | + BigQueryAsyncExtractor, |
| 21 | +) |
| 22 | +from astronomer.providers.google.cloud.operators.bigquery import ( |
| 23 | + BigQueryInsertJobOperatorAsync, |
| 24 | +) |
| 25 | + |
| 26 | +TEST_DATASET_LOCATION = "EU" |
| 27 | +TEST_GCP_PROJECT_ID = "test-project" |
| 28 | +TEST_DATASET = "test-dataset" |
| 29 | +TEST_TABLE = "test-table" |
| 30 | +EXECUTION_DATE = datetime(2022, 1, 1, 0, 0, 0) |
| 31 | +INSERT_DATE = EXECUTION_DATE.strftime("%Y-%m-%d") |
| 32 | +INSERT_ROWS_QUERY = ( |
| 33 | + f"INSERT {TEST_DATASET}.{TEST_TABLE} VALUES " |
| 34 | + f"(42, 'monthy python', '{INSERT_DATE}'), " |
| 35 | + f"(42, 'fishy fish', '{INSERT_DATE}');" |
| 36 | +) |
| 37 | + |
| 38 | +INPUT_STATS = [ |
| 39 | + Dataset( |
| 40 | + source=Source(scheme="bigquery"), |
| 41 | + name=f"astronomer-airflow-providers.{TEST_DATASET}.{TEST_TABLE}", |
| 42 | + fields=[], |
| 43 | + custom_facets={}, |
| 44 | + input_facets={}, |
| 45 | + output_facets={}, |
| 46 | + ) |
| 47 | +] |
| 48 | + |
| 49 | +OUTPUT_STATS = Dataset( |
| 50 | + source=Source(scheme="bigquery"), |
| 51 | + name=f"astronomer-airflow-providers.{TEST_DATASET}.{TEST_TABLE}", |
| 52 | + fields=[], |
| 53 | + custom_facets={"stats": BigQueryStatisticsDatasetFacet(rowCount=2, size=0)}, |
| 54 | + input_facets={}, |
| 55 | + output_facets={"outputStatistics": OutputStatisticsOutputDatasetFacet(rowCount=2, size=0)}, |
| 56 | +) |
| 57 | + |
| 58 | +with open("tests/google/cloud/extractors/job_details.json") as jd_json: |
| 59 | + JOB_PROPERTIES = json.load(jd_json) |
| 60 | + |
| 61 | +RUN_FACETS = { |
| 62 | + "bigQuery_job": BigQueryJobRunFacet(billedBytes=0, cached=False, properties=json.dumps(JOB_PROPERTIES)) |
| 63 | +} |
| 64 | + |
| 65 | + |
| 66 | +@pytest.fixture |
| 67 | +def context(): |
| 68 | + """ |
| 69 | + Creates an empty context. |
| 70 | + """ |
| 71 | + context = {} |
| 72 | + yield context |
| 73 | + |
| 74 | + |
| 75 | +@mock.patch("astronomer.providers.google.cloud.operators.bigquery._BigQueryHook") |
| 76 | +@mock.patch("airflow.models.TaskInstance.xcom_pull") |
| 77 | +@mock.patch("openlineage.common.provider.bigquery.BigQueryDatasetsProvider.get_facets") |
| 78 | +def test_extract_on_complete(mock_bg_dataset_provider, mock_xcom_pull, mock_hook): |
| 79 | + """ |
| 80 | + Tests that the custom extractor's implementation for the BigQueryInsertJobOperatorAsync is able to process the |
| 81 | + operator's metadata that needs to be extracted as per OpenLineage. |
| 82 | + """ |
| 83 | + configuration = { |
| 84 | + "query": { |
| 85 | + "query": INSERT_ROWS_QUERY, |
| 86 | + "useLegacySql": False, |
| 87 | + } |
| 88 | + } |
| 89 | + job_id = "123456" |
| 90 | + mock_hook.return_value.insert_job.return_value = MagicMock(job_id=job_id, error_result=False) |
| 91 | + mock_bg_dataset_provider.return_value = BigQueryFacets( |
| 92 | + run_facets=RUN_FACETS, inputs=INPUT_STATS, output=OUTPUT_STATS |
| 93 | + ) |
| 94 | + |
| 95 | + task_id = "insert_query_job" |
| 96 | + operator = BigQueryInsertJobOperatorAsync( |
| 97 | + task_id=task_id, |
| 98 | + configuration=configuration, |
| 99 | + location=TEST_DATASET_LOCATION, |
| 100 | + job_id=job_id, |
| 101 | + project_id=TEST_GCP_PROJECT_ID, |
| 102 | + ) |
| 103 | + |
| 104 | + task_instance = TaskInstance(task=operator) |
| 105 | + with pytest.raises(TaskDeferred): |
| 106 | + operator.execute(context) |
| 107 | + |
| 108 | + bq_extractor = BigQueryAsyncExtractor(operator) |
| 109 | + task_meta_extract = bq_extractor.extract() |
| 110 | + assert task_meta_extract is None |
| 111 | + |
| 112 | + task_meta = bq_extractor.extract_on_complete(task_instance) |
| 113 | + |
| 114 | + mock_xcom_pull.assert_called_once_with(task_ids=task_instance.task_id, key="job_id") |
| 115 | + |
| 116 | + assert task_meta.name == f"adhoc_airflow.{task_id}" |
| 117 | + |
| 118 | + assert task_meta.inputs[0].facets["dataSource"].name == INPUT_STATS[0].source.scheme |
| 119 | + assert task_meta.inputs[0].name == INPUT_STATS[0].name |
| 120 | + |
| 121 | + assert task_meta.outputs[0].name == OUTPUT_STATS.name |
| 122 | + assert task_meta.outputs[0].facets["stats"].rowCount == 2 |
| 123 | + assert task_meta.outputs[0].facets["stats"].size == 0 |
| 124 | + |
| 125 | + assert task_meta.run_facets["bigQuery_job"].billedBytes == 0 |
| 126 | + run_facet_properties = json.loads(task_meta.run_facets["bigQuery_job"].properties) |
| 127 | + assert run_facet_properties == JOB_PROPERTIES |
| 128 | + |
| 129 | + |
| 130 | +def test_extractor_works_on_operator(): |
| 131 | + """Tests that the custom extractor implementation is available for the BigQueryInsertJobOperatorAsync Operator.""" |
| 132 | + task_id = "insert_query_job" |
| 133 | + operator = BigQueryInsertJobOperatorAsync(task_id=task_id, configuration={}) |
| 134 | + assert type(operator).__name__ in BigQueryAsyncExtractor.get_operator_classnames() |
| 135 | + |
| 136 | + |
| 137 | +@mock.patch("astronomer.providers.google.cloud.operators.bigquery._BigQueryHook") |
| 138 | +def test_unavailable_xcom_raises_exception(mock_hook): |
| 139 | + """ |
| 140 | + Tests that an exception is raised when the custom extractor is not available to retrieve required XCOM for the |
| 141 | + BigQueryInsertJobOperatorAsync Operator. |
| 142 | + """ |
| 143 | + configuration = { |
| 144 | + "query": { |
| 145 | + "query": INSERT_ROWS_QUERY, |
| 146 | + "useLegacySql": False, |
| 147 | + } |
| 148 | + } |
| 149 | + job_id = "123456" |
| 150 | + mock_hook.return_value.insert_job.return_value = MagicMock(job_id=job_id, error_result=False) |
| 151 | + task_id = "insert_query_job" |
| 152 | + operator = BigQueryInsertJobOperatorAsync( |
| 153 | + task_id=task_id, |
| 154 | + configuration=configuration, |
| 155 | + location=TEST_DATASET_LOCATION, |
| 156 | + job_id=job_id, |
| 157 | + project_id=TEST_GCP_PROJECT_ID, |
| 158 | + ) |
| 159 | + |
| 160 | + task_instance = TaskInstance(task=operator) |
| 161 | + execution_date = datetime(2022, 1, 1, 0, 0, 0) |
| 162 | + task_instance.run_id = DagRun.generate_run_id(DagRunType.MANUAL, execution_date) |
| 163 | + |
| 164 | + with pytest.raises(TaskDeferred): |
| 165 | + operator.execute(context) |
| 166 | + bq_extractor = BigQueryAsyncExtractor(operator) |
| 167 | + with mock.patch.object(bq_extractor.log, "exception") as mock_log_exception: |
| 168 | + task_meta = bq_extractor.extract_on_complete(task_instance) |
| 169 | + |
| 170 | + mock_log_exception.assert_called_with("%s", "Could not pull relevant BigQuery job ID from XCOM") |
| 171 | + assert task_meta.name == f"adhoc_airflow.{task_id}" |
0 commit comments