Skip to content

Commit dd218dd

Browse files
authored
Improve provider verification pre-commit (#33640)
There were a numer of problems with the provider verification pre-commit scripts: * It missed verification of "notifications" * It did not check if the classes or modules specified in provider yaml raised deprecation warnings * The messages produced by the script when some discrepancies were found were pretty cryptic and it was difficult to guess what kind of action should be taken to fix the problem This PR fixes all those problems: * verification of notification is performed * when importing all the classes and modules, check for the AirflowProviderDeprecationWarnings is done and treated as error * The messages produced provide clear actionable instructions on what to do and explain what are the discrepancies of expected vs. current list in a clear way
1 parent 2ae1c10 commit dd218dd

File tree

2 files changed

+116
-24
lines changed

2 files changed

+116
-24
lines changed

scripts/ci/pre_commit/pre_commit_check_provider_yaml_files.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151
*get_extra_docker_flags(MOUNT_SELECTED),
5252
"-e",
5353
"SKIP_ENVIRONMENT_INITIALIZATION=true",
54+
"-e",
55+
"PYTHONWARNINGS=default",
5456
"--pull",
5557
"never",
5658
airflow_image,

scripts/in_container/run_provider_yaml_files_check.py

Lines changed: 114 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import platform
2727
import sys
2828
import textwrap
29+
import warnings
2930
from collections import Counter
3031
from enum import Enum
3132
from typing import Any, Iterable
@@ -37,18 +38,22 @@
3738
from tabulate import tabulate
3839

3940
from airflow.cli.commands.info_command import Architecture
41+
from airflow.exceptions import AirflowProviderDeprecationWarning
4042
from airflow.providers_manager import ProvidersManager
4143

4244
# Those are deprecated modules that contain removed Hooks/Sensors/Operators that we left in the code
4345
# so that users can get a very specific error message when they try to use them.
4446

45-
EXCLUDED_MODULES = [
47+
DEPRECATED_MODULES = [
4648
"airflow.providers.apache.hdfs.sensors.hdfs",
4749
"airflow.providers.apache.hdfs.hooks.hdfs",
4850
"airflow.providers.cncf.kubernetes.triggers.kubernetes_pod",
4951
"airflow.providers.cncf.kubernetes.operators.kubernetes_pod",
5052
]
5153

54+
KNOWN_DEPRECATED_CLASSES = [
55+
"airflow.providers.google.cloud.links.dataproc.DataprocLink",
56+
]
5257

5358
try:
5459
from yaml import CSafeLoader as SafeLoader
@@ -71,6 +76,13 @@
7176
errors: list[str] = []
7277

7378
console = Console(width=400, color_system="standard")
79+
# you need to enable warnings for all deprecations - needed by importlib library to show deprecations
80+
if os.environ.get("PYTHONWARNINGS") != "default":
81+
console.print(
82+
"[red]Error: PYTHONWARNINGS not set[/]\n"
83+
"You must set `PYTHONWARNINGS=default` environment variable to run this script"
84+
)
85+
sys.exit(1)
7486

7587
suspended_providers: set[str] = set()
7688
suspended_logos: set[str] = set()
@@ -136,7 +148,14 @@ def check_integration_duplicates(yaml_files: dict[str, dict]):
136148
sys.exit(3)
137149

138150

139-
def assert_sets_equal(set1, set2, allow_extra_in_set2=False):
151+
def assert_sets_equal(
152+
set1: set[str],
153+
set_name_1: str,
154+
set2: set[str],
155+
set_name_2: str,
156+
allow_extra_in_set2=False,
157+
extra_message: str = "",
158+
):
140159
try:
141160
difference1 = set1.difference(set2)
142161
except TypeError as e:
@@ -153,6 +172,8 @@ def assert_sets_equal(set1, set2, allow_extra_in_set2=False):
153172

154173
if difference1 or (difference2 and not allow_extra_in_set2):
155174
lines = []
175+
lines.append(f" Left set:{set_name_1}")
176+
lines.append(f" Right set:{set_name_2}")
156177
if difference1:
157178
lines.append(" -- Items in the left set but not the right:")
158179
for item in sorted(difference1):
@@ -163,6 +184,8 @@ def assert_sets_equal(set1, set2, allow_extra_in_set2=False):
163184
lines.append(f" {item!r}")
164185

165186
standard_msg = "\n".join(lines)
187+
if extra_message:
188+
standard_msg += f"\n{extra_message}"
166189
raise AssertionError(standard_msg)
167190

168191

@@ -174,12 +197,37 @@ class ObjectType(Enum):
174197
def check_if_object_exist(object_name: str, resource_type: str, yaml_file_path: str, object_type: ObjectType):
175198
try:
176199
if object_type == ObjectType.CLASS:
177-
module_name, object_name = object_name.rsplit(".", maxsplit=1)
178-
the_class = getattr(importlib.import_module(module_name), object_name)
200+
module_name, class_name = object_name.rsplit(".", maxsplit=1)
201+
with warnings.catch_warnings(record=True) as w:
202+
the_class = getattr(importlib.import_module(module_name), class_name)
203+
for warn in w:
204+
if warn.category == AirflowProviderDeprecationWarning:
205+
if object_name in KNOWN_DEPRECATED_CLASSES:
206+
console.print(
207+
f"[yellow]The {object_name} class is deprecated and we know about it. "
208+
f"It should be removed in the future."
209+
)
210+
continue
211+
errors.append(
212+
f"The `{class_name}` class in {resource_type} list in {yaml_file_path} "
213+
f"is deprecated with this message: '{warn.message}'.\n"
214+
f"[yellow]How to fix it[/]: Please remove it from provider.yaml and replace with "
215+
f"the new class."
216+
)
179217
if the_class and inspect.isclass(the_class):
180218
return
181219
elif object_type == ObjectType.MODULE:
182-
module = importlib.import_module(object_name)
220+
with warnings.catch_warnings(record=True) as w:
221+
module = importlib.import_module(object_name)
222+
for warn in w:
223+
if warn.category == AirflowProviderDeprecationWarning:
224+
errors.append(
225+
f"The `{object_name}` module in {resource_type} list in {yaml_file_path} "
226+
f"is deprecated with this message: '{warn.message}'.\n"
227+
f"[yellow]How to fix it[/]: Please remove it from provider.yaml and replace it "
228+
f"with the new module. If you see warnings in classes - fix the classes so that "
229+
f"they are not raising Deprecation Warnings when module is imported."
230+
)
183231
if inspect.ismodule(module):
184232
return
185233
else:
@@ -231,23 +279,32 @@ def parse_module_data(provider_data, resource_type, yaml_file_path):
231279
return expected_modules, provider_package, resource_data
232280

233281

234-
def check_correctness_of_list_of_sensors_operators_hook_modules(yaml_files: dict[str, dict]):
235-
print("Checking completeness of list of {sensors, hooks, operators, triggers}")
236-
print(" -- {sensors, hooks, operators, triggers} - Expected modules (left) : Current modules (right)")
282+
def check_correctness_of_list_of_sensors_operators_hook_trigger_modules(yaml_files: dict[str, dict]):
283+
print(" -- Checking completeness of list of {sensors, hooks, operators, triggers}")
237284
for (yaml_file_path, provider_data), resource_type in itertools.product(
238285
yaml_files.items(), ["sensors", "operators", "hooks", "triggers"]
239286
):
240287
expected_modules, provider_package, resource_data = parse_module_data(
241288
provider_data, resource_type, yaml_file_path
242289
)
243-
expected_modules = {module for module in expected_modules if module not in EXCLUDED_MODULES}
290+
expected_modules = {module for module in expected_modules if module not in DEPRECATED_MODULES}
244291
current_modules = {str(i) for r in resource_data for i in r.get("python-modules", [])}
245292

246293
check_if_objects_exist_and_belong_to_package(
247294
current_modules, provider_package, yaml_file_path, resource_type, ObjectType.MODULE
248295
)
249296
try:
250-
assert_sets_equal(set(expected_modules), set(current_modules))
297+
package_name = os.fspath(ROOT_DIR.joinpath(yaml_file_path).parent.relative_to(ROOT_DIR)).replace(
298+
"/", "."
299+
)
300+
assert_sets_equal(
301+
set(expected_modules),
302+
f"Found list of {resource_type} modules in provider package: {package_name}",
303+
set(current_modules),
304+
f"Currently configured list of {resource_type} modules in {yaml_file_path}",
305+
extra_message="[yellow]If there are deprecated modules in the list, please add them to "
306+
f"DEPRECATED_MODULES in {pathlib.Path(__file__).relative_to(ROOT_DIR)}[/]",
307+
)
251308
except AssertionError as ex:
252309
nested_error = textwrap.indent(str(ex), " ")
253310
errors.append(
@@ -276,19 +333,27 @@ def check_completeness_of_list_of_transfers(yaml_files: dict[str, dict]):
276333
print("Checking completeness of list of transfers")
277334
resource_type = "transfers"
278335

279-
print(" -- Expected transfers modules(Left): Current transfers Modules(Right)")
336+
print(" -- Checking transfers modules")
280337
for yaml_file_path, provider_data in yaml_files.items():
281338
expected_modules, provider_package, resource_data = parse_module_data(
282339
provider_data, resource_type, yaml_file_path
283340
)
284-
expected_modules = {module for module in expected_modules if module not in EXCLUDED_MODULES}
341+
expected_modules = {module for module in expected_modules if module not in DEPRECATED_MODULES}
285342
current_modules = {r.get("python-module") for r in resource_data}
286343

287344
check_if_objects_exist_and_belong_to_package(
288345
current_modules, provider_package, yaml_file_path, resource_type, ObjectType.MODULE
289346
)
290347
try:
291-
assert_sets_equal(set(expected_modules), set(current_modules))
348+
package_name = os.fspath(ROOT_DIR.joinpath(yaml_file_path).parent.relative_to(ROOT_DIR)).replace(
349+
"/", "."
350+
)
351+
assert_sets_equal(
352+
set(expected_modules),
353+
f"Found list of transfer modules in provider package: {package_name}",
354+
set(current_modules),
355+
f"Currently configured list of transfer modules in {yaml_file_path}",
356+
)
292357
except AssertionError as ex:
293358
nested_error = textwrap.indent(str(ex), " ")
294359
errors.append(
@@ -337,6 +402,18 @@ def check_extra_link_classes(yaml_files: dict[str, dict]):
337402
)
338403

339404

405+
def check_notification_classes(yaml_files: dict[str, dict]):
406+
print("Checking notifications belong to package, exist and are classes")
407+
resource_type = "notifications"
408+
for yaml_file_path, provider_data in yaml_files.items():
409+
provider_package = pathlib.Path(yaml_file_path).parent.as_posix().replace("/", ".")
410+
notifications = provider_data.get(resource_type)
411+
if notifications:
412+
check_if_objects_exist_and_belong_to_package(
413+
notifications, provider_package, yaml_file_path, resource_type, ObjectType.CLASS
414+
)
415+
416+
340417
def check_duplicates_in_list_of_transfers(yaml_files: dict[str, dict]):
341418
print("Checking for duplicates in list of transfers")
342419
errors = []
@@ -435,11 +512,20 @@ def check_doc_files(yaml_files: dict[str, dict]):
435512
}
436513

437514
try:
438-
print(" -- Checking document urls: expected (left), current (right)")
439-
assert_sets_equal(set(expected_doc_urls), set(current_doc_urls))
440-
441-
print(" -- Checking logo urls: expected (left), current (right)")
442-
assert_sets_equal(set(expected_logo_urls), set(current_logo_urls))
515+
print(" -- Checking document urls")
516+
assert_sets_equal(
517+
set(expected_doc_urls),
518+
"Document urls found in airflow/docs",
519+
set(current_doc_urls),
520+
"Document urls configured in provider.yaml files",
521+
)
522+
print(" -- Checking logo urls")
523+
assert_sets_equal(
524+
set(expected_logo_urls),
525+
"Logo urls found in airflow/docs/integration-logos",
526+
set(current_logo_urls),
527+
"Logo urls configured in provider.yaml files",
528+
)
443529
except AssertionError as ex:
444530
print(ex)
445531
sys.exit(1)
@@ -465,12 +551,15 @@ def check_providers_are_mentioned_in_issue_template(yaml_files: dict[str, dict])
465551
issue_template = yaml.safe_load(issue_file)
466552
all_mentioned_providers = [match.value for match in jsonpath_expr.find(issue_template)]
467553
try:
468-
print(
469-
f" -- Checking providers: present in code (left), "
470-
f"mentioned in {PROVIDER_ISSUE_TEMPLATE_PATH} (right)"
471-
)
554+
print(f" -- Checking providers are mentioned in {PROVIDER_ISSUE_TEMPLATE_PATH}")
472555
# in case of suspended providers, we still want to have them in the issue template
473-
assert_sets_equal(set(short_provider_names), set(all_mentioned_providers), allow_extra_in_set2=True)
556+
assert_sets_equal(
557+
set(short_provider_names),
558+
"Provider names found in provider.yaml files",
559+
set(all_mentioned_providers),
560+
f"Provider names mentioned in {PROVIDER_ISSUE_TEMPLATE_PATH}",
561+
allow_extra_in_set2=True,
562+
)
474563
except AssertionError as ex:
475564
print(ex)
476565
sys.exit(1)
@@ -512,7 +601,8 @@ def check_providers_have_all_documentation_files(yaml_files: dict[str, dict]):
512601
check_hook_connection_classes(all_parsed_yaml_files)
513602
check_plugin_classes(all_parsed_yaml_files)
514603
check_extra_link_classes(all_parsed_yaml_files)
515-
check_correctness_of_list_of_sensors_operators_hook_modules(all_parsed_yaml_files)
604+
check_correctness_of_list_of_sensors_operators_hook_trigger_modules(all_parsed_yaml_files)
605+
check_notification_classes(all_parsed_yaml_files)
516606
check_unique_provider_name(all_parsed_yaml_files)
517607
check_providers_have_all_documentation_files(all_parsed_yaml_files)
518608

0 commit comments

Comments
 (0)