7
7
import hashlib
8
8
import os
9
9
10
- import dateutil
11
- import humanize
12
10
import pandas as pd
11
+ import yaml
13
12
from dagster import OpExecutionContext , Output , asset
14
13
from github import Repository
15
14
from orchestrator .logging import sentry
15
+ from orchestrator .models .metadata import LatestMetadataEntry , MetadataDefinition , PartialMetadataDefinition
16
16
from orchestrator .ops .slack import send_slack_message
17
17
from orchestrator .utils .dagster_helpers import OutputDataFrame , output_dataframe
18
18
19
19
GROUP_NAME = "github"
20
+ TOOLING_TEAM_SLACK_TEAM_ID = "S077R8636CV"
20
21
21
22
22
23
def _get_md5_of_github_file (context : OpExecutionContext , github_connector_repo : Repository , path : str ) -> str :
@@ -34,6 +35,11 @@ def _get_md5_of_github_file(context: OpExecutionContext, github_connector_repo:
34
35
return base_64_value
35
36
36
37
38
+ def _get_content_of_github_file (context : OpExecutionContext , github_connector_repo : Repository , path : str ) -> str :
39
+ context .log .debug (f"retrieving contents of { path } " )
40
+ return github_connector_repo .get_contents (path )
41
+
42
+
37
43
@asset (required_resource_keys = {"github_connectors_directory" }, group_name = GROUP_NAME )
38
44
@sentry .instrument_asset_op
39
45
def github_connector_folders (context ):
@@ -65,75 +71,76 @@ def github_metadata_file_md5s(context):
65
71
return Output (metadata_file_paths , metadata = {"preview" : metadata_file_paths })
66
72
67
73
68
- def _should_publish_have_ran (datetime_string : str ) -> bool :
69
- """
70
- Return true if the datetime is 2 hours old.
71
-
72
- """
73
- dt = dateutil .parser .parse (datetime_string )
74
- now = datetime .datetime .now (datetime .timezone .utc )
75
- two_hours_ago = now - datetime .timedelta (hours = 2 )
76
- return dt < two_hours_ago
77
-
78
-
79
- def _to_time_ago (datetime_string : str ) -> str :
74
+ @asset (required_resource_keys = {"github_connector_repo" , "github_connectors_metadata_files" }, group_name = GROUP_NAME )
75
+ def github_metadata_definitions (context ):
80
76
"""
81
- Return a string of how long ago the datetime is human readable format. 10 min
77
+ Return a list of all metadata definitions hosted on our github repo
82
78
"""
83
- dt = dateutil . parser . parse ( datetime_string )
84
- return humanize . naturaltime ( dt )
79
+ github_connector_repo = context . resources . github_connector_repo
80
+ github_connectors_metadata_files = context . resources . github_connectors_metadata_files
85
81
82
+ metadata_definitions = []
83
+ for metadata_file in github_connectors_metadata_files :
84
+ metadata_raw = _get_content_of_github_file (context , github_connector_repo , metadata_file ["path" ])
85
+ metadata_dict = yaml .safe_load (metadata_raw .decoded_content )
86
+ metadata_definitions .append (
87
+ LatestMetadataEntry (
88
+ metadata_definition = MetadataDefinition .parse_obj (metadata_dict ), last_modified = metadata_file ["last_modified" ]
89
+ )
90
+ )
86
91
87
- def _is_stale (github_file_info : dict , latest_gcs_metadata_md5s : dict ) -> bool :
88
- """
89
- Return true if the github info is stale.
90
- """
91
- not_in_gcs = latest_gcs_metadata_md5s .get (github_file_info ["md5" ]) is None
92
- return not_in_gcs and _should_publish_have_ran (github_file_info ["last_modified" ])
92
+ return Output (metadata_definitions , metadata = {"preview" : [md .json () for md in metadata_definitions ]})
93
93
94
94
95
- @asset (required_resource_keys = {"slack" , "latest_metadata_file_blobs" }, group_name = GROUP_NAME )
96
- def stale_gcs_latest_metadata_file (context , github_metadata_file_md5s : dict ) -> OutputDataFrame :
95
+ @asset (required_resource_keys = {"slack" }, group_name = GROUP_NAME )
96
+ def stale_gcs_latest_metadata_file (context , github_metadata_definitions : list , metadata_definitions : list ) -> OutputDataFrame :
97
97
"""
98
98
Return a list of all metadata files in the github repo and denote whether they are stale or not.
99
99
100
100
Stale means that the file in the github repo is not in the latest metadata file blobs.
101
101
"""
102
- human_readable_stale_bools = {True : "🚨 YES!!!" , False : "No" }
103
- latest_gcs_metadata_file_blobs = context . resources . latest_metadata_file_blobs
104
- latest_gcs_metadata_md5s = { blob . md5_hash : blob . name for blob in latest_gcs_metadata_file_blobs }
105
-
106
- stale_report = [
107
- {
108
- "stale" : _is_stale ( github_file_info , latest_gcs_metadata_md5s ),
109
- "github_path" : github_path ,
110
- "github_md5" : github_file_info [ "md5" ],
111
- "github_last_modified" : _to_time_ago ( github_file_info [ "last_modified" ]),
112
- "gcs_md5" : latest_gcs_metadata_md5s . get ( github_file_info [ "md5" ]),
113
- "gcs_path" : latest_gcs_metadata_md5s . get ( github_file_info [ "md5" ]),
114
- }
115
- for github_path , github_file_info in github_metadata_file_md5s . items ( )
116
- ]
102
+ latest_versions_on_gcs = {
103
+ metadata_entry . metadata_definition . data . dockerRepository : metadata_entry . metadata_definition . data . dockerImageTag
104
+ for metadata_entry in metadata_definitions
105
+ if metadata_entry . metadata_definition . data . supportLevel != "archived"
106
+ }
107
+
108
+ now = datetime . datetime . now ( datetime . timezone . utc )
109
+ latest_versions_on_github = {
110
+ metadata_entry . metadata_definition . data . dockerRepository : metadata_entry . metadata_definition . data . dockerImageTag
111
+ for metadata_entry in github_metadata_definitions
112
+ if metadata_entry . metadata_definition . data . supportLevel
113
+ != "archived" # We give a 2 hour grace period for the metadata to be updated
114
+ and datetime . datetime . strptime ( metadata_entry . last_modified , "%a, %d %b %Y %H:%M:%S %Z" ). replace ( tzinfo = datetime . timezone . utc )
115
+ > now - datetime . timedelta ( hours = 2 )
116
+ }
117
117
118
- stale_metadata_files_df = pd .DataFrame (stale_report )
118
+ stale_connectors = []
119
+ for docker_repository , github_docker_image_tag in latest_versions_on_github .items ():
120
+ gcs_docker_image_tag = latest_versions_on_gcs .get (docker_repository )
121
+ if gcs_docker_image_tag != github_docker_image_tag :
122
+ stale_connectors .append (
123
+ {"connector" : docker_repository , "master_version" : github_docker_image_tag , "gcs_version" : gcs_docker_image_tag }
124
+ )
119
125
120
- # sort by stale true to false, then by github_path
121
- stale_metadata_files_df = stale_metadata_files_df .sort_values (
122
- by = ["stale" , "github_path" ],
123
- ascending = [False , True ],
124
- )
126
+ stale_connectors_df = pd .DataFrame (stale_connectors )
125
127
126
128
# If any stale files exist, report to slack
127
129
channel = os .getenv ("STALE_REPORT_CHANNEL" )
128
- any_stale = stale_metadata_files_df ["stale" ].any ()
129
- if channel and any_stale :
130
- only_stale_df = stale_metadata_files_df [stale_metadata_files_df ["stale" ] == True ]
131
- pretty_stale_df = only_stale_df .replace (human_readable_stale_bools )
132
- stale_report_md = pretty_stale_df .to_markdown (index = False )
133
- send_slack_message (context , channel , stale_report_md , enable_code_block_wrapping = True )
134
-
135
- stale_metadata_files_df .replace (human_readable_stale_bools , inplace = True )
136
- return output_dataframe (stale_metadata_files_df )
130
+ any_stale = len (stale_connectors_df ) > 0
131
+ if channel :
132
+ if any_stale :
133
+ stale_report_md = stale_connectors_df .to_markdown (index = False )
134
+ send_slack_message (context , channel , f"🚨 Stale metadata detected! (cc. <!subteam^{ TOOLING_TEAM_SLACK_TEAM_ID } >)" )
135
+ send_slack_message (context , channel , stale_report_md , enable_code_block_wrapping = True )
136
+ else :
137
+ message = f"""
138
+ Analyzed { len (github_metadata_definitions )} metadata files on our master branch and { len (metadata_definitions )} latest metadata files hosted in GCS.All MD5 hashes of these files.
139
+ All MD5 hashes of our metadata files on master match the latest metadata files on GCS.
140
+ No stale metadata: GCS metadata are up to date with metadata hosted on GCS.
141
+ """
142
+ send_slack_message (context , channel , message )
143
+ return output_dataframe (stale_connectors_df )
137
144
138
145
139
146
@asset (required_resource_keys = {"github_connector_nightly_workflow_successes" }, group_name = GROUP_NAME )
0 commit comments