Skip to content

Commit c0492b0

Browse files
authored
Upload git info with metadata file (#1) (#37802)
## What <!-- * Describe what the change is solving. Link all GitHub issues related to this change. --> Adds git commit info to the metadata file during upload. ![image.png](https://graphite-user-uploaded-assets-prod.s3.amazonaws.com/PTsI7qAmiIMkhFQg04QF/b7de4cce-ffe8-4506-a13d-027b1ba21a34.png) Spun out of #32715 as a stack
1 parent 99ab869 commit c0492b0

21 files changed

+1417
-487
lines changed

airbyte-ci/connectors/metadata_service/lib/metadata_service/gcs_upload.py

+86-14
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,15 @@
55
import base64
66
import hashlib
77
import json
8+
import logging
89
import os
910
import re
11+
import tempfile
1012
from dataclasses import dataclass
1113
from pathlib import Path
1214
from typing import List, Optional, Tuple
1315

16+
import git
1417
import yaml
1518
from google.cloud import storage
1619
from google.oauth2 import service_account
@@ -23,8 +26,10 @@
2326
METADATA_FOLDER,
2427
)
2528
from metadata_service.models.generated.ConnectorMetadataDefinitionV0 import ConnectorMetadataDefinitionV0
29+
from metadata_service.models.generated.GitInfo import GitInfo
2630
from metadata_service.models.transform import to_json_sanitized_dict
2731
from metadata_service.validators.metadata_validator import POST_UPLOAD_VALIDATORS, ValidatorOptions, validate_and_load
32+
from pydash import set_
2833
from pydash.objects import get
2934

3035

@@ -172,27 +177,95 @@ def _doc_upload(
172177
return doc_uploaded, doc_blob_id
173178

174179

175-
def create_prerelease_metadata_file(metadata_file_path: Path, validator_opts: ValidatorOptions) -> Path:
176-
metadata, error = validate_and_load(metadata_file_path, [], validator_opts)
177-
if metadata is None:
178-
raise ValueError(f"Metadata file {metadata_file_path} is invalid for uploading: {error}")
180+
def _apply_prerelease_overrides(metadata_dict: dict, validator_opts: ValidatorOptions) -> dict:
181+
"""Apply any prerelease overrides to the metadata file before uploading it to GCS."""
182+
if validator_opts.prerelease_tag is None:
183+
return metadata_dict
179184

180185
# replace any dockerImageTag references with the actual tag
181186
# this includes metadata.data.dockerImageTag, metadata.data.registries[].dockerImageTag
182187
# where registries is a dictionary of registry name to registry object
183-
metadata_dict = to_json_sanitized_dict(metadata, exclude_none=True)
184188
metadata_dict["data"]["dockerImageTag"] = validator_opts.prerelease_tag
185189
for registry in get(metadata_dict, "data.registries", {}).values():
186190
if "dockerImageTag" in registry:
187191
registry["dockerImageTag"] = validator_opts.prerelease_tag
188192

189-
# write metadata to yaml file in system tmp folder
190-
tmp_metadata_file_path = Path("/tmp") / metadata.data.dockerRepository / validator_opts.prerelease_tag / METADATA_FILE_NAME
191-
tmp_metadata_file_path.parent.mkdir(parents=True, exist_ok=True)
192-
with open(tmp_metadata_file_path, "w") as f:
193-
yaml.dump(metadata_dict, f)
193+
return metadata_dict
194+
195+
196+
def _commit_to_git_info(commit: git.Commit) -> GitInfo:
197+
return GitInfo(
198+
commit_sha=commit.hexsha,
199+
commit_timestamp=commit.authored_datetime,
200+
commit_author=commit.author.name,
201+
commit_author_email=commit.author.email,
202+
)
203+
204+
205+
def _get_git_info_for_file(original_metadata_file_path: Path) -> Optional[GitInfo]:
206+
"""
207+
Add additional information to the metadata file before uploading it to GCS.
208+
209+
e.g. The git commit hash, the date of the commit, the author of the commit, etc.
210+
211+
"""
212+
try:
213+
repo = git.Repo(search_parent_directories=True)
214+
215+
# get the commit hash for the last commit that modified the metadata file
216+
commit_sha = repo.git.log("-1", "--format=%H", str(original_metadata_file_path))
217+
218+
commit = repo.commit(commit_sha)
219+
return _commit_to_git_info(commit)
220+
except git.exc.InvalidGitRepositoryError:
221+
logging.warning(f"Metadata file {original_metadata_file_path} is not in a git repository, skipping author info attachment.")
222+
return None
223+
except git.exc.GitCommandError as e:
224+
if "unknown revision or path not in the working tree" in str(e):
225+
logging.warning(f"Metadata file {original_metadata_file_path} is not tracked by git, skipping author info attachment.")
226+
return None
227+
else:
228+
raise e
229+
194230

195-
return tmp_metadata_file_path
231+
def _apply_author_info_to_metadata_file(metadata_dict: dict, original_metadata_file_path: Path) -> dict:
232+
"""Apply author info to the metadata file before uploading it to GCS."""
233+
git_info = _get_git_info_for_file(original_metadata_file_path)
234+
if git_info:
235+
# Apply to the nested / optional field at metadata.data.generated.git
236+
git_info_dict = to_json_sanitized_dict(git_info, exclude_none=True)
237+
metadata_dict = set_(metadata_dict, "data.generated.git", git_info_dict)
238+
return metadata_dict
239+
240+
241+
def _write_metadata_to_tmp_file(metadata_dict: dict) -> Path:
242+
"""Write the metadata to a temporary file."""
243+
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as tmp_file:
244+
yaml.dump(metadata_dict, tmp_file)
245+
return Path(tmp_file.name)
246+
247+
248+
def _safe_load_metadata_file(metadata_file_path: Path) -> dict:
249+
try:
250+
metadata = yaml.safe_load(metadata_file_path.read_text())
251+
if metadata is None or not isinstance(metadata, dict):
252+
raise ValueError(f"Validation error: Metadata file {metadata_file_path} is invalid yaml.")
253+
return metadata
254+
except Exception as e:
255+
raise ValueError(f"Validation error: Metadata file {metadata_file_path} is invalid yaml: {e}")
256+
257+
258+
def _apply_modifications_to_metadata_file(original_metadata_file_path: Path, validator_opts: ValidatorOptions) -> Path:
259+
"""Apply modifications to the metadata file before uploading it to GCS.
260+
261+
e.g. The git commit hash, the date of the commit, the author of the commit, etc.
262+
263+
"""
264+
metadata = _safe_load_metadata_file(original_metadata_file_path)
265+
metadata = _apply_prerelease_overrides(metadata, validator_opts)
266+
metadata = _apply_author_info_to_metadata_file(metadata, original_metadata_file_path)
267+
268+
return _write_metadata_to_tmp_file(metadata)
196269

197270

198271
def upload_metadata_to_gcs(bucket_name: str, metadata_file_path: Path, validator_opts: ValidatorOptions) -> MetadataUploadInfo:
@@ -209,11 +282,10 @@ def upload_metadata_to_gcs(bucket_name: str, metadata_file_path: Path, validator
209282
Returns:
210283
Tuple[bool, str]: Whether the metadata file was uploaded and its blob id.
211284
"""
212-
if validator_opts.prerelease_tag:
213-
metadata_file_path = create_prerelease_metadata_file(metadata_file_path, validator_opts)
214285

215-
metadata, error = validate_and_load(metadata_file_path, POST_UPLOAD_VALIDATORS, validator_opts)
286+
metadata_file_path = _apply_modifications_to_metadata_file(metadata_file_path, validator_opts)
216287

288+
metadata, error = validate_and_load(metadata_file_path, POST_UPLOAD_VALIDATORS, validator_opts)
217289
if metadata is None:
218290
raise ValueError(f"Metadata file {metadata_file_path} is invalid for uploading: {error}")
219291

airbyte-ci/connectors/metadata_service/lib/metadata_service/models/generated/ConnectorMetadataDefinitionV0.py

+29-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from __future__ import annotations
55

6-
from datetime import date
6+
from datetime import date, datetime
77
from typing import Any, Dict, List, Optional
88
from uuid import UUID
99

@@ -104,6 +104,28 @@ class Config:
104104
packageName: str = Field(..., description="The name of the package on PyPi.")
105105

106106

107+
class GitInfo(BaseModel):
108+
class Config:
109+
extra = Extra.forbid
110+
111+
commit_sha: Optional[str] = Field(
112+
None,
113+
description="The git commit sha of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
114+
)
115+
commit_timestamp: Optional[datetime] = Field(
116+
None,
117+
description="The git commit timestamp of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
118+
)
119+
commit_author: Optional[str] = Field(
120+
None,
121+
description="The git commit author of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
122+
)
123+
commit_author_email: Optional[str] = Field(
124+
None,
125+
description="The git commit author email of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
126+
)
127+
128+
107129
class JobTypeResourceLimit(BaseModel):
108130
class Config:
109131
extra = Extra.forbid
@@ -123,6 +145,10 @@ class Config:
123145
pypi: Optional[PyPi] = None
124146

125147

148+
class GeneratedFields(BaseModel):
149+
git: Optional[GitInfo] = None
150+
151+
126152
class ActorDefinitionResourceRequirements(BaseModel):
127153
class Config:
128154
extra = Extra.forbid
@@ -232,7 +258,8 @@ class Config:
232258
resourceRequirements: Optional[ActorDefinitionResourceRequirements] = None
233259
ab_internal: Optional[AirbyteInternal] = None
234260
remoteRegistries: Optional[RemoteRegistries] = None
235-
supportsRefreshes: Optional[bool] = None
261+
supportsRefreshes: Optional[bool] = False
262+
generated: Optional[GeneratedFields] = None
236263

237264

238265
class ConnectorMetadataDefinitionV0(BaseModel):

airbyte-ci/connectors/metadata_service/lib/metadata_service/models/generated/ConnectorRegistryDestinationDefinition.py

+1
Original file line numberDiff line numberDiff line change
@@ -171,3 +171,4 @@ class Config:
171171
allowedHosts: Optional[AllowedHosts] = None
172172
releases: Optional[ConnectorReleases] = None
173173
ab_internal: Optional[AirbyteInternal] = None
174+
supportsRefreshes: Optional[bool] = False

airbyte-ci/connectors/metadata_service/lib/metadata_service/models/generated/ConnectorRegistryV0.py

+1
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@ class Config:
213213
allowedHosts: Optional[AllowedHosts] = None
214214
releases: Optional[ConnectorReleases] = None
215215
ab_internal: Optional[AirbyteInternal] = None
216+
supportsRefreshes: Optional[bool] = False
216217

217218

218219
class ConnectorRegistryV0(BaseModel):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# generated by datamodel-codegen:
2+
# filename: GeneratedFields.yaml
3+
4+
from __future__ import annotations
5+
6+
from datetime import datetime
7+
from typing import Optional
8+
9+
from pydantic import BaseModel, Extra, Field
10+
11+
12+
class GitInfo(BaseModel):
13+
class Config:
14+
extra = Extra.forbid
15+
16+
commit_sha: Optional[str] = Field(
17+
None,
18+
description="The git commit sha of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
19+
)
20+
commit_timestamp: Optional[datetime] = Field(
21+
None,
22+
description="The git commit timestamp of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
23+
)
24+
commit_author: Optional[str] = Field(
25+
None,
26+
description="The git commit author of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
27+
)
28+
commit_author_email: Optional[str] = Field(
29+
None,
30+
description="The git commit author email of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
31+
)
32+
33+
34+
class GeneratedFields(BaseModel):
35+
git: Optional[GitInfo] = None
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# generated by datamodel-codegen:
2+
# filename: GitInfo.yaml
3+
4+
from __future__ import annotations
5+
6+
from datetime import datetime
7+
from typing import Optional
8+
9+
from pydantic import BaseModel, Extra, Field
10+
11+
12+
class GitInfo(BaseModel):
13+
class Config:
14+
extra = Extra.forbid
15+
16+
commit_sha: Optional[str] = Field(
17+
None,
18+
description="The git commit sha of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
19+
)
20+
commit_timestamp: Optional[datetime] = Field(
21+
None,
22+
description="The git commit timestamp of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
23+
)
24+
commit_author: Optional[str] = Field(
25+
None,
26+
description="The git commit author of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
27+
)
28+
commit_author_email: Optional[str] = Field(
29+
None,
30+
description="The git commit author email of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
31+
)

airbyte-ci/connectors/metadata_service/lib/metadata_service/models/generated/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from .ConnectorRegistrySourceDefinition import *
99
from .ConnectorRegistryV0 import *
1010
from .ConnectorReleases import *
11+
from .GeneratedFields import *
12+
from .GitInfo import *
1113
from .JobType import *
1214
from .NormalizationDestinationDefinitionConfig import *
1315
from .RegistryOverrides import *

airbyte-ci/connectors/metadata_service/lib/metadata_service/models/src/ConnectorMetadataDefinitionV0.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -116,3 +116,5 @@ properties:
116116
supportsRefreshes:
117117
type: boolean
118118
default: false
119+
generated:
120+
"$ref": GeneratedFields.yaml
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
"$schema": http://json-schema.org/draft-07/schema#
3+
"$id": https://github.com/airbytehq/airbyte/airbyte-ci/connectors_ci/metadata_service/lib/models/src/GeneratedFields.yaml
4+
title: GeneratedFields
5+
description: Optional schema for fields generated at metadata upload time
6+
type: object
7+
properties:
8+
git:
9+
"$ref": GitInfo.yaml
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
---
2+
"$schema": http://json-schema.org/draft-07/schema#
3+
"$id": https://github.com/airbytehq/airbyte/airbyte-ci/connectors/metadata_service/lib/metadata_service/models/src/GitInfo.yaml
4+
title: GitInfo
5+
description: Information about the author of the last commit that modified this file
6+
type: object
7+
additionalProperties: false
8+
properties:
9+
commit_sha:
10+
type: string
11+
description: The git commit sha of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.
12+
commit_timestamp:
13+
type: string
14+
format: date-time
15+
description: The git commit timestamp of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.
16+
commit_author:
17+
type: string
18+
description: The git commit author of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.
19+
commit_author_email:
20+
type: string
21+
description: The git commit author email of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.

0 commit comments

Comments
 (0)