Skip to content

Upload git info with metadata file (#1) #37802

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
import base64
import hashlib
import json
import logging
import os
import re
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple

import git
import yaml
from google.cloud import storage
from google.oauth2 import service_account
Expand All @@ -23,8 +26,10 @@
METADATA_FOLDER,
)
from metadata_service.models.generated.ConnectorMetadataDefinitionV0 import ConnectorMetadataDefinitionV0
from metadata_service.models.generated.GitInfo import GitInfo
from metadata_service.models.transform import to_json_sanitized_dict
from metadata_service.validators.metadata_validator import POST_UPLOAD_VALIDATORS, ValidatorOptions, validate_and_load
from pydash import set_
from pydash.objects import get


Expand Down Expand Up @@ -172,27 +177,95 @@ def _doc_upload(
return doc_uploaded, doc_blob_id


def create_prerelease_metadata_file(metadata_file_path: Path, validator_opts: ValidatorOptions) -> Path:
metadata, error = validate_and_load(metadata_file_path, [], validator_opts)
if metadata is None:
raise ValueError(f"Metadata file {metadata_file_path} is invalid for uploading: {error}")
def _apply_prerelease_overrides(metadata_dict: dict, validator_opts: ValidatorOptions) -> dict:
"""Apply any prerelease overrides to the metadata file before uploading it to GCS."""
if validator_opts.prerelease_tag is None:
return metadata_dict

# replace any dockerImageTag references with the actual tag
# this includes metadata.data.dockerImageTag, metadata.data.registries[].dockerImageTag
# where registries is a dictionary of registry name to registry object
metadata_dict = to_json_sanitized_dict(metadata, exclude_none=True)
metadata_dict["data"]["dockerImageTag"] = validator_opts.prerelease_tag
for registry in get(metadata_dict, "data.registries", {}).values():
if "dockerImageTag" in registry:
registry["dockerImageTag"] = validator_opts.prerelease_tag

# write metadata to yaml file in system tmp folder
tmp_metadata_file_path = Path("/tmp") / metadata.data.dockerRepository / validator_opts.prerelease_tag / METADATA_FILE_NAME
tmp_metadata_file_path.parent.mkdir(parents=True, exist_ok=True)
with open(tmp_metadata_file_path, "w") as f:
yaml.dump(metadata_dict, f)
return metadata_dict


def _commit_to_git_info(commit: git.Commit) -> GitInfo:
return GitInfo(
commit_sha=commit.hexsha,
commit_timestamp=commit.authored_datetime,
commit_author=commit.author.name,
commit_author_email=commit.author.email,
)


def _get_git_info_for_file(original_metadata_file_path: Path) -> Optional[GitInfo]:
"""
Add additional information to the metadata file before uploading it to GCS.

e.g. The git commit hash, the date of the commit, the author of the commit, etc.

"""
try:
repo = git.Repo(search_parent_directories=True)

# get the commit hash for the last commit that modified the metadata file
commit_sha = repo.git.log("-1", "--format=%H", str(original_metadata_file_path))

commit = repo.commit(commit_sha)
return _commit_to_git_info(commit)
except git.exc.InvalidGitRepositoryError:
logging.warning(f"Metadata file {original_metadata_file_path} is not in a git repository, skipping author info attachment.")
return None
except git.exc.GitCommandError as e:
if "unknown revision or path not in the working tree" in str(e):
logging.warning(f"Metadata file {original_metadata_file_path} is not tracked by git, skipping author info attachment.")
return None
else:
raise e


return tmp_metadata_file_path
def _apply_author_info_to_metadata_file(metadata_dict: dict, original_metadata_file_path: Path) -> dict:
"""Apply author info to the metadata file before uploading it to GCS."""
git_info = _get_git_info_for_file(original_metadata_file_path)
if git_info:
# Apply to the nested / optional field at metadata.data.generated.git
git_info_dict = to_json_sanitized_dict(git_info, exclude_none=True)
metadata_dict = set_(metadata_dict, "data.generated.git", git_info_dict)
return metadata_dict


def _write_metadata_to_tmp_file(metadata_dict: dict) -> Path:
"""Write the metadata to a temporary file."""
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as tmp_file:
yaml.dump(metadata_dict, tmp_file)
return Path(tmp_file.name)


def _safe_load_metadata_file(metadata_file_path: Path) -> dict:
try:
metadata = yaml.safe_load(metadata_file_path.read_text())
if metadata is None or not isinstance(metadata, dict):
raise ValueError(f"Validation error: Metadata file {metadata_file_path} is invalid yaml.")
return metadata
except Exception as e:
raise ValueError(f"Validation error: Metadata file {metadata_file_path} is invalid yaml: {e}")


def _apply_modifications_to_metadata_file(original_metadata_file_path: Path, validator_opts: ValidatorOptions) -> Path:
"""Apply modifications to the metadata file before uploading it to GCS.

e.g. The git commit hash, the date of the commit, the author of the commit, etc.

"""
metadata = _safe_load_metadata_file(original_metadata_file_path)
metadata = _apply_prerelease_overrides(metadata, validator_opts)
metadata = _apply_author_info_to_metadata_file(metadata, original_metadata_file_path)

return _write_metadata_to_tmp_file(metadata)


def upload_metadata_to_gcs(bucket_name: str, metadata_file_path: Path, validator_opts: ValidatorOptions) -> MetadataUploadInfo:
Expand All @@ -209,11 +282,10 @@ def upload_metadata_to_gcs(bucket_name: str, metadata_file_path: Path, validator
Returns:
Tuple[bool, str]: Whether the metadata file was uploaded and its blob id.
"""
if validator_opts.prerelease_tag:
metadata_file_path = create_prerelease_metadata_file(metadata_file_path, validator_opts)

metadata, error = validate_and_load(metadata_file_path, POST_UPLOAD_VALIDATORS, validator_opts)
metadata_file_path = _apply_modifications_to_metadata_file(metadata_file_path, validator_opts)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would find it more explicit and maintainable to decouple metadata mutation from metadata upload. Wdyt about create a separate step in publish which would perform this apply_modifications_to_metadata_file?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm I dont think I like the logic living in the pipeline.

Simply because right now we have metadata logic living already in two places

  1. Metadata service (validation and upload)
  2. Orchestrator (Parsing and generation)

Currently they are at least contained in the metadata_service folder

But if we add the git logic of

for a given metadata file get the author of the last commit that modified

Then we end up with 3 locations to look for logic, 1 of which is now outside the area in the codebase


metadata, error = validate_and_load(metadata_file_path, POST_UPLOAD_VALIDATORS, validator_opts)
if metadata is None:
raise ValueError(f"Metadata file {metadata_file_path} is invalid for uploading: {error}")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from __future__ import annotations

from datetime import date
from datetime import date, datetime
from typing import Any, Dict, List, Optional
from uuid import UUID

Expand Down Expand Up @@ -104,6 +104,28 @@ class Config:
packageName: str = Field(..., description="The name of the package on PyPi.")


class GitInfo(BaseModel):
class Config:
extra = Extra.forbid

commit_sha: Optional[str] = Field(
None,
description="The git commit sha of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_timestamp: Optional[datetime] = Field(
None,
description="The git commit timestamp of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_author: Optional[str] = Field(
None,
description="The git commit author of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_author_email: Optional[str] = Field(
None,
description="The git commit author email of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)


class JobTypeResourceLimit(BaseModel):
class Config:
extra = Extra.forbid
Expand All @@ -123,6 +145,10 @@ class Config:
pypi: Optional[PyPi] = None


class GeneratedFields(BaseModel):
git: Optional[GitInfo] = None


class ActorDefinitionResourceRequirements(BaseModel):
class Config:
extra = Extra.forbid
Expand Down Expand Up @@ -232,7 +258,8 @@ class Config:
resourceRequirements: Optional[ActorDefinitionResourceRequirements] = None
ab_internal: Optional[AirbyteInternal] = None
remoteRegistries: Optional[RemoteRegistries] = None
supportsRefreshes: Optional[bool] = None
supportsRefreshes: Optional[bool] = False
generated: Optional[GeneratedFields] = None


class ConnectorMetadataDefinitionV0(BaseModel):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,4 @@ class Config:
allowedHosts: Optional[AllowedHosts] = None
releases: Optional[ConnectorReleases] = None
ab_internal: Optional[AirbyteInternal] = None
supportsRefreshes: Optional[bool] = False
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ class Config:
allowedHosts: Optional[AllowedHosts] = None
releases: Optional[ConnectorReleases] = None
ab_internal: Optional[AirbyteInternal] = None
supportsRefreshes: Optional[bool] = False


class ConnectorRegistryV0(BaseModel):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# generated by datamodel-codegen:
# filename: GeneratedFields.yaml

from __future__ import annotations

from datetime import datetime
from typing import Optional

from pydantic import BaseModel, Extra, Field


class GitInfo(BaseModel):
class Config:
extra = Extra.forbid

commit_sha: Optional[str] = Field(
None,
description="The git commit sha of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_timestamp: Optional[datetime] = Field(
None,
description="The git commit timestamp of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_author: Optional[str] = Field(
None,
description="The git commit author of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_author_email: Optional[str] = Field(
None,
description="The git commit author email of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)


class GeneratedFields(BaseModel):
git: Optional[GitInfo] = None
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# generated by datamodel-codegen:
# filename: GitInfo.yaml

from __future__ import annotations

from datetime import datetime
from typing import Optional

from pydantic import BaseModel, Extra, Field


class GitInfo(BaseModel):
class Config:
extra = Extra.forbid

commit_sha: Optional[str] = Field(
None,
description="The git commit sha of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_timestamp: Optional[datetime] = Field(
None,
description="The git commit timestamp of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_author: Optional[str] = Field(
None,
description="The git commit author of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_author_email: Optional[str] = Field(
None,
description="The git commit author email of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from .ConnectorRegistrySourceDefinition import *
from .ConnectorRegistryV0 import *
from .ConnectorReleases import *
from .GeneratedFields import *
from .GitInfo import *
from .JobType import *
from .NormalizationDestinationDefinitionConfig import *
from .RegistryOverrides import *
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,5 @@ properties:
supportsRefreshes:
type: boolean
default: false
generated:
"$ref": GeneratedFields.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
"$schema": http://json-schema.org/draft-07/schema#
"$id": https://github.com/airbytehq/airbyte/airbyte-ci/connectors_ci/metadata_service/lib/models/src/GeneratedFields.yaml
title: GeneratedFields
description: Optional schema for fields generated at metadata upload time
type: object
properties:
git:
"$ref": GitInfo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
"$schema": http://json-schema.org/draft-07/schema#
"$id": https://github.com/airbytehq/airbyte/airbyte-ci/connectors/metadata_service/lib/metadata_service/models/src/GitInfo.yaml
title: GitInfo
description: Information about the author of the last commit that modified this file
type: object
additionalProperties: false
properties:
commit_sha:
type: string
description: The git commit sha of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.
commit_timestamp:
type: string
format: date-time
description: The git commit timestamp of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.
commit_author:
type: string
description: The git commit author of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.
commit_author_email:
type: string
description: The git commit author email of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.
Loading