Skip to content

tech spec: demo of common base image for all python connectors #29477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Python base images used for connector built are declared [here](https://github.com/airbytehq/airbyte/blob/8328c9dd89f6417295a20f7c0f5b823a2f02ee8e/airbyte-ci/connectors/pipelines/pipelines/builds/base_images/python.py)# Changelog for airbyte-python-base

|Version| Changelog |
|-------|-------------------------------------------------------------------------------------------------|
| 1.0.0 | Upgrades the base image to Python 3.10.12. |
| 0.0.2 | Adds git to the base image. |
| 0.0.1 |Declares the legacy base image with mandatory debian packages, pip upgrade, timezone settings ...|
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import inspect
import sys
from abc import ABC, abstractmethod
from enum import Enum
from typing import Type

import dagger
from py_markdown_table.markdown_table import markdown_table


class PythonBase(Enum):
# Using image digest to ensure that the image is not changed
PYTHON_3_9 = "python:3.9@sha256:0596c508fdfdf28fd3b98e170f7e3d4708d01df6e6d4bffa981fd6dd22dbd1a5"
PYTHON_3_10_12 = "python:3.10.12@sha256-527cc6f230cf7de1f972fbb0ffc850035e91fb4a52058b44906ea706b3018bb6"


class VersionError(Exception):
pass


class AirbytePythonBase(ABC):

name = "airbyte-python-base"

TIMEZONE = "Etc/UTC"

@property
@abstractmethod
def python_base_image(cls) -> PythonBase:
raise NotImplementedError("Subclasses must define a 'python_base_image'.")

@property
@abstractmethod
def changelog(cls) -> str:
raise NotImplementedError("Subclasses must define a 'changelog' attribute.")

def __init__(self, dagger_client: dagger.Client):
self.dagger_client = dagger_client
self.validate_version()

@classmethod
def name_and_version(cls) -> str:
return f"{cls.name}:{cls.version()}"

@classmethod
def version(cls):
return ".".join(cls.__name__.split("_")[1:])

@property
def base(self) -> dagger.Container:
return self.dagger_client.container().from_(self.python_base_image.value).with_env_variable("BASE_IMAGE", self.name_and_version())

@property
@abstractmethod
def container(self):
raise NotImplementedError("Subclasses must define a 'container' property.")

def validate_version(self):
version_parts = self.version().split(".")
if not len(version_parts) == 3 and all([v.isdigit() for v in version_parts]):
raise VersionError("Version must be in the format 'x.y.z' and each part must be a digit.")


class _0_0_1(AirbytePythonBase):
python_base_image = PythonBase.PYTHON_3_9

apt_packages = [
"curl",
"bash",
"build-essential",
"cmake",
"g++",
"libffi-dev",
"libstdc++6",
]

changelog = "Declares the legacy base image with mandatory debian packages, pip upgrade, timezone settings ..."

@property
def container(self) -> dagger.Container:
return (
self.base.with_exec(["ln", "-snf", f"/usr/share/zoneinfo/{self.TIMEZONE}", "/etc/localtime"])
.with_exec(["apt-get", "update"])
.with_exec(["apt-get", "install", "-y", *self.apt_packages])
.with_exec(["pip", "install", "--upgrade", "pip"])
)


class _0_0_2(_0_0_1):
Copy link
Contributor Author

@alafanechere alafanechere Aug 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm showcasing here how inheritance can help in patching a previous version with maximal code reuse.
Nothing in the pattern I implemented forces a new version to inherit from the previous one. In other words: the container property can be re-implemented from scratch on a AirbytePythonBase subclass if we don't want to build on top of the previous version.

python_base_image = PythonBase.PYTHON_3_9

changelog = "Adds git to the base image."

@property
def container(self) -> dagger.Container:

return super().with_exec(["apt-get", "install", "-y", "git"])


class _1_0_0(_0_0_2):
python_base_image = PythonBase.PYTHON_3_10_12

changelog = "Upgrades the base image to Python 3.10.12."


def get_all_base_images() -> dict[str, Type[AirbytePythonBase]]:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is "just" sugar to automatically discover the image version declared in the module. It saves us from hardcoding the list of base images version: implementing a new class should be the only step to make a new base version available.

# Reverse the order of the members so that the latest version is first
cls_members = reversed(inspect.getmembers(sys.modules[__name__], inspect.isclass))
return {
cls_member.name_and_version(): cls_member
for _, cls_member in cls_members
if issubclass(type(cls_member), type(AirbytePythonBase)) and cls_member != AirbytePythonBase and cls_member != ABC
}


ALL_BASE_IMAGES = get_all_base_images()


def write_changelog_file():
entries = [{"Version": base_cls.version(), "Changelog": base_cls.changelog} for _, base_cls in ALL_BASE_IMAGES.items()]
markdown = markdown_table(entries).set_params(row_sep="markdown", quote=False).get_markdown()
with open("PYTHON_BASE_IMAGES_CHANGELOG.md", "w") as f:
f.write(
"Python base images used for connector built are declared [here](https://github.com/airbytehq/airbyte/blob/8328c9dd89f6417295a20f7c0f5b823a2f02ee8e/airbyte-ci/connectors/pipelines/pipelines/builds/base_images/python.py)\n\n"
)
f.write(f"# Changelog for {AirbytePythonBase.name}\n\n")
f.write(markdown)


write_changelog_file()
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#

from dagger import QueryError
from pipelines.actions.environments import with_airbyte_python_connector
import importlib.util
from pathlib import Path

from dagger import CacheVolume, Container, QueryError
from pipelines.actions.environments import find_local_python_dependencies
from pipelines.bases import StepResult, StepStatus
from pipelines.builds.base_images.python import ALL_BASE_IMAGES
from pipelines.builds.common import BuildConnectorImageBase, BuildConnectorImageForAllPlatformsBase
from pipelines.contexts import ConnectorContext

Expand All @@ -15,13 +19,102 @@ class BuildConnectorImage(BuildConnectorImageBase):
A spec command is run on the container to validate it was built successfully.
"""

entrypoint = ["python", "/airbyte/integration_code/main.py"]

async def _run(self) -> StepResult:
connector = await with_airbyte_python_connector(self.context, self.build_platform)
# connector_base_image = self.context.connector.metadata["connectorBaseImage"]
# Hardcode the base image for demo purposes
connector_base_image = "airbyte-python-base:0.0.1"

if connector_base_image not in ALL_BASE_IMAGES:
return StepResult(
self,
StepStatus.FAILURE,
f"Connector base image {connector_base_image} does not exists. " f"Supported connector base images are {ALL_BASE_IMAGES}",
)
base: Container = ALL_BASE_IMAGES[connector_base_image](self.dagger_client).container
connector: Container = await self._build_from_base(base)
try:
return await self.get_step_result(connector.with_exec(["spec"]))
except QueryError as e:
return StepResult(self, StepStatus.FAILURE, stderr=str(e))

async def _build_from_base(self, base: Container):
pip_cache: CacheVolume = self.context.dagger_client.cache_volume("pip_cache")
snake_case_name = self.context.connector.technical_name.replace("-", "_")

setup_dependencies_to_mount = await find_local_python_dependencies(
self.context,
str(self.context.connector.code_directory),
search_dependencies_in_setup_py=True,
search_dependencies_in_requirements_txt=False,
)

with_setup_file: Container = (
base.with_env_variable("DAGGER_BUILD", "True")
.with_workdir("/airbyte/integration_code")
.with_mounted_cache("/root/.cache/pip", pip_cache)
.with_file("setup.py", (await self.context.get_connector_dir(include="setup.py")).file("setup.py"))
)
with_local_dependencies = with_setup_file

for dependency_path in setup_dependencies_to_mount:
in_container_dependency_path = f"/local_dependencies/{Path(dependency_path).name}"
with_local_dependencies = with_local_dependencies.with_mounted_directory(
in_container_dependency_path, self.context.get_repo_dir(dependency_path)
)
with_local_dependencies_installed = with_local_dependencies.with_exec(["pip", "install", "--prefix=/usr/local", "."])
with_main = with_local_dependencies_installed.with_file("main.py", (await self.context.get_connector_dir()).file("main.py"))
with_connector_code = with_main.with_directory(
snake_case_name, (await self.context.get_connector_dir(include=snake_case_name)).directory(snake_case_name)
)
connector_container = (
with_connector_code.with_env_variable("AIRBYTE_ENTRYPOINT", " ".join(self.entrypoint))
.with_entrypoint(self.entrypoint)
.with_label("io.airbyte.version", self.context.metadata["dockerImageTag"])
.with_label("io.airbyte.name", self.context.metadata["dockerRepository"])
)
return await self.finalize_build(self.context, connector_container)

@staticmethod
async def finalize_build(context: ConnectorContext, connector_container: Container) -> Container:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"""Finalize build by running finalize_build.sh or finalize_build.py if present in the connector directory."""
connector_dir_with_finalize_script = await context.get_connector_dir(include=["finalize_build.sh", "finalize_build.py"])
finalize_scripts = await connector_dir_with_finalize_script.entries()
if not finalize_scripts:
return connector_container

# We don't want finalize scripts to override the entrypoint so we keep it in memory to reset it after finalization
original_entrypoint = await connector_container.entrypoint()

has_finalize_bash_script = "finalize_build.sh" in finalize_scripts
has_finalize_python_script = "finalize_build.py" in finalize_scripts
if has_finalize_python_script and has_finalize_bash_script:
raise Exception("Connector has both finalize_build.sh and finalize_build.py, please remove one of them")

if has_finalize_python_script:
context.logger.info(f"{context.connector.technical_name} has a finalize_build.py script, running it to finalize build...")
module_path = context.connector.code_directory / "finalize_build.py"
connector_finalize_module_spec = importlib.util.spec_from_file_location(
f"{context.connector.code_directory.name}_finalize", module_path
)
connector_finalize_module = importlib.util.module_from_spec(connector_finalize_module_spec)
connector_finalize_module_spec.loader.exec_module(connector_finalize_module)
try:
connector_container = await connector_finalize_module.finalize_build(context, connector_container)
except AttributeError:
raise Exception("Connector has a finalize_build.py script but it doesn't have a finalize_build function.")

if has_finalize_bash_script:
context.logger.info(f"{context.connector.technical_name} has finalize_build.sh script, running it to finalize build...")
connector_container = (
connector_container.with_file("/tmp/finalize_build.sh", connector_dir_with_finalize_script.file("finalize_build.sh"))
.with_entrypoint("sh")
.with_exec(["/tmp/finalize_build.sh"])
)

return connector_container.with_entrypoint(original_entrypoint)


class BuildConnectorImageForAllPlatforms(BuildConnectorImageForAllPlatformsBase):
"""Build a Python connector image for all platforms."""
Expand Down
Loading