Skip to content

[Arch] Shrink runtime image size #3051

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jul 21, 2024
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -210,10 +210,13 @@ cache

# configuration
config.toml
config.toml_
config.toml.bak

containers/agnostic_sandbox

# swe-bench-eval
image_build_logs
run_instance_logs

od_runtime_*.tar
3 changes: 3 additions & 0 deletions opendevin/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ class SandboxConfig(metaclass=Singleton):
enable_auto_lint: Whether to enable auto-lint.
use_host_network: Whether to use the host network.
initialize_plugins: Whether to initialize plugins.
update_source_code: Whether to update the source code in the EventStreamRuntime.
Used for development of EventStreamRuntime.
"""

box_type: str = 'ssh'
Expand All @@ -157,6 +159,7 @@ class SandboxConfig(metaclass=Singleton):
)
use_host_network: bool = False
initialize_plugins: bool = True
update_source_code: bool = False

def defaults_to_dict(self) -> dict:
"""Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
Expand Down
11 changes: 11 additions & 0 deletions opendevin/runtime/client/client.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
"""
This is the main file for the runtime client.
It is responsible for executing actions received from OpenDevin backend and producing observations.

NOTE: this will be executed inside the docker sandbox.

If you already have pre-build docker image yet you changed the code in this file OR dependencies, you need to rebuild the docker image to update the source code.

You should add SANDBOX_UPDATE_SOURCE_CODE=True to any `python XXX.py` command you run to update the source code.
"""

import argparse
import asyncio
import os
Expand Down
5 changes: 3 additions & 2 deletions opendevin/runtime/client/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,15 @@ async def ainit(self, env_vars: dict[str, str] | None = None):
# NOTE: You can need set DEBUG=true to update the source code
# inside the container. This is useful when you want to test/debug the
# latest code in the runtime docker container.
update_source_code=config.debug,
update_source_code=self.sandbox_config.update_source_code,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be helpful! 👍

)
self.container = await self._init_container(
self.sandbox_workspace_dir,
mount_dir=config.workspace_mount_path,
plugins=self.plugins,
)
# Initialize the env vars
# MUST call super().ainit() to initialize both default env vars
# AND the ones in env vars!
await super().ainit(env_vars)

@staticmethod
Expand Down
5 changes: 3 additions & 2 deletions opendevin/runtime/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,9 @@ async def ainit(self, env_vars: dict[str, str] | None = None) -> None:

This method should be called after the runtime's constructor.
"""
logger.debug(f'Adding default env vars: {self.DEFAULT_ENV_VARS}')
await self.add_env_vars(self.DEFAULT_ENV_VARS)
if self.DEFAULT_ENV_VARS:
logger.debug(f'Adding default env vars: {self.DEFAULT_ENV_VARS}')
await self.add_env_vars(self.DEFAULT_ENV_VARS)
if env_vars is not None:
logger.debug(f'Adding provided env vars: {env_vars}')
await self.add_env_vars(env_vars)
Expand Down
140 changes: 106 additions & 34 deletions opendevin/runtime/utils/runtime_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,23 @@
import shutil
import subprocess
import tempfile
from importlib.metadata import version

import docker
import toml

import opendevin
from opendevin.core.logger import opendevin_logger as logger


def _get_package_version():
"""Read the version from pyproject.toml as the other one may be outdated."""
project_root = os.path.dirname(os.path.dirname(os.path.abspath(opendevin.__file__)))
pyproject_path = os.path.join(project_root, 'pyproject.toml')
with open(pyproject_path, 'r') as f:
pyproject_data = toml.load(f)
return pyproject_data['tool']['poetry']['version']


def _create_project_source_dist():
"""Create a source distribution of the project. Return the path to the tarball."""
# Copy the project directory to the container
Expand All @@ -24,8 +33,10 @@ def _create_project_source_dist():
logger.error(f'Build failed: {result}')
raise Exception(f'Build failed: {result}')

# Fetch the correct version from pyproject.toml
package_version = _get_package_version()
tarball_path = os.path.join(
project_root, 'dist', f'opendevin-{version("opendevin")}.tar.gz'
project_root, 'dist', f'opendevin-{package_version}.tar.gz'
)
if not os.path.exists(tarball_path):
logger.error(f'Source distribution not found at {tarball_path}')
Expand Down Expand Up @@ -60,44 +71,64 @@ def _generate_dockerfile(
if skip_init:
dockerfile_content = f'FROM {base_image}\n'
else:
# Ubuntu 22.x has libgl1-mesa-glx, but 24.x and above have libgl1!
if 'ubuntu' in base_image and (
base_image.endswith(':latest') or base_image.endswith(':24.04')
):
LIBGL_MESA = 'libgl1'
else:
LIBGL_MESA = 'libgl1-mesa-glx'

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would add this at the top:

        dockerfile_content = (
            'ENV DEBIAN_FRONTEND=noninteractive '
            'POETRY_NO_INTERACTION=1 '
            'POETRY_VIRTUALENVS_IN_PROJECT=1 '
            'POETRY_VIRTUALENVS_CREATE=1 '
            'POETRY_CACHE_DIR=/tmp/poetry_cache\n'
        )

(and "+=" in the below line then)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any particular reason why we should add this when not adding it still works? I'm trying to see if we can make things as simple as possible..

Copy link
Collaborator

@tobitege tobitege Jul 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤔 https://python-poetry.org/docs/configuration/#virtualenvsin-project

If not set explicitly, poetry by default will create a virtual environment under {cache-dir}/virtualenvs or use the {project-dir}/.venv directory if one already exists.

Now i see why the previous runs fails, because after rm -rf /root/.cache/pypoetry the virtual env will be removed.

I don't think we need this for the sandbox, because we will rm -rf /opendevin/code and copy a new version of the codebase there, if we enable POETRY_VIRTUALENVS_IN_PROJECT, it will delete the virtual env with rm -rf /opendevin/code , and we will have to re-install everything from scratch whenever we update the source code..

Copy link
Collaborator Author

@xingyaoww xingyaoww Jul 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And POETRY_VIRTUALENVS_CREATE also default to true, so we probably don't need to set it either. https://python-poetry.org/docs/configuration/#virtualenvscreate

dockerfile_content = (
f'FROM {base_image}\n'
# FIXME: make this more generic / cross-platform
# Install necessary packages
# libgl1-mesa-glx is extra dependency for OpenCV
'RUN apt-get update && apt-get install -y wget sudo libgl1-mesa-glx\n'
'RUN apt-get clean && rm -rf /var/lib/apt/lists/*\n' # Clean up the apt cache to reduce image size
'RUN mkdir -p /opendevin && mkdir -p /opendevin/logs && chmod 777 /opendevin/logs\n'
'RUN echo "" > /opendevin/bash.bashrc\n'
'RUN if [ ! -d /opendevin/miniforge3 ]; then \\\n'
' wget --progress=bar:force -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" && \\\n'
' bash Miniforge3.sh -b -p /opendevin/miniforge3 && \\\n'
' rm Miniforge3.sh && \\\n'
' chmod -R g+w /opendevin/miniforge3 && \\\n'
' bash -c ". /opendevin/miniforge3/etc/profile.d/conda.sh && conda config --set changeps1 False && conda config --append channels conda-forge"; \\\n'
' fi\n'
# Install necessary packages and clean up in one layer
f'RUN apt-get update && apt-get install -y wget sudo apt-utils {LIBGL_MESA} libasound2-plugins && \\\n'
f' apt-get clean && rm -rf /var/lib/apt/lists/*\n'
# Create necessary directories
f'RUN mkdir -p /opendevin && mkdir -p /opendevin/logs && chmod 777 /opendevin/logs && \\\n'
f' echo "" > /opendevin/bash.bashrc\n'
# Install Miniforge3
f'RUN if [ ! -d /opendevin/miniforge3 ]; then \\\n'
f' wget --progress=bar:force -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" && \\\n'
f' bash Miniforge3.sh -b -p /opendevin/miniforge3 && \\\n'
f' rm Miniforge3.sh && \\\n'
f' chmod -R g+w /opendevin/miniforge3 && \\\n'
f' bash -c ". /opendevin/miniforge3/etc/profile.d/conda.sh && conda config --set changeps1 False && conda config --append channels conda-forge"; \\\n'
f' fi\n'
'RUN /opendevin/miniforge3/bin/mamba install python=3.11 -y\n'
'RUN /opendevin/miniforge3/bin/mamba install conda-forge::poetry -y\n'
)

# Copy the project directory to the container
dockerfile_content += 'COPY project.tar.gz /opendevin\n'
# remove /opendevin/code if it exists
# Remove /opendevin/code if it exists
dockerfile_content += (
'RUN if [ -d /opendevin/code ]; then rm -rf /opendevin/code; fi\n'
)
# unzip the tarball to /opendevin/code
# Unzip the tarball to /opendevin/code
dockerfile_content += (
'RUN cd /opendevin && tar -xzvf project.tar.gz && rm project.tar.gz\n'
)
dockerfile_content += f'RUN mv /opendevin/{source_code_dirname} /opendevin/code\n'
# install (or update) the dependencies

# ALTERNATIVE, but maybe not complete? (toml error!)
dockerfile_content += (
'RUN cd /opendevin/code && '
'/opendevin/miniforge3/bin/mamba run -n base poetry env use python3.11 && '
'/opendevin/miniforge3/bin/mamba run -n base poetry install\n'
# for browser (update if needed)
'RUN apt-get update && cd /opendevin/code && /opendevin/miniforge3/bin/mamba run -n base poetry run playwright install --with-deps chromium\n'
'/opendevin/miniforge3/bin/mamba run -n base poetry install --no-interaction --no-root\n'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should add --only main here.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no pyproject.toml file in the sandbox with this version of the method.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we do have pyproject.toml inside the sandbox

9f34b7f3ee93bc2829a2c0366a048943

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm... how did those get there, the "tar" doesn't copy them over, does it?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the .tar does copy that over as part of the python source build

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the .tar does copy that over as part of the python source build

Ok, that I must have missed. Can you point me to where exactly that happens or do you mean "make build" here? 🤔

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'RUN /opendevin/miniforge3/bin/mamba run -n base poetry cache clear --all . && \\\n'
'apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* &&\\\n'
'/opendevin/miniforge3/bin/mamba clean --all\n'
)

# For browser (update if needed)
dockerfile_content += (
'RUN apt-get update && \\\n'
' cd /opendevin/code && \\\n'
' /opendevin/miniforge3/bin/mamba run -n base poetry run pip install playwright && \\\n'
' /opendevin/miniforge3/bin/mamba run -n base poetry run playwright install --with-deps chromium && \\\n'
' apt-get clean && \\\n'
' rm -rf /var/lib/apt/lists/*\n'
)
return dockerfile_content

Expand Down Expand Up @@ -176,69 +207,110 @@ def _get_new_image_name(base_image: str, dev_mode: bool = False) -> str:
base_image = base_image + ':latest'
[repo, tag] = base_image.split(':')
repo = repo.replace('/', '___')
return f'{prefix}:{repo}_tag_{tag}'

od_version = _get_package_version()
return f'{prefix}:od_v{od_version}_image_{repo}_tag_{tag}'


def _check_image_exists(image_name: str, docker_client: docker.DockerClient) -> bool:
images = docker_client.images.list()
for image in images:
if image_name in image.tags:
return True
if images:
for image in images:
if image_name in image.tags:
return True
return False


def build_runtime_image(
base_image: str,
docker_client: docker.DockerClient,
update_source_code: bool = False,
save_to_local_store: bool = False, # New parameter to control saving to local store
) -> str:
"""Build the runtime image for the OpenDevin runtime.

This is only used for **eventstream runtime**.
"""
new_image_name = _get_new_image_name(base_image)
logger.info(f'New image name: {new_image_name}')

# Ensure new_image_name contains a colon
if ':' not in new_image_name:
raise ValueError(
f'Invalid image name: {new_image_name}. Expected format "repository:tag".'
)

# Try to pull the new image from the registry
try:
docker_client.images.pull(new_image_name)
except Exception as e:
logger.info(f'Error pulling image {new_image_name}, building it from scratch')
logger.info(f'Non-fatal error: {e}')
except Exception:
logger.info(f'Cannot pull image {new_image_name} directly')

# Detect if the sandbox image is built
image_exists = _check_image_exists(new_image_name, docker_client)
if image_exists:
logger.info(f'Image {new_image_name} exists')
else:
logger.info(f'Image {new_image_name} does not exist')

skip_init = False
if image_exists and not update_source_code:
# If (1) Image exists & we are not updating the source code, we can reuse the existing production image
logger.info('No image build done (not updating source code)')
return new_image_name
elif image_exists and update_source_code:
# If (2) Image exists & we plan to update the source code (in dev mode), we need to rebuild the image
# and give it a special name
# e.g., od_runtime:ubuntu_tag_latest -> od_runtime_dev:ubuntu_tag_latest
logger.info('Image exists, but updating source code requested')
base_image = new_image_name
new_image_name = _get_new_image_name(base_image, dev_mode=True)

skip_init = True # since we only need to update the source code
else:
# If (3) Image does not exist, we need to build it from scratch
# e.g., ubuntu:latest -> od_runtime:ubuntu_tag_latest
skip_init = False # since we need to build the image from scratch

logger.info(f'Building image [{new_image_name}] from scratch')
# This snippet would allow to load from archive:
# tar_path = f'{new_image_name.replace(":", "_")}.tar'
# if os.path.exists(tar_path):
# logger.info(f'Loading image from {tar_path}')
# load_command = ['docker', 'load', '-i', tar_path]
# subprocess.run(load_command, check=True)
# logger.info(f'Image {new_image_name} loaded from {tar_path}')
# return new_image_name
skip_init = False

if not skip_init:
logger.info(f'Building image [{new_image_name}] from scratch')

_build_sandbox_image(base_image, new_image_name, docker_client, skip_init=skip_init)

# Only for development: allow to save image as archive:
if not image_exists and save_to_local_store:
tar_path = f'{new_image_name.replace(":", "_")}.tar'
save_command = ['docker', 'save', '-o', tar_path, new_image_name]
subprocess.run(save_command, check=True)
logger.info(f'Image saved to {tar_path}')

load_command = ['docker', 'load', '-i', tar_path]
subprocess.run(load_command, check=True)
logger.info(f'Image {new_image_name} loaded back into Docker from {tar_path}')

return new_image_name


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--base_image', type=str, default='ubuntu:latest')
parser.add_argument('--base_image', type=str, default='ubuntu:22.04')
parser.add_argument('--update_source_code', type=bool, default=False)
parser.add_argument('--save_to_local_store', type=bool, default=False)
args = parser.parse_args()

client = docker.from_env()
image_name = build_runtime_image(
args.base_image, client, update_source_code=args.update_source_code
args.base_image,
client,
update_source_code=args.update_source_code,
save_to_local_store=args.save_to_local_store,
)
print(f'\nBUILT Image: {image_name}\n')