Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BioCoder integration #2076

Merged
merged 16 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions evaluation/biocoder/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# BioCoder Evaluation with Opendevin

Implements evaluation of agents on BioCoder from the BioCoder benchmark introduced in [BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models](https://arxiv.org/abs/2308.16458). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper.

## Setup Environment

Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.


## Configure OpenDevin and your LLM


## Run Inference

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add more details about how to run your benchamrk and also add run_infer.sh?

## Examples
176 changes: 176 additions & 0 deletions evaluation/biocoder/biocoder_env_box.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import sys

from opendevin.core.config import config
from opendevin.core.logger import opendevin_logger as logger
from opendevin.runtime.docker.ssh_box import DockerSSHBox
from opendevin.runtime.plugins import JupyterRequirement, SWEAgentCommandsRequirement

BIOCODER_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.0'


class BiocoderSSHBox(DockerSSHBox):
def __init__(
self,
container_image: str,
timeout: int = 120,
sid: str | None = None,
biocoder_instance_id: str | None = None,
biocoder_instance: dict | None = None,
skip_workspace_mount: bool = True,
):
if biocoder_instance_id is None:
raise ValueError('biocoder_instance_id must be provided')
self.biocoder_instance_id = biocoder_instance_id
self.biocoder_instance = biocoder_instance
self.skip_workspace_mount = skip_workspace_mount

assert (
container_image is not None
), 'container_image is required for BiocoderBenchSSHBox!'

@property
def volumes(self):
if self.skip_workspace_mount:
return {
k: v
for k, v in super().volumes.items()
if not v['bind'] == self.sandbox_workspace_dir
}
return super().volumes

def execute_and_check(self, cmd: str, error_msg: str) -> tuple[int, str]:
exit_code, output = self.execute(cmd)
if exit_code != 0:
logger.error(error_msg)
sys.exit(1)
return exit_code, output

@classmethod
def get_box_for_instance(
cls,
instance,
n_tries=5,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that n_tries is not being used. If you need to retry, you can use the retry decorator from the tenacity library, like

@retry(stop=stop_after_attempt(5), wait=wait_fixed(5))

skip_workspace_mount: bool = True,
workspace_mount_path: str | None = None,
) -> 'BiocoderSSHBox':
"""This method initializes a container image, then runs some initialization commands"""
config.workspace_base = workspace_mount_path
config.workspace_mount_path = workspace_mount_path

# linting python after editing helps LLM fix indentations
config.enable_auto_lint = True

sandbox = cls(
container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
swe_instance_id=instance['instance_id'],
swe_instance=instance,
skip_workspace_mount=skip_workspace_mount,
)

logger.info(f"SSH box started for instance {instance['instance_id']}.")
# cd to the workspace
exit_code, output = sandbox.execute_and_check(
'cd /workspace', 'Failed to cd to workspace'
)
logger.info(f'cd to workspace: {output}')

# download repository archive
repository_url = (
f"https://biocoder.lilbillbiscuit.com/repositories/{instance['repo']}.zip"
)
exit_code, output = sandbox.execute_and_check(
'wget -O repo.zip ' + repository_url, 'Failed to download the repository'
)
logger.info(f'Downloaded the repository: {output}')
exit_code, output = sandbox.execute_and_check(
'unzip repo.zip', 'Failed to unzip the repository'
)
logger.info(f'Unzipped the repository: {output}')

return sandbox


if __name__ == '__main__':
EXAMPLE_INSTANCE = {
'repo': 'django/django',
'instance_id': 'django__django-11099',
'base_commit': 'd26b2424437dabeeca94d7900b37d2df4410da0c',
'patch': "diff --git a/django/contrib/auth/validators.py b/django/contrib/auth/validators.py\n--- a/django/contrib/auth/validators.py\n+++ b/django/contrib/auth/validators.py\n@@ -7,7 +7,7 @@\n \n @deconstructible\n class ASCIIUsernameValidator(validators.RegexValidator):\n- regex = r'^[\\w.@+-]+$'\n+ regex = r'^[\\w.@+-]+\\Z'\n message = _(\n 'Enter a valid username. This value may contain only English letters, '\n 'numbers, and @/./+/-/_ characters.'\n@@ -17,7 +17,7 @@ class ASCIIUsernameValidator(validators.RegexValidator):\n \n @deconstructible\n class UnicodeUsernameValidator(validators.RegexValidator):\n- regex = r'^[\\w.@+-]+$'\n+ regex = r'^[\\w.@+-]+\\Z'\n message = _(\n 'Enter a valid username. This value may contain only letters, '\n 'numbers, and @/./+/-/_ characters.'\n",
'test_patch': "diff --git a/tests/auth_tests/test_validators.py b/tests/auth_tests/test_validators.py\n--- a/tests/auth_tests/test_validators.py\n+++ b/tests/auth_tests/test_validators.py\n@@ -237,7 +237,7 @@ def test_unicode_validator(self):\n invalid_usernames = [\n \"o'connell\", \"عبد ال\",\n \"zerowidth\\u200Bspace\", \"nonbreaking\\u00A0space\",\n- \"en\\u2013dash\",\n+ \"en\\u2013dash\", 'trailingnewline\\u000A',\n ]\n v = validators.UnicodeUsernameValidator()\n for valid in valid_usernames:\n@@ -250,7 +250,7 @@ def test_unicode_validator(self):\n \n def test_ascii_validator(self):\n valid_usernames = ['glenn', 'GLEnN', 'jean-marc']\n- invalid_usernames = [\"o'connell\", 'Éric', 'jean marc', \"أحمد\"]\n+ invalid_usernames = [\"o'connell\", 'Éric', 'jean marc', \"أحمد\", 'trailingnewline\\n']\n v = validators.ASCIIUsernameValidator()\n for valid in valid_usernames:\n with self.subTest(valid=valid):\n",
'problem_statement': "UsernameValidator allows trailing newline in usernames\nDescription\n\t\nASCIIUsernameValidator and UnicodeUsernameValidator use the regex \nr'^[\\w.@+-]+$'\nThe intent is to only allow alphanumeric characters as well as ., @, +, and -. However, a little known quirk of Python regexes is that $ will also match a trailing newline. Therefore, the user name validators will accept usernames which end with a newline. You can avoid this behavior by instead using \\A and \\Z to terminate regexes. For example, the validator regex could be changed to\nr'\\A[\\w.@+-]+\\Z'\nin order to reject usernames that end with a newline.\nI am not sure how to officially post a patch, but the required change is trivial - using the regex above in the two validators in contrib.auth.validators.\n",
'hints_text': '',
'created_at': '2019-03-20T03:46:18Z',
'version': '3.0',
'FAIL_TO_PASS': '["test_ascii_validator (auth_tests.test_validators.UsernameValidatorsTests)", "test_unicode_validator (auth_tests.test_validators.UsernameValidatorsTests)", "test_help_text (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)"]',
'PASS_TO_PASS': '["test_help_text (auth_tests.test_validators.MinimumLengthValidatorTest)", "test_validate (auth_tests.test_validators.MinimumLengthValidatorTest)", "test_help_text (auth_tests.test_validators.NumericPasswordValidatorTest)", "test_validate (auth_tests.test_validators.NumericPasswordValidatorTest)", "test_validate (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)", "test_validate_property (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)", "test_empty_password_validator_help_text_html (auth_tests.test_validators.PasswordValidationTest)", "test_get_default_password_validators (auth_tests.test_validators.PasswordValidationTest)", "test_get_password_validators_custom (auth_tests.test_validators.PasswordValidationTest)", "test_password_changed (auth_tests.test_validators.PasswordValidationTest)", "test_password_changed_with_custom_validator (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_text_html (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_text_html_escaping (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_texts (auth_tests.test_validators.PasswordValidationTest)", "test_validate_password (auth_tests.test_validators.PasswordValidationTest)", "test_help_text (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate_custom_list (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate_django_supplied_file (auth_tests.test_validators.CommonPasswordValidatorTest)"]',
'environment_setup_commit': '419a78300f7cd27611196e1e464d50fd0385ff27',
}

sandbox = BiocoderSSHBox.get_box_for_instance(instance=EXAMPLE_INSTANCE)

# in actual eval, this will be initialized by the controller
sandbox.init_plugins([JupyterRequirement(), SWEAgentCommandsRequirement()])

# PRE TEST
exit_code, output = sandbox.execute('cd $REPO_PATH')
assert exit_code == 0, 'Failed to cd $REPO_PATH'
logger.info(f'cd $REPO_PATH: {output}')

# apply test patch
exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/test.patch')
assert exit_code == 0, 'Failed to apply test patch'
logger.info(f'git apply $SWE_TASK_DIR/test.patch: {output}')

# TEST
exit_code, output = sandbox.execute(
'./tests/runtests.py --verbosity 2 auth_tests.test_validators'
)
assert exit_code == 1, 'Expected exit code 1 (since this is a FAIL_TO_PASS)'
logger.info(f'$TEST_CMD:\n{output}')

# apply gold patch
exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/gold.patch')
logger.info('exit code: %d', exit_code)
logger.info(f'git apply $SWE_TASK_DIR/gold.patch: {output}')

# TEST
exit_code, output = sandbox.execute(
'./tests/runtests.py --verbosity 2 auth_tests.test_validators'
)
assert exit_code == 0, 'Expected exit code 0 (since we applied the gold patch)'
logger.info(f'$TEST_CMD:\n{output}')

# Reset the repo
exit_code, output = sandbox.execute('git reset --hard')
assert exit_code == 0, 'Failed to reset the repo'
logger.info(f'git reset --hard: {output}')

bg_cmd = sandbox.execute_in_background(
"while true; do echo 'dot ' && sleep 10; done"
)

sys.stdout.flush()
try:
while True:
try:
user_input = input('>>> ')
except EOFError:
logger.info('Exiting...')
break
if user_input.lower() == 'exit':
logger.info('Exiting...')
break
if user_input.lower() == 'kill':
sandbox.kill_background(bg_cmd.pid)
logger.info('Background process killed')
continue
exit_code, output = sandbox.execute(user_input)
logger.info('exit code: %d', exit_code)
logger.info(output)
if bg_cmd.pid in sandbox.background_commands:
logs = sandbox.read_logs(bg_cmd.pid)
logger.info('background logs: %s', logs)
sys.stdout.flush()
except KeyboardInterrupt:
logger.info('Exiting...')
sandbox.close()
Loading