Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BioCoder integration #2076

Merged
merged 16 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions evaluation/biocoder/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# BioCoder Evaluation with Opendevin

Implements evaluation of agents on BioCoder from the BioCoder benchmark introduced in [BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models](https://arxiv.org/abs/2308.16458). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper.

## Setup Environment

Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.


## Configure OpenDevin and your LLM
Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.


## Start the evaluation


```bash
./evaluation/biocoder/scripts/run_infer.sh [model_config] [agent] [eval_limit]
```

where `model_config` is mandatory, while `agent`, `dataset` and `eval_limit` are optional.

- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
LLM settings, as defined in your `config.toml`.

- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
to `CodeActAgent`.

- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.

Let's say you'd like to run 10 instances using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent,
then your command would be:

## Examples

```bash
./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 CodeActAgent 1
```

## Reference
```
@misc{tang2024biocoder,
title={BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models},
author={Xiangru Tang and Bill Qian and Rick Gao and Jiakang Chen and Xinyun Chen and Mark Gerstein},
year={2024},
eprint={2308.16458},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
```
236 changes: 236 additions & 0 deletions evaluation/biocoder/biocoder_env_box.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
import json
import os
import sys

from opendevin.core.config import config
from opendevin.core.logger import opendevin_logger as logger
from opendevin.runtime.docker.ssh_box import DockerSSHBox
from opendevin.runtime.plugins import (
JupyterRequirement,
PluginRequirement,
SWEAgentCommandsRequirement,
)

BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'


class BiocoderSSHBox(DockerSSHBox):
def __init__(
self,
container_image: str,
timeout: int = 120,
sid: str | None = None,
biocoder_instance_id: str | None = None,
biocoder_instance: dict | None = None,
skip_workspace_mount: bool = True,
sandbox_plugins: list[PluginRequirement] = [], # noqa: B006
):
if biocoder_instance_id is None:
raise ValueError('biocoder_instance_id must be provided')
self.biocoder_instance_id = biocoder_instance_id
self.biocoder_instance = biocoder_instance
self.skip_workspace_mount = skip_workspace_mount

assert (
container_image is not None
), 'container_image is required for BiocoderBenchSSHBox!'
super().__init__(container_image, timeout, sid)
self.init_plugins(sandbox_plugins)

@property
def volumes(self):
if self.skip_workspace_mount:
return {
k: v
for k, v in super().volumes.items()
if not v['bind'] == self.sandbox_workspace_dir
}
return super().volumes

def get_changed_code(self):
# use biocoder_instance to get all data
pass

def execute_and_check(self, cmd: str, error_msg: str) -> tuple[int, str]:
exit_code, output = self.execute(cmd)
if exit_code != 0:
logger.error(error_msg)
sys.exit(1)
return exit_code, output

@classmethod
def get_box_for_instance(
cls,
instance,
n_tries=5,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that n_tries is not being used. If you need to retry, you can use the retry decorator from the tenacity library, like

@retry(stop=stop_after_attempt(5), wait=wait_fixed(5))

skip_workspace_mount: bool = False,
workspace_mount_path: str | None = None,
sandbox_plugins: list[PluginRequirement] = [], # noqa: B006
) -> 'BiocoderSSHBox':
"""This method initializes a container image, then runs some initialization commands"""
config.workspace_base = workspace_mount_path
config.workspace_mount_path = workspace_mount_path

workspace_test_folder = 'biocoder_cache'
if not os.path.exists(
os.path.join(workspace_mount_path, workspace_test_folder)
):
os.makedirs(
os.path.join(workspace_mount_path, workspace_test_folder), exist_ok=True
)

file_ext = {
'python': 'py',
'java': 'java',
'c': 'c',
'cpp': 'cpp',
'javascript': 'js',
'typescript': 'ts',
}[instance['language'].lower()]

context_path = os.path.join(
workspace_mount_path, workspace_test_folder, 'context.' + file_ext
)
generated_path = os.path.join(
workspace_mount_path, workspace_test_folder, 'generated.' + file_ext
)
golden_path = os.path.join(
workspace_mount_path, workspace_test_folder, 'golden.' + file_ext
)

with open(context_path, 'w') as f:
f.write(instance['contextCode'])
with open(generated_path, 'w') as f:
f.write(instance['goldenCode'])
with open(golden_path, 'w') as f:
f.write(instance['goldenCode'])

testcase_json = {
'test_case_id': instance['test_case_id'],
'num_cases': 1000,
'language': instance['language'].lower(),
}

with open(
os.path.join(
workspace_mount_path, workspace_test_folder, 'testcase_biocoder.json'
),
'w',
) as f:
f.write(json.dumps(testcase_json, indent=4))

# linting python after editing helps LLM fix indentations
config.enable_auto_lint = True

sandbox = cls(
container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
biocoder_instance_id=instance['test_case_id'],
biocoder_instance=instance,
skip_workspace_mount=skip_workspace_mount,
sandbox_plugins=sandbox_plugins,
)

logger.info(f"SSH box started for instance {instance['test_case_id']}.")
# cd to the workspace
exit_code, output = sandbox.execute_and_check(
'cd /workspace', 'Failed to cd to workspace'
)
logger.info(f'cd to workspace: {output}')

# download repository archive
repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance['repository'].split('/')[1]}.zip"
exit_code, output = sandbox.execute_and_check(
'wget -O repo.zip ' + repository_url, 'Failed to download the repository'
)
logger.info(f'Downloaded the repository: {output}')
exit_code, output = sandbox.execute_and_check(
'unzip -o -q repo.zip', 'Failed to unzip the repository'
)
logger.info(f'Unzipped the repository: {output}')

# copy the context, generated and golden files to the /testing_files folder
exit_code, output = sandbox.execute_and_check(
f'cp -r /workspace/{workspace_test_folder}/ /testing_files/',
'Failed to copy the files',
)

return sandbox


if __name__ == '__main__':
EXAMPLE_INSTANCE = {
'signature': 'def sanitize_tex(original_text)',
'numLines': 27,
'repository': 'pgxcentre/genipe',
'lineEnd': 84,
'promptSummaryOnly': 'This is in python\nwrite a function that takes in a string as an argument and returns a sanitized version of the string suitable for LaTeX formatting. The function should follow these steps:\n\n1. Replace any occurrences of four backslashes (\\\\\\\\) with the string \\\\textbackslash.\n2. Escape certain characters, including $, %, _, }, {, &, and #, by adding a backslash before them.\n3. Replace special characters such as tilde (~) with the corresponding LaTeX equivalent (e.g. $\\\\sim$).\nThe function should be named sanitize_tex and should have one argument named original_text. It should return the sanitized version of the string.',
'content': "def sanitize_tex(original_text):\n \"\"\"Sanitize TeX text.\n\n Args:\n original_text (str): the text to sanitize for LaTeX\n\n Text is sanitized by following these steps:\n\n 1. Replaces ``\\\\\\\\`` by ``\\\\textbackslash``\n 2. Escapes certain characters (such as ``$``, ``%``, ``_``, ``}``, ``{``,\n ``&`` and ``#``) by adding a backslash (*e.g.* from ``&`` to ``\\\\&``).\n 3. Replaces special characters such as ``~`` by the LaTeX equivalent\n (*e.g.* from ``~`` to ``$\\\\sim$``).\n\n \"\"\"\n sanitized_tex = original_text.replace(\\'\\\\\\', \\'\\\\textbackslash \\')\n sanitized_tex = re.sub(\\'([{}])\\'.format(\\'\\'.join(_escaped_char)),\n \\'\\\\\\\\\\\\g<1>\\', sanitized_tex)\n for character, mod in _char_mod.items():\n sanitized_tex = sanitized_tex.replace(character, mod)\n return sanitized_tex\n",
'comment': 'Sanitize TeX text.\n\nArgs:\n original_text (str): the text to sanitize for LaTeX\n\nText is sanitized by following these steps:\n\n1. Replaces ``\\\\`` by ``\\textbackslash``\n2. Escapes certain characters (such as ``$``, ``%``, ``_``, ``}``, ``{``,\n ``&`` and ``#``) by adding a backslash (*e.g.* from ``&`` to ``\\&``).\n3. Replaces special characters such as ``~`` by the LaTeX equivalent\n (*e.g.* from ``~`` to ``$\\sim$``).',
'filePath': 'genipe/reporting/utils.py',
'contextCode': "import random\nimport hashlib\nimport numpy as np\nimport skimage\nimport skimage.measure\nimport scipy.ndimage\nimport os\nimport logging\nfrom functools import wraps\nfrom scipy import stats\nimport sys\nimport math\nimport subprocess\nfrom pathlib import PurePath\nfrom itertools import islice\nimport pysam\nimport pandas as pd\nfrom scipy.signal import savgol_coeffs, savgol_filter\nfrom scipy.stats import norm\nimport re\nimport fileinput\nimport warnings\nfrom scipy.stats import scoreatpercentile, chisquare\nfrom sklearn.preprocessing import scale\nfrom sklearn.cluster import KMeans, AgglomerativeClustering\n_char_mod = {\\'~\\': \\'$\\\\sim$\\'}\n_escaped_char = [\\'$\\', \\'%\\', \\'_\\', \\'}\\', \\'{\\', \\'&\\', \\'#\\']\ndef format_time(total_seconds, written_time=False):\n \"\"\"Format time (either \"HH:MM:SS\" or \"H hours, M minutes and S seconds\".\n Args:\n total_seconds (int): the total number of seconds\n written_time (bool): whether to write time in written language\n Returns:\n str: a string representation of the total time\n If ``written_time`` is ``True``, time will be displayed as \"H hours, M\n minutes and S seconds\". Otherwise, the time will be represented as\n HH:MM:SS.\n \"\"\"\n time_fmt = \\'{hours:02d}:{minutes:02d}:{seconds:02d}\\'\n minutes, seconds = divmod(total_seconds, 60)\n hours, minutes = divmod(minutes, 60)\n if not written_time:\n return time_fmt.format(seconds=seconds, minutes=minutes, hours=hours)\n written_time = []\n if hours > 0:\n written_time.append(\\'{} hour{}\\'.format(hours, \\'s\\' if hours > 1 else \\'\\')\n )\n if minutes > 0:\n written_time.append(\\'{} minute{}\\'.format(minutes, \\'s\\' if minutes > \n 1 else \\'\\'))\n if seconds > 0:\n written_time.append(\\'{} second{}\\'.format(seconds, \\'s\\' if seconds > \n 1 else \\'\\'))\n if len(written_time) == 0:\n return \\'no time\\'\n if len(written_time) == 1:\n return written_time[0]\n return \\', \\'.join(written_time[:-1]) + \\' and \\' + written_time[-1]\ndef colorize_time(total_seconds):\n \"\"\"Colorize the time.\n Args:\n total_seconds (int): the total number of seconds\n Returns:\n str: a colorized LaTeX string representation of time\n The time is displayed as ``HH:MM:SS``, but insignificant zeros are\n grayed-out.\n \"\"\"\n formatted_time = format_time(total_seconds)\n colored_time = formatted_time\n to_color = re.match(\\'([0:]+)\\', formatted_time)\n if to_color is not None:\n colored_time = \\'{\\\\color{light_gray}\\'\n colored_time += formatted_time[:to_color.end()]\n colored_time += \\'}\\' + formatted_time[to_color.end():]\n return colored_time\n<<insert solution here>>\ndef main():\n random.seed(<|int;range=0,100|>)\n argString = \\'\\'.join([random.choice(_escaped_char) for _ in range(100)])\n print(sanitize_tex(argString))\nif __name__ == \"__main__\":\n main()\n",
'goldenCode': "def sanitize_tex(original_text):\n \"\"\"Sanitize TeX text.\n\n Args:\n original_text (str): the text to sanitize for LaTeX\n\n Text is sanitized by following these steps:\n\n 1. Replaces ``\\\\\\\\`` by ``\\\\textbackslash``\n 2. Escapes certain characters (such as ``$``, ``%``, ``_``, ``}``, ``{``,\n ``&`` and ``#``) by adding a backslash (*e.g.* from ``&`` to ``\\\\&``).\n 3. Replaces special characters such as ``~`` by the LaTeX equivalent\n (*e.g.* from ``~`` to ``$\\\\sim$``).\n\n \"\"\"\n sanitized_tex = original_text.replace(\\'\\\\\\', \\'\\\\textbackslash \\')\n sanitized_tex = re.sub(\\'([{}])\\'.format(\\'\\'.join(_escaped_char)),\n \\'\\\\\\\\\\\\g<1>\\', sanitized_tex)\n for character, mod in _char_mod.items():\n sanitized_tex = sanitized_tex.replace(character, mod)\n return sanitized_tex",
'test_case_id': '61beb3529846e024cdff01d3e2ba1a1ec4212dd64426028deb1065f1975bd376',
'lineStart': 58,
'language': 'Python',
}

sandbox = BiocoderSSHBox.get_box_for_instance(
instance=EXAMPLE_INSTANCE,
workspace_mount_path='/home/ubuntu/OpenDevinBioCoder/workspace',
skip_workspace_mount=False,
sandbox_plugins=[JupyterRequirement(), SWEAgentCommandsRequirement()],
)

# PRE TEST
exit_code, output = sandbox.execute_and_check(
'cd /testing',
'Failed to cd /testing',
)
logger.info(f'cd $REPO_PATH: {output}')

exit_code, output = sandbox.execute_and_check(
'whoami',
'Failed to run whoami',
)
logger.info(f'whoami: {output}')

# TEST
exit_code, output = sandbox.execute(
'/home/devin/mambaforge/bin/mamba run -n base python3 /testing/start_test_opendevin.py'
)
assert exit_code == 0, 'Expected exit code 0 (this should have passed)'
logger.info(f'$TEST_CMD:\n{output}')

exit_code, output = sandbox.execute_and_check(
'cat /testing/result_biocoder.json', 'Failed to read the result file'
)

bg_cmd = sandbox.execute_in_background(
"while true; do echo 'dot ' && sleep 10; done"
)

sys.stdout.flush()
try:
while True:
try:
user_input = input('>>> ')
except EOFError:
logger.info('Exiting...')
break
if user_input.lower() == 'exit':
logger.info('Exiting...')
break
if user_input.lower() == 'kill':
sandbox.kill_background(bg_cmd.pid)
logger.info('Background process killed')
continue
exit_code, output = sandbox.execute(user_input)
logger.info('exit code: %d', exit_code)
logger.info(output)
if bg_cmd.pid in sandbox.background_commands:
logs = sandbox.read_logs(bg_cmd.pid)
logger.info('background logs: %s', logs)
sys.stdout.flush()
except KeyboardInterrupt:
logger.info('Exiting...')
sandbox.close()
Loading