Skip to content

Commit 6b16a5d

Browse files
xingyaowwneubig
andauthored
[Eval,Arch] Update GPTQ eval and add headless_mode for Controller (#2994)
* update and polish gptq eval * fix typo * Update evaluation/gpqa/README.md Co-authored-by: Graham Neubig <[email protected]> * Update evaluation/gpqa/run_infer.py Co-authored-by: Graham Neubig <[email protected]> * add headless mode to all appropriate agent controller call * delegate set to error when in headless mode * try to deduplicate a bit * make headless_mode default to True and only change it to false for AgentSession --------- Co-authored-by: Graham Neubig <[email protected]>
1 parent dada004 commit 6b16a5d

File tree

8 files changed

+172
-74
lines changed

8 files changed

+172
-74
lines changed

agenthub/codeact_agent/action_parser.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def parse(self, action_str: str) -> Action:
9898
# a command was found
9999
command_group = self.bash_command.group(1).strip()
100100
if command_group.strip() == 'exit':
101-
return AgentFinishAction()
101+
return AgentFinishAction(thought=thought)
102102
return CmdRunAction(command=command_group, thought=thought)
103103

104104

evaluation/browsing_delegation/run_infer.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,7 @@ def process_instance(
6767

6868
state: State | None = asyncio.run(
6969
run_agent_controller(
70-
agent,
71-
instruction,
72-
max_iterations=metadata.max_iterations,
73-
sid=env_id,
70+
agent, instruction, max_iterations=metadata.max_iterations, sid=env_id
7471
)
7572
)
7673

evaluation/gpqa/README.md

+1-13
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,6 @@ Further references:
1515
- https://paperswithcode.com/dataset/gpqa
1616
- https://github.com/idavidrein/gpqa
1717

18-
## TODOs
19-
- [ ] Add support for other agents (currently only tested on `CodeActAgent`)
20-
- [ ] Complete full benchmark evaluation
21-
- [ ] Fix intermittent `BrowserException: Failed to start browser environment` error
2218

2319
## Setup Environment
2420

@@ -27,19 +23,11 @@ Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/D
2723

2824
## Configure OpenDevin and your LLM
2925

30-
Create a `config.toml` file if it does not exist at the root of the workspace.
26+
Create a `config.toml` file (you can copy from `config.template.toml`) if it does not exist at the root of the workspace.
3127

3228
Add the following configurations:
3329

3430
```toml
35-
[core]
36-
max_iterations = 100
37-
cache_dir = "/tmp/cache"
38-
ssh_hostname = "localhost"
39-
40-
[sandbox]
41-
enable_auto_lint = true
42-
4331
# TODO: Change these to the model you want to evaluate
4432
[llm.eval_gpt4_1106_preview]
4533
model = "gpt-4-1106-preview"

evaluation/gpqa/run_infer.py

+145-52
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import pathlib
2323
import random
2424
import re
25+
from typing import Callable
2526

2627
import pandas as pd
2728
from datasets import load_dataset
@@ -39,51 +40,82 @@
3940
from opendevin.core.logger import get_console_handler
4041
from opendevin.core.logger import opendevin_logger as logger
4142
from opendevin.core.main import run_agent_controller
43+
from opendevin.events.action import Action, AgentFinishAction, MessageAction
44+
from opendevin.events.observation import Observation
4245
from opendevin.llm.llm import LLM
4346

44-
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
45-
'CodeActAgent': codeact_user_response,
46-
}
47+
ACTION_FORMAT = """
48+
<<FINAL_ANSWER||
49+
<insert correct answer here, must be one of A, B, C, D> (Please dont use any additional characters. Just the letter of the correct answer (A/B/C/D).)
50+
||FINAL_ANSWER>>
51+
""".strip()
52+
53+
54+
def gpqa_codeact_user_response(
55+
state: State,
56+
encapsulate_solution: bool = False,
57+
try_parse: Callable[[Action], str] | None = None,
58+
) -> str:
59+
msg = (
60+
'Please continue working on the task on whatever approach you think is suitable.\n'
61+
'Feel free to use all tools for calculations and solving the problem, and web-search for finding relevant facts during the process if needed\n'
62+
'If you have finished reporting the answer in the expected format, (and only once that is done), please run the following command to submit: <execute_bash> exit </execute_bash>.\n'
63+
'Again you are being told a million times to first report the answer in the requested format (see again below for reference) before exiting. DO NOT EXIT WITHOUT REPORTING THE ANSWER FIRST.\n'
64+
'That is, when you have decided on the answer report in the following format:\n'
65+
f'{ACTION_FORMAT}\n'
66+
'<execute_bash> exit </execute_bash>\n'
67+
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP TO SOLVE THIS TASK.\n'
68+
)
69+
70+
return msg
71+
72+
73+
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {'CodeActAgent': codeact_user_response}
4774

4875
AGENT_CLS_TO_INST_SUFFIX = {
4976
'CodeActAgent': '\n\n SUPER IMPORTANT: When you think you have solved the question, first report it back to the user in the requested format. Only once that is done, in the next turn, please run the following command: <execute_bash> exit </execute_bash>.\n'
5077
}
5178

5279

53-
def parse_final_answer(final_answer: str) -> str:
80+
def parse_final_answer(final_answer: str | None) -> str | None:
5481
"""Parse the final answer from the final message generated by the agent
5582
to extract the final answer. The final answer is usually enclosed in the format:
5683
<<FINAL_ANSWER||
5784
<insert correct answer here>
5885
||FINAL_ANSWER>>
5986
"""
87+
# to do this first extract the part enclosed in the format <<FINAL_ANSWER|| ... ||FINAL_ANSWER>>
6088
pattern = re.compile(r'<<FINAL_ANSWER\|\|(.*?)\|\|FINAL_ANSWER>>', re.DOTALL)
6189
match = pattern.search(final_answer)
6290

63-
if match:
64-
return match.group(1).strip()
65-
else:
66-
return 'No final answer found in the provided string.'
91+
# and then strip it, remove any leading/trailing spaces line breaks etc.
92+
answer = match.group(1).strip()
93+
# finally capitalize it
94+
answer = answer.upper()
95+
# and then return A, B, C, D depending on whether the answer A, B, C, D is found in the final answer
96+
for letter in ['A', 'B', 'C', 'D']:
97+
if letter in answer:
98+
return letter
6799

68100

69-
def compare_answers(predicted_answer, ground_truth):
101+
def compare_answers(model_output: str | None, ground_truth: str):
70102
"""Compare the predicted answer with the ground truth answer"""
103+
try:
104+
# parse the final answer from model output
105+
predicted_answer = parse_final_answer(model_output)
106+
except Exception as e:
107+
# Log the exception
108+
logger.error(f'An error occurred: {e}\n defaulting to random guess ...')
109+
# choose a random answer if the model output is not in the correct format
110+
predicted_answer = random.choice(['A', 'B', 'C', 'D'])
111+
112+
logger.info('#############################################')
113+
logger.info(f'Predicted answer: {predicted_answer}')
114+
logger.info(f'Ground truth answer: {ground_truth}')
115+
logger.info('#############################################')
71116
return predicted_answer == ground_truth
72117

73118

74-
def get_test_result(model_output, ground_truth):
75-
"""Implements the evaluation logic for GPQA
76-
Checks if the output of a given instance is correct (as per the ground truth)
77-
"""
78-
# parse the final answer from model output
79-
predicted_answer = parse_final_answer(model_output)
80-
81-
# check if the model output matches the ground truth
82-
result = compare_answers(predicted_answer, ground_truth)
83-
84-
return result
85-
86-
87119
def convert_instance_dict(instance):
88120
"""Used for preprocessing the hf dataset into a format that can be used by the agent.
89121
Reads and extracts relevant information from the dataset instance.
@@ -163,27 +195,33 @@ def process_instance(
163195
# ======= Run the agent on the instance =======
164196
# Prepare instruction for the agent using suggested format in gpqa codebase
165197
instruction = f"""
166-
What is the correct answer to this question:\n
167-
{instance['question']}\n
198+
What is the correct answer to this question:\n
199+
{instance['question']}\n
168200
169-
Choices:\n
170-
(A) {instance['choices'][0]}\n
171-
(B) {instance['choices'][1]}\n
172-
(C) {instance['choices'][2]}\n
173-
(D) {instance['choices'][3]}\n
174-
\n\n
201+
Choices:\n
202+
(A) {instance['choices'][0]}\n
203+
(B) {instance['choices'][1]}\n
204+
(C) {instance['choices'][2]}\n
205+
(D) {instance['choices'][3]}\n
206+
\n\n
175207
176-
MOST IMPORTANT: Format your response as follows:
177-
<<FINAL_ANSWER||
178-
<insert correct answer here, must be one of A, B, C, D> (Please dont use any additional characters. Just the letter of the correct answer (A/B/C/D).)
179-
||FINAL_ANSWER>>
208+
MOST IMPORTANT: Format your response as follows:
209+
{ACTION_FORMAT}
180210
181-
Additional Instructions:
182-
- You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.
183-
"""
211+
Additional Instructions:
212+
- Do not try to solve the question in a single step. Break it down into smaller steps.
213+
- You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.
184214
185-
# NOTE: You can actually set slightly different instruction for different agents
186-
instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
215+
- SUPER IMPORTANT: When you have reported the answer to the user in the requested format, (and only once that is done) in the next turn, please run the following command: <execute_bash> exit </execute_bash>.
216+
- Again you are being told a million times to first report the answer in the requested format (see again below for reference) before exiting. DO NOT EXIT WITHOUT REPORTING THE ANSWER FIRST.
217+
That is, when you have decided on the answer report in the following format:
218+
219+
{ACTION_FORMAT}
220+
<execute_bash> exit </execute_bash>
221+
222+
Again do not quit without reporting the answer first.
223+
Ok now its time to start solving the question. Good luck!
224+
"""
187225

188226
# Here's how you can run the agent (similar to the `main` function) and get the final task state
189227
state: State | None = asyncio.run(
@@ -194,18 +232,69 @@ def process_instance(
194232
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
195233
agent.__class__.__name__
196234
),
197-
sid=instance.instance_id,
235+
sid=f'gptq_{str(instance.instance_id)}',
198236
)
199237
)
200238
assert state is not None, 'State should not be None.'
201239

202240
# ======= Attempt to evaluate the agent's edits =======
203-
# get the final message from the state history (default to empty if not found)
204-
final_message = state.history.get_last_agent_message()
205241

242+
question_choices = {
243+
'A': instance['choices'][0],
244+
'B': instance['choices'][1],
245+
'C': instance['choices'][2],
246+
'D': instance['choices'][3],
247+
}
248+
# get the final message from the state history (default to empty if not found)
249+
found_answers = {
250+
'A': False,
251+
'B': False,
252+
'C': False,
253+
'D': False,
254+
}
255+
for event in state.history.get_events(reverse=True):
256+
if (
257+
isinstance(event, AgentFinishAction)
258+
and event.source != 'user'
259+
and '<<FINAL_ANSWER||' in event.thought
260+
):
261+
final_message = event.thought
262+
break
263+
elif (
264+
isinstance(event, MessageAction)
265+
and event.source != 'user'
266+
and '<<FINAL_ANSWER||' in event.content
267+
):
268+
final_message = event.content
269+
break
270+
elif isinstance(event, Observation):
271+
for option, option_text in question_choices.items():
272+
if option_text in event.content:
273+
found_answers[option] = True
274+
else:
275+
final_message = None
276+
277+
found_options = [option for option, found in found_answers.items() if found]
278+
logger.info('#############################################')
206279
logger.info(f'Final message generated by the agent: {final_message}')
207-
208-
test_result = get_test_result(final_message, instance.correct_solution)
280+
logger.info('#############################################')
281+
282+
# check if the model output matches the ground truth
283+
test_result = compare_answers(final_message, instance.correct_solution)
284+
if final_message is None and len(found_options) > 0:
285+
_selected = random.choice(found_options)
286+
# if the final message is None, then the agent did not report the answer in the correct format
287+
# so we randomly select one of the found options and compare it with the correct solution
288+
test_result = _selected == instance.correct_solution
289+
logger.info('#############################################')
290+
logger.info('Agent did not report the answer in the correct format.')
291+
logger.info(f'Found options: {found_options}')
292+
logger.info(f'Selected option: {_selected}')
293+
logger.info('#############################################')
294+
295+
logger.info('#############################################')
296+
logger.info(f'Test result: {test_result}')
297+
logger.info('#############################################')
209298

210299
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
211300
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
@@ -214,21 +303,20 @@ def process_instance(
214303

215304
metrics = state.metrics.get() if state.metrics else None
216305

217-
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
218-
# for compatibility with the existing output format, we can remake the pairs here
219-
# remove when it becomes unnecessary
220-
histories = state.history.compatibility_for_eval_history_pairs()
221-
222306
# Save the output
223307
output = {
224308
'task_id': instance.task_id,
225309
'instance_id': instance.instance_id,
226310
'instruction': instruction,
227311
'metadata': metadata.model_dump(),
228-
'history': histories,
312+
'history': state.history.compatibility_for_eval_history_pairs(),
229313
'metrics': metrics,
230314
'error': state.last_error if state and state.last_error else None,
231-
'test_result': test_result,
315+
'test_result': {
316+
'result': test_result,
317+
'found_answers': found_answers,
318+
'last_message': final_message,
319+
},
232320
}
233321

234322
except Exception:
@@ -267,9 +355,14 @@ def process_instance(
267355
gpqa_dataset['task_id'] = gpqa_dataset.index
268356
# gpqa_dataset = dataset['train'].to_pandas().sort_values(by='id').reset_index(drop=True)
269357

358+
if args.agent_cls != 'CodeActAgent':
359+
raise ValueError(
360+
f'Agent class {args.agent_cls} not supported for GPQA evaluation.'
361+
)
362+
270363
metadata = make_metadata(
271364
llm_config=llm_config,
272-
dataset_name='gpqa',
365+
dataset_name=args.data_split,
273366
agent_class=args.agent_cls,
274367
max_iterations=args.max_iterations,
275368
eval_note=args.eval_note,

evaluation/gpqa/scripts/run_infer.sh

100644100755
File mode changed.

opendevin/controller/agent_controller.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def __init__(
6868
max_budget_per_task: float | None = MAX_BUDGET_PER_TASK,
6969
initial_state: State | None = None,
7070
is_delegate: bool = False,
71+
headless_mode: bool = True,
7172
):
7273
"""Initializes a new instance of the AgentController class.
7374
@@ -79,10 +80,12 @@ def __init__(
7980
max_budget_per_task: The maximum budget (in USD) allowed per task, beyond which the agent will stop.
8081
initial_state: The initial state of the controller.
8182
is_delegate: Whether this controller is a delegate.
83+
headless_mode: Whether the agent is run in headless mode.
8284
"""
8385
self._step_lock = asyncio.Lock()
8486
self.id = sid
8587
self.agent = agent
88+
self.headless_mode = headless_mode
8689

8790
# subscribe to the event stream
8891
self.event_stream = event_stream
@@ -293,6 +296,9 @@ async def _step(self) -> None:
293296
logger.debug(f'[Agent Controller {self.id}] Delegate step done')
294297
assert self.delegate is not None
295298
delegate_state = self.delegate.get_agent_state()
299+
logger.debug(
300+
f'[Agent Controller {self.id}] Delegate state: {delegate_state}'
301+
)
296302
if delegate_state == AgentState.ERROR:
297303
# close the delegate upon error
298304
await self.delegate.close()
@@ -345,10 +351,18 @@ async def _step(self) -> None:
345351
self.state.traffic_control_state = TrafficControlState.NORMAL
346352
else:
347353
self.state.traffic_control_state = TrafficControlState.THROTTLING
348-
await self.report_error(
349-
f'Agent reached maximum number of iterations, task paused. {TRAFFIC_CONTROL_REMINDER}'
350-
)
351-
await self.set_agent_state_to(AgentState.PAUSED)
354+
if self.headless_mode:
355+
# set to ERROR state if running in headless mode
356+
# since user cannot resume on the web interface
357+
await self.report_error(
358+
'Agent reached maximum number of iterations in headless mode, task stopped.'
359+
)
360+
await self.set_agent_state_to(AgentState.ERROR)
361+
else:
362+
await self.report_error(
363+
f'Agent reached maximum number of iterations, task paused. {TRAFFIC_CONTROL_REMINDER}'
364+
)
365+
await self.set_agent_state_to(AgentState.PAUSED)
352366
return
353367
elif self.max_budget_per_task is not None:
354368
current_cost = self.state.metrics.accumulated_cost

0 commit comments

Comments
 (0)