Skip to content

Commit 48151bd

Browse files
authored
[feat] WebArena benchmark, MiniWoB++ benchmark and related arch changes (#2170)
* add webarena, and revamp messaging for webarena eval * add changes for browsergym * update infer script * fix unit tests * update * add multiple run for miniwob * update instruction, remove personal path * update * add code for getting final reward, fix integration, add results * add avg cost calculation
1 parent 99c6333 commit 48151bd

File tree

23 files changed

+952
-55
lines changed

23 files changed

+952
-55
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -209,3 +209,4 @@ evaluation/outputs
209209
evaluation/evaluation_outputs
210210
test_results*
211211
/_test_files_tmp/
212+
evaluation/webarena/scripts/webarena_env.sh

agenthub/browsing_agent/browsing_agent.py

+94-29
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import ast
2+
import os
23

34
from browsergym.core.action.highlevel import HighLevelActionSet
45
from browsergym.utils.obs import flatten_axtree_to_str
@@ -12,28 +13,20 @@
1213
BrowseInteractiveAction,
1314
MessageAction,
1415
)
16+
from opendevin.events.event import EventSource
1517
from opendevin.events.observation import BrowserOutputObservation
1618
from opendevin.llm.llm import LLM
1719
from opendevin.runtime.plugins import (
1820
PluginRequirement,
1921
)
2022
from opendevin.runtime.tools import RuntimeTool
2123

22-
23-
def parse_response(response: str) -> Action:
24-
if '```' not in response:
25-
# unexpected response format, message back to user
26-
return MessageAction(response)
27-
thought = response.split('```')[0].strip()
28-
action_str = response.split('```')[1].strip()
29-
# handle send message to user function call in BrowserGym
30-
for sub_action in action_str.split('\n'):
31-
if 'send_msg_to_user(' in sub_action:
32-
tree = ast.parse(sub_action)
33-
args = tree.body[0].value.args # type: ignore
34-
return MessageAction(args[0].value)
35-
36-
return BrowseInteractiveAction(browser_actions=action_str, thought=thought)
24+
USE_NAV = (
25+
os.environ.get('USE_NAV', 'true') == 'true'
26+
) # only disable NAV actions when running webarena and miniwob benchmarks
27+
USE_CONCISE_ANSWER = (
28+
os.environ.get('USE_CONCISE_ANSWER', 'false') == 'true'
29+
) # only return concise answer when running webarena and miniwob benchmarks
3730

3831

3932
class BrowsingAgent(Agent):
@@ -56,13 +49,13 @@ def __init__(
5649
- llm (LLM): The llm to be used by this agent
5750
"""
5851
super().__init__(llm)
52+
# define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
53+
# see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
54+
action_subsets = ['chat', 'bid']
55+
if USE_NAV:
56+
action_subsets.append('nav')
5957
self.action_space = HighLevelActionSet(
60-
# see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
61-
subsets=[
62-
'chat',
63-
'bid',
64-
'nav',
65-
], # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
58+
subsets=action_subsets,
6659
strict=False, # less strict on the parsing of the actions
6760
multiaction=True, # enable to agent to take multiple actions at once
6861
)
@@ -75,6 +68,32 @@ def reset(self) -> None:
7568
"""
7669
super().reset()
7770
self.cost_accumulator = 0
71+
self.error_accumulator = 0
72+
73+
def parse_response(self, response: str) -> Action:
74+
if '```' not in response:
75+
# unexpected response format, message back to user
76+
action_str = f'send_msg_to_user("""{response}""")'
77+
return BrowseInteractiveAction(
78+
browser_actions=action_str,
79+
thought=response,
80+
browsergym_send_msg_to_user=response,
81+
)
82+
thought = response.split('```')[0].strip()
83+
action_str = response.split('```')[1].strip()
84+
# handle send message to user function call in BrowserGym
85+
msg_content = ''
86+
for sub_action in action_str.split('\n'):
87+
if 'send_msg_to_user(' in sub_action:
88+
tree = ast.parse(sub_action)
89+
args = tree.body[0].value.args # type: ignore
90+
msg_content = args[0].value
91+
92+
return BrowseInteractiveAction(
93+
browser_actions=action_str,
94+
thought=thought,
95+
browsergym_send_msg_to_user=msg_content,
96+
)
7897

7998
def step(self, state: State) -> Action:
8099
"""
@@ -91,26 +110,57 @@ def step(self, state: State) -> Action:
91110
"""
92111
goal = state.get_current_user_intent()
93112
messages = []
94-
prev_actions = ''
113+
prev_actions = []
95114
cur_axtree_txt = ''
96115
error_prefix = ''
97116
last_obs = None
117+
last_action = None
118+
if len(state.history) == 1:
119+
# initialize and retrieve the first observation by issuing an noop OP
120+
# TODO: need more elegant way of doing this
121+
return BrowseInteractiveAction(browser_actions='noop()')
98122
for prev_action, obs in state.history:
99123
if isinstance(prev_action, BrowseInteractiveAction):
100-
prev_actions += f'{prev_action.browser_actions}\n'
124+
prev_actions.append(prev_action.browser_actions)
101125
last_obs = obs
126+
last_action = prev_action
102127
elif (
103-
isinstance(prev_action, MessageAction) and prev_action.source != 'user'
128+
isinstance(prev_action, MessageAction)
129+
and prev_action.source == EventSource.AGENT
104130
):
105131
# agent has responded, task finish.
106132
return AgentFinishAction()
107133

134+
prev_action_str = '\n'.join(prev_actions[1:])
135+
# if the final BrowserInteractiveAction exec BrowserGym's send_msg_to_user,
136+
# we should also send a message back to the user in OpenDevin and call it a day
137+
if (
138+
isinstance(last_action, BrowseInteractiveAction)
139+
and last_action.browsergym_send_msg_to_user
140+
):
141+
return MessageAction(last_action.browsergym_send_msg_to_user)
142+
108143
if isinstance(last_obs, BrowserOutputObservation):
109144
if last_obs.error:
110145
# add error recovery prompt prefix
111146
error_prefix = f'IMPORTANT! Last action is incorrect:\n{last_obs.last_browser_action}\nThink again with the current observation of the page.\n'
112-
cur_axtree_txt = flatten_axtree_to_str(last_obs.axtree_object)
113-
147+
try:
148+
cur_axtree_txt = flatten_axtree_to_str(
149+
last_obs.axtree_object,
150+
extra_properties=last_obs.extra_element_properties,
151+
with_clickable=True,
152+
filter_visible_only=True,
153+
)
154+
except Exception as e:
155+
logger.error(
156+
'Error when trying to process the accessibility tree: %s', e
157+
)
158+
return MessageAction('Error encountered when browsing.')
159+
160+
if error_prefix:
161+
self.error_accumulator += 1
162+
if self.error_accumulator > 5:
163+
return MessageAction('Too many errors encountered. Task failed.')
114164
system_msg = f"""\
115165
# Instructions
116166
Review the current state of the page and all other information to find the best
@@ -133,24 +183,39 @@ def step(self, state: State) -> Action:
133183
{cur_axtree_txt}
134184
135185
# Previous Actions
136-
{prev_actions}
186+
{prev_action_str}
137187
138188
Here is an example with chain of thought of a valid action when clicking on a button:
139189
"
140190
In order to accomplish my goal I need to click on the button with bid 12
141191
```click("12")```
142192
"
143193
""".strip()
194+
195+
if USE_CONCISE_ANSWER:
196+
concise_instruction = """\
197+
198+
Here is another example with chain of thought of a valid action when providing a concise answer to user:
199+
"
200+
In order to accomplish my goal I need to send the information asked back to the user. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I will send a message back to user with the answer.
201+
```send_msg_to_user("$279.49")```
202+
"
203+
"""
204+
prompt += concise_instruction
144205
messages.append({'role': 'user', 'content': prompt})
145206
response = self.llm.completion(
146207
messages=messages,
147208
temperature=0.0,
209+
stop=[')```', ')\n```'],
148210
)
149211
self.log_cost(response)
150-
action_resp = response['choices'][0]['message']['content']
212+
action_resp = response['choices'][0]['message']['content'].strip()
213+
if not action_resp.endswith('```'):
214+
action_resp = action_resp + ')```'
215+
151216
logger.info(prompt)
152217
logger.info(action_resp)
153-
return parse_response(action_resp)
218+
return self.parse_response(action_resp)
154219

155220
def search_memory(self, query: str) -> list[str]:
156221
raise NotImplementedError('Implement this abstract method')

evaluation/miniwob/README.md

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# WebArena Evaluation with OpenDevin Browsing Agents
2+
3+
This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on synthetic web browsing tasks.
4+
5+
## Setup OpenDevin Environment
6+
7+
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
8+
9+
## Configure OpenDevin and your LLM
10+
11+
Create a `config.toml` file if it does not exist at the root of the workspace.
12+
13+
Add the following configurations:
14+
15+
```toml
16+
[core]
17+
max_iterations = 100
18+
cache_dir = "/tmp/cache"
19+
sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
20+
sandbox_type = "ssh"
21+
ssh_hostname = "localhost"
22+
sandbox_timeout = 120
23+
24+
# TODO: Change these to the model you want to evaluate
25+
[eval_gpt4_1106_preview]
26+
model = "gpt-4-1106-preview"
27+
api_key = "XXX"
28+
temperature = 0.0
29+
30+
[eval_some_openai_compatible_model]
31+
model = "openai/MODEL_NAME"
32+
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
33+
api_key = "XXX"
34+
temperature = 0.0
35+
```
36+
37+
## Setup MiniWoB++ Environment and Environment Variables of MiniWoB++
38+
MiniWoB++ requires you to set up websites containing a static website that is accessible via URL to the machine running the OpenDevin agents.
39+
40+
- Clone miniwob (use a specific frozen commit for reproducibility)
41+
```sh
42+
git clone [email protected]:Farama-Foundation/miniwob-plusplus.git
43+
git -C "./miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838
44+
```
45+
46+
- Setup Miniwob URL (change `PATH_TO_MINIWOB_CLONED_REPO` here to the absolute path to your `miniwob-plusplus` folder) in `evaluation/miniwob/scripts/run_infer.sh`
47+
```sh
48+
export MINIWOB_URL="file://<PATH_TO_MINIWOB_CLONED_REPO>/miniwob/html/miniwob/"
49+
```
50+
51+
## Test if your environment works
52+
53+
Access with browser the above MiniWoB URLs and see if they load correctly.
54+
55+
## Run Evaluation
56+
57+
```sh
58+
bash evaluation/miniwob/scripts/run_infer.sh
59+
```
60+
61+
Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
62+
63+
To calculate the average reward, run:
64+
65+
```sh
66+
poetry run python evaluation/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
67+
```
68+
69+
## Submit your evaluation results
70+
71+
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
72+
73+
74+
## BrowsingAgent V1.0 result
75+
76+
Tested on BrowsingAgent V1.0
77+
78+
MiniWoB++, 125 tasks (3 runs due to random init task), max step 10
79+
80+
- GPT4o: 0.384, 0.416, 0.424, avg: 0.408
81+
- GPT3.5: 0.288, 0.256, 0.272, avg: 0.272

evaluation/miniwob/__init__.py

Whitespace-only changes.

evaluation/miniwob/get_avg_reward.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import argparse
2+
import json
3+
4+
import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments
5+
import gymnasium as gym
6+
7+
parser = argparse.ArgumentParser(description='Calculate average reward.')
8+
parser.add_argument('output_path', type=str, help='path to output.jsonl')
9+
10+
args = parser.parse_args()
11+
12+
if __name__ == '__main__':
13+
env_ids = [
14+
id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
15+
]
16+
total_num = len(env_ids)
17+
print('Total number of tasks: ', total_num)
18+
total_reward = 0
19+
total_cost = 0
20+
actual_num = 0
21+
with open(args.output_path, 'r') as f:
22+
for line in f:
23+
data = json.loads(line)
24+
actual_num += 1
25+
total_cost += data['metrics']['accumulated_cost']
26+
total_reward += data['test_result']
27+
28+
avg_reward = total_reward / total_num
29+
print('Avg Reward: ', avg_reward)
30+
31+
avg_cost = total_cost / actual_num
32+
print('Avg Cost: ', avg_cost)
33+
print('Actual number of tasks finished: ', actual_num)

0 commit comments

Comments
 (0)