All-Hands-AI · yufansong · Jun 6, 2024 · Jun 1, 2024 · Jun 1, 2024 · Jun 2, 2024
diff --git a/.gitignore b/.gitignore
@@ -209,3 +209,4 @@ evaluation/outputs
 evaluation/evaluation_outputs
 test_results*
 /_test_files_tmp/
+evaluation/webarena/scripts/webarena_env.sh
diff --git a/agenthub/browsing_agent/browsing_agent.py b/agenthub/browsing_agent/browsing_agent.py
@@ -1,4 +1,5 @@
 import ast
+import os
 
 from browsergym.core.action.highlevel import HighLevelActionSet
 from browsergym.utils.obs import flatten_axtree_to_str
@@ -12,28 +13,20 @@
     BrowseInteractiveAction,
     MessageAction,
 )
+from opendevin.events.event import EventSource
 from opendevin.events.observation import BrowserOutputObservation
 from opendevin.llm.llm import LLM
 from opendevin.runtime.plugins import (
     PluginRequirement,
 )
 from opendevin.runtime.tools import RuntimeTool
 
-
-def parse_response(response: str) -> Action:
-    if '```' not in response:
-        # unexpected response format, message back to user
-        return MessageAction(response)
-    thought = response.split('```')[0].strip()
-    action_str = response.split('```')[1].strip()
-    # handle send message to user function call in BrowserGym
-    for sub_action in action_str.split('\n'):
-        if 'send_msg_to_user(' in sub_action:
-            tree = ast.parse(sub_action)
-            args = tree.body[0].value.args  # type: ignore
-            return MessageAction(args[0].value)
-
-    return BrowseInteractiveAction(browser_actions=action_str, thought=thought)
+USE_NAV = (
+    os.environ.get('USE_NAV', 'true') == 'true'
+)  # only disable NAV actions when running webarena and miniwob benchmarks
+USE_CONCISE_ANSWER = (
+    os.environ.get('USE_CONCISE_ANSWER', 'false') == 'true'
+)  # only return concise answer when running webarena and miniwob benchmarks
 
 
 class BrowsingAgent(Agent):
@@ -56,13 +49,13 @@ def __init__(
         - llm (LLM): The llm to be used by this agent
         """
         super().__init__(llm)
+        # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
+        # see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
+        action_subsets = ['chat', 'bid']
+        if USE_NAV:
+            action_subsets.append('nav')
         self.action_space = HighLevelActionSet(
-            # see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
-            subsets=[
-                'chat',
-                'bid',
-                'nav',
-            ],  # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
+            subsets=action_subsets,
             strict=False,  # less strict on the parsing of the actions
             multiaction=True,  # enable to agent to take multiple actions at once
         )
@@ -75,6 +68,32 @@ def reset(self) -> None:
         """
         super().reset()
         self.cost_accumulator = 0
+        self.error_accumulator = 0
+
+    def parse_response(self, response: str) -> Action:
+        if '```' not in response:
+            # unexpected response format, message back to user
+            action_str = f'send_msg_to_user("""{response}""")'
+            return BrowseInteractiveAction(
+                browser_actions=action_str,
+                thought=response,
+                browsergym_send_msg_to_user=response,
+            )
+        thought = response.split('```')[0].strip()
+        action_str = response.split('```')[1].strip()
+        # handle send message to user function call in BrowserGym
+        msg_content = ''
+        for sub_action in action_str.split('\n'):
+            if 'send_msg_to_user(' in sub_action:
+                tree = ast.parse(sub_action)
+                args = tree.body[0].value.args  # type: ignore
+                msg_content = args[0].value
+
+        return BrowseInteractiveAction(
+            browser_actions=action_str,
+            thought=thought,
+            browsergym_send_msg_to_user=msg_content,
+        )
 
     def step(self, state: State) -> Action:
         """
@@ -91,26 +110,57 @@ def step(self, state: State) -> Action:
         """
         goal = state.get_current_user_intent()
         messages = []
-        prev_actions = ''
+        prev_actions = []
         cur_axtree_txt = ''
         error_prefix = ''
         last_obs = None
+        last_action = None
+        if len(state.history) == 1:
+            # initialize and retrieve the first observation by issuing an noop OP
+            # TODO: need more elegant way of doing this
+            return BrowseInteractiveAction(browser_actions='noop()')
         for prev_action, obs in state.history:
             if isinstance(prev_action, BrowseInteractiveAction):
-                prev_actions += f'{prev_action.browser_actions}\n'
+                prev_actions.append(prev_action.browser_actions)
                 last_obs = obs
+                last_action = prev_action
             elif (
-                isinstance(prev_action, MessageAction) and prev_action.source != 'user'
+                isinstance(prev_action, MessageAction)
+                and prev_action.source == EventSource.AGENT
             ):
                 # agent has responded, task finish.
                 return AgentFinishAction()
 
+        prev_action_str = '\n'.join(prev_actions[1:])
+        # if the final BrowserInteractiveAction exec BrowserGym's send_msg_to_user,
+        # we should also send a message back to the user in OpenDevin and call it a day
+        if (
+            isinstance(last_action, BrowseInteractiveAction)
+            and last_action.browsergym_send_msg_to_user
+        ):
+            return MessageAction(last_action.browsergym_send_msg_to_user)
+
         if isinstance(last_obs, BrowserOutputObservation):
             if last_obs.error:
                 # add error recovery prompt prefix
                 error_prefix = f'IMPORTANT! Last action is incorrect:\n{last_obs.last_browser_action}\nThink again with the current observation of the page.\n'
-            cur_axtree_txt = flatten_axtree_to_str(last_obs.axtree_object)
-
+            try:
+                cur_axtree_txt = flatten_axtree_to_str(
+                    last_obs.axtree_object,
+                    extra_properties=last_obs.extra_element_properties,
+                    with_clickable=True,
+                    filter_visible_only=True,
+                )
+            except Exception as e:
+                logger.error(
+                    'Error when trying to process the accessibility tree: %s', e
+                )
+                return MessageAction('Error encountered when browsing.')
+
+        if error_prefix:
+            self.error_accumulator += 1
+            if self.error_accumulator > 5:
+                return MessageAction('Too many errors encountered. Task failed.')
         system_msg = f"""\
 # Instructions
 Review the current state of the page and all other information to find the best
@@ -133,24 +183,39 @@ def step(self, state: State) -> Action:
 {cur_axtree_txt}
 
 # Previous Actions
-{prev_actions}
+{prev_action_str}
 
 Here is an example with chain of thought of a valid action when clicking on a button:
 "
 In order to accomplish my goal I need to click on the button with bid 12
 ```click("12")```
 "
 """.strip()
+
+        if USE_CONCISE_ANSWER:
+            concise_instruction = """\
+
+Here is another example with chain of thought of a valid action when providing a concise answer to user:
+"
+In order to accomplish my goal I need to send the information asked back to the user. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I will send a message back to user with the answer.
+```send_msg_to_user("$279.49")```
+"
+"""
+            prompt += concise_instruction
         messages.append({'role': 'user', 'content': prompt})
         response = self.llm.completion(
             messages=messages,
             temperature=0.0,
+            stop=[')```', ')\n```'],
         )
         self.log_cost(response)
-        action_resp = response['choices'][0]['message']['content']
+        action_resp = response['choices'][0]['message']['content'].strip()
+        if not action_resp.endswith('```'):
+            action_resp = action_resp + ')```'
+
         logger.info(prompt)
         logger.info(action_resp)
-        return parse_response(action_resp)
+        return self.parse_response(action_resp)
 
     def search_memory(self, query: str) -> list[str]:
         raise NotImplementedError('Implement this abstract method')

diff --git a/evaluation/miniwob/README.md b/evaluation/miniwob/README.md
@@ -0,0 +1,81 @@
+# WebArena Evaluation with OpenDevin Browsing Agents
+
+This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on synthetic web browsing tasks.
+
+## Setup OpenDevin Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+[core]
+max_iterations = 100
+cache_dir = "/tmp/cache"
+sandbox_container_image = "ghcr.io/opendevin/sandbox:latest"
+sandbox_type = "ssh"
+ssh_hostname = "localhost"
+sandbox_timeout = 120
+
+# TODO: Change these to the model you want to evaluate
+[eval_gpt4_1106_preview]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[eval_some_openai_compatible_model]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+## Setup MiniWoB++ Environment and Environment Variables of MiniWoB++
+MiniWoB++ requires you to set up websites containing a static website that is accessible via URL to the machine running the OpenDevin agents.
+
+- Clone miniwob (use a specific frozen commit for reproducibility)
+```sh
+git clone [email protected]:Farama-Foundation/miniwob-plusplus.git
+git -C "./miniwob-plusplus" reset --hard 7fd85d71a4b60325c6585396ec4f48377d049838
+```
+
+- Setup Miniwob URL (change `PATH_TO_MINIWOB_CLONED_REPO` here to the absolute path to your `miniwob-plusplus` folder) in `evaluation/miniwob/scripts/run_infer.sh`
+```sh
+export MINIWOB_URL="file://<PATH_TO_MINIWOB_CLONED_REPO>/miniwob/html/miniwob/"
+```
+
+## Test if your environment works
+
+Access with browser the above MiniWoB URLs and see if they load correctly.
+
+## Run Evaluation
+
+```sh
+bash evaluation/miniwob/scripts/run_infer.sh
+```
+
+Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
+
+To calculate the average reward, run:
+
+```sh
+poetry run python evaluation/miniwob/get_success_rate.py evaluation/evaluation_outputs/outputs/miniwob/SOME_AGENT/EXP_NAME/output.jsonl
+```
+
+## Submit your evaluation results
+
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+
+
+## BrowsingAgent V1.0 result
+
+Tested on BrowsingAgent V1.0
+
+MiniWoB++, 125 tasks (3 runs due to random init task), max step 10
+
+- GPT4o: 0.384, 0.416, 0.424, avg: 0.408
+- GPT3.5: 0.288, 0.256, 0.272, avg: 0.272
diff --git a/evaluation/miniwob/__init__.py b/evaluation/miniwob/__init__.py
diff --git a/evaluation/miniwob/get_avg_reward.py b/evaluation/miniwob/get_avg_reward.py
@@ -0,0 +1,25 @@
+import argparse
+import json
+
+import browsergym.miniwob  # noqa F401 register miniwob tasks as gym environments
+import gymnasium as gym
+
+parser = argparse.ArgumentParser(description='Calculate average reward.')
+parser.add_argument('output_path', type=str, help='path to output.jsonl')
+
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    env_ids = [
+        id for id in gym.envs.registry.keys() if id.startswith('browsergym/miniwob')
+    ]
+    total_num = len(env_ids)
+    print('Total number of tasks: ', total_num)
+    total_reward = 0
+    with open(args.output_path, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            total_reward += data['test_result']
+
+    avg_reward = total_reward / total_num
+    print('Avg Reward: ', avg_reward)