All-Hands-AI · neubig · May 15, 2024 · May 15, 2024 · May 15, 2024 · May 15, 2024
diff --git a/agenthub/codeact_agent/codeact_agent.py b/agenthub/codeact_agent/codeact_agent.py
@@ -13,11 +13,13 @@
 from opendevin.events.action import (
     Action,
     AgentFinishAction,
+    BrowseInteractiveAction,
     CmdRunAction,
     IPythonRunCellAction,
     MessageAction,
 )
 from opendevin.events.observation import (
+    BrowserOutputObservation,
     CmdOutputObservation,
     IPythonRunCellObservation,
 )
@@ -33,7 +35,7 @@
 
 def parse_response(response) -> str:
     action = response.choices[0].message.content
-    for lang in ['bash', 'ipython']:
+    for lang in ['bash', 'ipython', 'browse']:
         if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
             action += f'</execute_{lang}>'
     return action
@@ -85,7 +87,7 @@ def swe_agent_edit_hack(bash_command: str) -> str:
 
 
 class CodeActAgent(Agent):
-    VERSION = '1.2'
+    VERSION = '1.3'
     """
     The Code Act Agent is a minimalist agent.
     The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
@@ -171,6 +173,7 @@ def step(self, state: State) -> Action:
         Returns:
         - CmdRunAction(command) - bash command to run
         - IPythonRunCellAction(code) - IPython code to run
+        - BrowseInteractiveAction(browsergym_command) - BrowserGym commands to run
         - MessageAction(content) - Message action to run (e.g. ask for clarification)
         - AgentFinishAction() - end the interaction
         """
@@ -205,6 +208,9 @@ def step(self, state: State) -> Action:
                     content = '\n'.join(splitted)
                     content = truncate_observation(content)
                     self.messages.append({'role': 'user', 'content': content})
+                elif isinstance(obs, BrowserOutputObservation):
+                    content = 'OBSERVATION:\n' + truncate_observation(obs.content)
+                    self.messages.append({'role': 'user', 'content': content})
 
         latest_user_message = [m for m in self.messages if m['role'] == 'user'][-1]
         if latest_user_message:
@@ -217,6 +223,7 @@ def step(self, state: State) -> Action:
             stop=[
                 '</execute_ipython>',
                 '</execute_bash>',
+                '</execute_browse>',
             ],
             temperature=0.0,
         )
@@ -251,6 +258,15 @@ def step(self, state: State) -> Action:
             code_group = python_code.group(1).strip()
             thought = action_str.replace(python_code.group(0), '').strip()
             return IPythonRunCellAction(code=code_group, thought=thought)
+        elif browse_command := re.search(
+            r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
+        ):
+            # BrowserGym actions was found
+            browse_actions = browse_command.group(1).strip()
+            thought = action_str.replace(browse_command.group(0), '').strip()
+            return BrowseInteractiveAction(
+                browser_actions=browse_actions, thought=thought
+            )
         else:
             # We assume the LLM is GOOD enough that when it returns pure natural language
             # it want to talk to the user

diff --git a/agenthub/codeact_agent/prompt.py b/agenthub/codeact_agent/prompt.py
@@ -34,6 +34,8 @@
 </execute_ipython>
 The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
 For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can browse the Internet with commands on behalf of the user by wrapping them with <execute_browse> and </execute_browse>.
+For example, you can browse a given URL by <execute_browse> goto("<URL>") </execute_browse>.
 The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
 The assistant can install Python packages through bash by <execute_bash> pip install [package needed] </execute_bash> and should always import packages and define variables before starting to use them.
 The assistant should stop <execute> and provide an answer when they have already obtained the answer from the execution result.
@@ -49,8 +51,8 @@
 If you require access to GitHub but $GITHUB_TOKEN is not set, ask the user to set it for you."""
 
 SYSTEM_SUFFIX = """The assistant's response should be concise.
-You should include <execute_ipython> or <execute_bash> in every one of your responses, unless you are finished with the task or need more input or action from the user in order to proceed.
-IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+You should include <execute_ipython> or <execute_bash> or <execute_browse> in every one of your responses, unless you are finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> or <execute_browse> instead of providing it.
 """
 
 EXAMPLES = """
@@ -154,6 +156,21 @@ def index():
 ASSISTANT:
 The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
 
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+goto("http://127.0.0.1:5000")
+</execute_browse>
+
+USER:
+Observation:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
 USER: Now kill the server, make it display the numbers in a table format.
 
 ASSISTANT:
@@ -230,4 +247,5 @@ def index():
     "I don't understand your input. \n"
     'If you want to execute a bash command, please use <execute_bash> YOUR_COMMAND_HERE </execute_bash>.\n'
     'If you want to execute a block of Python code, please use <execute_ipython> YOUR_COMMAND_HERE </execute_ipython>.\n'
+    'If you want to browse the Internet, please use <execute_browse> YOUR_COMMAND_HERE </execute_browse>.\n'
 )
@@ -17,6 +17,11 @@ const messageActions = {
     store.dispatch(setUrl(url));
     store.dispatch(setScreenshotSrc(screenshotSrc));
   },
+  [ActionType.BROWSE_INTERACTIVE]: (message: ActionMessage) => {
+    const { url, screenshotSrc } = message.args;
+    store.dispatch(setUrl(url));
+    store.dispatch(setScreenshotSrc(screenshotSrc));
+  },
   [ActionType.WRITE]: (message: ActionMessage) => {
     const { path, content } = message.args;
     store.dispatch(updatePath(path));

@@ -23,6 +23,9 @@ enum ActionType {
   // Opens a web page.
   BROWSE = "browse",
 
+  // Interact with the browser instance.
+  BROWSE_INTERACTIVE = "browse_interactive",
+
   // Searches long-term memory.
   RECALL = "recall",
 

diff --git a/opendevin/core/schema/action.py b/opendevin/core/schema/action.py
@@ -40,6 +40,10 @@ class ActionTypeSchema(BaseModel):
     """Opens a web page.
     """
 
+    BROWSE_INTERACTIVE: str = Field(default='browse_interactive')
+    """Interact with the browser instance.
+    """
+
     RECALL: str = Field(default='recall')
     """Searches long-term memory
     """

diff --git a/opendevin/events/action/__init__.py b/opendevin/events/action/__init__.py
@@ -7,7 +7,7 @@
     AgentSummarizeAction,
     ChangeAgentStateAction,
 )
-from .browse import BrowseURLAction
+from .browse import BrowseInteractiveAction, BrowseURLAction
 from .commands import CmdKillAction, CmdRunAction, IPythonRunCellAction
 from .empty import NullAction
 from .files import FileReadAction, FileWriteAction
@@ -20,6 +20,7 @@
     'CmdRunAction',
     'CmdKillAction',
     'BrowseURLAction',
+    'BrowseInteractiveAction',
     'FileReadAction',
     'FileWriteAction',
     'AgentRecallAction',

diff --git a/opendevin/events/action/browse.py b/opendevin/events/action/browse.py
@@ -16,3 +16,15 @@ class BrowseURLAction(Action):
     @property
     def message(self) -> str:
         return f'Browsing URL: {self.url}'
+
+
+@dataclass
+class BrowseInteractiveAction(Action):
+    browser_actions: str
+    thought: str = ''
+    action: str = ActionType.BROWSE_INTERACTIVE
+    runnable: ClassVar[bool] = True
+
+    @property
+    def message(self) -> str:
+        return f'Executing browser actions: {self.browser_actions}'
diff --git a/opendevin/events/serialization/action.py b/opendevin/events/serialization/action.py
@@ -7,7 +7,7 @@
     AgentRejectAction,
     ChangeAgentStateAction,
 )
-from opendevin.events.action.browse import BrowseURLAction
+from opendevin.events.action.browse import BrowseInteractiveAction, BrowseURLAction
 from opendevin.events.action.commands import (
     CmdKillAction,
     CmdRunAction,
@@ -22,6 +22,7 @@
     CmdRunAction,
     IPythonRunCellAction,
     BrowseURLAction,
+    BrowseInteractiveAction,
     FileReadAction,
     FileWriteAction,
     AgentRecallAction,

diff --git a/opendevin/runtime/runtime.py b/opendevin/runtime/runtime.py
@@ -5,6 +5,7 @@
 from opendevin.events.action import (
     Action,
     AgentRecallAction,
+    BrowseInteractiveAction,
     BrowseURLAction,
     CmdKillAction,
     CmdRunAction,
@@ -154,6 +155,10 @@ async def write(self, action: FileWriteAction) -> Observation:
     async def browse(self, action: BrowseURLAction) -> Observation:
         pass
 
+    @abstractmethod
+    async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation:
+        pass
+
     @abstractmethod
     async def recall(self, action: AgentRecallAction) -> Observation:
         pass
diff --git a/opendevin/runtime/server/browse.py b/opendevin/runtime/server/browse.py
@@ -1,15 +1,23 @@
 import os
 
+from opendevin.core.schema import ActionType
 from opendevin.events.observation import BrowserOutputObservation
 
 
 async def browse(action, browser) -> BrowserOutputObservation:  # type: ignore
-    asked_url = action.url
-    if not asked_url.startswith('http'):
-        asked_url = os.path.abspath(os.curdir) + action.url
-    try:
-        # action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py
+    if action.action == ActionType.BROWSE:
+        # legacy BrowseURLAction
+        asked_url = action.url
+        if not asked_url.startswith('http'):
+            asked_url = os.path.abspath(os.curdir) + action.url
         action_str = f'goto("{asked_url}")'
+    elif action.action == ActionType.BROWSE_INTERACTIVE:
+        # new BrowseInteractiveAction, supports full featured BrowserGym actions
+        # action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py
+        action_str = action.browser_actions
+    else:
+        raise ValueError(f'Invalid action type: {action.action}')
+    try:
         # obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396
         obs = browser.step(action_str)
         return BrowserOutputObservation(
@@ -21,9 +29,12 @@ async def browse(action, browser) -> BrowserOutputObservation:  # type: ignore
             last_browser_action=obs['last_action'],  # last browser env action performed
             focused_element_bid=obs['focused_element_bid'],  # focused element bid
             screenshot=obs['screenshot'],  # base64-encoded screenshot, png
-            url=asked_url,
+            url=obs['url'],  # URL of the page
         )
     except Exception as e:
         return BrowserOutputObservation(
-            content=str(e), screenshot='', error=True, url=asked_url
+            content=str(e),
+            screenshot='',
+            error=True,
+            url=asked_url if action.action == ActionType.BROWSE else '',
         )
diff --git a/opendevin/runtime/server/runtime.py b/opendevin/runtime/server/runtime.py
@@ -1,5 +1,6 @@
 from opendevin.events.action import (
     AgentRecallAction,
+    BrowseInteractiveAction,
     BrowseURLAction,
     CmdKillAction,
     CmdRunAction,
@@ -58,6 +59,9 @@ async def write(self, action: FileWriteAction) -> Observation:
     async def browse(self, action: BrowseURLAction) -> Observation:
         return await browse(action, self.browser)
 
+    async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation:
+        return await browse(action, self.browser)
+
     async def recall(self, action: AgentRecallAction) -> Observation:
         return NullObservation('')
 

diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_001.log b/tests/integration/mock/CodeActAgent/test_edits/prompt_001.log
@@ -9,6 +9,8 @@ print("Hello World!")
 </execute_ipython>
 The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
 For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can browse the Internet with commands on behalf of the user by wrapping them with <execute_browse> and </execute_browse>.
+For example, you can browse a given URL by <execute_browse> goto("<URL>") </execute_browse>.
 The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
 The assistant can install Python packages through bash by <execute_bash> pip install [package needed] </execute_bash> and should always import packages and define variables before starting to use them.
 The assistant should stop <execute> and provide an answer when they have already obtained the answer from the execution result.
@@ -38,8 +40,8 @@ end_of_edit - replaces lines <start_line> through <end_line> (inclusive) with th
 Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. If you'd like to add the line '        print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
 
 The assistant's response should be concise.
-You should include <execute_ipython> or <execute_bash> in every one of your responses, unless you are finished with the task or need more input or action from the user in order to proceed.
-IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+You should include <execute_ipython> or <execute_bash> or <execute_browse> in every one of your responses, unless you are finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> or <execute_browse> instead of providing it.
 
 
 ----------
@@ -146,6 +148,21 @@ Press CTRL+C to quit
 ASSISTANT:
 The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
 
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+goto("http://127.0.0.1:5000")
+</execute_browse>
+
+USER:
+Observation:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
 USER: Now kill the server, make it display the numbers in a table format.
 
 ASSISTANT:

diff --git a/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log b/tests/integration/mock/CodeActAgent/test_edits/prompt_002.log
@@ -9,6 +9,8 @@ print("Hello World!")
 </execute_ipython>
 The assistant can execute bash commands on behalf of the user by wrapping them with <execute_bash> and </execute_bash>.
 For example, you can list the files in the current directory by <execute_bash> ls </execute_bash>.
+The assistant can browse the Internet with commands on behalf of the user by wrapping them with <execute_browse> and </execute_browse>.
+For example, you can browse a given URL by <execute_browse> goto("<URL>") </execute_browse>.
 The assistant should attempt fewer things at a time instead of putting too much commands OR code in one "execute" block.
 The assistant can install Python packages through bash by <execute_bash> pip install [package needed] </execute_bash> and should always import packages and define variables before starting to use them.
 The assistant should stop <execute> and provide an answer when they have already obtained the answer from the execution result.
@@ -38,8 +40,8 @@ end_of_edit - replaces lines <start_line> through <end_line> (inclusive) with th
 Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. If you'd like to add the line '        print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
 
 The assistant's response should be concise.
-You should include <execute_ipython> or <execute_bash> in every one of your responses, unless you are finished with the task or need more input or action from the user in order to proceed.
-IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> instead of providing it.
+You should include <execute_ipython> or <execute_bash> or <execute_browse> in every one of your responses, unless you are finished with the task or need more input or action from the user in order to proceed.
+IMPORTANT: Whenever possible, execute the code for the user using <execute_ipython> or <execute_bash> or <execute_browse> instead of providing it.
 
 
 ----------
@@ -146,6 +148,21 @@ Press CTRL+C to quit
 ASSISTANT:
 The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
 
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+goto("http://127.0.0.1:5000")
+</execute_browse>
+
+USER:
+Observation:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
 USER: Now kill the server, make it display the numbers in a table format.
 
 ASSISTANT:
@@ -228,20 +245,19 @@ ENVIRONMENT REMINDER: You have 9 turns left to complete the task.
 
 ----------
 
-Certainly! I will start by opening the file `bad.txt` to see the content and then proceed to fix the typos.
+Let's start by examining the contents of `bad.txt` to identify the typos. I'll read the file first.
 
 <execute_bash>
-open bad.txt
+cat bad.txt
 </execute_bash>
 
 ----------
 
 OBSERVATION:
-[File: /workspace/bad.txt (4 lines total)]
-1:This is a stupid typoo.
-2:Really?
-3:No mor typos!
-4:Enjoy!
+This is a stupid typoo.
+Really?
+No mor typos!
+Enjoy!
 [Command -1 finished with exit code 0]]
 
 ENVIRONMENT REMINDER: You have 8 turns left to complete the task.