Skip to content

Commit 9097f48

Browse files
Move get_agent_obs_text function to browser utils and add return_all option (All-Hands-AI#9019)
Co-authored-by: openhands <[email protected]>
1 parent fd921a4 commit 9097f48

File tree

13 files changed

+145
-85
lines changed

13 files changed

+145
-85
lines changed

openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def step(self, state: State) -> Action:
208208
# for visualwebarena, webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
209209
# initialize and retrieve the first observation by issuing an noop OP
210210
# For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
211-
return BrowseInteractiveAction(browser_actions='noop(1000)')
211+
return BrowseInteractiveAction(browser_actions='noop(1000)', return_axtree=True)
212212

213213
for event in state.view:
214214
if isinstance(event, BrowseInteractiveAction):

openhands/events/action/browse.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ class BrowseURLAction(Action):
1212
action: str = ActionType.BROWSE
1313
runnable: ClassVar[bool] = True
1414
security_risk: ActionSecurityRisk | None = None
15+
return_axtree: bool = False
1516

1617
@property
1718
def message(self) -> str:
@@ -33,6 +34,7 @@ class BrowseInteractiveAction(Action):
3334
action: str = ActionType.BROWSE_INTERACTIVE
3435
runnable: ClassVar[bool] = True
3536
security_risk: ActionSecurityRisk | None = None
37+
return_axtree: bool = False
3638

3739
@property
3840
def message(self) -> str:
Lines changed: 2 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
from dataclasses import dataclass, field
22
from typing import Any
33

4-
from browsergym.utils.obs import flatten_axtree_to_str
5-
6-
from openhands.core.schema import ActionType, ObservationType
4+
from openhands.core.schema import ObservationType
75
from openhands.events.observation.observation import Observation
86

97

@@ -53,69 +51,5 @@ def __str__(self) -> str:
5351
if self.screenshot_path:
5452
ret += f'Screenshot saved to: {self.screenshot_path}\n'
5553
ret += '--- Agent Observation ---\n'
56-
ret += self.get_agent_obs_text()
54+
ret += self.content
5755
return ret
58-
59-
def get_agent_obs_text(self) -> str:
60-
"""Get a concise text that will be shown to the agent."""
61-
if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
62-
text = f'[Current URL: {self.url}]\n'
63-
text += f'[Focused element bid: {self.focused_element_bid}]\n'
64-
65-
# Add screenshot path information if available
66-
if self.screenshot_path:
67-
text += f'[Screenshot saved to: {self.screenshot_path}]\n'
68-
69-
text += '\n'
70-
71-
if self.error:
72-
text += (
73-
'================ BEGIN error message ===============\n'
74-
'The following error occurred when executing the last action:\n'
75-
f'{self.last_browser_action_error}\n'
76-
'================ END error message ===============\n'
77-
)
78-
else:
79-
text += '[Action executed successfully.]\n'
80-
try:
81-
# We do not filter visible only here because we want to show the full content
82-
# of the web page to the agent for simplicity.
83-
# FIXME: handle the case when the web page is too large
84-
cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
85-
text += (
86-
f'============== BEGIN accessibility tree ==============\n'
87-
f'{cur_axtree_txt}\n'
88-
f'============== END accessibility tree ==============\n'
89-
)
90-
except Exception as e:
91-
text += (
92-
f'\n[Error encountered when processing the accessibility tree: {e}]'
93-
)
94-
return text
95-
96-
elif self.trigger_by_action == ActionType.BROWSE:
97-
text = f'[Current URL: {self.url}]\n'
98-
99-
if self.error:
100-
text += (
101-
'================ BEGIN error message ===============\n'
102-
'The following error occurred when trying to visit the URL:\n'
103-
f'{self.last_browser_action_error}\n'
104-
'================ END error message ===============\n'
105-
)
106-
text += '============== BEGIN webpage content ==============\n'
107-
text += self.content
108-
text += '\n============== END webpage content ==============\n'
109-
return text
110-
else:
111-
raise ValueError(f'Invalid trigger_by_action: {self.trigger_by_action}')
112-
113-
def get_axtree_str(self, filter_visible_only: bool = False) -> str:
114-
cur_axtree_txt = flatten_axtree_to_str(
115-
self.axtree_object,
116-
extra_properties=self.extra_element_properties,
117-
with_clickable=True,
118-
skip_generic=False,
119-
filter_visible_only=filter_visible_only,
120-
)
121-
return str(cur_axtree_txt)

openhands/memory/conversation_memory.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@ def _process_observation(
391391
role='user', content=[TextContent(text=obs.content)]
392392
) # Content is already truncated by openhands-aci
393393
elif isinstance(obs, BrowserOutputObservation):
394-
text = obs.get_agent_obs_text()
394+
text = obs.content
395395
if (
396396
obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
397397
and enable_som_visual_browsing

openhands/runtime/browser/utils.py

Lines changed: 104 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
import datetime
33
import os
44
from pathlib import Path
5+
from typing import Any
56

7+
from browsergym.utils.obs import flatten_axtree_to_str
68
from PIL import Image
79

810
from openhands.core.exceptions import BrowserUnavailableException
@@ -14,6 +16,78 @@
1416
from openhands.utils.async_utils import call_sync_from_async
1517

1618

19+
def get_axtree_str(
20+
axtree_object: dict[str, Any],
21+
extra_element_properties: dict[str, Any],
22+
filter_visible_only: bool = False,
23+
) -> str:
24+
cur_axtree_txt = flatten_axtree_to_str(
25+
axtree_object,
26+
extra_properties=extra_element_properties,
27+
with_clickable=True,
28+
skip_generic=False,
29+
filter_visible_only=filter_visible_only,
30+
)
31+
return str(cur_axtree_txt)
32+
33+
34+
def get_agent_obs_text(obs: BrowserOutputObservation) -> str:
35+
"""Get a concise text that will be shown to the agent."""
36+
if obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
37+
text = f'[Current URL: {obs.url}]\n'
38+
text += f'[Focused element bid: {obs.focused_element_bid}]\n'
39+
40+
# Add screenshot path information if available
41+
if obs.screenshot_path:
42+
text += f'[Screenshot saved to: {obs.screenshot_path}]\n'
43+
44+
text += '\n'
45+
46+
if obs.error:
47+
text += (
48+
'================ BEGIN error message ===============\n'
49+
'The following error occurred when executing the last action:\n'
50+
f'{obs.last_browser_action_error}\n'
51+
'================ END error message ===============\n'
52+
)
53+
else:
54+
text += '[Action executed successfully.]\n'
55+
try:
56+
# We do not filter visible only here because we want to show the full content
57+
# of the web page to the agent for simplicity.
58+
# FIXME: handle the case when the web page is too large
59+
cur_axtree_txt = get_axtree_str(
60+
obs.axtree_object,
61+
obs.extra_element_properties,
62+
filter_visible_only=False,
63+
)
64+
text += (
65+
f'============== BEGIN accessibility tree ==============\n'
66+
f'{cur_axtree_txt}\n'
67+
f'============== END accessibility tree ==============\n'
68+
)
69+
except Exception as e:
70+
text += f'\n[Error encountered when processing the accessibility tree: {e}]'
71+
return text
72+
73+
elif obs.trigger_by_action == ActionType.BROWSE:
74+
text = f'[Current URL: {obs.url}]\n'
75+
76+
if obs.error:
77+
text += (
78+
'================ BEGIN error message ===============\n'
79+
'The following error occurred when trying to visit the URL:\n'
80+
f'{obs.last_browser_action_error}\n'
81+
'================ END error message ===============\n'
82+
)
83+
text += '============== BEGIN webpage content ==============\n'
84+
text += obs.content
85+
text += '\n============== END webpage content ==============\n'
86+
return text
87+
else:
88+
raise ValueError(f'Invalid trigger_by_action: {obs.trigger_by_action}')
89+
90+
1791
async def browse(
1892
action: BrowseURLAction | BrowseInteractiveAction,
1993
browser: BrowserEnv | None,
@@ -78,7 +152,8 @@ async def browse(
78152
image = png_base64_url_to_image(obs.get('screenshot'))
79153
image.save(screenshot_path, format='PNG', optimize=True)
80154

81-
return BrowserOutputObservation(
155+
# Create the observation with all data
156+
observation = BrowserOutputObservation(
82157
content=obs['text_content'], # text content of the page
83158
url=obs.get('url', ''), # URL of the page
84159
screenshot=obs.get('screenshot', None), # base64-encoded screenshot, png
@@ -103,13 +178,37 @@ async def browse(
103178
error=True if obs.get('last_action_error', '') else False, # error flag
104179
trigger_by_action=action.action,
105180
)
181+
182+
# Process the content first using the axtree_object
183+
observation.content = get_agent_obs_text(observation)
184+
185+
# If return_axtree is False, remove the axtree_object to save space
186+
if not action.return_axtree:
187+
observation.dom_object = {}
188+
observation.axtree_object = {}
189+
observation.extra_element_properties = {}
190+
191+
return observation
106192
except Exception as e:
107-
return BrowserOutputObservation(
108-
content=str(e),
193+
error_message = str(e)
194+
error_url = asked_url if action.action == ActionType.BROWSE else ''
195+
196+
# Create error observation
197+
observation = BrowserOutputObservation(
198+
content=error_message,
109199
screenshot='',
110200
screenshot_path=None,
111201
error=True,
112-
last_browser_action_error=str(e),
113-
url=asked_url if action.action == ActionType.BROWSE else '',
202+
last_browser_action_error=error_message,
203+
url=error_url,
114204
trigger_by_action=action.action,
115205
)
206+
207+
# Process the content using get_agent_obs_text regardless of return_axtree value
208+
try:
209+
observation.content = get_agent_obs_text(observation)
210+
except Exception:
211+
# If get_agent_obs_text fails, keep the original error message
212+
pass
213+
214+
return observation

openhands/security/invariant/parser.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def parse_action(trace: list[TraceElement], action: Action) -> list[TraceElement
5050
event_dict = event_to_dict(action)
5151
args = event_dict.get('args', {})
5252
thought = args.pop('thought', None)
53+
5354
function = Function(name=action.action, arguments=args)
5455
if thought is not None:
5556
inv_trace.append(Message(role='assistant', content=thought))

tests/runtime/test_browsergym_envs.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ def test_browsergym_eval_env(runtime_cls, temp_dir):
4343
)
4444

4545
# Test browse
46-
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
46+
action = BrowseInteractiveAction(
47+
browser_actions=BROWSER_EVAL_GET_GOAL_ACTION, return_axtree=False
48+
)
4749
logger.info(action, extra={'msg_type': 'ACTION'})
4850
obs = runtime.run_action(action)
4951
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -54,7 +56,7 @@ def test_browsergym_eval_env(runtime_cls, temp_dir):
5456
assert 'from the list and click Submit' in obs.content
5557

5658
# Make sure the browser can produce observation in eval env
57-
action = BrowseInteractiveAction(browser_actions='noop()')
59+
action = BrowseInteractiveAction(browser_actions='noop()', return_axtree=False)
5860
logger.info(action, extra={'msg_type': 'ACTION'})
5961
obs = runtime.run_action(action)
6062
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -64,7 +66,9 @@ def test_browsergym_eval_env(runtime_cls, temp_dir):
6466
)
6567

6668
# Make sure the rewards are working
67-
action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
69+
action = BrowseInteractiveAction(
70+
browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION, return_axtree=False
71+
)
6872
logger.info(action, extra={'msg_type': 'ACTION'})
6973
obs = runtime.run_action(action)
7074
logger.info(obs, extra={'msg_type': 'OBSERVATION'})

tests/runtime/test_browsing.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def test_simple_browse(temp_dir, runtime_cls, run_as_openhands):
4545
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
4646
assert obs.exit_code == 0
4747

48-
action_browse = BrowseURLAction(url='http://localhost:8000')
48+
action_browse = BrowseURLAction(url='http://localhost:8000', return_axtree=False)
4949
logger.info(action_browse, extra={'msg_type': 'ACTION'})
5050
obs = runtime.run_action(action_browse)
5151
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -116,7 +116,9 @@ def test_read_pdf_browse(temp_dir, runtime_cls, run_as_openhands):
116116

117117
# Browse to the PDF file
118118
pdf_url = f'{server_url}/view?path=/workspace/test_document.pdf'
119-
action_browse = BrowseInteractiveAction(browser_actions=f'goto("{pdf_url}")')
119+
action_browse = BrowseInteractiveAction(
120+
browser_actions=f'goto("{pdf_url}")', return_axtree=False
121+
)
120122
logger.info(action_browse, extra={'msg_type': 'ACTION'})
121123
obs = runtime.run_action(action_browse)
122124
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -185,7 +187,9 @@ def test_read_png_browse(temp_dir, runtime_cls, run_as_openhands):
185187

186188
# Browse to the PNG file
187189
png_url = f'{server_url}/view?path=/workspace/test_image.png'
188-
action_browse = BrowseInteractiveAction(browser_actions=f'goto("{png_url}")')
190+
action_browse = BrowseInteractiveAction(
191+
browser_actions=f'goto("{png_url}")', return_axtree=False
192+
)
189193
logger.info(action_browse, extra={'msg_type': 'ACTION'})
190194
obs = runtime.run_action(action_browse)
191195
logger.info(obs, extra={'msg_type': 'OBSERVATION'})

tests/unit/test_action_serialization.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,11 @@ def test_cmd_run_action_serialization_deserialization():
108108
def test_browse_url_action_serialization_deserialization():
109109
original_action_dict = {
110110
'action': 'browse',
111-
'args': {'thought': '', 'url': 'https://www.example.com'},
111+
'args': {
112+
'thought': '',
113+
'url': 'https://www.example.com',
114+
'return_axtree': False,
115+
},
112116
}
113117
serialization_deserialization(original_action_dict, BrowseURLAction)
114118

@@ -120,6 +124,7 @@ def test_browse_interactive_action_serialization_deserialization():
120124
'thought': '',
121125
'browser_actions': 'goto("https://www.example.com")',
122126
'browsergym_send_msg_to_user': '',
127+
'return_axtree': False,
123128
},
124129
}
125130
serialization_deserialization(original_action_dict, BrowseInteractiveAction)

tests/unit/test_browsing_agent_parser.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,4 @@ def test_parse_action(
8080
assert action.browser_actions == expected_browser_actions
8181
assert action.thought == expected_thought
8282
assert action.browsergym_send_msg_to_user == expected_msg_content
83+
assert action.return_axtree is False # Default value should be False

tests/unit/test_conversation_memory.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -457,11 +457,13 @@ def test_process_events_with_file_read_observation(conversation_memory):
457457

458458

459459
def test_process_events_with_browser_output_observation(conversation_memory):
460+
formatted_content = '[Current URL: http://example.com]\n\n============== BEGIN webpage content ==============\nPage loaded\n============== END webpage content =============='
461+
460462
obs = BrowserOutputObservation(
461463
url='http://example.com',
462464
trigger_by_action='browse',
463465
screenshot='',
464-
content='Page loaded',
466+
content=formatted_content,
465467
error=False,
466468
)
467469

tests/unit/test_function_calling.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ def test_browser_valid():
178178
assert len(actions) == 1
179179
assert isinstance(actions[0], BrowseInteractiveAction)
180180
assert actions[0].browser_actions == "click('button-1')"
181+
assert actions[0].return_axtree is False # Default value should be False
181182

182183

183184
def test_browser_missing_code():

0 commit comments

Comments
 (0)