Skip to content

Commit a593d9b

Browse files
Visual browsing in CodeAct using set-of-marks annotated webpage screenshots (#6464)
1 parent eb8d160 commit a593d9b

File tree

4 files changed

+50
-5
lines changed

4 files changed

+50
-5
lines changed

openhands/agenthub/codeact_agent/codeact_agent.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from openhands.core.config import AgentConfig
1212
from openhands.core.logger import openhands_logger as logger
1313
from openhands.core.message import ImageContent, Message, TextContent
14+
from openhands.core.schema import ActionType
1415
from openhands.events.action import (
1516
Action,
1617
AgentDelegateAction,
@@ -304,10 +305,30 @@ def get_observation_message(
304305
) # Content is already truncated by openhands-aci
305306
elif isinstance(obs, BrowserOutputObservation):
306307
text = obs.get_agent_obs_text()
307-
message = Message(
308-
role='user',
309-
content=[TextContent(text=text)],
310-
)
308+
if (
309+
obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
310+
and obs.set_of_marks is not None
311+
and len(obs.set_of_marks) > 0
312+
and self.config.enable_som_visual_browsing
313+
and self.llm.vision_is_active()
314+
and (
315+
self.mock_function_calling
316+
or self.llm.is_visual_browser_tool_active()
317+
)
318+
):
319+
text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
320+
message = Message(
321+
role='user',
322+
content=[
323+
TextContent(text=text),
324+
ImageContent(image_urls=[obs.set_of_marks]),
325+
],
326+
)
327+
else:
328+
message = Message(
329+
role='user',
330+
content=[TextContent(text=text)],
331+
)
311332
elif isinstance(obs, AgentDelegateObservation):
312333
text = truncate_content(
313334
obs.outputs['content'] if 'content' in obs.outputs else '',

openhands/core/config/agent_config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class AgentConfig(BaseModel):
2121
"""
2222

2323
codeact_enable_browsing: bool = Field(default=True)
24+
enable_som_visual_browsing: bool = Field(default=False)
2425
codeact_enable_llm_editor: bool = Field(default=False)
2526
codeact_enable_jupyter: bool = Field(default=True)
2627
micro_agent_name: str | None = Field(default=None)

openhands/core/message.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,11 @@ def _list_serializer(self) -> dict:
101101
# See discussion here for details: https://github.com/BerriAI/litellm/issues/6422#issuecomment-2438765472
102102
if self.role == 'tool' and item.cache_prompt:
103103
role_tool_with_prompt_caching = True
104-
d.pop('cache_control')
104+
if isinstance(d, dict):
105+
d.pop('cache_control')
106+
elif isinstance(d, list):
107+
for d_item in d:
108+
d_item.pop('cache_control')
105109
if isinstance(item, TextContent):
106110
content.append(d)
107111
elif isinstance(item, ImageContent) and self.vision_enabled:

openhands/llm/llm.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,16 @@
7373
'o1-2024-12-17',
7474
]
7575

76+
# visual browsing tool supported models
77+
# This flag is needed since gpt-4o and gpt-4o-mini do not allow passing image_urls with role='tool'
78+
VISUAL_BROWSING_TOOL_SUPPORTED_MODELS = [
79+
'claude-3-5-sonnet',
80+
'claude-3-5-sonnet-20240620',
81+
'claude-3-5-sonnet-20241022',
82+
'o1-2024-12-17',
83+
]
84+
85+
7686
REASONING_EFFORT_SUPPORTED_MODELS = [
7787
'o1-2024-12-17',
7888
]
@@ -466,6 +476,15 @@ def is_function_calling_active(self) -> bool:
466476
"""
467477
return self._function_calling_active
468478

479+
def is_visual_browser_tool_active(self) -> bool:
480+
return (
481+
self.config.model in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
482+
or self.config.model.split('/')[-1] in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
483+
or any(
484+
m in self.config.model for m in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
485+
)
486+
)
487+
469488
def _post_completion(self, response: ModelResponse) -> float:
470489
"""Post-process the completion response.
471490

0 commit comments

Comments
 (0)