Visual browsing in CodeAct using set-of-marks annotated webpage screenshots (#6464)

adityasoni9998 · web-flow · commit a593d9bc6dda · 2025-02-02T04:56:11.000+08:00
diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -11,6 +11,7 @@
 from openhands.core.config import AgentConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
+from openhands.core.schema import ActionType
 from openhands.events.action import (
     Action,
     AgentDelegateAction,
@@ -304,10 +305,30 @@ def get_observation_message(
             )  # Content is already truncated by openhands-aci
         elif isinstance(obs, BrowserOutputObservation):
             text = obs.get_agent_obs_text()
-            message = Message(
-                role='user',
-                content=[TextContent(text=text)],
-            )
+            if (
+                obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
+                and obs.set_of_marks is not None
+                and len(obs.set_of_marks) > 0
+                and self.config.enable_som_visual_browsing
+                and self.llm.vision_is_active()
+                and (
+                    self.mock_function_calling
+                    or self.llm.is_visual_browser_tool_active()
+                )
+            ):
+                text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
+                message = Message(
+                    role='user',
+                    content=[
+                        TextContent(text=text),
+                        ImageContent(image_urls=[obs.set_of_marks]),
+                    ],
+                )
+            else:
+                message = Message(
+                    role='user',
+                    content=[TextContent(text=text)],
+                )
         elif isinstance(obs, AgentDelegateObservation):
             text = truncate_content(
                 obs.outputs['content'] if 'content' in obs.outputs else '',
diff --git a/openhands/core/config/agent_config.py b/openhands/core/config/agent_config.py
@@ -21,6 +21,7 @@ class AgentConfig(BaseModel):
     """
 
     codeact_enable_browsing: bool = Field(default=True)
+    enable_som_visual_browsing: bool = Field(default=False)
     codeact_enable_llm_editor: bool = Field(default=False)
     codeact_enable_jupyter: bool = Field(default=True)
     micro_agent_name: str | None = Field(default=None)
diff --git a/openhands/core/message.py b/openhands/core/message.py
@@ -101,7 +101,11 @@ def _list_serializer(self) -> dict:
             # See discussion here for details: https://github.com/BerriAI/litellm/issues/6422#issuecomment-2438765472
             if self.role == 'tool' and item.cache_prompt:
                 role_tool_with_prompt_caching = True
-                d.pop('cache_control')
+                if isinstance(d, dict):
+                    d.pop('cache_control')
+                elif isinstance(d, list):
+                    for d_item in d:
+                        d_item.pop('cache_control')
             if isinstance(item, TextContent):
                 content.append(d)
             elif isinstance(item, ImageContent) and self.vision_enabled:
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
@@ -73,6 +73,16 @@
     'o1-2024-12-17',
 ]
 
+# visual browsing tool supported models
+# This flag is needed since gpt-4o and gpt-4o-mini do not allow passing image_urls with role='tool'
+VISUAL_BROWSING_TOOL_SUPPORTED_MODELS = [
+    'claude-3-5-sonnet',
+    'claude-3-5-sonnet-20240620',
+    'claude-3-5-sonnet-20241022',
+    'o1-2024-12-17',
+]
+
+
 REASONING_EFFORT_SUPPORTED_MODELS = [
     'o1-2024-12-17',
 ]
@@ -466,6 +476,15 @@ def is_function_calling_active(self) -> bool:
         """
         return self._function_calling_active
 
+    def is_visual_browser_tool_active(self) -> bool:
+        return (
+            self.config.model in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
+            or self.config.model.split('/')[-1] in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
+            or any(
+                m in self.config.model for m in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
+            )
+        )
+
     def _post_completion(self, response: ModelResponse) -> float:
         """Post-process the completion response.