oraichain
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎dev_config/python/.pre-commit-config.yaml
+9-9 b/‎dev_config/python/.pre-commit-config.yaml
+9-9
diff --git a/‎openhands/agenthub/codeact_agent/codeact_agent.py
+4-18 b/‎openhands/agenthub/codeact_agent/codeact_agent.py
+4-18
diff --git a/‎openhands/controller/agent.py
+2 b/‎openhands/controller/agent.py
+2
diff --git a/‎openhands/core/schema/action.py
+1-1 b/‎openhands/core/schema/action.py
+1-1
diff --git a/‎openhands/core/schema/observation.py
+5-1 b/‎openhands/core/schema/observation.py
+5-1
diff --git a/‎openhands/core/setup.py
+11-6 b/‎openhands/core/setup.py
+11-6
diff --git a/‎openhands/events/observation/__init__.py
+1 b/‎openhands/events/observation/__init__.py
+1
diff --git a/‎openhands/events/observation/playwright_mcp.py
+22 b/‎openhands/events/observation/playwright_mcp.py
+22
diff --git a/‎openhands/events/serialization/observation.py
+4 b/‎openhands/events/serialization/observation.py
+4
diff --git a/‎openhands/events/stream.py
+1-1 b/‎openhands/events/stream.py
+1-1
diff --git a/‎openhands/memory/conversation_memory.py
+14 b/‎openhands/memory/conversation_memory.py
+14
@@ -233,3 +233,4 @@ containers/runtime/Dockerfile
 containers/runtime/project.tar.gz
 containers/runtime/code
 **/node_modules/
+yes/
@@ -1,13 +1,13 @@
 repos:
-  # - repo: https://github.com/pre-commit/pre-commit-hooks
-  #   rev: v4.5.0
-  #   hooks:
-  #     - id: trailing-whitespace
-  #       exclude: docs/modules/python
-  #     - id: end-of-file-fixer
-  #       exclude: docs/modules/python
-  #     - id: check-yaml
-  #     - id: debug-statements
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: trailing-whitespace
+        exclude: docs/modules/python
+      - id: end-of-file-fixer
+        exclude: docs/modules/python
+      - id: check-yaml
+      - id: debug-statements
 
   - repo: https://github.com/tox-dev/pyproject-fmt
     rev: 1.7.0
 
@@ -13,7 +13,6 @@
 )
 from openhands.events.event import Event
 from openhands.llm.llm import LLM
-from openhands.mcp.mcp_agent import MCPAgent, convert_mcp_agents_to_tools
 from openhands.memory.condenser import Condenser
 from openhands.memory.condenser.condenser import Condensation, View
 from openhands.memory.conversation_memory import ConversationMemory
@@ -54,12 +53,7 @@ class CodeActAgent(Agent):
         JupyterRequirement(),
     ]
 
-    def __init__(
-        self,
-        llm: LLM,
-        config: AgentConfig,
-        mcp_agents: list[MCPAgent] | None = None,
-    ) -> None:
+    def __init__(self, llm: LLM, config: AgentConfig, mcp_tools: list[dict]) -> None:
         """Initializes a new instance of the CodeActAgent class.
 
         Parameters:
@@ -68,7 +62,7 @@ def __init__(
         super().__init__(llm, config)
         self.pending_actions: deque[Action] = deque()
         self.reset()
-        logger.info(f'MCP agents: {mcp_agents}')
+        logger.debug(f'MCP tools: {mcp_tools}')
 
         built_in_tools = codeact_function_calling.get_tools(
             codeact_enable_browsing=self.config.codeact_enable_browsing,
@@ -78,14 +72,6 @@ def __init__(
         )
 
         # initialize MCP agents
-        self.mcp_agents = mcp_agents if mcp_agents is not None else []
-        logger.info(f'MCP agents: {self.mcp_agents}')
-        try:
-            mcp_tools = convert_mcp_agents_to_tools(self.mcp_agents)
-            logger.info(f'MCP tools: {mcp_tools}')
-        except Exception as e:
-            logger.error(f"Error converting MCP agents to tools: {e}")
-            mcp_tools = []
         self.tools = built_in_tools + mcp_tools
 
         # Retrieve the enabled tools
@@ -155,9 +141,9 @@ def step(self, state: State) -> Action:
         # log to litellm proxy if possible
         params['extra_body'] = {'metadata': state.to_llm_metadata(agent_name=self.name)}
         response = self.llm.completion(**params)
-        logger.error(f'Response from LLM: {response}')
+        logger.debug(f'Response from LLM: {response}')
         actions = codeact_function_calling.response_to_actions(response)
-        logger.error(f'Actions after response_to_actions: {actions}')
+        logger.debug(f'Actions after response_to_actions: {actions}')
         for action in actions:
             self.pending_actions.append(action)
         return self.pending_actions.popleft()
 
@@ -32,11 +32,13 @@ def __init__(
         self,
         llm: LLM,
         config: 'AgentConfig',
+        mcp_tools: list[dict] | None = None,
     ):
         self.llm = llm
         self.config = config
         self._complete = False
         self.prompt_manager: 'PromptManager' | None = None
+        self.mcp_tools = mcp_tools
 
     @property
     def complete(self) -> bool:
 
@@ -38,7 +38,7 @@ class ActionType(str, Enum):
     """Interact with the browser instance.
     """
 
-    MCP = 'mcp'
+    MCP = 'call_tool_mcp'
     """Interact with the MCP server.
     """
 
 
@@ -51,4 +51,8 @@ class ObservationType(str, Enum):
     """Result of a recall operation. This can be the workspace context, a microagent, or other types of information."""
 
     MCP = 'mcp'
-    """Result of a MCP Server operation. This can be the result of a MCP action."""
+    """Result of a MCP Server operation"""
+
+    PLAYWRIGHT_MCP_BROWSER_SCREENSHOT = 'playwright_mcp_browser_screenshot'
+    """Result of a Playwright MCP Browser Screenshot operation. The response is a base64 encoded string of the screenshot, which should be streamed to the client using the correct format matching
+    browsergym's screenshot format."""
@@ -17,7 +17,7 @@
 from openhands.events.event import Event
 from openhands.integrations.provider import ProviderToken, ProviderType, SecretStore
 from openhands.llm.llm import LLM
-from openhands.mcp.mcp_agent import MCPAgent
+from openhands.mcp.mcp_agent import MCPAgent, convert_mcp_agents_to_tools
 from openhands.memory.memory import Memory
 from openhands.microagent.microagent import BaseMicroAgent
 from openhands.runtime import get_runtime_cls
@@ -176,21 +176,26 @@ async def create_agent(config: AppConfig) -> Agent:
     agent_cls: Type[Agent] = Agent.get_cls(config.default_agent)
     agent_config = config.get_agent_config(config.default_agent)
     llm_config = config.get_llm_config_from_agent(config.default_agent)
-    # FIXME: Need to close the agents after calling the tool
     mcp_agents = await create_mcp_agents(
         config.mcp.sse.mcp_servers, config.mcp.stdio.commands, config.mcp.stdio.args
     )
+    mcp_tools = convert_mcp_agents_to_tools(mcp_agents)
     agent = agent_cls(
         llm=LLM(config=llm_config),
         config=agent_config,
-        mcp_agents=mcp_agents,
+        mcp_tools=mcp_tools,
     )
 
+    # We only need to get the tools from the MCP agents, so we can safely close them after that
+    # the actual calls will be done in a sandbox environment, not here
+    for mcp_agent in mcp_agents:
+        await mcp_agent.cleanup()
+
     return agent
 
 
 async def create_mcp_agents(
-    sse_mcp_server: List[str], commands: List[str], args: List[str]
+    sse_mcp_server: List[str], commands: List[str], args: List[List[str]]
 ) -> List[MCPAgent]:
     mcp_agents: List[MCPAgent] = []
     # Initialize SSE connections
@@ -211,15 +216,15 @@ async def create_mcp_agents(
 
     # Initialize stdio connections
     if commands:
-        for command, args in zip(commands, args):
+        for command, command_args in zip(commands, args):
             logger.info(
                 f'Initializing MCP agent for {command} with stdio connection...'
             )
 
             agent = MCPAgent()
             try:
                 await agent.initialize(
-                    connection_type='stdio', command=command, args=args
+                    connection_type='stdio', command=command, args=command_args
                 )
                 mcp_agents.append(agent)
                 logger.info(f'Connected to MCP server via stdio with command {command}')
 
@@ -45,4 +45,5 @@
     'RecallObservation',
     'RecallType',
     'MCPObservation',
+    'PlaywrightMcpBrowserScreenshotObservation',
 ]
@@ -0,0 +1,22 @@
+from dataclasses import dataclass, field
+
+from openhands.core.schema import ObservationType
+from openhands.events.observation.observation import Observation
+
+
+@dataclass
+class PlaywrightMcpBrowserScreenshotObservation(Observation):
+    """This data class represents the result of a Playwright MCP Browser Screenshot operation.
+    
+    The response is a dict {"data": "base64 encoded string of the screenshot, which should be streamed to the client using the correct format matching 
+    browsergym's screenshot format", "url": "url of the current webpage"}.
+    """
+
+    observation: str = ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT
+    url: str
+    trigger_by_action: str
+    screenshot: str = field(repr=False, default='')  # don't show in repr
+
+    @property
+    def message(self) -> str:
+        return self.content
@@ -27,6 +27,9 @@
 )
 from openhands.events.observation.mcp import MCPObservation
 from openhands.events.observation.observation import Observation
+from openhands.events.observation.playwright_mcp import (
+    PlaywrightMcpBrowserScreenshotObservation,
+)
 from openhands.events.observation.reject import UserRejectObservation
 from openhands.events.observation.success import SuccessObservation
 
@@ -47,6 +50,7 @@
     AgentThinkObservation,
     RecallObservation,
     MCPObservation,
+    PlaywrightMcpBrowserScreenshotObservation,
 )
 
 OBSERVATION_TYPE_TO_CLASS = {
 
@@ -166,7 +166,7 @@ def add_event(self, event: Event, source: EventSource) -> None:
         logger.debug(f'Adding {type(event).__name__} id={event.id} from {source.name}')
         event._timestamp = datetime.now().isoformat()
         event._source = source  # type: ignore [attr-defined]
-        logger.warning(f'Event to add: {event}')
+        logger.debug(f'Event to add: {event}')
         data = event_to_dict(event)
         data = self._replace_secrets(data)
         event = event_from_dict(data)
 
@@ -39,6 +39,9 @@
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.observation.mcp import MCPObservation
 from openhands.events.observation.observation import Observation
+from openhands.events.observation.playwright_mcp import (
+    PlaywrightMcpBrowserScreenshotObservation,
+)
 from openhands.events.serialization.event import truncate_content
 from openhands.utils.prompt import PromptManager, RepositoryInfo, RuntimeInfo
 
@@ -333,6 +336,17 @@ def _process_observation(
         elif isinstance(obs, MCPObservation):
             logger.warning(f'MCPObservation: {obs}')
             message = Message(role='user', content=[TextContent(text=obs.content)])
+        elif isinstance(obs, PlaywrightMcpBrowserScreenshotObservation):
+            text = obs.content
+
+            text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
+            message = Message(
+                role='user',
+                content=[
+                    TextContent(text=obs.content),
+                    ImageContent(image_urls=[obs.url]),
+                ],
+            )
         elif isinstance(obs, IPythonRunCellObservation):
             text = obs.content
             # replace base64 images with a placeholder
Original file line number	Diff line number	Diff line change
`@@ -45,4 +45,5 @@`
`45`	`45`	`'RecallObservation',`
`46`	`46`	`'RecallType',`
`47`	`47`	`'MCPObservation',`
	`48`	`+ 'PlaywrightMcpBrowserScreenshotObservation',`
`48`	`49`	`]`