Skip to content

Commit 3980b76

Browse files
trungbachducphamle2
authored andcommitted
feat: convert mcp agents to tool in agent_session
chore: reenable precommit hooks & fix lint chore: reduce log verbose feat: add playwright mcp browser screenshot stream
1 parent 930b1d4 commit 3980b76

File tree

20 files changed

+284
-75
lines changed

20 files changed

+284
-75
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -233,3 +233,4 @@ containers/runtime/Dockerfile
233233
containers/runtime/project.tar.gz
234234
containers/runtime/code
235235
**/node_modules/
236+
yes/

dev_config/python/.pre-commit-config.yaml

+9-9
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
repos:
2-
# - repo: https://github.com/pre-commit/pre-commit-hooks
3-
# rev: v4.5.0
4-
# hooks:
5-
# - id: trailing-whitespace
6-
# exclude: docs/modules/python
7-
# - id: end-of-file-fixer
8-
# exclude: docs/modules/python
9-
# - id: check-yaml
10-
# - id: debug-statements
2+
- repo: https://github.com/pre-commit/pre-commit-hooks
3+
rev: v4.5.0
4+
hooks:
5+
- id: trailing-whitespace
6+
exclude: docs/modules/python
7+
- id: end-of-file-fixer
8+
exclude: docs/modules/python
9+
- id: check-yaml
10+
- id: debug-statements
1111

1212
- repo: https://github.com/tox-dev/pyproject-fmt
1313
rev: 1.7.0

openhands/agenthub/codeact_agent/codeact_agent.py

+4-18
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
)
1414
from openhands.events.event import Event
1515
from openhands.llm.llm import LLM
16-
from openhands.mcp.mcp_agent import MCPAgent, convert_mcp_agents_to_tools
1716
from openhands.memory.condenser import Condenser
1817
from openhands.memory.condenser.condenser import Condensation, View
1918
from openhands.memory.conversation_memory import ConversationMemory
@@ -54,12 +53,7 @@ class CodeActAgent(Agent):
5453
JupyterRequirement(),
5554
]
5655

57-
def __init__(
58-
self,
59-
llm: LLM,
60-
config: AgentConfig,
61-
mcp_agents: list[MCPAgent] | None = None,
62-
) -> None:
56+
def __init__(self, llm: LLM, config: AgentConfig, mcp_tools: list[dict]) -> None:
6357
"""Initializes a new instance of the CodeActAgent class.
6458
6559
Parameters:
@@ -68,7 +62,7 @@ def __init__(
6862
super().__init__(llm, config)
6963
self.pending_actions: deque[Action] = deque()
7064
self.reset()
71-
logger.info(f'MCP agents: {mcp_agents}')
65+
logger.debug(f'MCP tools: {mcp_tools}')
7266

7367
built_in_tools = codeact_function_calling.get_tools(
7468
codeact_enable_browsing=self.config.codeact_enable_browsing,
@@ -78,14 +72,6 @@ def __init__(
7872
)
7973

8074
# initialize MCP agents
81-
self.mcp_agents = mcp_agents if mcp_agents is not None else []
82-
logger.info(f'MCP agents: {self.mcp_agents}')
83-
try:
84-
mcp_tools = convert_mcp_agents_to_tools(self.mcp_agents)
85-
logger.info(f'MCP tools: {mcp_tools}')
86-
except Exception as e:
87-
logger.error(f"Error converting MCP agents to tools: {e}")
88-
mcp_tools = []
8975
self.tools = built_in_tools + mcp_tools
9076

9177
# Retrieve the enabled tools
@@ -155,9 +141,9 @@ def step(self, state: State) -> Action:
155141
# log to litellm proxy if possible
156142
params['extra_body'] = {'metadata': state.to_llm_metadata(agent_name=self.name)}
157143
response = self.llm.completion(**params)
158-
logger.error(f'Response from LLM: {response}')
144+
logger.debug(f'Response from LLM: {response}')
159145
actions = codeact_function_calling.response_to_actions(response)
160-
logger.error(f'Actions after response_to_actions: {actions}')
146+
logger.debug(f'Actions after response_to_actions: {actions}')
161147
for action in actions:
162148
self.pending_actions.append(action)
163149
return self.pending_actions.popleft()

openhands/controller/agent.py

+2
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,13 @@ def __init__(
3232
self,
3333
llm: LLM,
3434
config: 'AgentConfig',
35+
mcp_tools: list[dict] | None = None,
3536
):
3637
self.llm = llm
3738
self.config = config
3839
self._complete = False
3940
self.prompt_manager: 'PromptManager' | None = None
41+
self.mcp_tools = mcp_tools
4042

4143
@property
4244
def complete(self) -> bool:

openhands/core/schema/action.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ class ActionType(str, Enum):
3838
"""Interact with the browser instance.
3939
"""
4040

41-
MCP = 'mcp'
41+
MCP = 'call_tool_mcp'
4242
"""Interact with the MCP server.
4343
"""
4444

openhands/core/schema/observation.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,8 @@ class ObservationType(str, Enum):
5151
"""Result of a recall operation. This can be the workspace context, a microagent, or other types of information."""
5252

5353
MCP = 'mcp'
54-
"""Result of a MCP Server operation. This can be the result of a MCP action."""
54+
"""Result of a MCP Server operation"""
55+
56+
PLAYWRIGHT_MCP_BROWSER_SCREENSHOT = 'playwright_mcp_browser_screenshot'
57+
"""Result of a Playwright MCP Browser Screenshot operation. The response is a base64 encoded string of the screenshot, which should be streamed to the client using the correct format matching
58+
browsergym's screenshot format."""

openhands/core/setup.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from openhands.events.event import Event
1818
from openhands.integrations.provider import ProviderToken, ProviderType, SecretStore
1919
from openhands.llm.llm import LLM
20-
from openhands.mcp.mcp_agent import MCPAgent
20+
from openhands.mcp.mcp_agent import MCPAgent, convert_mcp_agents_to_tools
2121
from openhands.memory.memory import Memory
2222
from openhands.microagent.microagent import BaseMicroAgent
2323
from openhands.runtime import get_runtime_cls
@@ -176,21 +176,26 @@ async def create_agent(config: AppConfig) -> Agent:
176176
agent_cls: Type[Agent] = Agent.get_cls(config.default_agent)
177177
agent_config = config.get_agent_config(config.default_agent)
178178
llm_config = config.get_llm_config_from_agent(config.default_agent)
179-
# FIXME: Need to close the agents after calling the tool
180179
mcp_agents = await create_mcp_agents(
181180
config.mcp.sse.mcp_servers, config.mcp.stdio.commands, config.mcp.stdio.args
182181
)
182+
mcp_tools = convert_mcp_agents_to_tools(mcp_agents)
183183
agent = agent_cls(
184184
llm=LLM(config=llm_config),
185185
config=agent_config,
186-
mcp_agents=mcp_agents,
186+
mcp_tools=mcp_tools,
187187
)
188188

189+
# We only need to get the tools from the MCP agents, so we can safely close them after that
190+
# the actual calls will be done in a sandbox environment, not here
191+
for mcp_agent in mcp_agents:
192+
await mcp_agent.cleanup()
193+
189194
return agent
190195

191196

192197
async def create_mcp_agents(
193-
sse_mcp_server: List[str], commands: List[str], args: List[str]
198+
sse_mcp_server: List[str], commands: List[str], args: List[List[str]]
194199
) -> List[MCPAgent]:
195200
mcp_agents: List[MCPAgent] = []
196201
# Initialize SSE connections
@@ -211,15 +216,15 @@ async def create_mcp_agents(
211216

212217
# Initialize stdio connections
213218
if commands:
214-
for command, args in zip(commands, args):
219+
for command, command_args in zip(commands, args):
215220
logger.info(
216221
f'Initializing MCP agent for {command} with stdio connection...'
217222
)
218223

219224
agent = MCPAgent()
220225
try:
221226
await agent.initialize(
222-
connection_type='stdio', command=command, args=args
227+
connection_type='stdio', command=command, args=command_args
223228
)
224229
mcp_agents.append(agent)
225230
logger.info(f'Connected to MCP server via stdio with command {command}')

openhands/events/observation/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,5 @@
4545
'RecallObservation',
4646
'RecallType',
4747
'MCPObservation',
48+
'PlaywrightMcpBrowserScreenshotObservation',
4849
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from dataclasses import dataclass, field
2+
3+
from openhands.core.schema import ObservationType
4+
from openhands.events.observation.observation import Observation
5+
6+
7+
@dataclass
8+
class PlaywrightMcpBrowserScreenshotObservation(Observation):
9+
"""This data class represents the result of a Playwright MCP Browser Screenshot operation.
10+
11+
The response is a dict {"data": "base64 encoded string of the screenshot, which should be streamed to the client using the correct format matching
12+
browsergym's screenshot format", "url": "url of the current webpage"}.
13+
"""
14+
15+
observation: str = ObservationType.PLAYWRIGHT_MCP_BROWSER_SCREENSHOT
16+
url: str
17+
trigger_by_action: str
18+
screenshot: str = field(repr=False, default='') # don't show in repr
19+
20+
@property
21+
def message(self) -> str:
22+
return self.content

openhands/events/serialization/observation.py

+4
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
)
2828
from openhands.events.observation.mcp import MCPObservation
2929
from openhands.events.observation.observation import Observation
30+
from openhands.events.observation.playwright_mcp import (
31+
PlaywrightMcpBrowserScreenshotObservation,
32+
)
3033
from openhands.events.observation.reject import UserRejectObservation
3134
from openhands.events.observation.success import SuccessObservation
3235

@@ -47,6 +50,7 @@
4750
AgentThinkObservation,
4851
RecallObservation,
4952
MCPObservation,
53+
PlaywrightMcpBrowserScreenshotObservation,
5054
)
5155

5256
OBSERVATION_TYPE_TO_CLASS = {

openhands/events/stream.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def add_event(self, event: Event, source: EventSource) -> None:
166166
logger.debug(f'Adding {type(event).__name__} id={event.id} from {source.name}')
167167
event._timestamp = datetime.now().isoformat()
168168
event._source = source # type: ignore [attr-defined]
169-
logger.warning(f'Event to add: {event}')
169+
logger.debug(f'Event to add: {event}')
170170
data = event_to_dict(event)
171171
data = self._replace_secrets(data)
172172
event = event_from_dict(data)

openhands/memory/conversation_memory.py

+14
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@
3939
from openhands.events.observation.error import ErrorObservation
4040
from openhands.events.observation.mcp import MCPObservation
4141
from openhands.events.observation.observation import Observation
42+
from openhands.events.observation.playwright_mcp import (
43+
PlaywrightMcpBrowserScreenshotObservation,
44+
)
4245
from openhands.events.serialization.event import truncate_content
4346
from openhands.utils.prompt import PromptManager, RepositoryInfo, RuntimeInfo
4447

@@ -333,6 +336,17 @@ def _process_observation(
333336
elif isinstance(obs, MCPObservation):
334337
logger.warning(f'MCPObservation: {obs}')
335338
message = Message(role='user', content=[TextContent(text=obs.content)])
339+
elif isinstance(obs, PlaywrightMcpBrowserScreenshotObservation):
340+
text = obs.content
341+
342+
text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
343+
message = Message(
344+
role='user',
345+
content=[
346+
TextContent(text=obs.content),
347+
ImageContent(image_urls=[obs.url]),
348+
],
349+
)
336350
elif isinstance(obs, IPythonRunCellObservation):
337351
text = obs.content
338352
# replace base64 images with a placeholder

0 commit comments

Comments
 (0)