Skip to content

Commit ac8b5e7

Browse files
xingyaowwopenhands-agentli-boxuan
authored
[agent, browsing] Support viewing pdf and png/jpg via browser (#7457)
Co-authored-by: openhands <[email protected]> Co-authored-by: Boxuan Li <[email protected]>
1 parent 2350557 commit ac8b5e7

File tree

8 files changed

+366
-32
lines changed

8 files changed

+366
-32
lines changed

openhands/agenthub/codeact_agent/tools/browser.py

+5
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@
1818
fill('a12', 'example with "quotes"')
1919
click('a51')
2020
click('48', button='middle', modifiers=['Shift'])
21+
22+
You can also use the browser to view pdf, png, jpg files.
23+
You should first check the content of /tmp/oh-server-url to get the server url, and then use it to view the file by `goto("{server_url}/view?path={absolute_file_path}")`.
24+
For example: `goto("http://localhost:8000/view?path=/workspace/test_document.pdf")`
25+
Note: The file should be downloaded to the local machine first before using the browser to view it.
2126
"""
2227

2328
_BROWSER_TOOL_DESCRIPTION = """

openhands/agenthub/codeact_agent/tools/web_read.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
22

3-
_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `web_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.).
3+
_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `web_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.) OR read a webpage that contains images.
44
5-
You may use the `web_read` tool to read content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`).
5+
You may use the `web_read` tool to read text content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`).
66
"""
77

88
WebReadTool = ChatCompletionToolParam(

openhands/core/config/agent_config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class AgentConfig(BaseModel):
2929
enable_prompt_extensions: bool = Field(default=True)
3030
disabled_microagents: list[str] = Field(default_factory=list)
3131
enable_history_truncation: bool = Field(default=True)
32-
enable_som_visual_browsing: bool = Field(default=False)
32+
enable_som_visual_browsing: bool = Field(default=True)
3333
condenser: CondenserConfig = Field(
3434
default_factory=lambda: NoOpCondenserConfig(type='noop')
3535
)

openhands/memory/conversation_memory.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -347,8 +347,6 @@ def _process_observation(
347347
text = obs.get_agent_obs_text()
348348
if (
349349
obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
350-
and obs.set_of_marks is not None
351-
and len(obs.set_of_marks) > 0
352350
and enable_som_visual_browsing
353351
and vision_is_active
354352
):
@@ -357,14 +355,27 @@ def _process_observation(
357355
role='user',
358356
content=[
359357
TextContent(text=text),
360-
ImageContent(image_urls=[obs.set_of_marks]),
358+
ImageContent(
359+
image_urls=[
360+
# show set of marks if it exists
361+
# otherwise, show raw screenshot when using vision-supported model
362+
obs.set_of_marks
363+
if obs.set_of_marks is not None
364+
and len(obs.set_of_marks) > 0
365+
else obs.screenshot
366+
]
367+
),
361368
],
362369
)
370+
logger.debug(
371+
f'Vision enabled for browsing, showing {"set of marks" if obs.set_of_marks and len(obs.set_of_marks) > 0 else "screenshot"}'
372+
)
363373
else:
364374
message = Message(
365375
role='user',
366376
content=[TextContent(text=text)],
367377
)
378+
logger.debug('Vision disabled for browsing, showing text')
368379
elif isinstance(obs, AgentDelegateObservation):
369380
text = truncate_content(
370381
obs.outputs['content'] if 'content' in obs.outputs else '',

openhands/runtime/action_execution_server.py

+55-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
2222
from fastapi.exceptions import RequestValidationError
23-
from fastapi.responses import FileResponse, JSONResponse
23+
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
2424
from fastapi.security import APIKeyHeader
2525
from openhands_aci.editor.editor import OHEditor
2626
from openhands_aci.editor.exceptions import ToolError
@@ -58,6 +58,7 @@
5858
from openhands.runtime.browser.browser_env import BrowserEnv
5959
from openhands.runtime.plugins import ALL_PLUGINS, JupyterPlugin, Plugin, VSCodePlugin
6060
from openhands.runtime.utils.bash import BashSession
61+
from openhands.runtime.utils.file_viewer import generate_file_viewer_html
6162
from openhands.runtime.utils.files import insert_lines, read_lines
6263
from openhands.runtime.utils.memory_monitor import MemoryMonitor
6364
from openhands.runtime.utils.runtime_init import init_user_and_working_directory
@@ -531,6 +532,11 @@ def close(self):
531532
# example: python client.py 8000 --working-dir /workspace --plugins JupyterRequirement
532533
args = parser.parse_args()
533534

535+
port_path = '/tmp/oh-server-url'
536+
os.makedirs(os.path.dirname(port_path), exist_ok=True)
537+
with open(port_path, 'w') as f:
538+
f.write(f'http://127.0.0.1:{args.port}')
539+
534540
plugins_to_load: list[Plugin] = []
535541
if args.plugins:
536542
for plugin in args.plugins:
@@ -811,5 +817,53 @@ async def list_files(request: Request):
811817
logger.error(f'Error listing files: {e}')
812818
return []
813819

820+
@app.get('/view')
821+
async def view_file(path: str, request: Request):
822+
"""View a file using an embedded viewer.
823+
824+
Args:
825+
path (str): The absolute path of the file to view.
826+
request (Request): The FastAPI request object.
827+
828+
Returns:
829+
HTMLResponse: An HTML page with an appropriate viewer for the file.
830+
"""
831+
# Security check: Only allow requests from localhost
832+
client_host = request.client.host if request.client else None
833+
if client_host not in ['127.0.0.1', 'localhost', '::1']:
834+
logger.warning(f'Unauthorized file view attempt from {client_host}')
835+
return HTMLResponse(
836+
content='<h1>Access Denied</h1><p>This endpoint is only accessible from localhost</p>',
837+
status_code=403,
838+
)
839+
840+
if not os.path.isabs(path):
841+
return HTMLResponse(
842+
content=f'<h1>Error: Path must be absolute</h1><p>{path}</p>',
843+
status_code=400,
844+
)
845+
846+
if not os.path.exists(path):
847+
return HTMLResponse(
848+
content=f'<h1>Error: File not found</h1><p>{path}</p>', status_code=404
849+
)
850+
851+
if os.path.isdir(path):
852+
return HTMLResponse(
853+
content=f'<h1>Error: Path is a directory</h1><p>{path}</p>',
854+
status_code=400,
855+
)
856+
857+
try:
858+
html_content = generate_file_viewer_html(path)
859+
return HTMLResponse(content=html_content)
860+
861+
except Exception as e:
862+
logger.error(f'Error serving file viewer: {str(e)}')
863+
return HTMLResponse(
864+
content=f'<h1>Error viewing file</h1><p>{path}</p><p>{str(e)}</p>',
865+
status_code=500,
866+
)
867+
814868
logger.debug(f'Starting action execution API on port {args.port}')
815869
run(app, host='0.0.0.0', port=args.port)
+146
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
"""
2+
Utility module for generating file viewer HTML content.
3+
"""
4+
5+
import base64
6+
import mimetypes
7+
import os
8+
9+
10+
def generate_file_viewer_html(file_path: str) -> str:
11+
"""
12+
Generate HTML content for viewing different file types.
13+
14+
Args:
15+
file_path: The absolute path to the file
16+
17+
Returns:
18+
str: HTML content for viewing the file
19+
20+
Raises:
21+
ValueError: If the file extension is not supported
22+
"""
23+
file_extension = os.path.splitext(file_path)[1].lower()
24+
file_name = os.path.basename(file_path)
25+
26+
# Define supported file extensions
27+
supported_extensions = [
28+
'.pdf',
29+
'.png',
30+
'.jpg',
31+
'.jpeg',
32+
'.gif',
33+
]
34+
35+
# Check if the file extension is supported
36+
if file_extension not in supported_extensions:
37+
raise ValueError(
38+
f"Unsupported file extension: {file_extension}. "
39+
f"Supported extensions are: {', '.join(supported_extensions)}"
40+
)
41+
42+
# Check if the file exists
43+
if not os.path.exists(file_path):
44+
raise ValueError(
45+
f'File not found locally: {file_path}. Please download the file to the local machine and try again.'
46+
)
47+
48+
# Read file content directly
49+
file_content = None
50+
mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
51+
52+
# For binary files (images, PDFs), encode as base64
53+
if file_extension in ['.pdf', '.png', '.jpg', '.jpeg', '.gif', '.bmp']:
54+
with open(file_path, 'rb') as file:
55+
file_content = base64.b64encode(file.read()).decode('utf-8')
56+
# For text files, read as text
57+
else:
58+
with open(file_path, 'r', encoding='utf-8') as file:
59+
file_content = file.read()
60+
61+
return f"""<!DOCTYPE html>
62+
<html lang="en">
63+
<head>
64+
<meta charset="UTF-8">
65+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
66+
<title>File Viewer - {file_name}</title>
67+
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
68+
<style>
69+
body, html {{ margin: 0; padding: 0; height: 100%; overflow: hidden; font-family: Arial, sans-serif; }}
70+
#viewer-container {{ width: 100%; height: 100vh; overflow: auto; }}
71+
.page {{ margin: 10px auto; box-shadow: 0 0 10px rgba(0,0,0,0.3); }}
72+
.text-content {{ margin: 20px; white-space: pre-wrap; font-family: monospace; line-height: 1.5; }}
73+
.error {{ color: red; margin: 20px; }}
74+
img {{ max-width: 100%; margin: 20px auto; display: block; }}
75+
</style>
76+
</head>
77+
<body>
78+
<div id="viewer-container"></div>
79+
<script>
80+
const filePath = "{file_path}";
81+
const fileExtension = "{file_extension}";
82+
const fileContent = `{file_content if file_extension not in ['.pdf', '.png', '.jpg', '.jpeg', '.gif', '.bmp'] else ''}`;
83+
const fileBase64 = "{file_content if file_extension in ['.pdf', '.png', '.jpg', '.jpeg', '.gif', '.bmp'] else ''}";
84+
const mimeType = "{mime_type}";
85+
const container = document.getElementById('viewer-container');
86+
87+
async function loadContent() {{
88+
try {{
89+
if (fileExtension === '.pdf') {{
90+
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
91+
const binaryString = atob(fileBase64);
92+
const bytes = new Uint8Array(binaryString.length);
93+
for (let i = 0; i < binaryString.length; i++) {{
94+
bytes[i] = binaryString.charCodeAt(i);
95+
}}
96+
97+
const loadingTask = pdfjsLib.getDocument({{data: bytes.buffer}});
98+
const pdf = await loadingTask.promise;
99+
100+
// Get total number of pages
101+
const numPages = pdf.numPages;
102+
103+
// Render each page
104+
for (let pageNum = 1; pageNum <= numPages; pageNum++) {{
105+
const page = await pdf.getPage(pageNum);
106+
107+
// Set scale for rendering
108+
const viewport = page.getViewport({{ scale: 1.5 }});
109+
110+
// Create canvas for rendering
111+
const canvas = document.createElement('canvas');
112+
canvas.className = 'page';
113+
canvas.width = viewport.width;
114+
canvas.height = viewport.height;
115+
container.appendChild(canvas);
116+
117+
// Render PDF page into canvas context
118+
const context = canvas.getContext('2d');
119+
const renderContext = {{
120+
canvasContext: context,
121+
viewport: viewport
122+
}};
123+
124+
await page.render(renderContext).promise;
125+
}}
126+
}} else if (['.png', '.jpg', '.jpeg', '.gif', '.bmp'].includes(fileExtension)) {{
127+
const img = document.createElement('img');
128+
img.src = `data:${{mimeType}};base64,${{fileBase64}}`;
129+
img.alt = filePath.split('/').pop();
130+
container.appendChild(img);
131+
}} else {{
132+
const pre = document.createElement('pre');
133+
pre.className = 'text-content';
134+
pre.textContent = fileContent;
135+
container.appendChild(pre);
136+
}}
137+
}} catch (error) {{
138+
console.error('Error:', error);
139+
container.innerHTML = `<div class="error"><h2>Error loading file</h2><p>${{error.message}}</p></div>`;
140+
}}
141+
}}
142+
143+
window.onload = loadContent;
144+
</script>
145+
</body>
146+
</html>"""

0 commit comments

Comments
 (0)