[agent, browsing] Support viewing pdf and png/jpg via browser (#7457)

xingyaoww · openhands-agent · li-boxuan · web-flow · commit ac8b5e79342f · 2025-03-28T07:07:33.000Z
Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
Co-authored-by: Boxuan Li &lt;liboxuan@connect.hku.hk&gt;
diff --git a/openhands/agenthub/codeact_agent/tools/browser.py b/openhands/agenthub/codeact_agent/tools/browser.py
@@ -18,6 +18,11 @@
 fill('a12', 'example with "quotes"')
 click('a51')
 click('48', button='middle', modifiers=['Shift'])
+
+You can also use the browser to view pdf, png, jpg files.
+You should first check the content of /tmp/oh-server-url to get the server url, and then use it to view the file by `goto("{server_url}/view?path={absolute_file_path}")`.
+For example: `goto("http://localhost:8000/view?path=/workspace/test_document.pdf")`
+Note: The file should be downloaded to the local machine first before using the browser to view it.
 """
 
 _BROWSER_TOOL_DESCRIPTION = """
diff --git a/openhands/agenthub/codeact_agent/tools/web_read.py b/openhands/agenthub/codeact_agent/tools/web_read.py
@@ -1,8 +1,8 @@
 from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
 
-_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `web_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.).
+_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `web_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.) OR read a webpage that contains images.
 
-You may use the `web_read` tool to read content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`).
+You may use the `web_read` tool to read text content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`).
 """
 
 WebReadTool = ChatCompletionToolParam(
diff --git a/openhands/core/config/agent_config.py b/openhands/core/config/agent_config.py
@@ -29,7 +29,7 @@ class AgentConfig(BaseModel):
     enable_prompt_extensions: bool = Field(default=True)
     disabled_microagents: list[str] = Field(default_factory=list)
     enable_history_truncation: bool = Field(default=True)
-    enable_som_visual_browsing: bool = Field(default=False)
+    enable_som_visual_browsing: bool = Field(default=True)
     condenser: CondenserConfig = Field(
         default_factory=lambda: NoOpCondenserConfig(type='noop')
     )
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
@@ -347,8 +347,6 @@ def _process_observation(
             text = obs.get_agent_obs_text()
             if (
                 obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
-                and obs.set_of_marks is not None
-                and len(obs.set_of_marks) > 0
                 and enable_som_visual_browsing
                 and vision_is_active
             ):
@@ -357,14 +355,27 @@ def _process_observation(
                     role='user',
                     content=[
                         TextContent(text=text),
-                        ImageContent(image_urls=[obs.set_of_marks]),
+                        ImageContent(
+                            image_urls=[
+                                # show set of marks if it exists
+                                # otherwise, show raw screenshot when using vision-supported model
+                                obs.set_of_marks
+                                if obs.set_of_marks is not None
+                                and len(obs.set_of_marks) > 0
+                                else obs.screenshot
+                            ]
+                        ),
                     ],
                 )
+                logger.debug(
+                    f'Vision enabled for browsing, showing {"set of marks" if obs.set_of_marks and len(obs.set_of_marks) > 0 else "screenshot"}'
+                )
             else:
                 message = Message(
                     role='user',
                     content=[TextContent(text=text)],
                 )
+                logger.debug('Vision disabled for browsing, showing text')
         elif isinstance(obs, AgentDelegateObservation):
             text = truncate_content(
                 obs.outputs['content'] if 'content' in obs.outputs else '',
diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py
@@ -20,7 +20,7 @@
 
 from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
 from fastapi.exceptions import RequestValidationError
-from fastapi.responses import FileResponse, JSONResponse
+from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
 from fastapi.security import APIKeyHeader
 from openhands_aci.editor.editor import OHEditor
 from openhands_aci.editor.exceptions import ToolError
@@ -58,6 +58,7 @@
 from openhands.runtime.browser.browser_env import BrowserEnv
 from openhands.runtime.plugins import ALL_PLUGINS, JupyterPlugin, Plugin, VSCodePlugin
 from openhands.runtime.utils.bash import BashSession
+from openhands.runtime.utils.file_viewer import generate_file_viewer_html
 from openhands.runtime.utils.files import insert_lines, read_lines
 from openhands.runtime.utils.memory_monitor import MemoryMonitor
 from openhands.runtime.utils.runtime_init import init_user_and_working_directory
@@ -531,6 +532,11 @@ def close(self):
     # example: python client.py 8000 --working-dir /workspace --plugins JupyterRequirement
     args = parser.parse_args()
 
+    port_path = '/tmp/oh-server-url'
+    os.makedirs(os.path.dirname(port_path), exist_ok=True)
+    with open(port_path, 'w') as f:
+        f.write(f'http://127.0.0.1:{args.port}')
+
     plugins_to_load: list[Plugin] = []
     if args.plugins:
         for plugin in args.plugins:
@@ -811,5 +817,53 @@ async def list_files(request: Request):
             logger.error(f'Error listing files: {e}')
             return []
 
+    @app.get('/view')
+    async def view_file(path: str, request: Request):
+        """View a file using an embedded viewer.
+
+        Args:
+            path (str): The absolute path of the file to view.
+            request (Request): The FastAPI request object.
+
+        Returns:
+            HTMLResponse: An HTML page with an appropriate viewer for the file.
+        """
+        # Security check: Only allow requests from localhost
+        client_host = request.client.host if request.client else None
+        if client_host not in ['127.0.0.1', 'localhost', '::1']:
+            logger.warning(f'Unauthorized file view attempt from {client_host}')
+            return HTMLResponse(
+                content='<h1>Access Denied</h1><p>This endpoint is only accessible from localhost</p>',
+                status_code=403,
+            )
+
+        if not os.path.isabs(path):
+            return HTMLResponse(
+                content=f'<h1>Error: Path must be absolute</h1><p>{path}</p>',
+                status_code=400,
+            )
+
+        if not os.path.exists(path):
+            return HTMLResponse(
+                content=f'<h1>Error: File not found</h1><p>{path}</p>', status_code=404
+            )
+
+        if os.path.isdir(path):
+            return HTMLResponse(
+                content=f'<h1>Error: Path is a directory</h1><p>{path}</p>',
+                status_code=400,
+            )
+
+        try:
+            html_content = generate_file_viewer_html(path)
+            return HTMLResponse(content=html_content)
+
+        except Exception as e:
+            logger.error(f'Error serving file viewer: {str(e)}')
+            return HTMLResponse(
+                content=f'<h1>Error viewing file</h1><p>{path}</p><p>{str(e)}</p>',
+                status_code=500,
+            )
+
     logger.debug(f'Starting action execution API on port {args.port}')
     run(app, host='0.0.0.0', port=args.port)
diff --git a/openhands/runtime/utils/file_viewer.py b/openhands/runtime/utils/file_viewer.py
@@ -0,0 +1,146 @@
+"""
+Utility module for generating file viewer HTML content.
+"""
+
+import base64
+import mimetypes
+import os
+
+
+def generate_file_viewer_html(file_path: str) -> str:
+    """
+    Generate HTML content for viewing different file types.
+
+    Args:
+        file_path: The absolute path to the file
+
+    Returns:
+        str: HTML content for viewing the file
+
+    Raises:
+        ValueError: If the file extension is not supported
+    """
+    file_extension = os.path.splitext(file_path)[1].lower()
+    file_name = os.path.basename(file_path)
+
+    # Define supported file extensions
+    supported_extensions = [
+        '.pdf',
+        '.png',
+        '.jpg',
+        '.jpeg',
+        '.gif',
+    ]
+
+    # Check if the file extension is supported
+    if file_extension not in supported_extensions:
+        raise ValueError(
+            f"Unsupported file extension: {file_extension}. "
+            f"Supported extensions are: {', '.join(supported_extensions)}"
+        )
+
+    # Check if the file exists
+    if not os.path.exists(file_path):
+        raise ValueError(
+            f'File not found locally: {file_path}. Please download the file to the local machine and try again.'
+        )
+
+    # Read file content directly
+    file_content = None
+    mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
+
+    # For binary files (images, PDFs), encode as base64
+    if file_extension in ['.pdf', '.png', '.jpg', '.jpeg', '.gif', '.bmp']:
+        with open(file_path, 'rb') as file:
+            file_content = base64.b64encode(file.read()).decode('utf-8')
+    # For text files, read as text
+    else:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            file_content = file.read()
+
+    return f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>File Viewer - {file_name}</title>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
+    <style>
+        body, html {{ margin: 0; padding: 0; height: 100%; overflow: hidden; font-family: Arial, sans-serif; }}
+        #viewer-container {{ width: 100%; height: 100vh; overflow: auto; }}
+        .page {{ margin: 10px auto; box-shadow: 0 0 10px rgba(0,0,0,0.3); }}
+        .text-content {{ margin: 20px; white-space: pre-wrap; font-family: monospace; line-height: 1.5; }}
+        .error {{ color: red; margin: 20px; }}
+        img {{ max-width: 100%; margin: 20px auto; display: block; }}
+    </style>
+</head>
+<body>
+    <div id="viewer-container"></div>
+    <script>
+    const filePath = "{file_path}";
+    const fileExtension = "{file_extension}";
+    const fileContent = `{file_content if file_extension not in ['.pdf', '.png', '.jpg', '.jpeg', '.gif', '.bmp'] else ''}`;
+    const fileBase64 = "{file_content if file_extension in ['.pdf', '.png', '.jpg', '.jpeg', '.gif', '.bmp'] else ''}";
+    const mimeType = "{mime_type}";
+    const container = document.getElementById('viewer-container');
+
+    async function loadContent() {{
+        try {{
+            if (fileExtension === '.pdf') {{
+                pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
+                const binaryString = atob(fileBase64);
+                const bytes = new Uint8Array(binaryString.length);
+                for (let i = 0; i < binaryString.length; i++) {{
+                    bytes[i] = binaryString.charCodeAt(i);
+                }}
+
+                const loadingTask = pdfjsLib.getDocument({{data: bytes.buffer}});
+                const pdf = await loadingTask.promise;
+
+                // Get total number of pages
+                const numPages = pdf.numPages;
+
+                // Render each page
+                for (let pageNum = 1; pageNum <= numPages; pageNum++) {{
+                    const page = await pdf.getPage(pageNum);
+
+                    // Set scale for rendering
+                    const viewport = page.getViewport({{ scale: 1.5 }});
+
+                    // Create canvas for rendering
+                    const canvas = document.createElement('canvas');
+                    canvas.className = 'page';
+                    canvas.width = viewport.width;
+                    canvas.height = viewport.height;
+                    container.appendChild(canvas);
+
+                    // Render PDF page into canvas context
+                    const context = canvas.getContext('2d');
+                    const renderContext = {{
+                        canvasContext: context,
+                        viewport: viewport
+                    }};
+
+                    await page.render(renderContext).promise;
+                }}
+            }} else if (['.png', '.jpg', '.jpeg', '.gif', '.bmp'].includes(fileExtension)) {{
+                const img = document.createElement('img');
+                img.src = `data:${{mimeType}};base64,${{fileBase64}}`;
+                img.alt = filePath.split('/').pop();
+                container.appendChild(img);
+            }} else {{
+                const pre = document.createElement('pre');
+                pre.className = 'text-content';
+                pre.textContent = fileContent;
+                container.appendChild(pre);
+            }}
+        }} catch (error) {{
+            console.error('Error:', error);
+            container.innerHTML = `<div class="error"><h2>Error loading file</h2><p>${{error.message}}</p></div>`;
+        }}
+    }}
+
+    window.onload = loadContent;
+    </script>
+</body>
+</html>"""
diff --git a/openhands/server/routes/settings.py b/openhands/server/routes/settings.py
diff --git a/tests/runtime/test_browsing.py b/tests/runtime/test_browsing.py

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ class AgentConfig(BaseModel):`
`29`	`29`	`enable_prompt_extensions: bool = Field(default=True)`
`30`	`30`	`disabled_microagents: list[str] = Field(default_factory=list)`
`31`	`31`	`enable_history_truncation: bool = Field(default=True)`
`32`		`- enable_som_visual_browsing: bool = Field(default=False)`
	`32`	`+ enable_som_visual_browsing: bool = Field(default=True)`
`33`	`33`	`condenser: CondenserConfig = Field(`
`34`	`34`	`default_factory=lambda: NoOpCondenserConfig(type='noop')`
`35`	`35`	`)`