Compare commits

...

1 Commits

Author SHA1 Message Date
kshitijk4poor
15c75b1018 feat: add compositor-level coordinate click to browser_click
Add optional x/y parameters to browser_click for viewport-coordinate
clicking via CDP Input.dispatchMouseEvent. This dispatches mouse events
at the browser compositor level — Chrome does its own hit-testing, so
clicks pass through iframes, shadow DOM (open/closed), cross-origin
boundaries, canvas/WebGL elements, and overlays.

Inspired by browser-harness's click_at_xy() strategy.

When x+y are provided:
  1. Prefer CDP path (direct WebSocket to Input.dispatchMouseEvent)
  2. Fall back to agent-browser mouse move/down/up subcommands

Existing ref-based clicks are unchanged — the two modes are mutually
exclusive (provide ref OR x+y, not both).

Schema change: ref is no longer required; either ref or x+y must be
provided. Validation catches all edge cases (partial coords, both
modes, neither mode, non-numeric coords).

17 new tests covering:
  - Input validation (6 tests)
  - CDP happy path with mock server (2 tests)
  - agent-browser mouse fallback (2 tests)
  - Ref-based click preserved (2 tests)
  - Schema correctness (3 tests)
  - Registry dispatch integration (2 tests)
2026-05-03 17:00:30 +05:30
2 changed files with 663 additions and 8 deletions

View File

@@ -0,0 +1,468 @@
"""Tests for compositor-level coordinate click (browser_click with x/y params).
Covers:
- Input validation (ref vs x/y mutually exclusive)
- CDP coordinate click path (via mock CDP server)
- agent-browser mouse fallback path
- Camofox passthrough still works with ref
"""
from __future__ import annotations
import asyncio
import json
import threading
from typing import Any, Dict, List
import pytest
import websockets
from websockets.asyncio.server import serve
# ---------------------------------------------------------------------------
# In-process CDP mock server (reused from test_browser_cdp_tool.py)
# ---------------------------------------------------------------------------
class _CDPServer:
"""Tiny CDP mock — replies to registered method handlers."""
def __init__(self) -> None:
self._handlers: Dict[str, Any] = {}
self._responses: List[Dict[str, Any]] = []
self._loop: asyncio.AbstractEventLoop | None = None
self._server: Any = None
self._thread: threading.Thread | None = None
self._host = "127.0.0.1"
self._port = 0
def on(self, method: str, handler):
self._handlers[method] = handler
def start(self) -> str:
ready = threading.Event()
def _run() -> None:
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)
async def _handler(ws):
try:
async for raw in ws:
msg = json.loads(raw)
call_id = msg.get("id")
method = msg.get("method", "")
params = msg.get("params", {}) or {}
session_id = msg.get("sessionId")
self._responses.append(msg)
fn = self._handlers.get(method)
if fn is None:
reply = {
"id": call_id,
"error": {"code": -32601, "message": f"No handler for {method}"},
}
else:
try:
result = fn(params, session_id)
reply = {"id": call_id, "result": result}
except Exception as exc:
reply = {"id": call_id, "error": {"code": -1, "message": str(exc)}}
if session_id:
reply["sessionId"] = session_id
await ws.send(json.dumps(reply))
except websockets.exceptions.ConnectionClosed:
pass
async def _serve() -> None:
self._server = await serve(_handler, self._host, 0)
sock = next(iter(self._server.sockets))
self._port = sock.getsockname()[1]
ready.set()
await self._server.wait_closed()
try:
self._loop.run_until_complete(_serve())
finally:
self._loop.close()
self._thread = threading.Thread(target=_run, daemon=True)
self._thread.start()
if not ready.wait(timeout=5.0):
raise RuntimeError("CDP mock server failed to start")
return f"ws://{self._host}:{self._port}/devtools/browser/mock"
def stop(self) -> None:
if self._loop and self._server:
self._loop.call_soon_threadsafe(self._server.close)
if self._thread:
self._thread.join(timeout=3.0)
def received(self) -> List[Dict[str, Any]]:
return list(self._responses)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def cdp_server(monkeypatch):
"""Start a CDP mock and point browser_cdp_tool's resolver at it."""
server = _CDPServer()
ws_url = server.start()
import tools.browser_cdp_tool as cdp_mod
monkeypatch.setattr(cdp_mod, "_resolve_cdp_endpoint", lambda: ws_url)
try:
yield server
finally:
server.stop()
# ---------------------------------------------------------------------------
# Input validation
# ---------------------------------------------------------------------------
class TestClickInputValidation:
"""browser_click validates that exactly one of ref / (x,y) is provided."""
def test_neither_ref_nor_coords(self):
from tools.browser_tool import browser_click
result = json.loads(browser_click())
assert result["success"] is False
assert "ref" in result["error"].lower() or "x" in result["error"].lower()
def test_both_ref_and_coords(self):
from tools.browser_tool import browser_click
result = json.loads(browser_click(ref="@e1", x=100, y=200))
assert result["success"] is False
assert "not both" in result["error"].lower()
def test_x_without_y(self):
from tools.browser_tool import browser_click
result = json.loads(browser_click(x=100))
assert result["success"] is False
assert "both" in result["error"].lower()
def test_y_without_x(self):
from tools.browser_tool import browser_click
result = json.loads(browser_click(y=200))
assert result["success"] is False
assert "both" in result["error"].lower()
def test_empty_ref_treated_as_missing(self):
from tools.browser_tool import browser_click
result = json.loads(browser_click(ref=""))
assert result["success"] is False
assert "ref" in result["error"].lower() or "x" in result["error"].lower()
def test_non_numeric_coordinates(self):
from tools.browser_tool import browser_click
result = json.loads(browser_click(x="abc", y="def"))
assert result["success"] is False
assert "number" in result["error"].lower()
# ---------------------------------------------------------------------------
# CDP coordinate click (happy path via mock server)
# ---------------------------------------------------------------------------
class TestCDPCoordinateClick:
"""Coordinate clicks via CDP Input.dispatchMouseEvent."""
def test_cdp_click_dispatches_press_and_release(self, cdp_server):
from tools.browser_tool import browser_click
# Register handlers for the protocol calls
cdp_server.on(
"Target.getTargets",
lambda p, s: {
"targetInfos": [
{"targetId": "page-1", "type": "page", "attached": True, "url": "https://example.com"},
]
},
)
cdp_server.on(
"Target.attachToTarget",
lambda p, s: {"sessionId": f"sess-{p['targetId']}"},
)
cdp_server.on(
"Input.dispatchMouseEvent",
lambda p, s: {},
)
result = json.loads(browser_click(x=150, y=300))
assert result["success"] is True
assert result["clicked_at"] == {"x": 150, "y": 300}
assert result["method"] == "cdp_compositor"
# Verify the CDP calls: Target.getTargets, attach, mousePressed, attach, mouseReleased
calls = cdp_server.received()
methods = [c["method"] for c in calls]
assert "Target.getTargets" in methods
assert "Input.dispatchMouseEvent" in methods
# Find the mouse events
mouse_events = [c for c in calls if c["method"] == "Input.dispatchMouseEvent"]
assert len(mouse_events) == 2
assert mouse_events[0]["params"]["type"] == "mousePressed"
assert mouse_events[0]["params"]["x"] == 150
assert mouse_events[0]["params"]["y"] == 300
assert mouse_events[0]["params"]["button"] == "left"
assert mouse_events[1]["params"]["type"] == "mouseReleased"
def test_cdp_click_rounds_float_coordinates(self, cdp_server):
from tools.browser_tool import browser_click
cdp_server.on(
"Target.getTargets",
lambda p, s: {"targetInfos": [{"targetId": "p1", "type": "page", "attached": True, "url": "..."}]},
)
cdp_server.on("Target.attachToTarget", lambda p, s: {"sessionId": "s1"})
cdp_server.on("Input.dispatchMouseEvent", lambda p, s: {})
result = json.loads(browser_click(x=150.7, y=299.3))
assert result["success"] is True
assert result["clicked_at"] == {"x": 151, "y": 299}
def test_cdp_click_no_page_target_still_works(self, cdp_server):
"""When Target.getTargets returns no page targets, click proceeds without target_id."""
from tools.browser_tool import browser_click
cdp_server.on(
"Target.getTargets",
lambda p, s: {"targetInfos": [{"targetId": "sw1", "type": "service_worker"}]},
)
# No Target.attachToTarget needed — page_target is None so _cdp_call
# sends without attaching
cdp_server.on("Input.dispatchMouseEvent", lambda p, s: {})
result = json.loads(browser_click(x=50, y=50))
assert result["success"] is True
assert result["clicked_at"] == {"x": 50, "y": 50}
def test_cdp_dispatch_mouse_event_failure(self, cdp_server):
"""When Input.dispatchMouseEvent returns a CDP error, return failure."""
from tools.browser_tool import browser_click
cdp_server.on(
"Target.getTargets",
lambda p, s: {"targetInfos": [{"targetId": "p1", "type": "page", "attached": True, "url": "..."}]},
)
cdp_server.on("Target.attachToTarget", lambda p, s: {"sessionId": "s1"})
# No handler for Input.dispatchMouseEvent — server returns CDP error
result = json.loads(browser_click(x=100, y=200))
assert result["success"] is False
assert "CDP coordinate click failed" in result["error"]
# ---------------------------------------------------------------------------
# agent-browser mouse fallback
# ---------------------------------------------------------------------------
class TestAgentBrowserMouseFallback:
"""When no CDP endpoint is available, fall back to agent-browser mouse commands."""
def test_falls_back_to_agent_browser_mouse(self, monkeypatch):
from tools import browser_tool, browser_cdp_tool
# No CDP endpoint available
monkeypatch.setattr(browser_cdp_tool, "_resolve_cdp_endpoint", lambda: "")
# Mock _run_browser_command and _last_session_key
commands_sent = []
def mock_run_cmd(task_id, command, args=None, timeout=None):
commands_sent.append((command, args))
return {"success": True}
monkeypatch.setattr(browser_tool, "_run_browser_command", mock_run_cmd)
monkeypatch.setattr(browser_tool, "_last_session_key", lambda tid: tid)
result = json.loads(browser_tool.browser_click(x=200, y=400))
assert result["success"] is True
assert result["clicked_at"] == {"x": 200, "y": 400}
assert result["method"] == "agent_browser_mouse"
# Should have sent: mouse move, mouse down, mouse up
assert len(commands_sent) == 3
assert commands_sent[0] == ("mouse", ["move", "200", "400"])
assert commands_sent[1] == ("mouse", ["down"])
assert commands_sent[2] == ("mouse", ["up"])
def test_mouse_move_failure_returns_error(self, monkeypatch):
from tools import browser_tool, browser_cdp_tool
monkeypatch.setattr(browser_cdp_tool, "_resolve_cdp_endpoint", lambda: "")
def mock_run_cmd(task_id, command, args=None, timeout=None):
if args and args[0] == "move":
return {"success": False, "error": "mouse move not supported"}
return {"success": True}
monkeypatch.setattr(browser_tool, "_run_browser_command", mock_run_cmd)
monkeypatch.setattr(browser_tool, "_last_session_key", lambda tid: tid)
result = json.loads(browser_tool.browser_click(x=100, y=100))
assert result["success"] is False
assert "mouse move" in result["error"]
def test_mouse_down_failure_returns_error(self, monkeypatch):
from tools import browser_tool, browser_cdp_tool
monkeypatch.setattr(browser_cdp_tool, "_resolve_cdp_endpoint", lambda: "")
def mock_run_cmd(task_id, command, args=None, timeout=None):
if args and args[0] == "down":
return {"success": False, "error": "mouse down failed"}
return {"success": True}
monkeypatch.setattr(browser_tool, "_run_browser_command", mock_run_cmd)
monkeypatch.setattr(browser_tool, "_last_session_key", lambda tid: tid)
result = json.loads(browser_tool.browser_click(x=100, y=100))
assert result["success"] is False
assert "mouse down" in result["error"]
def test_mouse_up_failure_returns_error(self, monkeypatch):
from tools import browser_tool, browser_cdp_tool
monkeypatch.setattr(browser_cdp_tool, "_resolve_cdp_endpoint", lambda: "")
def mock_run_cmd(task_id, command, args=None, timeout=None):
if args and args[0] == "up":
return {"success": False, "error": "mouse up failed"}
return {"success": True}
monkeypatch.setattr(browser_tool, "_run_browser_command", mock_run_cmd)
monkeypatch.setattr(browser_tool, "_last_session_key", lambda tid: tid)
result = json.loads(browser_tool.browser_click(x=100, y=100))
assert result["success"] is False
assert "mouse up" in result["error"]
# ---------------------------------------------------------------------------
# Ref-based click unchanged
# ---------------------------------------------------------------------------
class TestRefClickPreserved:
"""Existing ref-based click behavior is unchanged."""
def test_ref_click_still_works(self, monkeypatch):
from tools import browser_tool
monkeypatch.setattr(browser_tool, "_is_camofox_mode", lambda: False)
monkeypatch.setattr(browser_tool, "_last_session_key", lambda tid: tid)
def mock_run_cmd(task_id, command, args=None, timeout=None):
return {"success": True}
monkeypatch.setattr(browser_tool, "_run_browser_command", mock_run_cmd)
result = json.loads(browser_tool.browser_click(ref="@e5"))
assert result["success"] is True
assert result["clicked"] == "@e5"
def test_ref_without_at_prefix_auto_added(self, monkeypatch):
from tools import browser_tool
monkeypatch.setattr(browser_tool, "_is_camofox_mode", lambda: False)
monkeypatch.setattr(browser_tool, "_last_session_key", lambda tid: tid)
clicked_refs = []
def mock_run_cmd(task_id, command, args=None, timeout=None):
clicked_refs.append(args)
return {"success": True}
monkeypatch.setattr(browser_tool, "_run_browser_command", mock_run_cmd)
browser_tool.browser_click(ref="e12")
assert clicked_refs[0] == ["@e12"]
# ---------------------------------------------------------------------------
# Schema check
# ---------------------------------------------------------------------------
class TestSchemaUpdated:
"""The tool schema reflects x/y params and ref is no longer required."""
def test_schema_has_x_y_properties(self):
from tools.browser_tool import _BROWSER_SCHEMA_MAP
schema = _BROWSER_SCHEMA_MAP["browser_click"]
props = schema["parameters"]["properties"]
assert "x" in props
assert "y" in props
assert props["x"]["type"] == "number"
assert props["y"]["type"] == "number"
def test_schema_no_required_fields(self):
from tools.browser_tool import _BROWSER_SCHEMA_MAP
schema = _BROWSER_SCHEMA_MAP["browser_click"]
# ref is no longer required — either ref or x+y
assert "required" not in schema["parameters"] or schema["parameters"]["required"] == []
def test_schema_ref_still_present(self):
from tools.browser_tool import _BROWSER_SCHEMA_MAP
schema = _BROWSER_SCHEMA_MAP["browser_click"]
assert "ref" in schema["parameters"]["properties"]
# ---------------------------------------------------------------------------
# Registry integration
# ---------------------------------------------------------------------------
class TestRegistryIntegration:
"""browser_click is registered with x/y params wired through."""
def test_dispatch_with_coordinates(self, monkeypatch, cdp_server):
from tools.registry import registry
cdp_server.on(
"Target.getTargets",
lambda p, s: {"targetInfos": [{"targetId": "p1", "type": "page", "attached": True, "url": "..."}]},
)
cdp_server.on("Target.attachToTarget", lambda p, s: {"sessionId": "s1"})
cdp_server.on("Input.dispatchMouseEvent", lambda p, s: {})
raw = registry.dispatch(
"browser_click", {"x": 42, "y": 84}, task_id="t1"
)
result = json.loads(raw)
assert result["success"] is True
assert result["clicked_at"] == {"x": 42, "y": 84}
def test_dispatch_with_ref(self, monkeypatch):
from tools import browser_tool
from tools.registry import registry
monkeypatch.setattr(browser_tool, "_is_camofox_mode", lambda: False)
monkeypatch.setattr(browser_tool, "_last_session_key", lambda tid: tid)
monkeypatch.setattr(
browser_tool, "_run_browser_command",
lambda tid, cmd, args=None, timeout=None: {"success": True},
)
raw = registry.dispatch("browser_click", {"ref": "@e3"}, task_id="t1")
result = json.loads(raw)
assert result["success"] is True
assert result["clicked"] == "@e3"

View File

@@ -1024,16 +1024,23 @@ BROWSER_TOOL_SCHEMAS = [
},
{
"name": "browser_click",
"description": "Click on an element identified by its ref ID from the snapshot (e.g., '@e5'). The ref IDs are shown in square brackets in the snapshot output. Requires browser_navigate and browser_snapshot to be called first.",
"description": "Click on an element identified by its ref ID from the snapshot (e.g., '@e5'). The ref IDs are shown in square brackets in the snapshot output. Requires browser_navigate and browser_snapshot to be called first.\n\nAlternatively, click at exact viewport coordinates (x, y) using compositor-level input. This bypasses DOM selectors entirely — clicks pass through iframes, shadow DOM, cross-origin boundaries, and canvas elements. Use browser_vision with annotate=true to find coordinates, or browser_console to evaluate getBoundingClientRect(). Provide EITHER ref OR (x + y), not both.",
"parameters": {
"type": "object",
"properties": {
"ref": {
"type": "string",
"description": "The element reference from the snapshot (e.g., '@e5', '@e12')"
},
"x": {
"type": "number",
"description": "Viewport X coordinate for compositor-level click. Use with y instead of ref to click through iframes, shadow DOM, or canvas elements."
},
"y": {
"type": "number",
"description": "Viewport Y coordinate for compositor-level click. Use with x instead of ref."
}
},
"required": ["ref"]
}
}
},
{
@@ -1922,17 +1929,197 @@ def browser_snapshot(
}, ensure_ascii=False)
def browser_click(ref: str, task_id: Optional[str] = None) -> str:
def _cdp_coordinate_click(
x: float,
y: float,
task_id: str,
button: str = "left",
) -> str:
"""Compositor-level click at viewport coordinates via CDP Input.dispatchMouseEvent.
This dispatches mouse events at the browser compositor level — Chrome does
its own hit-testing to route the event to the correct renderer process.
Works through iframes (same-origin and cross-origin OOPIFs), shadow DOM
(open and closed), canvas/WebGL elements, and overlays — anything visible
at those coordinates gets clicked.
Inspired by browser-harness's ``click_at_xy()`` strategy.
"""
Click on an element.
try:
from tools.browser_cdp_tool import _cdp_call, _run_async, _resolve_cdp_endpoint, _WS_AVAILABLE
except ImportError:
return json.dumps({
"success": False,
"error": "browser_cdp_tool not available — coordinate clicks require the CDP tool module.",
}, ensure_ascii=False)
if not _WS_AVAILABLE:
return json.dumps({
"success": False,
"error": "The 'websockets' package is required for coordinate clicks. Install with: pip install websockets",
}, ensure_ascii=False)
endpoint = _resolve_cdp_endpoint()
if not endpoint:
# Fall back to agent-browser mouse commands (3 subprocess calls)
return _coordinate_click_via_agent_browser(x, y, task_id, button)
if not endpoint.startswith(("ws://", "wss://")):
return _coordinate_click_via_agent_browser(x, y, task_id, button)
# Find the page target to scope the input events to.
# Input.dispatchMouseEvent is a page-level method — we need a session.
try:
targets_result = _run_async(
_cdp_call(endpoint, "Target.getTargets", {}, None, 10.0)
)
page_target = None
for t in targets_result.get("targetInfos", []):
if t.get("type") == "page" and t.get("attached", True):
page_target = t["targetId"]
break
if not page_target:
# No attached page target — try without target_id
# (some CDP endpoints scope Input to the default page)
page_target = None
except Exception:
page_target = None
ix, iy = int(round(x)), int(round(y))
try:
# mousePressed
_run_async(_cdp_call(endpoint, "Input.dispatchMouseEvent", {
"type": "mousePressed",
"x": ix,
"y": iy,
"button": button,
"clickCount": 1,
}, page_target, 10.0))
# mouseReleased
_run_async(_cdp_call(endpoint, "Input.dispatchMouseEvent", {
"type": "mouseReleased",
"x": ix,
"y": iy,
"button": button,
"clickCount": 1,
}, page_target, 10.0))
except Exception as exc:
return json.dumps({
"success": False,
"error": f"CDP coordinate click failed: {type(exc).__name__}: {exc}",
}, ensure_ascii=False)
return json.dumps({
"success": True,
"clicked_at": {"x": ix, "y": iy},
"method": "cdp_compositor",
}, ensure_ascii=False)
def _coordinate_click_via_agent_browser(
x: float,
y: float,
task_id: str,
button: str = "left",
) -> str:
"""Fallback: coordinate click via agent-browser mouse subcommands."""
effective_task_id = _last_session_key(task_id)
ix, iy = int(round(x)), int(round(y))
# agent-browser mouse move <x> <y> + mouse down + mouse up
move_result = _run_browser_command(effective_task_id, "mouse", ["move", str(ix), str(iy)])
if not move_result.get("success"):
return json.dumps({
"success": False,
"error": f"mouse move failed: {move_result.get('error', 'unknown')}",
}, ensure_ascii=False)
btn_arg = [] if button == "left" else [button]
down_result = _run_browser_command(effective_task_id, "mouse", ["down"] + btn_arg)
if not down_result.get("success"):
return json.dumps({
"success": False,
"error": f"mouse down failed: {down_result.get('error', 'unknown')}",
}, ensure_ascii=False)
up_result = _run_browser_command(effective_task_id, "mouse", ["up"] + btn_arg)
if not up_result.get("success"):
return json.dumps({
"success": False,
"error": f"mouse up failed: {up_result.get('error', 'unknown')}",
}, ensure_ascii=False)
return json.dumps({
"success": True,
"clicked_at": {"x": ix, "y": iy},
"method": "agent_browser_mouse",
}, ensure_ascii=False)
def browser_click(
ref: Optional[str] = None,
x: Optional[float] = None,
y: Optional[float] = None,
task_id: Optional[str] = None,
) -> str:
"""
Click on an element by ref ID, or at exact viewport coordinates.
Provide EITHER ``ref`` (selector-based, via agent-browser) OR ``x`` + ``y``
(compositor-level, via CDP Input.dispatchMouseEvent). Coordinate clicks
bypass DOM selectors entirely — they pass through iframes, shadow DOM,
cross-origin boundaries, and canvas elements.
Args:
ref: Element reference (e.g., "@e5")
x: Viewport X coordinate for compositor-level click
y: Viewport Y coordinate for compositor-level click
task_id: Task identifier for session isolation
Returns:
JSON string with click result
"""
# --- Input validation ---------------------------------------------------
has_ref = ref is not None and str(ref).strip() != ""
has_coords = x is not None and y is not None
if has_ref and has_coords:
return json.dumps({
"success": False,
"error": "Provide either 'ref' or 'x'+'y', not both.",
}, ensure_ascii=False)
if (x is not None) != (y is not None):
return json.dumps({
"success": False,
"error": "Both 'x' and 'y' are required for coordinate clicks.",
}, ensure_ascii=False)
if not has_ref and not has_coords:
return json.dumps({
"success": False,
"error": "Provide either 'ref' (element reference) or 'x'+'y' (viewport coordinates).",
}, ensure_ascii=False)
# --- Coordinate-based click (compositor-level) --------------------------
if has_coords:
try:
fx, fy = float(x), float(y) # type: ignore[arg-type]
except (TypeError, ValueError):
return json.dumps({
"success": False,
"error": f"x and y must be numbers, got x={x!r} y={y!r}",
}, ensure_ascii=False)
return _cdp_coordinate_click(fx, fy, task_id or "default")
# --- Ref-based click (existing path) ------------------------------------
if not has_ref or ref is None:
# Defensive guard — validation above should ensure we never reach here
return json.dumps({
"success": False,
"error": "Internal error: expected ref parameter.",
}, ensure_ascii=False)
if _is_camofox_mode():
from tools.browser_camofox import camofox_click
return camofox_click(ref, task_id)
@@ -2928,7 +3115,7 @@ registry.register(
name="browser_click",
toolset="browser",
schema=_BROWSER_SCHEMA_MAP["browser_click"],
handler=lambda args, **kw: browser_click(ref=args.get("ref", ""), task_id=kw.get("task_id")),
handler=lambda args, **kw: browser_click(ref=args.get("ref"), x=args.get("x"), y=args.get("y"), task_id=kw.get("task_id")),
check_fn=check_browser_requirements,
emoji="👆",
)