mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 15:01:34 +08:00
149 lines
5.4 KiB
Python
149 lines
5.4 KiB
Python
|
|
"""Agent-facing tool: respond to a native JS dialog captured by the CDP supervisor.
|
||
|
|
|
||
|
|
This tool is response-only — the agent first reads ``pending_dialogs`` from
|
||
|
|
``browser_snapshot`` output, then calls ``browser_dialog(action=...)`` to
|
||
|
|
accept or dismiss.
|
||
|
|
|
||
|
|
Gated on the same ``_browser_cdp_check`` as ``browser_cdp`` so it only
|
||
|
|
appears when a CDP endpoint is reachable (Browserbase with a
|
||
|
|
``connectUrl``, local Chrome via ``/browser connect``, or
|
||
|
|
``browser.cdp_url`` set in config).
|
||
|
|
|
||
|
|
See ``website/docs/developer-guide/browser-supervisor.md`` for the full
|
||
|
|
design.
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
import logging
|
||
|
|
from typing import Any, Dict, Optional
|
||
|
|
|
||
|
|
from tools.browser_supervisor import SUPERVISOR_REGISTRY
|
||
|
|
from tools.registry import registry
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
|
||
|
|
BROWSER_DIALOG_SCHEMA: Dict[str, Any] = {
|
||
|
|
"name": "browser_dialog",
|
||
|
|
"description": (
|
||
|
|
"Respond to a native JavaScript dialog (alert / confirm / prompt / "
|
||
|
|
"beforeunload) that is currently blocking the page.\n\n"
|
||
|
|
"**Workflow:** call ``browser_snapshot`` first — if a dialog is open, "
|
||
|
|
"it appears in the ``pending_dialogs`` field with ``id``, ``type``, "
|
||
|
|
"and ``message``. Then call this tool with ``action='accept'`` or "
|
||
|
|
"``action='dismiss'``.\n\n"
|
||
|
|
"**Prompt dialogs:** pass ``prompt_text`` to supply the response "
|
||
|
|
"string. Ignored for alert/confirm/beforeunload.\n\n"
|
||
|
|
"**Multiple dialogs:** if more than one dialog is queued (rare — "
|
||
|
|
"happens when a second dialog fires while the first is still open), "
|
||
|
|
"pass ``dialog_id`` from the snapshot to disambiguate.\n\n"
|
||
|
|
"**Availability:** only present when a CDP-capable backend is "
|
||
|
|
"attached — Browserbase sessions, local Chrome via "
|
||
|
|
"``/browser connect``, or ``browser.cdp_url`` in config.yaml. "
|
||
|
|
"Not available on Camofox (REST-only) or the default Playwright "
|
||
|
|
"local browser (CDP port is hidden)."
|
||
|
|
),
|
||
|
|
"parameters": {
|
||
|
|
"type": "object",
|
||
|
|
"properties": {
|
||
|
|
"action": {
|
||
|
|
"type": "string",
|
||
|
|
"enum": ["accept", "dismiss"],
|
||
|
|
"description": (
|
||
|
|
"'accept' clicks OK / returns the prompt text. "
|
||
|
|
"'dismiss' clicks Cancel / returns null from prompt(). "
|
||
|
|
"For ``beforeunload`` dialogs: 'accept' allows the "
|
||
|
|
"navigation, 'dismiss' keeps the page."
|
||
|
|
),
|
||
|
|
},
|
||
|
|
"prompt_text": {
|
||
|
|
"type": "string",
|
||
|
|
"description": (
|
||
|
|
"Response string for a ``prompt()`` dialog. Ignored for "
|
||
|
|
"other dialog types. Defaults to empty string."
|
||
|
|
),
|
||
|
|
},
|
||
|
|
"dialog_id": {
|
||
|
|
"type": "string",
|
||
|
|
"description": (
|
||
|
|
"Specific dialog to respond to, from "
|
||
|
|
"``browser_snapshot.pending_dialogs[].id``. Required "
|
||
|
|
"only when multiple dialogs are queued."
|
||
|
|
),
|
||
|
|
},
|
||
|
|
},
|
||
|
|
"required": ["action"],
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def browser_dialog(
|
||
|
|
action: str,
|
||
|
|
prompt_text: Optional[str] = None,
|
||
|
|
dialog_id: Optional[str] = None,
|
||
|
|
task_id: Optional[str] = None,
|
||
|
|
) -> str:
|
||
|
|
"""Respond to a pending dialog on the active task's CDP supervisor."""
|
||
|
|
effective_task_id = task_id or "default"
|
||
|
|
supervisor = SUPERVISOR_REGISTRY.get(effective_task_id)
|
||
|
|
if supervisor is None:
|
||
|
|
return json.dumps(
|
||
|
|
{
|
||
|
|
"success": False,
|
||
|
|
"error": (
|
||
|
|
"No CDP supervisor is attached to this task. Either the "
|
||
|
|
"browser backend doesn't expose CDP (Camofox, default "
|
||
|
|
"Playwright) or no browser session has been started yet. "
|
||
|
|
"Call browser_navigate or /browser connect first."
|
||
|
|
),
|
||
|
|
}
|
||
|
|
)
|
||
|
|
|
||
|
|
result = supervisor.respond_to_dialog(
|
||
|
|
action=action,
|
||
|
|
prompt_text=prompt_text,
|
||
|
|
dialog_id=dialog_id,
|
||
|
|
)
|
||
|
|
if result.get("ok"):
|
||
|
|
return json.dumps(
|
||
|
|
{
|
||
|
|
"success": True,
|
||
|
|
"action": action,
|
||
|
|
"dialog": result.get("dialog", {}),
|
||
|
|
}
|
||
|
|
)
|
||
|
|
return json.dumps({"success": False, "error": result.get("error", "unknown error")})
|
||
|
|
|
||
|
|
|
||
|
|
def _browser_dialog_check() -> bool:
|
||
|
|
"""Gate: same as ``browser_cdp`` — only offered when CDP is reachable.
|
||
|
|
|
||
|
|
Kept identical so the two tools appear and disappear together. The
|
||
|
|
supervisor itself is started lazily by ``browser_navigate`` /
|
||
|
|
``/browser connect`` / Browserbase session creation, so a reachable
|
||
|
|
CDP URL is enough to commit to showing the tool.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
from tools.browser_cdp_tool import _browser_cdp_check # type: ignore[import-not-found]
|
||
|
|
except Exception as exc: # pragma: no cover — defensive
|
||
|
|
logger.debug("browser_dialog check: browser_cdp_tool import failed: %s", exc)
|
||
|
|
return False
|
||
|
|
return _browser_cdp_check()
|
||
|
|
|
||
|
|
|
||
|
|
registry.register(
|
||
|
|
name="browser_dialog",
|
||
|
|
toolset="browser-cdp",
|
||
|
|
schema=BROWSER_DIALOG_SCHEMA,
|
||
|
|
handler=lambda args, **kw: browser_dialog(
|
||
|
|
action=args.get("action", ""),
|
||
|
|
prompt_text=args.get("prompt_text"),
|
||
|
|
dialog_id=args.get("dialog_id"),
|
||
|
|
task_id=kw.get("task_id"),
|
||
|
|
),
|
||
|
|
check_fn=_browser_dialog_check,
|
||
|
|
emoji="💬",
|
||
|
|
)
|