fix(computer-use): vision capture returns an image on cua-driver >=0.5.x

Vision mode called a `screenshot` MCP tool that cua-driver dropped in 0.5.x (full-window PNG capture was folded into `get_window_state`). The driver replied "Unknown tool: screenshot", so `images` came back empty, `png_b64` stayed None, and capture returned a 0x0 result with no image on every call. `som`/`ax` were unaffected because they already use `get_window_state`, which masked the regression. Route vision by capability: - driver advertises `screenshot` (older builds) -> use it (no AX walk) - otherwise -> call `get_window_state` but discard the AX tree/elements, returning only the PNG so vision stays free of element noise - capabilities not yet discovered -> try `screenshot`, fall back to `get_window_state` on an empty image, so the path self-heals Add `_image_from_tool_result` to pull the PNG from either an MCP image content-part or `structuredContent.screenshot_png_b64`, and use it on the som path too so the image won't silently drop on driver builds that deliver it via structuredContent instead of a content part. Verified live (vision: 1568x954, 0 elements; som: image + 527 elements) and with unit coverage of all four routing cases.
2026-06-24 02:43:18 +08:00 · 2026-06-22 17:40:18 -05:00
1 changed files with 118 additions and 21 deletions
--- a/tools/computer_use/cua_backend.py
+++ b/tools/computer_use/cua_backend.py
@@ -723,6 +723,28 @@ class _CuaDriverSession:
            return capability in self._capabilities.get(tool, set())
        return any(capability in caps for caps in self._capabilities.values())

+    def _has_tool(self, name: str) -> bool:
+        """Return True when ``tools/list`` advertised a tool by this name.
+
+        Used to route capture(): cua-driver dropped the standalone
+        ``screenshot`` tool and folded full-window PNG capture into
+        ``get_window_state`` (whose own description notes it "Also captures
+        a PNG screenshot of the specified window"). Older drivers that still
+        expose ``screenshot`` keep using it; newer ones fall through to
+        ``get_window_state``.
+
+        Returns False when discovery hasn't populated the map yet — callers
+        treat that as "unknown" and probe defensively rather than trusting it.
+        """
+        return name in self._capabilities
+
+    @property
+    def capabilities_discovered(self) -> bool:
+        """True once ``tools/list`` populated the per-tool map. When False,
+        ``_has_tool`` answers are not trustworthy (discovery failed or the
+        session hasn't started) and capture() should probe defensively."""
+        return bool(self._capabilities)
+
    @property
    def capability_version(self) -> str:
        """Driver-advertised capability vocabulary version (empty string
@@ -825,6 +847,45 @@ def _extract_tool_result(mcp_result: Any) -> Dict[str, Any]:
    }


+def _image_from_tool_result(out: Dict[str, Any]) -> tuple[Optional[str], Optional[str]]:
+    """Pull a (png_b64, mime_type) pair out of a flattened tool result.
+
+    cua-driver delivers window screenshots in two shapes depending on tool +
+    transport:
+
+      * As an MCP ``image`` content part — surfaced by ``_extract_tool_result``
+        in ``out["images"]`` with a parallel ``image_mime_types`` entry. This
+        is what ``get_window_state`` emits over the stdio MCP transport.
+      * As a base64 field inside ``structuredContent`` —
+        ``screenshot_png_b64`` (+ ``screenshot_mime_type``). This is what
+        ``get_window_state`` returns when its structured payload carries the
+        image instead of a content part (newer driver builds; also the shape
+        seen via the ``cua-driver call`` CLI surface).
+
+    Checking both makes capture() robust to either delivery shape, so the
+    image never silently drops just because the driver moved it between the
+    content list and structuredContent. Returns ``(None, None)`` when neither
+    location carries an image.
+    """
+    images = out.get("images") or []
+    if images and images[0]:
+        mimes = out.get("image_mime_types") or []
+        mime = mimes[0] if mimes and mimes[0] else None
+        return images[0], mime
+
+    structured = out.get("structuredContent") or {}
+    b64 = structured.get("screenshot_png_b64") or structured.get("png_b64")
+    if b64:
+        mime = (
+            structured.get("screenshot_mime_type")
+            or structured.get("mime_type")
+            or None
+        )
+        return b64, mime
+
+    return None, None
+
+
 # ---------------------------------------------------------------------------
 # The backend itself
 # ---------------------------------------------------------------------------
@@ -1003,25 +1064,61 @@ class CuaDriverBackend(ComputerUseBackend):
        window_title = ""

        if mode == "vision":
-            # screenshot tool: just the PNG, no AX walk.
-            sc_out = self._session.call_tool(
-                "screenshot",
-                {
-                    "window_id": self._active_window_id,
-                    "format": "jpeg",
-                    "quality": 85,
-                    "session": self._session_id,
-                },
+            # Plain screenshot, no AX walk. cua-driver dropped the standalone
+            # `screenshot` tool (≥0.5.x) and folded full-window PNG capture
+            # into `get_window_state`. Route accordingly:
+            #   * Driver advertises `screenshot` (older builds) → use it; it's
+            #     the cheapest path (no AX tree walked server-side).
+            #   * Otherwise (current drivers) → call `get_window_state` but
+            #     DISCARD the AX tree/elements, returning only the PNG. Vision
+            #     mode's whole contract is "just the pixels, no element noise",
+            #     so we drop everything but the image.
+            # When capability discovery hasn't run (empty map), we don't trust
+            # a negative `_has_tool` answer — we still try `screenshot` first
+            # and fall back if the driver rejects it, so the path self-heals on
+            # any driver version.
+            use_screenshot = (
+                self._session._has_tool("screenshot")
+                or not self._session.capabilities_discovered
            )
-            if sc_out["images"]:
-                png_b64 = sc_out["images"][0]
-                # Pick up the explicit mimeType cua-driver attaches to image
-                # parts (Surface 7). Empty string means the driver didn't
-                # carry one — callers will fall back to magic-byte sniffing.
-                mimes = sc_out.get("image_mime_types") or []
-                image_mime_type = mimes[0] if mimes and mimes[0] else None
+            sc_out: Optional[Dict[str, Any]] = None
+            if use_screenshot:
+                sc_out = self._session.call_tool(
+                    "screenshot",
+                    {
+                        "window_id": self._active_window_id,
+                        "format": "jpeg",
+                        "quality": 85,
+                        "session": self._session_id,
+                    },
+                )
+                png_b64, image_mime_type = _image_from_tool_result(sc_out)
+                if not png_b64:
+                    # Driver had no usable `screenshot` (e.g. "Unknown tool:
+                    # screenshot" on ≥0.5.x, or an empty image part). Fall
+                    # through to the get_window_state path below.
+                    sc_out = None
+
+            if sc_out is None:
+                gws_out = self._session.call_tool(
+                    "get_window_state",
+                    {
+                        "pid": self._active_pid,
+                        "window_id": self._active_window_id,
+                        "session": self._session_id,
+                    },
+                )
+                png_b64, image_mime_type = _image_from_tool_result(gws_out)
+                # Still grab the window title — it's cheap and useful in the
+                # vision response — but deliberately leave `elements` empty so
+                # vision stays free of AX-tree noise.
+                text = gws_out["data"] if isinstance(gws_out["data"], str) else ""
+                _, tree = _split_tree_text(text)
+                wt = re.search(r'AXWindow\s+"([^"]+)"', tree)
+                if wt:
+                    window_title = wt.group(1)
        else:
-            # get_window_state: AX tree + optional screenshot.
+            # get_window_state: AX tree + screenshot.
            gws_out = self._session.call_tool(
                "get_window_state",
                {
@@ -1058,10 +1155,10 @@ class CuaDriverBackend(ComputerUseBackend):
                if e.element_token
            }

-            if gws_out["images"]:
-                png_b64 = gws_out["images"][0]
-                mimes = gws_out.get("image_mime_types") or []
-                image_mime_type = mimes[0] if mimes and mimes[0] else None
+            # Image may arrive as an MCP image part or inside
+            # structuredContent (screenshot_png_b64) depending on the driver
+            # build — _image_from_tool_result handles both.
+            png_b64, image_mime_type = _image_from_tool_result(gws_out)

            # Extract window title from the AX tree first AXWindow line.
            wt = re.search(r'AXWindow\s+"([^"]+)"', tree)