feat(delegate): diagnostic dump when a subagent times out with 0 API calls (#15105)

When a subagent in delegate_task times out before making its first LLM request, write a structured diagnostic file under ~/.hermes/logs/subagent-timeout-<sid>-<ts>.log capturing enough state for the user (and us) to debug the hang. The old error message — 'Subagent timed out after Ns with no response. The child may be stuck on a slow API call or unresponsive network request.' — gave no observability for the 0-API-call case, which is the hardest to reason about remotely. The diagnostic captures: - timeout config vs actual duration - goal (truncated to 1000 chars) - child config: model, provider, api_mode, base_url, max_iterations, quiet_mode, platform, _delegate_role, _delegate_depth - enabled_toolsets + loaded tool names - system prompt byte/char count (catches oversized prompts that providers silently choke on) - tool schema count + byte size - child's get_activity_summary() snapshot - Python stack of the worker thread at the moment of timeout (reveals whether the hang is in credential resolution, transport, prompt construction, etc.) Wiring: - _run_single_child captures the worker thread via a small wrapper around child.run_conversation so we can look up its stack at timeout. - After a FuturesTimeoutError, we pull child.get_activity_summary() to read api_call_count. If 0 AND it was a timeout (not a raise), _dump_subagent_timeout_diagnostic() is invoked. - The returned path is surfaced in the error string so the parent agent (and therefore the user / gateway) sees exactly where to look. - api_calls > 0 timeouts keep the old 'stuck on slow API call' phrasing since that's the correct diagnosis for those. This does NOT change any behavior for successful subagent runs, non-timeout errors, or subagents that made at least one API call before hanging. Tests: 7 cases (tests/tools/test_delegate_subagent_timeout_diagnostic.py) - output format + required sections + field values - long-goal truncation with [truncated] marker - missing / already-exited worker thread branches - unwritable HERMES_HOME/logs/ returns None without raising - _run_single_child wiring: 0 API calls → dump + diagnostic_path in error - _run_single_child wiring: N>0 API calls → no dump, old message Refs: #14726
2026-04-28 06:51:16 +08:00 · 2026-04-24 04:58:32 -07:00
parent 3cb43df2cd
commit 7634c1386f
2 changed files with 490 additions and 14 deletions
--- a/tests/tools/test_delegate_subagent_timeout_diagnostic.py
+++ b/tests/tools/test_delegate_subagent_timeout_diagnostic.py
@@ -0,0 +1,286 @@
+"""Regression tests for subagent timeout diagnostic dump (issue #14726).
+
+When delegate_task's child subagent times out without having made any API
+call, a structured diagnostic file is written under
+``~/.hermes/logs/subagent-timeout-<sid>-<ts>.log``. This gives users a
+concrete artifact to inspect (worker thread stack, system prompt size,
+tool schema bytes, credential pool state, etc.) instead of the previous
+opaque "subagent timed out" error.
+
+These tests pin:
+- the diagnostic writer's output format and content
+- the timeout branch in _run_single_child only dumps when api_calls == 0
+- the error message surfaces the diagnostic path
+- api_calls > 0 timeouts do NOT write a dump (the old "stuck on slow API
+  call" explanation still applies)
+"""
+from __future__ import annotations
+
+import os
+import threading
+import time
+from pathlib import Path
+from typing import Optional
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+@pytest.fixture
+def hermes_home(tmp_path, monkeypatch):
+    home = tmp_path / ".hermes"
+    home.mkdir()
+    monkeypatch.setenv("HERMES_HOME", str(home))
+    return home
+
+
+class _StubChild:
+    """Minimal stand-in for an AIAgent subagent."""
+    def __init__(
+        self,
+        *,
+        api_call_count: int = 0,
+        hang_seconds: float = 5.0,
+        subagent_id: str = "sa-0-stubabc",
+        tool_schema=None,
+    ):
+        self._subagent_id = subagent_id
+        self._delegate_depth = 1
+        self._delegate_role = "leaf"
+        self.model = "test/model"
+        self.provider = "testprov"
+        self.api_mode = "chat_completions"
+        self.base_url = "https://example.test/v1"
+        self.max_iterations = 30
+        self.quiet_mode = True
+        self.skip_memory = True
+        self.skip_context_files = True
+        self.platform = "cli"
+        self.ephemeral_system_prompt = "sys prompt"
+        self.enabled_toolsets = ["web", "terminal"]
+        self.valid_tool_names = {"web_search", "terminal"}
+        self.tools = tool_schema if tool_schema is not None else [
+            {"name": "web_search", "description": "search"},
+            {"name": "terminal", "description": "shell"},
+        ]
+        self._api_call_count = api_call_count
+        self._hang = threading.Event()
+        self._hang_seconds = hang_seconds
+
+    def get_activity_summary(self):
+        return {
+            "api_call_count": self._api_call_count,
+            "max_iterations": self.max_iterations,
+            "current_tool": None,
+            "seconds_since_activity": 60,
+        }
+
+    def run_conversation(self, user_message, task_id=None):
+        self._hang.wait(self._hang_seconds)
+        return {"final_response": "", "completed": False, "api_calls": self._api_call_count}
+
+    def interrupt(self):
+        self._hang.set()
+
+
+# ── _dump_subagent_timeout_diagnostic ──────────────────────────────────
+
+class TestDumpSubagentTimeoutDiagnostic:
+
+    def test_writes_log_with_expected_sections(self, hermes_home):
+        from tools.delegate_tool import _dump_subagent_timeout_diagnostic
+        child = _StubChild(subagent_id="sa-7-abc123")
+
+        worker = threading.Thread(
+            target=lambda: child.run_conversation("test"),
+            daemon=True,
+        )
+        worker.start()
+        time.sleep(0.1)
+        try:
+            path = _dump_subagent_timeout_diagnostic(
+                child=child,
+                task_index=7,
+                timeout_seconds=300.0,
+                duration_seconds=300.01,
+                worker_thread=worker,
+                goal="Research something long",
+            )
+        finally:
+            child.interrupt()
+            worker.join(timeout=2.0)
+
+        assert path is not None
+        p = Path(path)
+        assert p.is_file()
+        # File lives under HERMES_HOME/logs/
+        assert p.parent == hermes_home / "logs"
+        assert p.name.startswith("subagent-timeout-sa-7-abc123-")
+        assert p.suffix == ".log"
+
+        content = p.read_text()
+        # Header references the issue for future grep-ability
+        assert "issue #14726" in content
+        # Timeout facts
+        assert "task_index:        7" in content
+        assert "subagent_id:       sa-7-abc123" in content
+        assert "configured_timeout: 300.0s" in content
+        assert "actual_duration:   300.01s" in content
+        # Goal
+        assert "Research something long" in content
+        # Child config
+        assert "model: 'test/model'" in content
+        assert "provider: 'testprov'" in content
+        assert "base_url: 'https://example.test/v1'" in content
+        assert "max_iterations: 30" in content
+        # Toolsets
+        assert "enabled_toolsets:  ['web', 'terminal']" in content
+        assert "loaded tool count: 2" in content
+        # Prompt / schema sizes
+        assert "system_prompt_bytes:" in content
+        assert "tool_schema_count: 2" in content
+        assert "tool_schema_bytes:" in content
+        # Activity summary
+        assert "api_call_count: 0" in content
+        # Worker stack
+        assert "Worker thread stack at timeout" in content
+        # The thread is parked inside _hang.wait → cond.wait → waiter.acquire
+        assert "acquire" in content or "wait" in content
+
+    def test_truncates_very_long_goal(self, hermes_home):
+        from tools.delegate_tool import _dump_subagent_timeout_diagnostic
+        child = _StubChild()
+        huge_goal = "x" * 5000
+
+        path = _dump_subagent_timeout_diagnostic(
+            child=child,
+            task_index=0,
+            timeout_seconds=300.0,
+            duration_seconds=300.0,
+            worker_thread=None,
+            goal=huge_goal,
+        )
+        child.interrupt()
+
+        content = Path(path).read_text()
+        assert "[truncated]" in content
+        # Goal section trimmed to 1000 chars + suffix
+        goal_block = content.split("## Goal", 1)[1].split("## Child config", 1)[0]
+        assert len(goal_block) < 1200
+
+    def test_missing_worker_thread_is_handled(self, hermes_home):
+        from tools.delegate_tool import _dump_subagent_timeout_diagnostic
+        child = _StubChild()
+        path = _dump_subagent_timeout_diagnostic(
+            child=child,
+            task_index=0,
+            timeout_seconds=300.0,
+            duration_seconds=300.0,
+            worker_thread=None,
+            goal="x",
+        )
+        child.interrupt()
+        content = Path(path).read_text()
+        assert "<no worker thread handle>" in content
+
+    def test_exited_worker_thread_is_handled(self, hermes_home):
+        from tools.delegate_tool import _dump_subagent_timeout_diagnostic
+        child = _StubChild()
+        # A thread that has already finished
+        t = threading.Thread(target=lambda: None)
+        t.start()
+        t.join()
+        assert not t.is_alive()
+        path = _dump_subagent_timeout_diagnostic(
+            child=child,
+            task_index=0,
+            timeout_seconds=300.0,
+            duration_seconds=300.0,
+            worker_thread=t,
+            goal="x",
+        )
+        child.interrupt()
+        content = Path(path).read_text()
+        assert "<worker thread already exited>" in content
+
+    def test_returns_none_on_unwritable_logs_dir(self, tmp_path, monkeypatch):
+        # Point HERMES_HOME at an unwritable path so logs/ can't be created
+        # (simulates permission-denied). Helper must not raise.
+        from tools.delegate_tool import _dump_subagent_timeout_diagnostic
+        bogus = tmp_path / "does-not-exist" / ".hermes"
+        monkeypatch.setenv("HERMES_HOME", str(bogus))
+        child = _StubChild()
+
+        # Make the logs dir itself unwritable by creating it as a FILE
+        # so mkdir(exist_ok=True) → NotADirectoryError and we fall through.
+        bogus.parent.mkdir(parents=True, exist_ok=True)
+        bogus.mkdir()
+        (bogus / "logs").write_text("not a dir")
+        result = _dump_subagent_timeout_diagnostic(
+            child=child,
+            task_index=0,
+            timeout_seconds=300.0,
+            duration_seconds=300.0,
+            worker_thread=None,
+            goal="x",
+        )
+        child.interrupt()
+        # Either None (mkdir failed) or a real path; must never raise.
+        # We assert no exception propagates — the return value is advisory.
+        assert result is None or Path(result).exists()
+
+
+# ── _run_single_child timeout branch wiring ───────────────────────────
+
+class TestRunSingleChildTimeoutDump:
+    """The timeout branch in _run_single_child must emit the diagnostic
+    dump when api_calls == 0, and must NOT emit it when api_calls > 0."""
+
+    def _invoke_with_short_timeout(self, child, monkeypatch):
+        """Run _run_single_child with a tiny timeout to force the timeout branch."""
+        from tools import delegate_tool
+        # Force a 0.3s timeout so the test is fast
+        monkeypatch.setattr(delegate_tool, "_get_child_timeout", lambda: 0.3)
+
+        parent = MagicMock()
+        parent._touch_activity = MagicMock()
+        parent._current_task_id = None
+        return delegate_tool._run_single_child(
+            task_index=0,
+            goal="test goal",
+            child=child,
+            parent_agent=parent,
+        )
+
+    def test_zero_api_calls_writes_dump_and_surfaces_path(self, hermes_home, monkeypatch):
+        child = _StubChild(api_call_count=0, hang_seconds=10.0)
+        result = self._invoke_with_short_timeout(child, monkeypatch)
+
+        assert result["status"] == "timeout"
+        assert result["api_calls"] == 0
+        assert result["diagnostic_path"] is not None
+        dump_path = Path(result["diagnostic_path"])
+        assert dump_path.is_file()
+        assert dump_path.parent == hermes_home / "logs"
+
+        # Error message surfaces the path and the "no API call" phrasing
+        assert "without making any API call" in result["error"]
+        assert "Diagnostic:" in result["error"]
+        assert str(dump_path) in result["error"]
+
+    def test_nonzero_api_calls_skips_dump_and_uses_old_message(self, hermes_home, monkeypatch):
+        child = _StubChild(api_call_count=5, hang_seconds=10.0)
+        result = self._invoke_with_short_timeout(child, monkeypatch)
+
+        assert result["status"] == "timeout"
+        assert result["api_calls"] == 5
+        # No diagnostic file should be written for timeouts that made
+        # actual API calls — the old generic "stuck on slow call" message
+        # still applies.
+        assert result.get("diagnostic_path") is None
+        assert "stuck on a slow API call" in result["error"]
+        # And no subagent-timeout-* file should exist under logs/
+        logs_dir = hermes_home / "logs"
+        if logs_dir.is_dir():
+            dumps = list(logs_dir.glob("subagent-timeout-*.log"))
+            assert dumps == []
--- a/tools/delegate_tool.py
+++ b/tools/delegate_tool.py
@@ -1016,6 +1016,150 @@ def _build_child_agent(
    return child


+def _dump_subagent_timeout_diagnostic(
+    *,
+    child: Any,
+    task_index: int,
+    timeout_seconds: float,
+    duration_seconds: float,
+    worker_thread: Optional[threading.Thread],
+    goal: str,
+) -> Optional[str]:
+    """Write a structured diagnostic dump for a subagent that timed out
+    before making any API call.
+
+    See issue #14726: users hit "subagent timed out after 300s with no response"
+    with zero API calls and no way to inspect what happened. This helper
+    writes a dedicated log under ``~/.hermes/logs/subagent-<sid>-<ts>.log``
+    capturing the child's config, system-prompt / tool-schema sizes, activity
+    tracker snapshot, and the worker thread's Python stack at timeout.
+
+    Returns the absolute path to the diagnostic file, or None on failure.
+    """
+    try:
+        from hermes_constants import get_hermes_home
+        import datetime as _dt
+        import sys as _sys
+        import traceback as _traceback
+
+        hermes_home = get_hermes_home()
+        logs_dir = hermes_home / "logs"
+        try:
+            logs_dir.mkdir(parents=True, exist_ok=True)
+        except Exception:
+            return None
+
+        subagent_id = getattr(child, "_subagent_id", None) or f"idx{task_index}"
+        ts = _dt.datetime.now().strftime("%Y%m%d_%H%M%S")
+        dump_path = logs_dir / f"subagent-timeout-{subagent_id}-{ts}.log"
+
+        lines: List[str] = []
+        def _w(line: str = "") -> None:
+            lines.append(line)
+
+        _w(f"# Subagent timeout diagnostic — issue #14726")
+        _w(f"# Generated: {_dt.datetime.now().isoformat()}")
+        _w("")
+        _w("## Timeout")
+        _w(f"  task_index:        {task_index}")
+        _w(f"  subagent_id:       {subagent_id}")
+        _w(f"  configured_timeout: {timeout_seconds}s")
+        _w(f"  actual_duration:   {duration_seconds:.2f}s")
+        _w("")
+
+        _w("## Goal")
+        _goal_preview = (goal or "").strip()
+        if len(_goal_preview) > 1000:
+            _goal_preview = _goal_preview[:1000] + " ...[truncated]"
+        _w(_goal_preview or "(empty)")
+        _w("")
+
+        _w("## Child config")
+        for attr in (
+            "model", "provider", "api_mode", "base_url", "max_iterations",
+            "quiet_mode", "skip_memory", "skip_context_files", "platform",
+            "_delegate_role", "_delegate_depth",
+        ):
+            try:
+                val = getattr(child, attr, None)
+                # Redact api_key-shaped values defensively
+                if isinstance(val, str) and attr == "base_url":
+                    pass
+                _w(f"  {attr}: {val!r}")
+            except Exception:
+                _w(f"  {attr}: <unreadable>")
+        _w("")
+
+        _w("## Toolsets")
+        enabled = getattr(child, "enabled_toolsets", None)
+        _w(f"  enabled_toolsets:  {enabled!r}")
+        tool_names = getattr(child, "valid_tool_names", None)
+        if tool_names:
+            _w(f"  loaded tool count: {len(tool_names)}")
+            try:
+                _w(f"  loaded tools:      {sorted(list(tool_names))}")
+            except Exception:
+                pass
+        _w("")
+
+        _w("## Prompt / schema sizes")
+        try:
+            sys_prompt = getattr(child, "ephemeral_system_prompt", None) \
+                or getattr(child, "system_prompt", None) \
+                or ""
+            _w(f"  system_prompt_bytes: {len(sys_prompt.encode('utf-8')) if isinstance(sys_prompt, str) else 'n/a'}")
+            _w(f"  system_prompt_chars: {len(sys_prompt) if isinstance(sys_prompt, str) else 'n/a'}")
+        except Exception as exc:
+            _w(f"  system_prompt: <error: {exc}>")
+        try:
+            tools_schema = getattr(child, "tools", None)
+            if tools_schema is not None:
+                _schema_json = json.dumps(tools_schema, default=str)
+                _w(f"  tool_schema_count: {len(tools_schema)}")
+                _w(f"  tool_schema_bytes: {len(_schema_json.encode('utf-8'))}")
+        except Exception as exc:
+            _w(f"  tool_schema: <error: {exc}>")
+        _w("")
+
+        _w("## Activity summary")
+        try:
+            summary = child.get_activity_summary()
+            for k, v in summary.items():
+                _w(f"  {k}: {v!r}")
+        except Exception as exc:
+            _w(f"  <get_activity_summary failed: {exc}>")
+        _w("")
+
+        _w("## Worker thread stack at timeout")
+        if worker_thread is not None and worker_thread.is_alive():
+            frames = _sys._current_frames()
+            worker_frame = frames.get(worker_thread.ident)
+            if worker_frame is not None:
+                stack = _traceback.format_stack(worker_frame)
+                for frame_line in stack:
+                    for sub in frame_line.rstrip().split("\n"):
+                        _w(f"  {sub}")
+            else:
+                _w("  <worker frame not available>")
+        elif worker_thread is None:
+            _w("  <no worker thread handle>")
+        else:
+            _w("  <worker thread already exited>")
+        _w("")
+
+        _w("## Notes")
+        _w("  This file is written ONLY when a subagent times out with 0 API calls.")
+        _w("  0-API-call timeouts mean the child never reached its first LLM request.")
+        _w("  Common causes: oversized prompt rejected by provider, transport hang,")
+        _w("  credential resolution stuck. See issue #14726 for context.")
+
+        dump_path.write_text("\n".join(lines), encoding="utf-8")
+        return str(dump_path)
+    except Exception as exc:
+        logger.warning("Subagent timeout diagnostic dump failed: %s", exc)
+        return None
+
+
 def _run_single_child(
    task_index: int,
    goal: str,
@@ -1168,11 +1312,18 @@ def _run_single_child(
        # when the child's API call or tool-level HTTP request hangs.
        child_timeout = _get_child_timeout()
        _timeout_executor = ThreadPoolExecutor(max_workers=1)
-        _child_future = _timeout_executor.submit(
-            child.run_conversation,
-            user_message=goal,
-            task_id=child_task_id,
-        )
+        # Capture the worker thread so the timeout diagnostic can dump its
+        # Python stack (see #14726 — 0-API-call hangs are opaque without it).
+        _worker_thread_holder: Dict[str, Optional[threading.Thread]] = {"t": None}
+
+        def _run_with_thread_capture():
+            _worker_thread_holder["t"] = threading.current_thread()
+            return child.run_conversation(
+                user_message=goal,
+                task_id=child_task_id,
+            )
+
+        _child_future = _timeout_executor.submit(_run_with_thread_capture)
        try:
            result = _child_future.result(timeout=child_timeout)
        except Exception as _timeout_exc:
@@ -1194,6 +1345,32 @@ def _run_single_child(
                duration,
            )

+            # When a subagent times out BEFORE making any API call, dump a
+            # diagnostic to help users (and us) see what the child was doing.
+            # See #14726 — without this, 0-API-call hangs are black boxes.
+            diagnostic_path: Optional[str] = None
+            child_api_calls = 0
+            try:
+                _summary = child.get_activity_summary()
+                child_api_calls = int(_summary.get("api_call_count", 0) or 0)
+            except Exception:
+                pass
+            if is_timeout and child_api_calls == 0:
+                diagnostic_path = _dump_subagent_timeout_diagnostic(
+                    child=child,
+                    task_index=task_index,
+                    timeout_seconds=float(child_timeout),
+                    duration_seconds=float(duration),
+                    worker_thread=_worker_thread_holder.get("t"),
+                    goal=goal,
+                )
+                if diagnostic_path:
+                    logger.warning(
+                        "Subagent %d 0-API-call timeout — diagnostic written to %s",
+                        task_index,
+                        diagnostic_path,
+                    )
+
            if child_progress_cb:
                try:
                    child_progress_cb(
@@ -1210,22 +1387,35 @@ def _run_single_child(
                except Exception:
                    pass

+            if is_timeout:
+                if child_api_calls == 0:
+                    _err = (
+                        f"Subagent timed out after {child_timeout}s without "
+                        f"making any API call — the child never reached its "
+                        f"first LLM request (prompt construction, credential "
+                        f"resolution, or transport may be stuck)."
+                    )
+                    if diagnostic_path:
+                        _err += f" Diagnostic: {diagnostic_path}"
+                else:
+                    _err = (
+                        f"Subagent timed out after {child_timeout}s with "
+                        f"{child_api_calls} API call(s) completed — likely "
+                        f"stuck on a slow API call or unresponsive network request."
+                    )
+            else:
+                _err = str(_timeout_exc)
+
            return {
                "task_index": task_index,
                "status": "timeout" if is_timeout else "error",
                "summary": None,
-                "error": (
-                    (
-                        f"Subagent timed out after {child_timeout}s with no response. "
-                        "The child may be stuck on a slow API call or unresponsive network request."
-                    )
-                    if is_timeout
-                    else str(_timeout_exc)
-                ),
+                "error": _err,
                "exit_reason": "timeout" if is_timeout else "error",
-                "api_calls": 0,
+                "api_calls": child_api_calls,
                "duration_seconds": duration,
                "_child_role": getattr(child, "_delegate_role", None),
+                "diagnostic_path": diagnostic_path,
            }
        finally:
            # Shut down executor without waiting — if the child thread