diff --git a/tests/tools/test_delegate_subagent_timeout_diagnostic.py b/tests/tools/test_delegate_subagent_timeout_diagnostic.py new file mode 100644 index 0000000000..9bb49125a1 --- /dev/null +++ b/tests/tools/test_delegate_subagent_timeout_diagnostic.py @@ -0,0 +1,286 @@ +"""Regression tests for subagent timeout diagnostic dump (issue #14726). + +When delegate_task's child subagent times out without having made any API +call, a structured diagnostic file is written under +``~/.hermes/logs/subagent-timeout--.log``. This gives users a +concrete artifact to inspect (worker thread stack, system prompt size, +tool schema bytes, credential pool state, etc.) instead of the previous +opaque "subagent timed out" error. + +These tests pin: +- the diagnostic writer's output format and content +- the timeout branch in _run_single_child only dumps when api_calls == 0 +- the error message surfaces the diagnostic path +- api_calls > 0 timeouts do NOT write a dump (the old "stuck on slow API + call" explanation still applies) +""" +from __future__ import annotations + +import os +import threading +import time +from pathlib import Path +from typing import Optional +from unittest.mock import MagicMock, patch + +import pytest + + +@pytest.fixture +def hermes_home(tmp_path, monkeypatch): + home = tmp_path / ".hermes" + home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(home)) + return home + + +class _StubChild: + """Minimal stand-in for an AIAgent subagent.""" + def __init__( + self, + *, + api_call_count: int = 0, + hang_seconds: float = 5.0, + subagent_id: str = "sa-0-stubabc", + tool_schema=None, + ): + self._subagent_id = subagent_id + self._delegate_depth = 1 + self._delegate_role = "leaf" + self.model = "test/model" + self.provider = "testprov" + self.api_mode = "chat_completions" + self.base_url = "https://example.test/v1" + self.max_iterations = 30 + self.quiet_mode = True + self.skip_memory = True + self.skip_context_files = True + self.platform = "cli" + self.ephemeral_system_prompt = "sys prompt" + self.enabled_toolsets = ["web", "terminal"] + self.valid_tool_names = {"web_search", "terminal"} + self.tools = tool_schema if tool_schema is not None else [ + {"name": "web_search", "description": "search"}, + {"name": "terminal", "description": "shell"}, + ] + self._api_call_count = api_call_count + self._hang = threading.Event() + self._hang_seconds = hang_seconds + + def get_activity_summary(self): + return { + "api_call_count": self._api_call_count, + "max_iterations": self.max_iterations, + "current_tool": None, + "seconds_since_activity": 60, + } + + def run_conversation(self, user_message, task_id=None): + self._hang.wait(self._hang_seconds) + return {"final_response": "", "completed": False, "api_calls": self._api_call_count} + + def interrupt(self): + self._hang.set() + + +# ── _dump_subagent_timeout_diagnostic ────────────────────────────────── + +class TestDumpSubagentTimeoutDiagnostic: + + def test_writes_log_with_expected_sections(self, hermes_home): + from tools.delegate_tool import _dump_subagent_timeout_diagnostic + child = _StubChild(subagent_id="sa-7-abc123") + + worker = threading.Thread( + target=lambda: child.run_conversation("test"), + daemon=True, + ) + worker.start() + time.sleep(0.1) + try: + path = _dump_subagent_timeout_diagnostic( + child=child, + task_index=7, + timeout_seconds=300.0, + duration_seconds=300.01, + worker_thread=worker, + goal="Research something long", + ) + finally: + child.interrupt() + worker.join(timeout=2.0) + + assert path is not None + p = Path(path) + assert p.is_file() + # File lives under HERMES_HOME/logs/ + assert p.parent == hermes_home / "logs" + assert p.name.startswith("subagent-timeout-sa-7-abc123-") + assert p.suffix == ".log" + + content = p.read_text() + # Header references the issue for future grep-ability + assert "issue #14726" in content + # Timeout facts + assert "task_index: 7" in content + assert "subagent_id: sa-7-abc123" in content + assert "configured_timeout: 300.0s" in content + assert "actual_duration: 300.01s" in content + # Goal + assert "Research something long" in content + # Child config + assert "model: 'test/model'" in content + assert "provider: 'testprov'" in content + assert "base_url: 'https://example.test/v1'" in content + assert "max_iterations: 30" in content + # Toolsets + assert "enabled_toolsets: ['web', 'terminal']" in content + assert "loaded tool count: 2" in content + # Prompt / schema sizes + assert "system_prompt_bytes:" in content + assert "tool_schema_count: 2" in content + assert "tool_schema_bytes:" in content + # Activity summary + assert "api_call_count: 0" in content + # Worker stack + assert "Worker thread stack at timeout" in content + # The thread is parked inside _hang.wait → cond.wait → waiter.acquire + assert "acquire" in content or "wait" in content + + def test_truncates_very_long_goal(self, hermes_home): + from tools.delegate_tool import _dump_subagent_timeout_diagnostic + child = _StubChild() + huge_goal = "x" * 5000 + + path = _dump_subagent_timeout_diagnostic( + child=child, + task_index=0, + timeout_seconds=300.0, + duration_seconds=300.0, + worker_thread=None, + goal=huge_goal, + ) + child.interrupt() + + content = Path(path).read_text() + assert "[truncated]" in content + # Goal section trimmed to 1000 chars + suffix + goal_block = content.split("## Goal", 1)[1].split("## Child config", 1)[0] + assert len(goal_block) < 1200 + + def test_missing_worker_thread_is_handled(self, hermes_home): + from tools.delegate_tool import _dump_subagent_timeout_diagnostic + child = _StubChild() + path = _dump_subagent_timeout_diagnostic( + child=child, + task_index=0, + timeout_seconds=300.0, + duration_seconds=300.0, + worker_thread=None, + goal="x", + ) + child.interrupt() + content = Path(path).read_text() + assert "" in content + + def test_exited_worker_thread_is_handled(self, hermes_home): + from tools.delegate_tool import _dump_subagent_timeout_diagnostic + child = _StubChild() + # A thread that has already finished + t = threading.Thread(target=lambda: None) + t.start() + t.join() + assert not t.is_alive() + path = _dump_subagent_timeout_diagnostic( + child=child, + task_index=0, + timeout_seconds=300.0, + duration_seconds=300.0, + worker_thread=t, + goal="x", + ) + child.interrupt() + content = Path(path).read_text() + assert "" in content + + def test_returns_none_on_unwritable_logs_dir(self, tmp_path, monkeypatch): + # Point HERMES_HOME at an unwritable path so logs/ can't be created + # (simulates permission-denied). Helper must not raise. + from tools.delegate_tool import _dump_subagent_timeout_diagnostic + bogus = tmp_path / "does-not-exist" / ".hermes" + monkeypatch.setenv("HERMES_HOME", str(bogus)) + child = _StubChild() + + # Make the logs dir itself unwritable by creating it as a FILE + # so mkdir(exist_ok=True) → NotADirectoryError and we fall through. + bogus.parent.mkdir(parents=True, exist_ok=True) + bogus.mkdir() + (bogus / "logs").write_text("not a dir") + result = _dump_subagent_timeout_diagnostic( + child=child, + task_index=0, + timeout_seconds=300.0, + duration_seconds=300.0, + worker_thread=None, + goal="x", + ) + child.interrupt() + # Either None (mkdir failed) or a real path; must never raise. + # We assert no exception propagates — the return value is advisory. + assert result is None or Path(result).exists() + + +# ── _run_single_child timeout branch wiring ─────────────────────────── + +class TestRunSingleChildTimeoutDump: + """The timeout branch in _run_single_child must emit the diagnostic + dump when api_calls == 0, and must NOT emit it when api_calls > 0.""" + + def _invoke_with_short_timeout(self, child, monkeypatch): + """Run _run_single_child with a tiny timeout to force the timeout branch.""" + from tools import delegate_tool + # Force a 0.3s timeout so the test is fast + monkeypatch.setattr(delegate_tool, "_get_child_timeout", lambda: 0.3) + + parent = MagicMock() + parent._touch_activity = MagicMock() + parent._current_task_id = None + return delegate_tool._run_single_child( + task_index=0, + goal="test goal", + child=child, + parent_agent=parent, + ) + + def test_zero_api_calls_writes_dump_and_surfaces_path(self, hermes_home, monkeypatch): + child = _StubChild(api_call_count=0, hang_seconds=10.0) + result = self._invoke_with_short_timeout(child, monkeypatch) + + assert result["status"] == "timeout" + assert result["api_calls"] == 0 + assert result["diagnostic_path"] is not None + dump_path = Path(result["diagnostic_path"]) + assert dump_path.is_file() + assert dump_path.parent == hermes_home / "logs" + + # Error message surfaces the path and the "no API call" phrasing + assert "without making any API call" in result["error"] + assert "Diagnostic:" in result["error"] + assert str(dump_path) in result["error"] + + def test_nonzero_api_calls_skips_dump_and_uses_old_message(self, hermes_home, monkeypatch): + child = _StubChild(api_call_count=5, hang_seconds=10.0) + result = self._invoke_with_short_timeout(child, monkeypatch) + + assert result["status"] == "timeout" + assert result["api_calls"] == 5 + # No diagnostic file should be written for timeouts that made + # actual API calls — the old generic "stuck on slow call" message + # still applies. + assert result.get("diagnostic_path") is None + assert "stuck on a slow API call" in result["error"] + # And no subagent-timeout-* file should exist under logs/ + logs_dir = hermes_home / "logs" + if logs_dir.is_dir(): + dumps = list(logs_dir.glob("subagent-timeout-*.log")) + assert dumps == [] diff --git a/tools/delegate_tool.py b/tools/delegate_tool.py index e779e6f609..e3273c3173 100644 --- a/tools/delegate_tool.py +++ b/tools/delegate_tool.py @@ -1016,6 +1016,150 @@ def _build_child_agent( return child +def _dump_subagent_timeout_diagnostic( + *, + child: Any, + task_index: int, + timeout_seconds: float, + duration_seconds: float, + worker_thread: Optional[threading.Thread], + goal: str, +) -> Optional[str]: + """Write a structured diagnostic dump for a subagent that timed out + before making any API call. + + See issue #14726: users hit "subagent timed out after 300s with no response" + with zero API calls and no way to inspect what happened. This helper + writes a dedicated log under ``~/.hermes/logs/subagent--.log`` + capturing the child's config, system-prompt / tool-schema sizes, activity + tracker snapshot, and the worker thread's Python stack at timeout. + + Returns the absolute path to the diagnostic file, or None on failure. + """ + try: + from hermes_constants import get_hermes_home + import datetime as _dt + import sys as _sys + import traceback as _traceback + + hermes_home = get_hermes_home() + logs_dir = hermes_home / "logs" + try: + logs_dir.mkdir(parents=True, exist_ok=True) + except Exception: + return None + + subagent_id = getattr(child, "_subagent_id", None) or f"idx{task_index}" + ts = _dt.datetime.now().strftime("%Y%m%d_%H%M%S") + dump_path = logs_dir / f"subagent-timeout-{subagent_id}-{ts}.log" + + lines: List[str] = [] + def _w(line: str = "") -> None: + lines.append(line) + + _w(f"# Subagent timeout diagnostic — issue #14726") + _w(f"# Generated: {_dt.datetime.now().isoformat()}") + _w("") + _w("## Timeout") + _w(f" task_index: {task_index}") + _w(f" subagent_id: {subagent_id}") + _w(f" configured_timeout: {timeout_seconds}s") + _w(f" actual_duration: {duration_seconds:.2f}s") + _w("") + + _w("## Goal") + _goal_preview = (goal or "").strip() + if len(_goal_preview) > 1000: + _goal_preview = _goal_preview[:1000] + " ...[truncated]" + _w(_goal_preview or "(empty)") + _w("") + + _w("## Child config") + for attr in ( + "model", "provider", "api_mode", "base_url", "max_iterations", + "quiet_mode", "skip_memory", "skip_context_files", "platform", + "_delegate_role", "_delegate_depth", + ): + try: + val = getattr(child, attr, None) + # Redact api_key-shaped values defensively + if isinstance(val, str) and attr == "base_url": + pass + _w(f" {attr}: {val!r}") + except Exception: + _w(f" {attr}: ") + _w("") + + _w("## Toolsets") + enabled = getattr(child, "enabled_toolsets", None) + _w(f" enabled_toolsets: {enabled!r}") + tool_names = getattr(child, "valid_tool_names", None) + if tool_names: + _w(f" loaded tool count: {len(tool_names)}") + try: + _w(f" loaded tools: {sorted(list(tool_names))}") + except Exception: + pass + _w("") + + _w("## Prompt / schema sizes") + try: + sys_prompt = getattr(child, "ephemeral_system_prompt", None) \ + or getattr(child, "system_prompt", None) \ + or "" + _w(f" system_prompt_bytes: {len(sys_prompt.encode('utf-8')) if isinstance(sys_prompt, str) else 'n/a'}") + _w(f" system_prompt_chars: {len(sys_prompt) if isinstance(sys_prompt, str) else 'n/a'}") + except Exception as exc: + _w(f" system_prompt: ") + try: + tools_schema = getattr(child, "tools", None) + if tools_schema is not None: + _schema_json = json.dumps(tools_schema, default=str) + _w(f" tool_schema_count: {len(tools_schema)}") + _w(f" tool_schema_bytes: {len(_schema_json.encode('utf-8'))}") + except Exception as exc: + _w(f" tool_schema: ") + _w("") + + _w("## Activity summary") + try: + summary = child.get_activity_summary() + for k, v in summary.items(): + _w(f" {k}: {v!r}") + except Exception as exc: + _w(f" ") + _w("") + + _w("## Worker thread stack at timeout") + if worker_thread is not None and worker_thread.is_alive(): + frames = _sys._current_frames() + worker_frame = frames.get(worker_thread.ident) + if worker_frame is not None: + stack = _traceback.format_stack(worker_frame) + for frame_line in stack: + for sub in frame_line.rstrip().split("\n"): + _w(f" {sub}") + else: + _w(" ") + elif worker_thread is None: + _w(" ") + else: + _w(" ") + _w("") + + _w("## Notes") + _w(" This file is written ONLY when a subagent times out with 0 API calls.") + _w(" 0-API-call timeouts mean the child never reached its first LLM request.") + _w(" Common causes: oversized prompt rejected by provider, transport hang,") + _w(" credential resolution stuck. See issue #14726 for context.") + + dump_path.write_text("\n".join(lines), encoding="utf-8") + return str(dump_path) + except Exception as exc: + logger.warning("Subagent timeout diagnostic dump failed: %s", exc) + return None + + def _run_single_child( task_index: int, goal: str, @@ -1168,11 +1312,18 @@ def _run_single_child( # when the child's API call or tool-level HTTP request hangs. child_timeout = _get_child_timeout() _timeout_executor = ThreadPoolExecutor(max_workers=1) - _child_future = _timeout_executor.submit( - child.run_conversation, - user_message=goal, - task_id=child_task_id, - ) + # Capture the worker thread so the timeout diagnostic can dump its + # Python stack (see #14726 — 0-API-call hangs are opaque without it). + _worker_thread_holder: Dict[str, Optional[threading.Thread]] = {"t": None} + + def _run_with_thread_capture(): + _worker_thread_holder["t"] = threading.current_thread() + return child.run_conversation( + user_message=goal, + task_id=child_task_id, + ) + + _child_future = _timeout_executor.submit(_run_with_thread_capture) try: result = _child_future.result(timeout=child_timeout) except Exception as _timeout_exc: @@ -1194,6 +1345,32 @@ def _run_single_child( duration, ) + # When a subagent times out BEFORE making any API call, dump a + # diagnostic to help users (and us) see what the child was doing. + # See #14726 — without this, 0-API-call hangs are black boxes. + diagnostic_path: Optional[str] = None + child_api_calls = 0 + try: + _summary = child.get_activity_summary() + child_api_calls = int(_summary.get("api_call_count", 0) or 0) + except Exception: + pass + if is_timeout and child_api_calls == 0: + diagnostic_path = _dump_subagent_timeout_diagnostic( + child=child, + task_index=task_index, + timeout_seconds=float(child_timeout), + duration_seconds=float(duration), + worker_thread=_worker_thread_holder.get("t"), + goal=goal, + ) + if diagnostic_path: + logger.warning( + "Subagent %d 0-API-call timeout — diagnostic written to %s", + task_index, + diagnostic_path, + ) + if child_progress_cb: try: child_progress_cb( @@ -1210,22 +1387,35 @@ def _run_single_child( except Exception: pass + if is_timeout: + if child_api_calls == 0: + _err = ( + f"Subagent timed out after {child_timeout}s without " + f"making any API call — the child never reached its " + f"first LLM request (prompt construction, credential " + f"resolution, or transport may be stuck)." + ) + if diagnostic_path: + _err += f" Diagnostic: {diagnostic_path}" + else: + _err = ( + f"Subagent timed out after {child_timeout}s with " + f"{child_api_calls} API call(s) completed — likely " + f"stuck on a slow API call or unresponsive network request." + ) + else: + _err = str(_timeout_exc) + return { "task_index": task_index, "status": "timeout" if is_timeout else "error", "summary": None, - "error": ( - ( - f"Subagent timed out after {child_timeout}s with no response. " - "The child may be stuck on a slow API call or unresponsive network request." - ) - if is_timeout - else str(_timeout_exc) - ), + "error": _err, "exit_reason": "timeout" if is_timeout else "error", - "api_calls": 0, + "api_calls": child_api_calls, "duration_seconds": duration, "_child_role": getattr(child, "_delegate_role", None), + "diagnostic_path": diagnostic_path, } finally: # Shut down executor without waiting — if the child thread