feat(delegate): diagnostic dump when a subagent times out with 0 API calls (#15105)

When a subagent in delegate_task times out before making its first LLM
request, write a structured diagnostic file under
~/.hermes/logs/subagent-timeout-<sid>-<ts>.log capturing enough state
for the user (and us) to debug the hang. The old error message —
'Subagent timed out after Ns with no response. The child may be stuck
on a slow API call or unresponsive network request.' — gave no
observability for the 0-API-call case, which is the hardest to reason
about remotely.

The diagnostic captures:
  - timeout config vs actual duration
  - goal (truncated to 1000 chars)
  - child config: model, provider, api_mode, base_url, max_iterations,
    quiet_mode, platform, _delegate_role, _delegate_depth
  - enabled_toolsets + loaded tool names
  - system prompt byte/char count (catches oversized prompts that
    providers silently choke on)
  - tool schema count + byte size
  - child's get_activity_summary() snapshot
  - Python stack of the worker thread at the moment of timeout
    (reveals whether the hang is in credential resolution, transport,
    prompt construction, etc.)

Wiring:
  - _run_single_child captures the worker thread via a small wrapper
    around child.run_conversation so we can look up its stack at
    timeout.
  - After a FuturesTimeoutError, we pull child.get_activity_summary()
    to read api_call_count. If 0 AND it was a timeout (not a raise),
    _dump_subagent_timeout_diagnostic() is invoked.
  - The returned path is surfaced in the error string so the parent
    agent (and therefore the user / gateway) sees exactly where to look.
  - api_calls > 0 timeouts keep the old 'stuck on slow API call'
    phrasing since that's the correct diagnosis for those.

This does NOT change any behavior for successful subagent runs,
non-timeout errors, or subagents that made at least one API call
before hanging.

Tests: 7 cases (tests/tools/test_delegate_subagent_timeout_diagnostic.py)
  - output format + required sections + field values
  - long-goal truncation with [truncated] marker
  - missing / already-exited worker thread branches
  - unwritable HERMES_HOME/logs/ returns None without raising
  - _run_single_child wiring: 0 API calls → dump + diagnostic_path in error
  - _run_single_child wiring: N>0 API calls → no dump, old message

Refs: #14726
This commit is contained in:
Teknium
2026-04-24 04:58:32 -07:00
committed by GitHub
parent 3cb43df2cd
commit 7634c1386f
2 changed files with 490 additions and 14 deletions

View File

@@ -0,0 +1,286 @@
"""Regression tests for subagent timeout diagnostic dump (issue #14726).
When delegate_task's child subagent times out without having made any API
call, a structured diagnostic file is written under
``~/.hermes/logs/subagent-timeout-<sid>-<ts>.log``. This gives users a
concrete artifact to inspect (worker thread stack, system prompt size,
tool schema bytes, credential pool state, etc.) instead of the previous
opaque "subagent timed out" error.
These tests pin:
- the diagnostic writer's output format and content
- the timeout branch in _run_single_child only dumps when api_calls == 0
- the error message surfaces the diagnostic path
- api_calls > 0 timeouts do NOT write a dump (the old "stuck on slow API
call" explanation still applies)
"""
from __future__ import annotations
import os
import threading
import time
from pathlib import Path
from typing import Optional
from unittest.mock import MagicMock, patch
import pytest
@pytest.fixture
def hermes_home(tmp_path, monkeypatch):
home = tmp_path / ".hermes"
home.mkdir()
monkeypatch.setenv("HERMES_HOME", str(home))
return home
class _StubChild:
"""Minimal stand-in for an AIAgent subagent."""
def __init__(
self,
*,
api_call_count: int = 0,
hang_seconds: float = 5.0,
subagent_id: str = "sa-0-stubabc",
tool_schema=None,
):
self._subagent_id = subagent_id
self._delegate_depth = 1
self._delegate_role = "leaf"
self.model = "test/model"
self.provider = "testprov"
self.api_mode = "chat_completions"
self.base_url = "https://example.test/v1"
self.max_iterations = 30
self.quiet_mode = True
self.skip_memory = True
self.skip_context_files = True
self.platform = "cli"
self.ephemeral_system_prompt = "sys prompt"
self.enabled_toolsets = ["web", "terminal"]
self.valid_tool_names = {"web_search", "terminal"}
self.tools = tool_schema if tool_schema is not None else [
{"name": "web_search", "description": "search"},
{"name": "terminal", "description": "shell"},
]
self._api_call_count = api_call_count
self._hang = threading.Event()
self._hang_seconds = hang_seconds
def get_activity_summary(self):
return {
"api_call_count": self._api_call_count,
"max_iterations": self.max_iterations,
"current_tool": None,
"seconds_since_activity": 60,
}
def run_conversation(self, user_message, task_id=None):
self._hang.wait(self._hang_seconds)
return {"final_response": "", "completed": False, "api_calls": self._api_call_count}
def interrupt(self):
self._hang.set()
# ── _dump_subagent_timeout_diagnostic ──────────────────────────────────
class TestDumpSubagentTimeoutDiagnostic:
def test_writes_log_with_expected_sections(self, hermes_home):
from tools.delegate_tool import _dump_subagent_timeout_diagnostic
child = _StubChild(subagent_id="sa-7-abc123")
worker = threading.Thread(
target=lambda: child.run_conversation("test"),
daemon=True,
)
worker.start()
time.sleep(0.1)
try:
path = _dump_subagent_timeout_diagnostic(
child=child,
task_index=7,
timeout_seconds=300.0,
duration_seconds=300.01,
worker_thread=worker,
goal="Research something long",
)
finally:
child.interrupt()
worker.join(timeout=2.0)
assert path is not None
p = Path(path)
assert p.is_file()
# File lives under HERMES_HOME/logs/
assert p.parent == hermes_home / "logs"
assert p.name.startswith("subagent-timeout-sa-7-abc123-")
assert p.suffix == ".log"
content = p.read_text()
# Header references the issue for future grep-ability
assert "issue #14726" in content
# Timeout facts
assert "task_index: 7" in content
assert "subagent_id: sa-7-abc123" in content
assert "configured_timeout: 300.0s" in content
assert "actual_duration: 300.01s" in content
# Goal
assert "Research something long" in content
# Child config
assert "model: 'test/model'" in content
assert "provider: 'testprov'" in content
assert "base_url: 'https://example.test/v1'" in content
assert "max_iterations: 30" in content
# Toolsets
assert "enabled_toolsets: ['web', 'terminal']" in content
assert "loaded tool count: 2" in content
# Prompt / schema sizes
assert "system_prompt_bytes:" in content
assert "tool_schema_count: 2" in content
assert "tool_schema_bytes:" in content
# Activity summary
assert "api_call_count: 0" in content
# Worker stack
assert "Worker thread stack at timeout" in content
# The thread is parked inside _hang.wait → cond.wait → waiter.acquire
assert "acquire" in content or "wait" in content
def test_truncates_very_long_goal(self, hermes_home):
from tools.delegate_tool import _dump_subagent_timeout_diagnostic
child = _StubChild()
huge_goal = "x" * 5000
path = _dump_subagent_timeout_diagnostic(
child=child,
task_index=0,
timeout_seconds=300.0,
duration_seconds=300.0,
worker_thread=None,
goal=huge_goal,
)
child.interrupt()
content = Path(path).read_text()
assert "[truncated]" in content
# Goal section trimmed to 1000 chars + suffix
goal_block = content.split("## Goal", 1)[1].split("## Child config", 1)[0]
assert len(goal_block) < 1200
def test_missing_worker_thread_is_handled(self, hermes_home):
from tools.delegate_tool import _dump_subagent_timeout_diagnostic
child = _StubChild()
path = _dump_subagent_timeout_diagnostic(
child=child,
task_index=0,
timeout_seconds=300.0,
duration_seconds=300.0,
worker_thread=None,
goal="x",
)
child.interrupt()
content = Path(path).read_text()
assert "<no worker thread handle>" in content
def test_exited_worker_thread_is_handled(self, hermes_home):
from tools.delegate_tool import _dump_subagent_timeout_diagnostic
child = _StubChild()
# A thread that has already finished
t = threading.Thread(target=lambda: None)
t.start()
t.join()
assert not t.is_alive()
path = _dump_subagent_timeout_diagnostic(
child=child,
task_index=0,
timeout_seconds=300.0,
duration_seconds=300.0,
worker_thread=t,
goal="x",
)
child.interrupt()
content = Path(path).read_text()
assert "<worker thread already exited>" in content
def test_returns_none_on_unwritable_logs_dir(self, tmp_path, monkeypatch):
# Point HERMES_HOME at an unwritable path so logs/ can't be created
# (simulates permission-denied). Helper must not raise.
from tools.delegate_tool import _dump_subagent_timeout_diagnostic
bogus = tmp_path / "does-not-exist" / ".hermes"
monkeypatch.setenv("HERMES_HOME", str(bogus))
child = _StubChild()
# Make the logs dir itself unwritable by creating it as a FILE
# so mkdir(exist_ok=True) → NotADirectoryError and we fall through.
bogus.parent.mkdir(parents=True, exist_ok=True)
bogus.mkdir()
(bogus / "logs").write_text("not a dir")
result = _dump_subagent_timeout_diagnostic(
child=child,
task_index=0,
timeout_seconds=300.0,
duration_seconds=300.0,
worker_thread=None,
goal="x",
)
child.interrupt()
# Either None (mkdir failed) or a real path; must never raise.
# We assert no exception propagates — the return value is advisory.
assert result is None or Path(result).exists()
# ── _run_single_child timeout branch wiring ───────────────────────────
class TestRunSingleChildTimeoutDump:
"""The timeout branch in _run_single_child must emit the diagnostic
dump when api_calls == 0, and must NOT emit it when api_calls > 0."""
def _invoke_with_short_timeout(self, child, monkeypatch):
"""Run _run_single_child with a tiny timeout to force the timeout branch."""
from tools import delegate_tool
# Force a 0.3s timeout so the test is fast
monkeypatch.setattr(delegate_tool, "_get_child_timeout", lambda: 0.3)
parent = MagicMock()
parent._touch_activity = MagicMock()
parent._current_task_id = None
return delegate_tool._run_single_child(
task_index=0,
goal="test goal",
child=child,
parent_agent=parent,
)
def test_zero_api_calls_writes_dump_and_surfaces_path(self, hermes_home, monkeypatch):
child = _StubChild(api_call_count=0, hang_seconds=10.0)
result = self._invoke_with_short_timeout(child, monkeypatch)
assert result["status"] == "timeout"
assert result["api_calls"] == 0
assert result["diagnostic_path"] is not None
dump_path = Path(result["diagnostic_path"])
assert dump_path.is_file()
assert dump_path.parent == hermes_home / "logs"
# Error message surfaces the path and the "no API call" phrasing
assert "without making any API call" in result["error"]
assert "Diagnostic:" in result["error"]
assert str(dump_path) in result["error"]
def test_nonzero_api_calls_skips_dump_and_uses_old_message(self, hermes_home, monkeypatch):
child = _StubChild(api_call_count=5, hang_seconds=10.0)
result = self._invoke_with_short_timeout(child, monkeypatch)
assert result["status"] == "timeout"
assert result["api_calls"] == 5
# No diagnostic file should be written for timeouts that made
# actual API calls — the old generic "stuck on slow call" message
# still applies.
assert result.get("diagnostic_path") is None
assert "stuck on a slow API call" in result["error"]
# And no subagent-timeout-* file should exist under logs/
logs_dir = hermes_home / "logs"
if logs_dir.is_dir():
dumps = list(logs_dir.glob("subagent-timeout-*.log"))
assert dumps == []

View File

@@ -1016,6 +1016,150 @@ def _build_child_agent(
return child
def _dump_subagent_timeout_diagnostic(
*,
child: Any,
task_index: int,
timeout_seconds: float,
duration_seconds: float,
worker_thread: Optional[threading.Thread],
goal: str,
) -> Optional[str]:
"""Write a structured diagnostic dump for a subagent that timed out
before making any API call.
See issue #14726: users hit "subagent timed out after 300s with no response"
with zero API calls and no way to inspect what happened. This helper
writes a dedicated log under ``~/.hermes/logs/subagent-<sid>-<ts>.log``
capturing the child's config, system-prompt / tool-schema sizes, activity
tracker snapshot, and the worker thread's Python stack at timeout.
Returns the absolute path to the diagnostic file, or None on failure.
"""
try:
from hermes_constants import get_hermes_home
import datetime as _dt
import sys as _sys
import traceback as _traceback
hermes_home = get_hermes_home()
logs_dir = hermes_home / "logs"
try:
logs_dir.mkdir(parents=True, exist_ok=True)
except Exception:
return None
subagent_id = getattr(child, "_subagent_id", None) or f"idx{task_index}"
ts = _dt.datetime.now().strftime("%Y%m%d_%H%M%S")
dump_path = logs_dir / f"subagent-timeout-{subagent_id}-{ts}.log"
lines: List[str] = []
def _w(line: str = "") -> None:
lines.append(line)
_w(f"# Subagent timeout diagnostic — issue #14726")
_w(f"# Generated: {_dt.datetime.now().isoformat()}")
_w("")
_w("## Timeout")
_w(f" task_index: {task_index}")
_w(f" subagent_id: {subagent_id}")
_w(f" configured_timeout: {timeout_seconds}s")
_w(f" actual_duration: {duration_seconds:.2f}s")
_w("")
_w("## Goal")
_goal_preview = (goal or "").strip()
if len(_goal_preview) > 1000:
_goal_preview = _goal_preview[:1000] + " ...[truncated]"
_w(_goal_preview or "(empty)")
_w("")
_w("## Child config")
for attr in (
"model", "provider", "api_mode", "base_url", "max_iterations",
"quiet_mode", "skip_memory", "skip_context_files", "platform",
"_delegate_role", "_delegate_depth",
):
try:
val = getattr(child, attr, None)
# Redact api_key-shaped values defensively
if isinstance(val, str) and attr == "base_url":
pass
_w(f" {attr}: {val!r}")
except Exception:
_w(f" {attr}: <unreadable>")
_w("")
_w("## Toolsets")
enabled = getattr(child, "enabled_toolsets", None)
_w(f" enabled_toolsets: {enabled!r}")
tool_names = getattr(child, "valid_tool_names", None)
if tool_names:
_w(f" loaded tool count: {len(tool_names)}")
try:
_w(f" loaded tools: {sorted(list(tool_names))}")
except Exception:
pass
_w("")
_w("## Prompt / schema sizes")
try:
sys_prompt = getattr(child, "ephemeral_system_prompt", None) \
or getattr(child, "system_prompt", None) \
or ""
_w(f" system_prompt_bytes: {len(sys_prompt.encode('utf-8')) if isinstance(sys_prompt, str) else 'n/a'}")
_w(f" system_prompt_chars: {len(sys_prompt) if isinstance(sys_prompt, str) else 'n/a'}")
except Exception as exc:
_w(f" system_prompt: <error: {exc}>")
try:
tools_schema = getattr(child, "tools", None)
if tools_schema is not None:
_schema_json = json.dumps(tools_schema, default=str)
_w(f" tool_schema_count: {len(tools_schema)}")
_w(f" tool_schema_bytes: {len(_schema_json.encode('utf-8'))}")
except Exception as exc:
_w(f" tool_schema: <error: {exc}>")
_w("")
_w("## Activity summary")
try:
summary = child.get_activity_summary()
for k, v in summary.items():
_w(f" {k}: {v!r}")
except Exception as exc:
_w(f" <get_activity_summary failed: {exc}>")
_w("")
_w("## Worker thread stack at timeout")
if worker_thread is not None and worker_thread.is_alive():
frames = _sys._current_frames()
worker_frame = frames.get(worker_thread.ident)
if worker_frame is not None:
stack = _traceback.format_stack(worker_frame)
for frame_line in stack:
for sub in frame_line.rstrip().split("\n"):
_w(f" {sub}")
else:
_w(" <worker frame not available>")
elif worker_thread is None:
_w(" <no worker thread handle>")
else:
_w(" <worker thread already exited>")
_w("")
_w("## Notes")
_w(" This file is written ONLY when a subagent times out with 0 API calls.")
_w(" 0-API-call timeouts mean the child never reached its first LLM request.")
_w(" Common causes: oversized prompt rejected by provider, transport hang,")
_w(" credential resolution stuck. See issue #14726 for context.")
dump_path.write_text("\n".join(lines), encoding="utf-8")
return str(dump_path)
except Exception as exc:
logger.warning("Subagent timeout diagnostic dump failed: %s", exc)
return None
def _run_single_child(
task_index: int,
goal: str,
@@ -1168,11 +1312,18 @@ def _run_single_child(
# when the child's API call or tool-level HTTP request hangs.
child_timeout = _get_child_timeout()
_timeout_executor = ThreadPoolExecutor(max_workers=1)
_child_future = _timeout_executor.submit(
child.run_conversation,
user_message=goal,
task_id=child_task_id,
)
# Capture the worker thread so the timeout diagnostic can dump its
# Python stack (see #14726 — 0-API-call hangs are opaque without it).
_worker_thread_holder: Dict[str, Optional[threading.Thread]] = {"t": None}
def _run_with_thread_capture():
_worker_thread_holder["t"] = threading.current_thread()
return child.run_conversation(
user_message=goal,
task_id=child_task_id,
)
_child_future = _timeout_executor.submit(_run_with_thread_capture)
try:
result = _child_future.result(timeout=child_timeout)
except Exception as _timeout_exc:
@@ -1194,6 +1345,32 @@ def _run_single_child(
duration,
)
# When a subagent times out BEFORE making any API call, dump a
# diagnostic to help users (and us) see what the child was doing.
# See #14726 — without this, 0-API-call hangs are black boxes.
diagnostic_path: Optional[str] = None
child_api_calls = 0
try:
_summary = child.get_activity_summary()
child_api_calls = int(_summary.get("api_call_count", 0) or 0)
except Exception:
pass
if is_timeout and child_api_calls == 0:
diagnostic_path = _dump_subagent_timeout_diagnostic(
child=child,
task_index=task_index,
timeout_seconds=float(child_timeout),
duration_seconds=float(duration),
worker_thread=_worker_thread_holder.get("t"),
goal=goal,
)
if diagnostic_path:
logger.warning(
"Subagent %d 0-API-call timeout — diagnostic written to %s",
task_index,
diagnostic_path,
)
if child_progress_cb:
try:
child_progress_cb(
@@ -1210,22 +1387,35 @@ def _run_single_child(
except Exception:
pass
if is_timeout:
if child_api_calls == 0:
_err = (
f"Subagent timed out after {child_timeout}s without "
f"making any API call — the child never reached its "
f"first LLM request (prompt construction, credential "
f"resolution, or transport may be stuck)."
)
if diagnostic_path:
_err += f" Diagnostic: {diagnostic_path}"
else:
_err = (
f"Subagent timed out after {child_timeout}s with "
f"{child_api_calls} API call(s) completed — likely "
f"stuck on a slow API call or unresponsive network request."
)
else:
_err = str(_timeout_exc)
return {
"task_index": task_index,
"status": "timeout" if is_timeout else "error",
"summary": None,
"error": (
(
f"Subagent timed out after {child_timeout}s with no response. "
"The child may be stuck on a slow API call or unresponsive network request."
)
if is_timeout
else str(_timeout_exc)
),
"error": _err,
"exit_reason": "timeout" if is_timeout else "error",
"api_calls": 0,
"api_calls": child_api_calls,
"duration_seconds": duration,
"_child_role": getattr(child, "_delegate_role", None),
"diagnostic_path": diagnostic_path,
}
finally:
# Shut down executor without waiting — if the child thread