mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
fix(cron): reap orphaned MCP stdio subprocesses after each tick
MCP stdio servers are spawned via the SDK's stdio_client, which on Linux uses start_new_session=True (setsid). When a cron job is cancelled mid-way (timeout, agent finish, exception), the subprocess often escapes the SDK's teardown and survives as a session leader. Because setsid() detaches the child from the gateway's process group / cgroup tree, systemd does not reap it on service restart either — so every cron tick that touches an MCP tool leaks a dangling server process. Fix: * tools/mcp_tool.py — _run_stdio now wraps the whole stdio+session context in try/finally. On any exit path (clean, exception, cancellation), PIDs still alive are moved from the active _stdio_pids set into a new _orphan_stdio_pids set. Orphan detection is done via os.kill(pid, 0) — a cheap liveness probe that never signals the target. * tools/mcp_tool.py — _kill_orphaned_mcp_children gains an include_active=False flag. Default behaviour now only reaps the orphan set so concurrent sessions (other parallel cron jobs or live user chats) are never disrupted. The existing shutdown path passes include_active=True to keep the previous "kill everything" semantics after the MCP loop is stopped. * cron/scheduler.py — the cleanup hook is moved from run_job()'s finally (which would race with parallel siblings after #13021) into tick() after the ThreadPoolExecutor has joined every future. At that point there are no in-flight sessions from this tick, so sweeping the orphan set is always safe. Net effect: zero regression for healthy sessions, and orphan MCP servers no longer accumulate between gateway restarts. Made-with: Cursor
This commit is contained in:
@@ -81,37 +81,51 @@ class TestStdioPidTracking:
|
||||
|
||||
def test_kill_orphaned_noop_when_empty(self):
|
||||
"""_kill_orphaned_mcp_children does nothing when no PIDs tracked."""
|
||||
from tools.mcp_tool import _kill_orphaned_mcp_children, _stdio_pids, _lock
|
||||
from tools.mcp_tool import (
|
||||
_kill_orphaned_mcp_children,
|
||||
_orphan_stdio_pids,
|
||||
_stdio_pids,
|
||||
_lock,
|
||||
)
|
||||
|
||||
with _lock:
|
||||
_stdio_pids.clear()
|
||||
_orphan_stdio_pids.clear()
|
||||
|
||||
# Should not raise
|
||||
_kill_orphaned_mcp_children()
|
||||
|
||||
def test_kill_orphaned_handles_dead_pids(self):
|
||||
"""_kill_orphaned_mcp_children gracefully handles already-dead PIDs."""
|
||||
from tools.mcp_tool import _kill_orphaned_mcp_children, _stdio_pids, _lock
|
||||
from tools.mcp_tool import (
|
||||
_kill_orphaned_mcp_children,
|
||||
_orphan_stdio_pids,
|
||||
_lock,
|
||||
)
|
||||
|
||||
# Use a PID that definitely doesn't exist
|
||||
fake_pid = 999999999
|
||||
with _lock:
|
||||
_stdio_pids[fake_pid] = "test"
|
||||
_orphan_stdio_pids.add(fake_pid)
|
||||
|
||||
# Should not raise (ProcessLookupError is caught)
|
||||
_kill_orphaned_mcp_children()
|
||||
|
||||
with _lock:
|
||||
assert fake_pid not in _stdio_pids
|
||||
assert fake_pid not in _orphan_stdio_pids
|
||||
|
||||
def test_kill_orphaned_uses_sigkill_when_available(self, monkeypatch):
|
||||
"""SIGTERM-first then SIGKILL after 2s for orphan cleanup."""
|
||||
from tools.mcp_tool import _kill_orphaned_mcp_children, _stdio_pids, _lock
|
||||
from tools.mcp_tool import (
|
||||
_kill_orphaned_mcp_children,
|
||||
_orphan_stdio_pids,
|
||||
_lock,
|
||||
)
|
||||
|
||||
fake_pid = 424242
|
||||
with _lock:
|
||||
_stdio_pids.clear()
|
||||
_stdio_pids[fake_pid] = "test"
|
||||
_orphan_stdio_pids.clear()
|
||||
_orphan_stdio_pids.add(fake_pid)
|
||||
|
||||
fake_sigkill = 9
|
||||
monkeypatch.setattr(signal, "SIGKILL", fake_sigkill, raising=False)
|
||||
@@ -128,16 +142,20 @@ class TestStdioPidTracking:
|
||||
mock_sleep.assert_called_once_with(2)
|
||||
|
||||
with _lock:
|
||||
assert fake_pid not in _stdio_pids
|
||||
assert fake_pid not in _orphan_stdio_pids
|
||||
|
||||
def test_kill_orphaned_falls_back_without_sigkill(self, monkeypatch):
|
||||
"""Without SIGKILL, SIGTERM is used for both phases."""
|
||||
from tools.mcp_tool import _kill_orphaned_mcp_children, _stdio_pids, _lock
|
||||
from tools.mcp_tool import (
|
||||
_kill_orphaned_mcp_children,
|
||||
_orphan_stdio_pids,
|
||||
_lock,
|
||||
)
|
||||
|
||||
fake_pid = 434343
|
||||
with _lock:
|
||||
_stdio_pids.clear()
|
||||
_stdio_pids[fake_pid] = "test"
|
||||
_orphan_stdio_pids.clear()
|
||||
_orphan_stdio_pids.add(fake_pid)
|
||||
|
||||
monkeypatch.delattr(signal, "SIGKILL", raising=False)
|
||||
|
||||
@@ -150,7 +168,7 @@ class TestStdioPidTracking:
|
||||
assert mock_sleep.called
|
||||
|
||||
with _lock:
|
||||
assert fake_pid not in _stdio_pids
|
||||
assert fake_pid not in _orphan_stdio_pids
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user