fix(cron): reap orphaned MCP stdio subprocesses after each tick

MCP stdio servers are spawned via the SDK's stdio_client, which on
Linux uses start_new_session=True (setsid).  When a cron job is
cancelled mid-way (timeout, agent finish, exception), the subprocess
often escapes the SDK's teardown and survives as a session leader.
Because setsid() detaches the child from the gateway's process group
/ cgroup tree, systemd does not reap it on service restart either —
so every cron tick that touches an MCP tool leaks a dangling server
process.

Fix:

* tools/mcp_tool.py — _run_stdio now wraps the whole stdio+session
  context in try/finally.  On any exit path (clean, exception,
  cancellation), PIDs still alive are moved from the active
  _stdio_pids set into a new _orphan_stdio_pids set.  Orphan
  detection is done via os.kill(pid, 0) — a cheap liveness probe
  that never signals the target.

* tools/mcp_tool.py — _kill_orphaned_mcp_children gains an
  include_active=False flag.  Default behaviour now only reaps the
  orphan set so concurrent sessions (other parallel cron jobs or
  live user chats) are never disrupted.  The existing shutdown path
  passes include_active=True to keep the previous "kill everything"
  semantics after the MCP loop is stopped.

* cron/scheduler.py — the cleanup hook is moved from run_job()'s
  finally (which would race with parallel siblings after #13021)
  into tick() after the ThreadPoolExecutor has joined every future.
  At that point there are no in-flight sessions from this tick, so
  sweeping the orphan set is always safe.

Net effect: zero regression for healthy sessions, and orphan MCP
servers no longer accumulate between gateway restarts.

Made-with: Cursor
This commit is contained in:
Ivan Tonov
2026-04-20 13:46:18 +03:00
committed by Teknium
parent 5db6db891c
commit 930494d687
3 changed files with 108 additions and 40 deletions

View File

@@ -81,37 +81,51 @@ class TestStdioPidTracking:
def test_kill_orphaned_noop_when_empty(self):
"""_kill_orphaned_mcp_children does nothing when no PIDs tracked."""
from tools.mcp_tool import _kill_orphaned_mcp_children, _stdio_pids, _lock
from tools.mcp_tool import (
_kill_orphaned_mcp_children,
_orphan_stdio_pids,
_stdio_pids,
_lock,
)
with _lock:
_stdio_pids.clear()
_orphan_stdio_pids.clear()
# Should not raise
_kill_orphaned_mcp_children()
def test_kill_orphaned_handles_dead_pids(self):
"""_kill_orphaned_mcp_children gracefully handles already-dead PIDs."""
from tools.mcp_tool import _kill_orphaned_mcp_children, _stdio_pids, _lock
from tools.mcp_tool import (
_kill_orphaned_mcp_children,
_orphan_stdio_pids,
_lock,
)
# Use a PID that definitely doesn't exist
fake_pid = 999999999
with _lock:
_stdio_pids[fake_pid] = "test"
_orphan_stdio_pids.add(fake_pid)
# Should not raise (ProcessLookupError is caught)
_kill_orphaned_mcp_children()
with _lock:
assert fake_pid not in _stdio_pids
assert fake_pid not in _orphan_stdio_pids
def test_kill_orphaned_uses_sigkill_when_available(self, monkeypatch):
"""SIGTERM-first then SIGKILL after 2s for orphan cleanup."""
from tools.mcp_tool import _kill_orphaned_mcp_children, _stdio_pids, _lock
from tools.mcp_tool import (
_kill_orphaned_mcp_children,
_orphan_stdio_pids,
_lock,
)
fake_pid = 424242
with _lock:
_stdio_pids.clear()
_stdio_pids[fake_pid] = "test"
_orphan_stdio_pids.clear()
_orphan_stdio_pids.add(fake_pid)
fake_sigkill = 9
monkeypatch.setattr(signal, "SIGKILL", fake_sigkill, raising=False)
@@ -128,16 +142,20 @@ class TestStdioPidTracking:
mock_sleep.assert_called_once_with(2)
with _lock:
assert fake_pid not in _stdio_pids
assert fake_pid not in _orphan_stdio_pids
def test_kill_orphaned_falls_back_without_sigkill(self, monkeypatch):
"""Without SIGKILL, SIGTERM is used for both phases."""
from tools.mcp_tool import _kill_orphaned_mcp_children, _stdio_pids, _lock
from tools.mcp_tool import (
_kill_orphaned_mcp_children,
_orphan_stdio_pids,
_lock,
)
fake_pid = 434343
with _lock:
_stdio_pids.clear()
_stdio_pids[fake_pid] = "test"
_orphan_stdio_pids.clear()
_orphan_stdio_pids.add(fake_pid)
monkeypatch.delattr(signal, "SIGKILL", raising=False)
@@ -150,7 +168,7 @@ class TestStdioPidTracking:
assert mock_sleep.called
with _lock:
assert fake_pid not in _stdio_pids
assert fake_pid not in _orphan_stdio_pids
# ---------------------------------------------------------------------------