fix(cron): reap orphaned MCP stdio subprocesses after each tick

MCP stdio servers are spawned via the SDK's stdio_client, which on Linux uses start_new_session=True (setsid). When a cron job is cancelled mid-way (timeout, agent finish, exception), the subprocess often escapes the SDK's teardown and survives as a session leader. Because setsid() detaches the child from the gateway's process group / cgroup tree, systemd does not reap it on service restart either — so every cron tick that touches an MCP tool leaks a dangling server process. Fix: * tools/mcp_tool.py — _run_stdio now wraps the whole stdio+session context in try/finally. On any exit path (clean, exception, cancellation), PIDs still alive are moved from the active _stdio_pids set into a new _orphan_stdio_pids set. Orphan detection is done via os.kill(pid, 0) — a cheap liveness probe that never signals the target. * tools/mcp_tool.py — _kill_orphaned_mcp_children gains an include_active=False flag. Default behaviour now only reaps the orphan set so concurrent sessions (other parallel cron jobs or live user chats) are never disrupted. The existing shutdown path passes include_active=True to keep the previous "kill everything" semantics after the MCP loop is stopped. * cron/scheduler.py — the cleanup hook is moved from run_job()'s finally (which would race with parallel siblings after #13021) into tick() after the ThreadPoolExecutor has joined every future. At that point there are no in-flight sessions from this tick, so sweeping the orphan set is always safe. Net effect: zero regression for healthy sessions, and orphan MCP servers no longer accumulate between gateway restarts. Made-with: Cursor
2026-04-28 06:51:16 +08:00 · 2026-04-20 13:46:18 +03:00
parent 5db6db891c
commit 930494d687
3 changed files with 108 additions and 40 deletions
--- a/tools/mcp_tool.py
+++ b/tools/mcp_tool.py
@@ -1044,33 +1044,51 @@ class MCPServerTask:

        # Snapshot child PIDs before spawning so we can track the new one.
        pids_before = _snapshot_child_pids()
+        new_pids: set = set()
        # Redirect subprocess stderr into a shared log file so MCP servers
        # (FastMCP banners, slack-mcp startup JSON, etc.) don't dump onto
        # the user's TTY and corrupt the TUI.  Preserves debuggability via
        # ~/.hermes/logs/mcp-stderr.log.
        _write_stderr_log_header(self.name)
        _errlog = _get_mcp_stderr_log()
-        async with stdio_client(server_params, errlog=_errlog) as (read_stream, write_stream):
-            # Capture the newly spawned subprocess PID for force-kill cleanup.
-            new_pids = _snapshot_child_pids() - pids_before
+        try:
+            async with stdio_client(server_params, errlog=_errlog) as (
+                read_stream,
+                write_stream,
+            ):
+                # Capture the newly spawned subprocess PID for force-kill cleanup.
+                new_pids = _snapshot_child_pids() - pids_before
+                if new_pids:
+                    with _lock:
+                        for _pid in new_pids:
+                            _stdio_pids[_pid] = self.name
+                async with ClientSession(
+                    read_stream, write_stream, **sampling_kwargs
+                ) as session:
+                    await session.initialize()
+                    self.session = session
+                    await self._discover_tools()
+                    self._ready.set()
+                    # stdio transport does not use OAuth, but we still honor
+                    # _reconnect_event (e.g. future manual /mcp refresh) for
+                    # consistency with _run_http.
+                    await self._wait_for_lifecycle_event()
+        finally:
+            # Runs on clean exit, exceptions, AND asyncio cancellation.
+            # If any of the spawned PIDs are still alive, the SDK's
+            # teardown failed (common when the task is cancelled mid-way
+            # on Linux, where setsid() children escape the parent cgroup).
+            # Mark them as orphans so the next cleanup sweep can reap them.
            if new_pids:
                with _lock:
                    for _pid in new_pids:
-                        _stdio_pids[_pid] = self.name
-            async with ClientSession(read_stream, write_stream, **sampling_kwargs) as session:
-                await session.initialize()
-                self.session = session
-                await self._discover_tools()
-                self._ready.set()
-                # stdio transport does not use OAuth, but we still honor
-                # _reconnect_event (e.g. future manual /mcp refresh) for
-                # consistency with _run_http.
-                await self._wait_for_lifecycle_event()
-        # Context exited cleanly — subprocess was terminated by the SDK.
-        if new_pids:
-            with _lock:
-                for _pid in new_pids:
-                    _stdio_pids.pop(_pid, None)
+                        _stdio_pids.pop(_pid, None)
+                    for pid in new_pids:
+                        try:
+                            os.kill(pid, 0)  # signal 0: probe liveness only
+                        except (ProcessLookupError, PermissionError, OSError):
+                            continue  # process already exited — nothing to do
+                        _orphan_stdio_pids.add(pid)

    async def _run_http(self, config: dict):
        """Run the server using HTTP/StreamableHTTP transport."""
@@ -1718,6 +1736,13 @@ _lock = threading.Lock()
 # normal server shutdown.
 _stdio_pids: Dict[int, str] = {}  # pid -> server_name

+# PIDs that survived their session context exit (SDK teardown failed to
+# terminate them).  These are detected in _run_stdio's finally block and
+# can be cleaned up asynchronously by _kill_orphaned_mcp_children().
+# Separate from _stdio_pids so cleanup sweeps never race with active
+# sessions (e.g. concurrent cron jobs or live user chats).
+_orphan_stdio_pids: set = set()
+

 def _snapshot_child_pids() -> set:
    """Return a set of current child process PIDs.
@@ -2959,21 +2984,34 @@ def shutdown_mcp_servers():
    _stop_mcp_loop()


-def _kill_orphaned_mcp_children() -> None:
-    """Graceful shutdown of MCP stdio subprocesses that survived loop cleanup.
+def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
+    """Best-effort graceful shutdown of stdio MCP subprocesses to reap orphans.

-    Sends SIGTERM first, waits 2 seconds, then escalates to SIGKILL.
-    This prevents shared-resource collisions when multiple hermes processes
-    run on the same host (each has its own _stdio_pids dict).
+    Orphans are PIDs that survived their session context exit (SDK teardown
+    did not terminate the process — common on Linux when stdio children escape
+    the parent cgroup on cancellation). By default only entries in
+    ``_orphan_stdio_pids`` are reaped so concurrent cron jobs and live user
+    sessions are not disrupted.

-    Only kills PIDs tracked in ``_stdio_pids`` — never arbitrary children.
+    Sends SIGTERM, waits 2 seconds, then escalates to SIGKILL for any
+    survivors, avoiding shared-resource collisions when multiple hermes
+    processes run on the same host (each has its own ``_stdio_pids`` dict).
+
+    With ``include_active=True`` also kills every PID in ``_stdio_pids`` —
+    used only at final shutdown, after the MCP event loop has stopped and no
+    sessions can still be in flight.
    """
    import signal as _signal
    import time as _time

    with _lock:
-        pids = dict(_stdio_pids)
-        _stdio_pids.clear()
+        pids: Dict[int, str] = {}
+        for opid in _orphan_stdio_pids:
+            pids[opid] = "orphan"
+        _orphan_stdio_pids.clear()
+        if include_active:
+            pids.update(dict(_stdio_pids))
+            _stdio_pids.clear()

    # Fast path: no tracked stdio PIDs to reap. Skip the SIGTERM/sleep/SIGKILL
    # dance entirely — otherwise every MCP-free shutdown pays a 2s sleep tax.
@@ -3022,5 +3060,6 @@ def _stop_mcp_loop():
        except Exception:
            pass
        # After closing the loop, any stdio subprocesses that survived the
-        # graceful shutdown are now orphaned.  Force-kill them.
-        _kill_orphaned_mcp_children()
+        # graceful shutdown are now orphaned — include active PIDs too
+        # since the loop is gone and no session can still be in flight.
+        _kill_orphaned_mcp_children(include_active=True)