Files
hermes-agent/tests/tools/test_local_interrupt_cleanup.py
Stephen Schoettler f73364b1c4 fix(ci): stabilize main test suite regressions (#17660)
* fix: stabilize main test suite regressions

* test(agent): update MiniMax normalization expectation

* test: stabilize remaining CI assertions

* test: harden config helper monkeypatching

* test: harden CI-only assertions

* fix(agent): propagate fast streaming interrupts
2026-04-29 23:18:55 -07:00

195 lines
7.7 KiB
Python

"""Regression tests for _wait_for_process subprocess cleanup on exception exit.
When the poll loop exits via KeyboardInterrupt or SystemExit (SIGTERM via
cli.py signal handler, SIGINT on the main thread in non-interactive -q mode,
or explicit sys.exit from some caller), the child subprocess must be killed
before the exception propagates — otherwise the local backend's use of
os.setsid leaves an orphan with PPID=1.
The live repro that motivated this: hermes chat -q ... 'sleep 300', SIGTERM
to the python process, sleep 300 survived with PPID=1 for the full 300 s
because _wait_for_process never got to call _kill_process before python
died. See commit message for full context.
"""
import os
import signal
import subprocess
import threading
import time
from types import SimpleNamespace
import pytest
from tools.environments.local import LocalEnvironment
@pytest.fixture(autouse=True)
def _isolate_hermes_home(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
(tmp_path / "logs").mkdir(exist_ok=True)
def _pgid_still_alive(pgid: int) -> bool:
"""Return True if any process in the given process group is still alive."""
try:
os.killpg(pgid, 0) # signal 0 = existence check
return True
except ProcessLookupError:
return False
def _process_group_snapshot(pgid: int) -> str:
"""Return a process-table snapshot for diagnostics."""
return subprocess.run(
["ps", "-o", "pid,ppid,pgid,stat,cmd", "-g", str(pgid)],
capture_output=True,
text=True,
check=False,
).stdout.strip()
def _wait_for_pgid_exit(pgid: int, timeout: float = 10.0) -> bool:
"""Wait for a process group to disappear under loaded xdist hosts."""
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
if not _pgid_still_alive(pgid):
return True
time.sleep(0.1)
return not _pgid_still_alive(pgid)
def test_kill_process_uses_cached_pgid_if_wrapper_already_exited(monkeypatch):
"""If the shell wrapper exits before cleanup, still kill its process group.
Without the cached pgid fallback, ``os.getpgid(proc.pid)`` raises for the
dead wrapper and cleanup falls back to ``proc.kill()``, which cannot reach
orphaned grandchildren still running in the original process group.
"""
env = object.__new__(LocalEnvironment)
proc = SimpleNamespace(
pid=12345,
_hermes_pgid=67890,
poll=lambda: 0,
kill=lambda: None,
)
killpg_calls = []
def fake_getpgid(_pid):
raise ProcessLookupError
def fake_killpg(pgid, sig):
killpg_calls.append((pgid, sig))
if sig == 0:
raise ProcessLookupError
monkeypatch.setattr(os, "getpgid", fake_getpgid)
monkeypatch.setattr(os, "killpg", fake_killpg)
env._kill_process(proc)
assert killpg_calls == [(67890, signal.SIGTERM), (67890, 0)]
def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
"""When KeyboardInterrupt arrives mid-poll, the subprocess group must be
killed before the exception is re-raised."""
env = LocalEnvironment(cwd="/tmp")
try:
result_holder = {}
proc_holder = {}
started = threading.Event()
raise_at = [None] # set by the main thread to tell worker when
# Drive execute() on a separate thread so we can SIGNAL-interrupt it
# via a thread-targeted exception without killing our test process.
def worker():
# Spawn a subprocess that will definitely be alive long enough
# to observe the cleanup, via env.execute(...) — the normal path
# that goes through _wait_for_process.
try:
result_holder["result"] = env.execute("sleep 30", timeout=60)
except BaseException as e: # noqa: BLE001 — we want to observe it
result_holder["exception"] = type(e).__name__
t = threading.Thread(target=worker, daemon=True)
t.start()
# Wait until the subprocess actually exists. LocalEnvironment.execute
# does init_session() (one spawn) before the real command, so we need
# to wait until a sleep 30 is visible. Use pgrep-style lookup via
# /proc to find the bash process running our sleep.
deadline = time.monotonic() + 5.0
target_pid = None
while time.monotonic() < deadline:
# Walk our children and grand-children to find one running 'sleep 30'
try:
import psutil # optional — fall back if absent
for p in psutil.Process(os.getpid()).children(recursive=True):
try:
if "sleep 30" in " ".join(p.cmdline()):
target_pid = p.pid
break
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
except ImportError:
# Fall back to ps
ps = subprocess.run(
["ps", "-eo", "pid,ppid,pgid,cmd"], capture_output=True, text=True,
)
for line in ps.stdout.splitlines():
if "sleep 30" in line and "grep" not in line:
parts = line.split()
if parts and parts[0].isdigit():
target_pid = int(parts[0])
break
if target_pid:
break
time.sleep(0.1)
assert target_pid is not None, (
"test setup: couldn't find 'sleep 30' subprocess after 5 s"
)
pgid = os.getpgid(target_pid)
assert _pgid_still_alive(pgid), "sanity: subprocess should be alive"
# Now inject a KeyboardInterrupt into the worker thread the same
# way CPython's signal machinery would. We use ctypes.PyThreadState_SetAsyncExc
# which is how signal delivery to non-main threads is simulated.
import ctypes
import sys as _sys
# py-thread-state exception targets need the ident, not the Thread
tid = t.ident
assert tid is not None
# Fire KeyboardInterrupt into the worker thread
ret = ctypes.pythonapi.PyThreadState_SetAsyncExc(
ctypes.c_ulong(tid), ctypes.py_object(KeyboardInterrupt),
)
assert ret == 1, f"SetAsyncExc returned {ret}, expected 1"
# Give the worker a moment to: hit the exception at the next poll,
# run the except-block cleanup (_kill_process), and exit.
t.join(timeout=5.0)
assert not t.is_alive(), "worker didn't exit within 5 s of the interrupt"
# The critical assertion: the subprocess GROUP must be dead. Not
# just the bash wrapper — the 'sleep 30' child too. Under xdist load,
# process-group disappearance can lag briefly after the worker exits,
# especially if the process is already dying or waiting to be reaped.
assert _wait_for_pgid_exit(pgid), (
f"subprocess group {pgid} is STILL ALIVE after worker received "
f"KeyboardInterrupt — orphan bug regressed. This is the "
f"sleep-300-survives-SIGTERM scenario from Physikal's Apr 2026 "
f"report. See tools/environments/base.py _wait_for_process "
f"except-block.\n{_process_group_snapshot(pgid)}"
)
# And the worker should have observed the KeyboardInterrupt (i.e.
# it re-raised cleanly, not silently swallowed).
assert result_holder.get("exception") == "KeyboardInterrupt", (
f"worker result: {result_holder!r} — expected KeyboardInterrupt "
f"propagation after cleanup"
)
finally:
try:
env.cleanup()
except Exception:
pass