mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
146 lines
6.1 KiB
Python
146 lines
6.1 KiB
Python
|
|
"""Regression tests for _wait_for_process subprocess cleanup on exception exit.
|
||
|
|
|
||
|
|
When the poll loop exits via KeyboardInterrupt or SystemExit (SIGTERM via
|
||
|
|
cli.py signal handler, SIGINT on the main thread in non-interactive -q mode,
|
||
|
|
or explicit sys.exit from some caller), the child subprocess must be killed
|
||
|
|
before the exception propagates — otherwise the local backend's use of
|
||
|
|
os.setsid leaves an orphan with PPID=1.
|
||
|
|
|
||
|
|
The live repro that motivated this: hermes chat -q ... 'sleep 300', SIGTERM
|
||
|
|
to the python process, sleep 300 survived with PPID=1 for the full 300 s
|
||
|
|
because _wait_for_process never got to call _kill_process before python
|
||
|
|
died. See commit message for full context.
|
||
|
|
"""
|
||
|
|
import os
|
||
|
|
import signal
|
||
|
|
import subprocess
|
||
|
|
import threading
|
||
|
|
import time
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
from tools.environments.local import LocalEnvironment
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.fixture(autouse=True)
|
||
|
|
def _isolate_hermes_home(tmp_path, monkeypatch):
|
||
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||
|
|
(tmp_path / "logs").mkdir(exist_ok=True)
|
||
|
|
|
||
|
|
|
||
|
|
def _pgid_still_alive(pgid: int) -> bool:
|
||
|
|
"""Return True if any process in the given process group is still alive."""
|
||
|
|
try:
|
||
|
|
os.killpg(pgid, 0) # signal 0 = existence check
|
||
|
|
return True
|
||
|
|
except ProcessLookupError:
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
|
||
|
|
"""When KeyboardInterrupt arrives mid-poll, the subprocess group must be
|
||
|
|
killed before the exception is re-raised."""
|
||
|
|
env = LocalEnvironment(cwd="/tmp")
|
||
|
|
try:
|
||
|
|
result_holder = {}
|
||
|
|
proc_holder = {}
|
||
|
|
started = threading.Event()
|
||
|
|
raise_at = [None] # set by the main thread to tell worker when
|
||
|
|
|
||
|
|
# Drive execute() on a separate thread so we can SIGNAL-interrupt it
|
||
|
|
# via a thread-targeted exception without killing our test process.
|
||
|
|
def worker():
|
||
|
|
# Spawn a subprocess that will definitely be alive long enough
|
||
|
|
# to observe the cleanup, via env.execute(...) — the normal path
|
||
|
|
# that goes through _wait_for_process.
|
||
|
|
try:
|
||
|
|
result_holder["result"] = env.execute("sleep 30", timeout=60)
|
||
|
|
except BaseException as e: # noqa: BLE001 — we want to observe it
|
||
|
|
result_holder["exception"] = type(e).__name__
|
||
|
|
|
||
|
|
t = threading.Thread(target=worker, daemon=True)
|
||
|
|
t.start()
|
||
|
|
# Wait until the subprocess actually exists. LocalEnvironment.execute
|
||
|
|
# does init_session() (one spawn) before the real command, so we need
|
||
|
|
# to wait until a sleep 30 is visible. Use pgrep-style lookup via
|
||
|
|
# /proc to find the bash process running our sleep.
|
||
|
|
deadline = time.monotonic() + 5.0
|
||
|
|
target_pid = None
|
||
|
|
while time.monotonic() < deadline:
|
||
|
|
# Walk our children and grand-children to find one running 'sleep 30'
|
||
|
|
try:
|
||
|
|
import psutil # optional — fall back if absent
|
||
|
|
for p in psutil.Process(os.getpid()).children(recursive=True):
|
||
|
|
try:
|
||
|
|
if "sleep 30" in " ".join(p.cmdline()):
|
||
|
|
target_pid = p.pid
|
||
|
|
break
|
||
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||
|
|
continue
|
||
|
|
except ImportError:
|
||
|
|
# Fall back to ps
|
||
|
|
ps = subprocess.run(
|
||
|
|
["ps", "-eo", "pid,ppid,pgid,cmd"], capture_output=True, text=True,
|
||
|
|
)
|
||
|
|
for line in ps.stdout.splitlines():
|
||
|
|
if "sleep 30" in line and "grep" not in line:
|
||
|
|
parts = line.split()
|
||
|
|
if parts and parts[0].isdigit():
|
||
|
|
target_pid = int(parts[0])
|
||
|
|
break
|
||
|
|
if target_pid:
|
||
|
|
break
|
||
|
|
time.sleep(0.1)
|
||
|
|
|
||
|
|
assert target_pid is not None, (
|
||
|
|
"test setup: couldn't find 'sleep 30' subprocess after 5 s"
|
||
|
|
)
|
||
|
|
pgid = os.getpgid(target_pid)
|
||
|
|
assert _pgid_still_alive(pgid), "sanity: subprocess should be alive"
|
||
|
|
|
||
|
|
# Now inject a KeyboardInterrupt into the worker thread the same
|
||
|
|
# way CPython's signal machinery would. We use ctypes.PyThreadState_SetAsyncExc
|
||
|
|
# which is how signal delivery to non-main threads is simulated.
|
||
|
|
import ctypes
|
||
|
|
import sys as _sys
|
||
|
|
# py-thread-state exception targets need the ident, not the Thread
|
||
|
|
tid = t.ident
|
||
|
|
assert tid is not None
|
||
|
|
# Fire KeyboardInterrupt into the worker thread
|
||
|
|
ret = ctypes.pythonapi.PyThreadState_SetAsyncExc(
|
||
|
|
ctypes.c_ulong(tid), ctypes.py_object(KeyboardInterrupt),
|
||
|
|
)
|
||
|
|
assert ret == 1, f"SetAsyncExc returned {ret}, expected 1"
|
||
|
|
|
||
|
|
# Give the worker a moment to: hit the exception at the next poll,
|
||
|
|
# run the except-block cleanup (_kill_process), and exit.
|
||
|
|
t.join(timeout=5.0)
|
||
|
|
assert not t.is_alive(), "worker didn't exit within 5 s of the interrupt"
|
||
|
|
|
||
|
|
# The critical assertion: the subprocess GROUP must be dead. Not
|
||
|
|
# just the bash wrapper — the 'sleep 30' child too.
|
||
|
|
# Give the SIGTERM+1s wait+SIGKILL escalation a moment to complete.
|
||
|
|
deadline = time.monotonic() + 3.0
|
||
|
|
while time.monotonic() < deadline:
|
||
|
|
if not _pgid_still_alive(pgid):
|
||
|
|
break
|
||
|
|
time.sleep(0.1)
|
||
|
|
assert not _pgid_still_alive(pgid), (
|
||
|
|
f"subprocess group {pgid} is STILL ALIVE after worker received "
|
||
|
|
f"KeyboardInterrupt — orphan bug regressed. This is the "
|
||
|
|
f"sleep-300-survives-SIGTERM scenario from Physikal's Apr 2026 "
|
||
|
|
f"report. See tools/environments/base.py _wait_for_process "
|
||
|
|
f"except-block."
|
||
|
|
)
|
||
|
|
# And the worker should have observed the KeyboardInterrupt (i.e.
|
||
|
|
# it re-raised cleanly, not silently swallowed).
|
||
|
|
assert result_holder.get("exception") == "KeyboardInterrupt", (
|
||
|
|
f"worker result: {result_holder!r} — expected KeyboardInterrupt "
|
||
|
|
f"propagation after cleanup"
|
||
|
|
)
|
||
|
|
finally:
|
||
|
|
try:
|
||
|
|
env.cleanup()
|
||
|
|
except Exception:
|
||
|
|
pass
|