Compare commits

...

2 Commits

Author SHA1 Message Date
teknium1
f1e27d8138 fix(web_server): preserve action exit code after reaping zombie proc
Follow-up on the zombie-reap fix: once the Popen handle is reaped and
dropped from _ACTION_PROCS, migrate the exit code/pid into _ACTION_RESULTS
so subsequent /api/actions/{name}/status polls keep reporting the real
result instead of falling back to None. The dashboard polls repeatedly,
so without this the status flips from 'exited N' to 'unknown' on the
next poll.
2026-06-07 19:46:05 -07:00
liuhao1024
866d54a41c fix(web_server): reap finished action subprocesses to prevent zombie accumulation
get_action_status() calls proc.poll() to check if a dashboard action
has finished, but never calls proc.wait() afterward.  On POSIX systems
the kernel retains the process table entry until a blocking waitpid()
is issued, so every completed action remains as a zombie for the
lifetime of the web server.

After poll() returns a non-None exit code, call proc.wait(timeout=1)
to reap the child and remove the handle from _ACTION_PROCS.

Fixes #38032
2026-06-07 19:44:57 -07:00
2 changed files with 55 additions and 0 deletions

View File

@@ -1691,6 +1691,17 @@ async def get_action_status(name: str, lines: int = 200):
exit_code = proc.poll()
running = exit_code is None
pid = proc.pid
if not running:
# Reap the finished child to prevent zombie accumulation.
try:
proc.wait(timeout=1)
except Exception:
pass
_ACTION_PROCS.pop(name, None)
# Preserve the result so subsequent polls keep reporting the real
# exit code/pid instead of falling back to None once the handle
# is gone.
_ACTION_RESULTS[name] = {"exit_code": exit_code, "pid": pid}
return {
"name": name,

View File

@@ -823,6 +823,50 @@ class TestWebServerEndpoints:
assert resp.json() == {"ok": True, "pid": 12345, "name": "hermes-update"}
assert calls == [(["update"], "hermes-update")]
def test_finished_action_proc_is_reaped_and_removed(self):
"""Processes that have exited are reaped via .wait() and removed
from _ACTION_PROCS so they do not accumulate as zombies."""
import hermes_cli.web_server as web_server
waited = []
class FinishedProc:
pid = 99999
def poll(self):
return 0
def wait(self, timeout=None):
waited.append(timeout)
return 0
name = "gateway-restart"
proc = FinishedProc()
web_server._ACTION_PROCS[name] = proc
web_server._ACTION_RESULTS.pop(name, None)
try:
status = self.client.get(f"/api/actions/{name}/status")
assert status.status_code == 200
data = status.json()
assert data["running"] is False
assert data["exit_code"] == 0
assert data["pid"] == 99999
# The proc should have been reaped and removed.
assert waited, "proc.wait() was not called"
assert name not in web_server._ACTION_PROCS
# A second poll, after the handle is gone, must still report the
# real exit code/pid from _ACTION_RESULTS rather than None.
status2 = self.client.get(f"/api/actions/{name}/status")
assert status2.status_code == 200
data2 = status2.json()
assert data2["running"] is False
assert data2["exit_code"] == 0
assert data2["pid"] == 99999
finally:
web_server._ACTION_PROCS.pop(name, None)
web_server._ACTION_RESULTS.pop(name, None)
def test_get_status_filters_unconfigured_gateway_platforms(self, monkeypatch):
import gateway.config as gateway_config
import hermes_cli.web_server as web_server