Compare commits

...

1 Commits

Author SHA1 Message Date
Teknium
57c8e5143c fix(cli): add missing subprocess.run() timeouts in gateway CLI
All 35 subprocess.run() calls in hermes_cli/gateway.py lacked timeout
parameters. If systemctl, launchctl, loginctl, wmic, or ps blocks,
hermes gateway start/stop/restart/status/install/uninstall hangs
indefinitely with no feedback.

Timeouts tiered by operation type:
- 10s: instant queries (is-active, status, list, ps, tail, journalctl)
- 30s: fast lifecycle (daemon-reload, enable, start, bootstrap, kickstart)
- 90s: graceful shutdown (stop, restart, bootout, kickstart -k) — exceeds
  our TimeoutStopSec=60 to avoid premature timeout during shutdown

Special handling: _is_service_running() and launchd_status() catch
TimeoutExpired and treat it as not-running/not-loaded, consistent with
how non-zero return codes are already handled.

Inspired by PR #3732 (dlkakbs) and issue #4057 (SHL0MS).
Reimplemented on current main which has significantly changed launchctl
handling (bootout/bootstrap/kickstart vs legacy load/unload/start/stop).
2026-04-05 22:41:13 -07:00
3 changed files with 75 additions and 54 deletions

View File

@@ -43,7 +43,7 @@ def find_gateway_pids() -> list:
# Windows: use wmic to search command lines # Windows: use wmic to search command lines
result = subprocess.run( result = subprocess.run(
["wmic", "process", "get", "ProcessId,CommandLine", "/FORMAT:LIST"], ["wmic", "process", "get", "ProcessId,CommandLine", "/FORMAT:LIST"],
capture_output=True, text=True capture_output=True, text=True, timeout=10
) )
# Parse WMIC LIST output: blocks of "CommandLine=...\nProcessId=...\n" # Parse WMIC LIST output: blocks of "CommandLine=...\nProcessId=...\n"
current_cmd = "" current_cmd = ""
@@ -65,7 +65,8 @@ def find_gateway_pids() -> list:
result = subprocess.run( result = subprocess.run(
["ps", "aux"], ["ps", "aux"],
capture_output=True, capture_output=True,
text=True text=True,
timeout=10,
) )
for line in result.stdout.split('\n'): for line in result.stdout.split('\n'):
# Skip grep and current process # Skip grep and current process
@@ -402,6 +403,7 @@ def get_systemd_linger_status() -> tuple[bool | None, str]:
capture_output=True, capture_output=True,
text=True, text=True,
check=False, check=False,
timeout=10,
) )
except Exception as e: except Exception as e:
return None, str(e) return None, str(e)
@@ -636,7 +638,7 @@ def refresh_systemd_unit_if_needed(system: bool = False) -> bool:
expected_user = _read_systemd_user_from_unit(unit_path) if system else None expected_user = _read_systemd_user_from_unit(unit_path) if system else None
unit_path.write_text(generate_systemd_unit(system=system, run_as_user=expected_user), encoding="utf-8") unit_path.write_text(generate_systemd_unit(system=system, run_as_user=expected_user), encoding="utf-8")
subprocess.run(_systemctl_cmd(system) + ["daemon-reload"], check=True) subprocess.run(_systemctl_cmd(system) + ["daemon-reload"], check=True, timeout=30)
print(f"↻ Updated gateway {_service_scope_label(system)} service definition to match the current Hermes install") print(f"↻ Updated gateway {_service_scope_label(system)} service definition to match the current Hermes install")
return True return True
@@ -687,6 +689,7 @@ def _ensure_linger_enabled() -> None:
capture_output=True, capture_output=True,
text=True, text=True,
check=False, check=False,
timeout=30,
) )
except Exception as e: except Exception as e:
_print_linger_enable_warning(username, str(e)) _print_linger_enable_warning(username, str(e))
@@ -717,7 +720,7 @@ def systemd_install(force: bool = False, system: bool = False, run_as_user: str
if not systemd_unit_is_current(system=system): if not systemd_unit_is_current(system=system):
print(f"↻ Repairing outdated {_service_scope_label(system)} systemd service at: {unit_path}") print(f"↻ Repairing outdated {_service_scope_label(system)} systemd service at: {unit_path}")
refresh_systemd_unit_if_needed(system=system) refresh_systemd_unit_if_needed(system=system)
subprocess.run(_systemctl_cmd(system) + ["enable", get_service_name()], check=True) subprocess.run(_systemctl_cmd(system) + ["enable", get_service_name()], check=True, timeout=30)
print(f"{_service_scope_label(system).capitalize()} service definition updated") print(f"{_service_scope_label(system).capitalize()} service definition updated")
return return
print(f"Service already installed at: {unit_path}") print(f"Service already installed at: {unit_path}")
@@ -728,8 +731,8 @@ def systemd_install(force: bool = False, system: bool = False, run_as_user: str
print(f"Installing {_service_scope_label(system)} systemd service to: {unit_path}") print(f"Installing {_service_scope_label(system)} systemd service to: {unit_path}")
unit_path.write_text(generate_systemd_unit(system=system, run_as_user=run_as_user), encoding="utf-8") unit_path.write_text(generate_systemd_unit(system=system, run_as_user=run_as_user), encoding="utf-8")
subprocess.run(_systemctl_cmd(system) + ["daemon-reload"], check=True) subprocess.run(_systemctl_cmd(system) + ["daemon-reload"], check=True, timeout=30)
subprocess.run(_systemctl_cmd(system) + ["enable", get_service_name()], check=True) subprocess.run(_systemctl_cmd(system) + ["enable", get_service_name()], check=True, timeout=30)
print() print()
print(f"{_service_scope_label(system).capitalize()} service installed and enabled!") print(f"{_service_scope_label(system).capitalize()} service installed and enabled!")
@@ -755,15 +758,15 @@ def systemd_uninstall(system: bool = False):
if system: if system:
_require_root_for_system_service("uninstall") _require_root_for_system_service("uninstall")
subprocess.run(_systemctl_cmd(system) + ["stop", get_service_name()], check=False) subprocess.run(_systemctl_cmd(system) + ["stop", get_service_name()], check=False, timeout=90)
subprocess.run(_systemctl_cmd(system) + ["disable", get_service_name()], check=False) subprocess.run(_systemctl_cmd(system) + ["disable", get_service_name()], check=False, timeout=30)
unit_path = get_systemd_unit_path(system=system) unit_path = get_systemd_unit_path(system=system)
if unit_path.exists(): if unit_path.exists():
unit_path.unlink() unit_path.unlink()
print(f"✓ Removed {unit_path}") print(f"✓ Removed {unit_path}")
subprocess.run(_systemctl_cmd(system) + ["daemon-reload"], check=True) subprocess.run(_systemctl_cmd(system) + ["daemon-reload"], check=True, timeout=30)
print(f"{_service_scope_label(system).capitalize()} service uninstalled") print(f"{_service_scope_label(system).capitalize()} service uninstalled")
@@ -772,7 +775,7 @@ def systemd_start(system: bool = False):
if system: if system:
_require_root_for_system_service("start") _require_root_for_system_service("start")
refresh_systemd_unit_if_needed(system=system) refresh_systemd_unit_if_needed(system=system)
subprocess.run(_systemctl_cmd(system) + ["start", get_service_name()], check=True) subprocess.run(_systemctl_cmd(system) + ["start", get_service_name()], check=True, timeout=30)
print(f"{_service_scope_label(system).capitalize()} service started") print(f"{_service_scope_label(system).capitalize()} service started")
@@ -781,7 +784,7 @@ def systemd_stop(system: bool = False):
system = _select_systemd_scope(system) system = _select_systemd_scope(system)
if system: if system:
_require_root_for_system_service("stop") _require_root_for_system_service("stop")
subprocess.run(_systemctl_cmd(system) + ["stop", get_service_name()], check=True) subprocess.run(_systemctl_cmd(system) + ["stop", get_service_name()], check=True, timeout=90)
print(f"{_service_scope_label(system).capitalize()} service stopped") print(f"{_service_scope_label(system).capitalize()} service stopped")
@@ -791,7 +794,7 @@ def systemd_restart(system: bool = False):
if system: if system:
_require_root_for_system_service("restart") _require_root_for_system_service("restart")
refresh_systemd_unit_if_needed(system=system) refresh_systemd_unit_if_needed(system=system)
subprocess.run(_systemctl_cmd(system) + ["restart", get_service_name()], check=True) subprocess.run(_systemctl_cmd(system) + ["restart", get_service_name()], check=True, timeout=90)
print(f"{_service_scope_label(system).capitalize()} service restarted") print(f"{_service_scope_label(system).capitalize()} service restarted")
@@ -818,12 +821,14 @@ def systemd_status(deep: bool = False, system: bool = False):
subprocess.run( subprocess.run(
_systemctl_cmd(system) + ["status", get_service_name(), "--no-pager"], _systemctl_cmd(system) + ["status", get_service_name(), "--no-pager"],
capture_output=False, capture_output=False,
timeout=10,
) )
result = subprocess.run( result = subprocess.run(
_systemctl_cmd(system) + ["is-active", get_service_name()], _systemctl_cmd(system) + ["is-active", get_service_name()],
capture_output=True, capture_output=True,
text=True, text=True,
timeout=10,
) )
status = result.stdout.strip() status = result.stdout.strip()
@@ -860,7 +865,7 @@ def systemd_status(deep: bool = False, system: bool = False):
if deep: if deep:
print() print()
print("Recent logs:") print("Recent logs:")
subprocess.run(_journalctl_cmd(system) + ["-u", get_service_name(), "-n", "20", "--no-pager"]) subprocess.run(_journalctl_cmd(system) + ["-u", get_service_name(), "-n", "20", "--no-pager"], timeout=10)
# ============================================================================= # =============================================================================
@@ -979,8 +984,8 @@ def refresh_launchd_plist_if_needed() -> bool:
plist_path.write_text(generate_launchd_plist(), encoding="utf-8") plist_path.write_text(generate_launchd_plist(), encoding="utf-8")
label = get_launchd_label() label = get_launchd_label()
# Bootout/bootstrap so launchd picks up the new definition # Bootout/bootstrap so launchd picks up the new definition
subprocess.run(["launchctl", "bootout", f"{_launchd_domain()}/{label}"], check=False) subprocess.run(["launchctl", "bootout", f"{_launchd_domain()}/{label}"], check=False, timeout=90)
subprocess.run(["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], check=False) subprocess.run(["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], check=False, timeout=30)
print("↻ Updated gateway launchd service definition to match the current Hermes install") print("↻ Updated gateway launchd service definition to match the current Hermes install")
return True return True
@@ -1002,7 +1007,7 @@ def launchd_install(force: bool = False):
print(f"Installing launchd service to: {plist_path}") print(f"Installing launchd service to: {plist_path}")
plist_path.write_text(generate_launchd_plist()) plist_path.write_text(generate_launchd_plist())
subprocess.run(["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], check=True) subprocess.run(["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], check=True, timeout=30)
print() print()
print("✓ Service installed and loaded!") print("✓ Service installed and loaded!")
@@ -1015,7 +1020,7 @@ def launchd_install(force: bool = False):
def launchd_uninstall(): def launchd_uninstall():
plist_path = get_launchd_plist_path() plist_path = get_launchd_plist_path()
label = get_launchd_label() label = get_launchd_label()
subprocess.run(["launchctl", "bootout", f"{_launchd_domain()}/{label}"], check=False) subprocess.run(["launchctl", "bootout", f"{_launchd_domain()}/{label}"], check=False, timeout=90)
if plist_path.exists(): if plist_path.exists():
plist_path.unlink() plist_path.unlink()
@@ -1032,25 +1037,25 @@ def launchd_start():
print("↻ launchd plist missing; regenerating service definition") print("↻ launchd plist missing; regenerating service definition")
plist_path.parent.mkdir(parents=True, exist_ok=True) plist_path.parent.mkdir(parents=True, exist_ok=True)
plist_path.write_text(generate_launchd_plist(), encoding="utf-8") plist_path.write_text(generate_launchd_plist(), encoding="utf-8")
subprocess.run(["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], check=True) subprocess.run(["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], check=True, timeout=30)
subprocess.run(["launchctl", "kickstart", f"{_launchd_domain()}/{label}"], check=True) subprocess.run(["launchctl", "kickstart", f"{_launchd_domain()}/{label}"], check=True, timeout=30)
print("✓ Service started") print("✓ Service started")
return return
refresh_launchd_plist_if_needed() refresh_launchd_plist_if_needed()
try: try:
subprocess.run(["launchctl", "kickstart", f"{_launchd_domain()}/{label}"], check=True) subprocess.run(["launchctl", "kickstart", f"{_launchd_domain()}/{label}"], check=True, timeout=30)
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
if e.returncode != 3: if e.returncode != 3:
raise raise
print("↻ launchd job was unloaded; reloading service definition") print("↻ launchd job was unloaded; reloading service definition")
subprocess.run(["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], check=True) subprocess.run(["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], check=True, timeout=30)
subprocess.run(["launchctl", "kickstart", f"{_launchd_domain()}/{label}"], check=True) subprocess.run(["launchctl", "kickstart", f"{_launchd_domain()}/{label}"], check=True, timeout=30)
print("✓ Service started") print("✓ Service started")
def launchd_stop(): def launchd_stop():
label = get_launchd_label() label = get_launchd_label()
subprocess.run(["launchctl", "kill", "SIGTERM", f"{_launchd_domain()}/{label}"], check=True) subprocess.run(["launchctl", "kill", "SIGTERM", f"{_launchd_domain()}/{label}"], check=True, timeout=30)
print("✓ Service stopped") print("✓ Service stopped")
def _wait_for_gateway_exit(timeout: float = 10.0, force_after: float = 5.0): def _wait_for_gateway_exit(timeout: float = 10.0, force_after: float = 5.0):
@@ -1100,7 +1105,7 @@ def launchd_restart():
# A two-step stop/start from inside the gateway's own process tree # A two-step stop/start from inside the gateway's own process tree
# would kill the shell before the start command is reached. # would kill the shell before the start command is reached.
try: try:
subprocess.run(["launchctl", "kickstart", "-k", target], check=True) subprocess.run(["launchctl", "kickstart", "-k", target], check=True, timeout=90)
print("✓ Service restarted") print("✓ Service restarted")
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
if e.returncode != 3: if e.returncode != 3:
@@ -1108,18 +1113,25 @@ def launchd_restart():
# Job not loaded — bootstrap and start fresh # Job not loaded — bootstrap and start fresh
print("↻ launchd job was unloaded; reloading") print("↻ launchd job was unloaded; reloading")
plist_path = get_launchd_plist_path() plist_path = get_launchd_plist_path()
subprocess.run(["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], check=True) subprocess.run(["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], check=True, timeout=30)
subprocess.run(["launchctl", "kickstart", target], check=True) subprocess.run(["launchctl", "kickstart", target], check=True, timeout=30)
print("✓ Service restarted") print("✓ Service restarted")
def launchd_status(deep: bool = False): def launchd_status(deep: bool = False):
plist_path = get_launchd_plist_path() plist_path = get_launchd_plist_path()
label = get_launchd_label() label = get_launchd_label()
try:
result = subprocess.run( result = subprocess.run(
["launchctl", "list", label], ["launchctl", "list", label],
capture_output=True, capture_output=True,
text=True text=True,
timeout=10,
) )
loaded = result.returncode == 0
loaded_output = result.stdout
except subprocess.TimeoutExpired:
loaded = False
loaded_output = ""
print(f"Launchd plist: {plist_path}") print(f"Launchd plist: {plist_path}")
if launchd_plist_is_current(): if launchd_plist_is_current():
@@ -1128,9 +1140,9 @@ def launchd_status(deep: bool = False):
print("⚠ Service definition is stale relative to the current Hermes install") print("⚠ Service definition is stale relative to the current Hermes install")
print(" Run: hermes gateway start") print(" Run: hermes gateway start")
if result.returncode == 0: if loaded:
print("✓ Gateway service is loaded") print("✓ Gateway service is loaded")
print(result.stdout) print(loaded_output)
else: else:
print("✗ Gateway service is not loaded") print("✗ Gateway service is not loaded")
print(" Service definition exists locally but launchd has not loaded it.") print(" Service definition exists locally but launchd has not loaded it.")
@@ -1141,7 +1153,7 @@ def launchd_status(deep: bool = False):
if log_file.exists(): if log_file.exists():
print() print()
print("Recent logs:") print("Recent logs:")
subprocess.run(["tail", "-20", str(log_file)]) subprocess.run(["tail", "-20", str(log_file)], timeout=10)
# ============================================================================= # =============================================================================
@@ -1658,28 +1670,37 @@ def _is_service_running() -> bool:
system_unit_exists = get_systemd_unit_path(system=True).exists() system_unit_exists = get_systemd_unit_path(system=True).exists()
if user_unit_exists: if user_unit_exists:
try:
result = subprocess.run( result = subprocess.run(
_systemctl_cmd(False) + ["is-active", get_service_name()], _systemctl_cmd(False) + ["is-active", get_service_name()],
capture_output=True, text=True capture_output=True, text=True, timeout=10,
) )
if result.stdout.strip() == "active": if result.stdout.strip() == "active":
return True return True
except subprocess.TimeoutExpired:
pass
if system_unit_exists: if system_unit_exists:
try:
result = subprocess.run( result = subprocess.run(
_systemctl_cmd(True) + ["is-active", get_service_name()], _systemctl_cmd(True) + ["is-active", get_service_name()],
capture_output=True, text=True capture_output=True, text=True, timeout=10,
) )
if result.stdout.strip() == "active": if result.stdout.strip() == "active":
return True return True
except subprocess.TimeoutExpired:
pass
return False return False
elif is_macos() and get_launchd_plist_path().exists(): elif is_macos() and get_launchd_plist_path().exists():
try:
result = subprocess.run( result = subprocess.run(
["launchctl", "list", get_launchd_label()], ["launchctl", "list", get_launchd_label()],
capture_output=True, text=True capture_output=True, text=True, timeout=10,
) )
return result.returncode == 0 return result.returncode == 0
except subprocess.TimeoutExpired:
return False
# Check for manual processes # Check for manual processes
return len(find_gateway_pids()) > 0 return len(find_gateway_pids()) > 0

View File

@@ -40,7 +40,7 @@ def test_systemd_status_warns_when_linger_disabled(monkeypatch, tmp_path, capsys
monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path) monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path)
monkeypatch.setattr(gateway, "get_systemd_linger_status", lambda: (False, "")) monkeypatch.setattr(gateway, "get_systemd_linger_status", lambda: (False, ""))
def fake_run(cmd, capture_output=False, text=False, check=False): def fake_run(cmd, capture_output=False, text=False, check=False, **kwargs):
if cmd[:4] == ["systemctl", "--user", "status", gateway.get_service_name()]: if cmd[:4] == ["systemctl", "--user", "status", gateway.get_service_name()]:
return SimpleNamespace(returncode=0, stdout="", stderr="") return SimpleNamespace(returncode=0, stdout="", stderr="")
if cmd[:3] == ["systemctl", "--user", "is-active"]: if cmd[:3] == ["systemctl", "--user", "is-active"]:

View File

@@ -44,7 +44,7 @@ class TestEnsureLingerEnabled:
run_calls = [] run_calls = []
def fake_run(cmd, capture_output=False, text=False, check=False): def fake_run(cmd, capture_output=False, text=False, check=False, **kwargs):
run_calls.append((cmd, capture_output, text, check)) run_calls.append((cmd, capture_output, text, check))
return SimpleNamespace(returncode=0, stdout="", stderr="") return SimpleNamespace(returncode=0, stdout="", stderr="")