fix(setup+gateway): defer config write, PID-based gateway kill, scoped systemd service names (#1499)

fix(setup+gateway): defer config write, PID-based gateway kill, scoped systemd service names
This commit is contained in:
Teknium
2026-03-16 04:58:12 -07:00
committed by GitHub
10 changed files with 137 additions and 56 deletions

View File

@@ -119,14 +119,35 @@ def is_windows() -> bool:
# Service Configuration
# =============================================================================
SERVICE_NAME = "hermes-gateway"
_SERVICE_BASE = "hermes-gateway"
SERVICE_DESCRIPTION = "Hermes Agent Gateway - Messaging Platform Integration"
def get_service_name() -> str:
"""Derive a systemd service name scoped to this HERMES_HOME.
Default ``~/.hermes`` returns ``hermes-gateway`` (backward compatible).
Any other HERMES_HOME appends a short hash so multiple installations
can each have their own systemd service without conflicting.
"""
import hashlib
from pathlib import Path as _Path # local import to avoid monkeypatch interference
home = _Path(os.getenv("HERMES_HOME", _Path.home() / ".hermes")).resolve()
default = (_Path.home() / ".hermes").resolve()
if home == default:
return _SERVICE_BASE
suffix = hashlib.sha256(str(home).encode()).hexdigest()[:8]
return f"{_SERVICE_BASE}-{suffix}"
SERVICE_NAME = _SERVICE_BASE # backward-compat for external importers; prefer get_service_name()
def get_systemd_unit_path(system: bool = False) -> Path:
name = get_service_name()
if system:
return Path("/etc/systemd/system") / f"{SERVICE_NAME}.service"
return Path.home() / ".config" / "systemd" / "user" / f"{SERVICE_NAME}.service"
return Path("/etc/systemd/system") / f"{name}.service"
return Path.home() / ".config" / "systemd" / "user" / f"{name}.service"
def _systemctl_cmd(system: bool = False) -> list[str]:
@@ -362,6 +383,8 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None)
sane_path = f"{venv_bin}:{node_bin}:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
hermes_cli = shutil.which("hermes") or f"{python_path} -m hermes_cli.main"
hermes_home = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")).resolve())
if system:
username, group_name, home_dir = _system_service_identity(run_as_user)
return f"""[Unit]
@@ -380,6 +403,7 @@ Environment="USER={username}"
Environment="LOGNAME={username}"
Environment="PATH={sane_path}"
Environment="VIRTUAL_ENV={venv_dir}"
Environment="HERMES_HOME={hermes_home}"
Restart=on-failure
RestartSec=10
KillMode=mixed
@@ -403,6 +427,7 @@ ExecStop={hermes_cli} gateway stop
WorkingDirectory={working_dir}
Environment="PATH={sane_path}"
Environment="VIRTUAL_ENV={venv_dir}"
Environment="HERMES_HOME={hermes_home}"
Restart=on-failure
RestartSec=10
KillMode=mixed
@@ -455,7 +480,7 @@ def _print_linger_enable_warning(username: str, detail: str | None = None) -> No
print(f" sudo loginctl enable-linger {username}")
print()
print(" Then restart the gateway:")
print(f" systemctl --user restart {SERVICE_NAME}.service")
print(f" systemctl --user restart {get_service_name()}.service")
print()
@@ -526,7 +551,7 @@ def systemd_install(force: bool = False, system: bool = False, run_as_user: str
unit_path.write_text(generate_systemd_unit(system=system, run_as_user=run_as_user), encoding="utf-8")
subprocess.run(_systemctl_cmd(system) + ["daemon-reload"], check=True)
subprocess.run(_systemctl_cmd(system) + ["enable", SERVICE_NAME], check=True)
subprocess.run(_systemctl_cmd(system) + ["enable", get_service_name()], check=True)
print()
print(f"{_service_scope_label(system).capitalize()} service installed and enabled!")
@@ -534,7 +559,7 @@ def systemd_install(force: bool = False, system: bool = False, run_as_user: str
print("Next steps:")
print(f" {'sudo ' if system else ''}hermes gateway start{scope_flag} # Start the service")
print(f" {'sudo ' if system else ''}hermes gateway status{scope_flag} # Check status")
print(f" {'journalctl' if system else 'journalctl --user'} -u {SERVICE_NAME} -f # View logs")
print(f" {'journalctl' if system else 'journalctl --user'} -u {get_service_name()} -f # View logs")
print()
if system:
@@ -552,8 +577,8 @@ def systemd_uninstall(system: bool = False):
if system:
_require_root_for_system_service("uninstall")
subprocess.run(_systemctl_cmd(system) + ["stop", SERVICE_NAME], check=False)
subprocess.run(_systemctl_cmd(system) + ["disable", SERVICE_NAME], check=False)
subprocess.run(_systemctl_cmd(system) + ["stop", get_service_name()], check=False)
subprocess.run(_systemctl_cmd(system) + ["disable", get_service_name()], check=False)
unit_path = get_systemd_unit_path(system=system)
if unit_path.exists():
@@ -569,7 +594,7 @@ def systemd_start(system: bool = False):
if system:
_require_root_for_system_service("start")
refresh_systemd_unit_if_needed(system=system)
subprocess.run(_systemctl_cmd(system) + ["start", SERVICE_NAME], check=True)
subprocess.run(_systemctl_cmd(system) + ["start", get_service_name()], check=True)
print(f"{_service_scope_label(system).capitalize()} service started")
@@ -578,7 +603,7 @@ def systemd_stop(system: bool = False):
system = _select_systemd_scope(system)
if system:
_require_root_for_system_service("stop")
subprocess.run(_systemctl_cmd(system) + ["stop", SERVICE_NAME], check=True)
subprocess.run(_systemctl_cmd(system) + ["stop", get_service_name()], check=True)
print(f"{_service_scope_label(system).capitalize()} service stopped")
@@ -588,7 +613,7 @@ def systemd_restart(system: bool = False):
if system:
_require_root_for_system_service("restart")
refresh_systemd_unit_if_needed(system=system)
subprocess.run(_systemctl_cmd(system) + ["restart", SERVICE_NAME], check=True)
subprocess.run(_systemctl_cmd(system) + ["restart", get_service_name()], check=True)
print(f"{_service_scope_label(system).capitalize()} service restarted")
@@ -613,12 +638,12 @@ def systemd_status(deep: bool = False, system: bool = False):
print()
subprocess.run(
_systemctl_cmd(system) + ["status", SERVICE_NAME, "--no-pager"],
_systemctl_cmd(system) + ["status", get_service_name(), "--no-pager"],
capture_output=False,
)
result = subprocess.run(
_systemctl_cmd(system) + ["is-active", SERVICE_NAME],
_systemctl_cmd(system) + ["is-active", get_service_name()],
capture_output=True,
text=True,
)
@@ -657,7 +682,7 @@ def systemd_status(deep: bool = False, system: bool = False):
if deep:
print()
print("Recent logs:")
subprocess.run(_journalctl_cmd(system) + ["-u", SERVICE_NAME, "-n", "20", "--no-pager"])
subprocess.run(_journalctl_cmd(system) + ["-u", get_service_name(), "-n", "20", "--no-pager"])
# =============================================================================
@@ -1118,7 +1143,7 @@ def _is_service_running() -> bool:
if user_unit_exists:
result = subprocess.run(
_systemctl_cmd(False) + ["is-active", SERVICE_NAME],
_systemctl_cmd(False) + ["is-active", get_service_name()],
capture_output=True, text=True
)
if result.stdout.strip() == "active":
@@ -1126,7 +1151,7 @@ def _is_service_running() -> bool:
if system_unit_exists:
result = subprocess.run(
_systemctl_cmd(True) + ["is-active", SERVICE_NAME],
_systemctl_cmd(True) + ["is-active", get_service_name()],
capture_output=True, text=True
)
if result.stdout.strip() == "active":

View File

@@ -2301,26 +2301,60 @@ def cmd_update(args):
print()
print("✓ Update complete!")
# Auto-restart gateway if it's running as a systemd service
# Auto-restart gateway if it's running.
# Uses the PID file (scoped to HERMES_HOME) to find this
# installation's gateway — safe with multiple installations.
try:
check = subprocess.run(
["systemctl", "--user", "is-active", "hermes-gateway"],
capture_output=True, text=True, timeout=5,
)
if check.stdout.strip() == "active":
print()
print("→ Gateway service is running — restarting to pick up changes...")
restart = subprocess.run(
["systemctl", "--user", "restart", "hermes-gateway"],
capture_output=True, text=True, timeout=15,
from gateway.status import get_running_pid, remove_pid_file
from hermes_cli.gateway import get_service_name
import signal as _signal
_gw_service_name = get_service_name()
existing_pid = get_running_pid()
has_systemd_service = False
try:
check = subprocess.run(
["systemctl", "--user", "is-active", _gw_service_name],
capture_output=True, text=True, timeout=5,
)
if restart.returncode == 0:
print("✓ Gateway restarted.")
else:
print(f"⚠ Gateway restart failed: {restart.stderr.strip()}")
print(" Try manually: hermes gateway restart")
except (FileNotFoundError, subprocess.TimeoutExpired):
pass # No systemd (macOS, WSL1, etc.) — skip silently
has_systemd_service = check.stdout.strip() == "active"
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
if existing_pid or has_systemd_service:
print()
# Kill the PID-file-tracked process (may be manual or systemd)
if existing_pid:
try:
os.kill(existing_pid, _signal.SIGTERM)
print(f"→ Stopped gateway process (PID {existing_pid})")
except ProcessLookupError:
pass # Already gone
except PermissionError:
print(f"⚠ Permission denied killing gateway PID {existing_pid}")
remove_pid_file()
# Restart the systemd service (starts a fresh process)
if has_systemd_service:
import time as _time
_time.sleep(1) # Brief pause for port/socket release
print("→ Restarting gateway service...")
restart = subprocess.run(
["systemctl", "--user", "restart", _gw_service_name],
capture_output=True, text=True, timeout=15,
)
if restart.returncode == 0:
print("✓ Gateway restarted.")
else:
print(f"⚠ Gateway restart failed: {restart.stderr.strip()}")
print(" Try manually: hermes gateway restart")
elif existing_pid:
print(" Gateway was running manually (not as a service).")
print(" Restart it with: hermes gateway run")
except Exception as e:
logger.debug("Gateway restart during update failed: %s", e)
print()
print("Tip: You can now select a provider and model:")

View File

@@ -743,6 +743,7 @@ def setup_model_provider(config: dict):
selected_provider = (
None # "nous", "openai-codex", "openrouter", "custom", or None (keep)
)
selected_base_url = None # deferred until after model selection
nous_models = [] # populated if Nous login succeeds
if provider_idx == 0: # Nous Portal (OAuth)
@@ -1025,8 +1026,8 @@ def setup_model_provider(config: dict):
if existing_custom:
save_env_value("OPENAI_BASE_URL", "")
save_env_value("OPENAI_API_KEY", "")
_update_config_for_provider("zai", zai_base_url, default_model="glm-5")
_set_model_provider(config, "zai", zai_base_url)
selected_base_url = zai_base_url
elif provider_idx == 5: # Kimi / Moonshot
selected_provider = "kimi-coding"
@@ -1058,8 +1059,8 @@ def setup_model_provider(config: dict):
if existing_custom:
save_env_value("OPENAI_BASE_URL", "")
save_env_value("OPENAI_API_KEY", "")
_update_config_for_provider("kimi-coding", pconfig.inference_base_url, default_model="kimi-k2.5")
_set_model_provider(config, "kimi-coding", pconfig.inference_base_url)
selected_base_url = pconfig.inference_base_url
elif provider_idx == 6: # MiniMax
selected_provider = "minimax"
@@ -1091,8 +1092,8 @@ def setup_model_provider(config: dict):
if existing_custom:
save_env_value("OPENAI_BASE_URL", "")
save_env_value("OPENAI_API_KEY", "")
_update_config_for_provider("minimax", pconfig.inference_base_url, default_model="MiniMax-M2.5")
_set_model_provider(config, "minimax", pconfig.inference_base_url)
selected_base_url = pconfig.inference_base_url
elif provider_idx == 7: # MiniMax China
selected_provider = "minimax-cn"
@@ -1124,8 +1125,8 @@ def setup_model_provider(config: dict):
if existing_custom:
save_env_value("OPENAI_BASE_URL", "")
save_env_value("OPENAI_API_KEY", "")
_update_config_for_provider("minimax-cn", pconfig.inference_base_url, default_model="MiniMax-M2.5")
_set_model_provider(config, "minimax-cn", pconfig.inference_base_url)
selected_base_url = pconfig.inference_base_url
elif provider_idx == 8: # Anthropic
selected_provider = "anthropic"
@@ -1228,8 +1229,8 @@ def setup_model_provider(config: dict):
save_env_value("OPENAI_API_KEY", "")
# Don't save base_url for Anthropic — resolve_runtime_provider()
# always hardcodes it. Stale base_urls contaminate other providers.
_update_config_for_provider("anthropic", "", default_model="claude-opus-4-6")
_set_model_provider(config, "anthropic")
selected_base_url = ""
# else: provider_idx == 9 (Keep current) — only shown when a provider already exists
# Normalize "keep current" to an explicit provider so downstream logic
@@ -1459,6 +1460,12 @@ def setup_model_provider(config: dict):
)
print_success(f"Model set to: {_display}")
# Write provider+base_url to config.yaml only after model selection is complete.
# This prevents a race condition where the gateway picks up a new provider
# before the model name has been updated to match.
if selected_provider in ("zai", "kimi-coding", "minimax", "minimax-cn", "anthropic") and selected_base_url is not None:
_update_config_for_provider(selected_provider, selected_base_url)
save_config(config)

View File

@@ -275,8 +275,13 @@ def show_status(args):
print(color("◆ Gateway Service", Colors.CYAN, Colors.BOLD))
if sys.platform.startswith('linux'):
try:
from hermes_cli.gateway import get_service_name
_gw_svc = get_service_name()
except Exception:
_gw_svc = "hermes-gateway"
result = subprocess.run(
["systemctl", "--user", "is-active", "hermes-gateway"],
["systemctl", "--user", "is-active", _gw_svc],
capture_output=True,
text=True
)

View File

@@ -133,7 +133,13 @@ def uninstall_gateway_service():
if platform.system() != "Linux":
return False
service_file = Path.home() / ".config" / "systemd" / "user" / "hermes-gateway.service"
try:
from hermes_cli.gateway import get_service_name
svc_name = get_service_name()
except Exception:
svc_name = "hermes-gateway"
service_file = Path.home() / ".config" / "systemd" / "user" / f"{svc_name}.service"
if not service_file.exists():
return False
@@ -141,14 +147,14 @@ def uninstall_gateway_service():
try:
# Stop the service
subprocess.run(
["systemctl", "--user", "stop", "hermes-gateway"],
["systemctl", "--user", "stop", svc_name],
capture_output=True,
check=False
)
# Disable the service
subprocess.run(
["systemctl", "--user", "disable", "hermes-gateway"],
["systemctl", "--user", "disable", svc_name],
capture_output=True,
check=False
)

View File

@@ -39,7 +39,7 @@ def test_systemd_status_warns_when_linger_disabled(monkeypatch, tmp_path, capsys
monkeypatch.setattr(gateway, "get_systemd_linger_status", lambda: (False, ""))
def fake_run(cmd, capture_output=False, text=False, check=False):
if cmd[:4] == ["systemctl", "--user", "status", gateway.SERVICE_NAME]:
if cmd[:4] == ["systemctl", "--user", "status", gateway.get_service_name()]:
return SimpleNamespace(returncode=0, stdout="", stderr="")
if cmd[:3] == ["systemctl", "--user", "is-active"]:
return SimpleNamespace(returncode=0, stdout="active\n", stderr="")
@@ -76,7 +76,7 @@ def test_systemd_install_checks_linger_status(monkeypatch, tmp_path, capsys):
assert unit_path.exists()
assert [cmd for cmd, _ in calls] == [
["systemctl", "--user", "daemon-reload"],
["systemctl", "--user", "enable", gateway.SERVICE_NAME],
["systemctl", "--user", "enable", gateway.get_service_name()],
]
assert helper_calls == [True]
assert "User service installed and enabled" in out
@@ -110,7 +110,7 @@ def test_systemd_install_system_scope_skips_linger_and_uses_systemctl(monkeypatc
assert unit_path.read_text(encoding="utf-8") == "scope=True user=alice\n"
assert [cmd for cmd, _ in calls] == [
["systemctl", "daemon-reload"],
["systemctl", "enable", gateway.SERVICE_NAME],
["systemctl", "enable", gateway.get_service_name()],
]
assert helper_calls == []
assert "Configured to run as: alice" not in out # generated test unit has no User= line

View File

@@ -114,7 +114,7 @@ def test_systemd_install_calls_linger_helper(monkeypatch, tmp_path, capsys):
assert unit_path.exists()
assert [cmd for cmd, _ in calls] == [
["systemctl", "--user", "daemon-reload"],
["systemctl", "--user", "enable", gateway.SERVICE_NAME],
["systemctl", "--user", "enable", gateway.get_service_name()],
]
assert helper_calls == [True]
assert "User service installed and enabled" in out

View File

@@ -26,7 +26,7 @@ class TestSystemdServiceRefresh:
assert unit_path.read_text(encoding="utf-8") == "new unit\n"
assert calls[:2] == [
["systemctl", "--user", "daemon-reload"],
["systemctl", "--user", "start", gateway_cli.SERVICE_NAME],
["systemctl", "--user", "start", gateway_cli.get_service_name()],
]
def test_systemd_restart_refreshes_outdated_unit(self, tmp_path, monkeypatch):
@@ -49,7 +49,7 @@ class TestSystemdServiceRefresh:
assert unit_path.read_text(encoding="utf-8") == "new unit\n"
assert calls[:2] == [
["systemctl", "--user", "daemon-reload"],
["systemctl", "--user", "restart", gateway_cli.SERVICE_NAME],
["systemctl", "--user", "restart", gateway_cli.get_service_name()],
]
@@ -92,9 +92,9 @@ class TestGatewayServiceDetection:
)
def fake_run(cmd, capture_output=True, text=True, **kwargs):
if cmd == ["systemctl", "--user", "is-active", gateway_cli.SERVICE_NAME]:
if cmd == ["systemctl", "--user", "is-active", gateway_cli.get_service_name()]:
return SimpleNamespace(returncode=0, stdout="inactive\n", stderr="")
if cmd == ["systemctl", "is-active", gateway_cli.SERVICE_NAME]:
if cmd == ["systemctl", "is-active", gateway_cli.get_service_name()]:
return SimpleNamespace(returncode=0, stdout="active\n", stderr="")
raise AssertionError(f"Unexpected command: {cmd}")

View File

@@ -34,7 +34,7 @@ All variables go in `~/.hermes/.env`. You can also set them with `hermes config
| `VOICE_TOOLS_OPENAI_KEY` | Preferred OpenAI key for OpenAI speech-to-text and text-to-speech providers |
| `HERMES_LOCAL_STT_COMMAND` | Optional local speech-to-text command template. Supports `{input_path}`, `{output_dir}`, `{language}`, and `{model}` placeholders |
| `HERMES_LOCAL_STT_LANGUAGE` | Default language passed to `HERMES_LOCAL_STT_COMMAND` or auto-detected local `whisper` CLI fallback (default: `en`) |
| `HERMES_HOME` | Override Hermes config directory (default: `~/.hermes`) |
| `HERMES_HOME` | Override Hermes config directory (default: `~/.hermes`). Also scopes the gateway PID file and systemd service name, so multiple installations can run concurrently |
## Provider Auth (OAuth)

View File

@@ -244,10 +244,10 @@ Background tasks on messaging platforms are fire-and-forget — you don't need t
```bash
hermes gateway install # Install as user service
systemctl --user start hermes-gateway
systemctl --user stop hermes-gateway
systemctl --user status hermes-gateway
journalctl --user -u hermes-gateway -f
hermes gateway start # Start the service
hermes gateway stop # Stop the service
hermes gateway status # Check status
journalctl --user -u hermes-gateway -f # View logs
# Enable lingering (keeps running after logout)
sudo loginctl enable-linger $USER
@@ -263,6 +263,10 @@ Use the user service on laptops and dev boxes. Use the system service on VPS or
Avoid keeping both the user and system gateway units installed at once unless you really mean to. Hermes will warn if it detects both because start/stop/status behavior gets ambiguous.
:::info Multiple installations
If you run multiple Hermes installations on the same machine (with different `HERMES_HOME` directories), each gets its own systemd service name. The default `~/.hermes` uses `hermes-gateway`; other installations use `hermes-gateway-<hash>`. The `hermes gateway` commands automatically target the correct service for your current `HERMES_HOME`.
:::
### macOS (launchd)
```bash