Compare commits

...

3 Commits

Author SHA1 Message Date
teknium1
8825ad20c1 fix(nix): drop --replace + default Restart=on-failure for gateway service
Extends the systemd/launchd --replace removal to the NixOS module: flip the
host-unit Restart default from always to on-failure, and drop --replace from the
container-mode (OCI) create command. Keeps every Hermes-generated service surface
consistent — none default to takeover mode under a service manager.
2026-05-29 19:37:22 -07:00
LeonSGP43
152207c0cc fix(gateway): avoid restart loops on replace 2026-05-29 19:35:18 -07:00
LeonSGP43
e643c79c2c fix(gateway): stop default replace in service runs 2026-05-29 19:35:14 -07:00
3 changed files with 41 additions and 14 deletions

View File

@@ -207,9 +207,11 @@ def _graceful_restart_via_sigusr1(pid: int, drain_timeout: float) -> bool:
SIGUSR1 is wired in gateway/run.py to ``request_restart(via_service=True)``
which drains in-flight agent runs (up to ``agent.restart_drain_timeout``
seconds), then exits with code 75. Both systemd (``Restart=always``
+ ``RestartForceExitStatus=75``) and launchd (``KeepAlive.SuccessfulExit
= false``) relaunch the process after the graceful exit.
seconds), then exits with code 75. Systemd units generated by Hermes use
``Restart=on-failure`` together with ``RestartForceExitStatus=75`` so the
service is relaunched after the graceful exit without reviving clean
``--replace`` takeovers. launchd still uses ``KeepAlive.SuccessfulExit =
false`` for the same relaunch behavior.
This is the drain-aware alternative to ``systemctl restart`` / ``SIGTERM``,
which SIGKILL in-flight agents after a short timeout.
@@ -565,7 +567,7 @@ def _gateway_run_args_for_profile(profile: str) -> list[str]:
args = [get_python_path(), "-m", "hermes_cli.main"]
if profile != "default":
args.extend(["--profile", profile])
args.extend(["gateway", "run", "--replace"])
args.extend(["gateway", "run"])
return args
@@ -2240,7 +2242,7 @@ StartLimitIntervalSec=0
Type=simple
User={username}
Group={group_name}
ExecStart={python_path} -m hermes_cli.main{f" {profile_arg}" if profile_arg else ""} gateway run --replace
ExecStart={python_path} -m hermes_cli.main{f" {profile_arg}" if profile_arg else ""} gateway run
WorkingDirectory={working_dir}
Environment="HOME={home_dir}"
Environment="USER={username}"
@@ -2248,7 +2250,7 @@ Environment="LOGNAME={username}"
Environment="PATH={sane_path}"
Environment="VIRTUAL_ENV={venv_dir}"
Environment="HERMES_HOME={hermes_home}"
Restart=always
Restart=on-failure
RestartSec=5
RestartMaxDelaySec=300
RestartSteps=5
@@ -2278,12 +2280,12 @@ StartLimitIntervalSec=0
[Service]
Type=simple
ExecStart={python_path} -m hermes_cli.main{f" {profile_arg}" if profile_arg else ""} gateway run --replace
ExecStart={python_path} -m hermes_cli.main{f" {profile_arg}" if profile_arg else ""} gateway run
WorkingDirectory={working_dir}
Environment="PATH={sane_path}"
Environment="VIRTUAL_ENV={venv_dir}"
Environment="HERMES_HOME={hermes_home}"
Restart=always
Restart=on-failure
RestartSec=5
RestartMaxDelaySec=300
RestartSteps=5
@@ -2875,7 +2877,6 @@ def generate_launchd_plist() -> str:
prog_args.extend([
"<string>gateway</string>",
"<string>run</string>",
"<string>--replace</string>",
])
prog_args_xml = "\n ".join(prog_args)
@@ -3270,7 +3271,7 @@ def run_gateway(verbose: int = 0, quiet: bool = False, replace: bool = False):
print()
# Exit with code 1 if gateway fails to connect any platform,
# so systemd Restart=always will retry on transient errors
# so systemd Restart=on-failure will retry on transient errors
verbosity = None if quiet else verbose
# ── Exit-path diagnostics ────────────────────────────────────────────

View File

@@ -535,7 +535,7 @@
restart = mkOption {
type = types.str;
default = "always";
default = "on-failure";
description = "systemd Restart= policy.";
};
@@ -974,7 +974,7 @@
--env MESSAGING_CWD=${containerWorkDir} \
${lib.concatStringsSep " " cfg.container.extraOptions} \
${cfg.container.image} \
${containerDataDir}/current-package/bin/hermes gateway run --replace ${lib.concatStringsSep " " cfg.extraArgs}
${containerDataDir}/current-package/bin/hermes gateway run ${lib.concatStringsSep " " cfg.extraArgs}
echo "${containerIdentity}" > ${identityFile}
fi

View File

@@ -326,6 +326,8 @@ class TestGeneratedSystemdUnits:
assert "ExecStart=" in unit
assert "ExecStop=" not in unit
assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit
assert "Restart=on-failure" in unit
assert "Restart=always" not in unit
assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit
# TimeoutStopSec must exceed the default drain_timeout (60s) so
# systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
@@ -387,6 +389,8 @@ class TestGeneratedSystemdUnits:
assert "ExecStart=" in unit
assert "ExecStop=" not in unit
assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit
assert "Restart=on-failure" in unit
assert "Restart=always" not in unit
assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit
# TimeoutStopSec must exceed the default drain_timeout (60s) so
# systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
@@ -493,7 +497,10 @@ class TestLaunchdServiceRecovery:
label = gateway_cli.get_launchd_label()
domain = gateway_cli._launchd_domain()
assert "--replace" in plist_path.read_text(encoding="utf-8")
plist_text = plist_path.read_text(encoding="utf-8")
assert "<string>gateway</string>" in plist_text
assert "<string>run</string>" in plist_text
assert "--replace" not in plist_text
assert calls[:2] == [
["launchctl", "bootout", f"{domain}/{label}"],
["launchctl", "bootstrap", domain, str(plist_path)],
@@ -1616,7 +1623,8 @@ class TestProfileArg:
monkeypatch.setattr(gateway_cli, "get_hermes_home", lambda: profile_dir)
unit = gateway_cli.generate_systemd_unit(system=False)
assert "--profile mybot" in unit
assert "gateway run --replace" in unit
assert "gateway run" in unit
assert "--replace" not in unit
def test_launchd_plist_includes_profile(self, tmp_path, monkeypatch):
"""generate_launchd_plist should include --profile in ProgramArguments for named profiles."""
@@ -1628,6 +1636,24 @@ class TestProfileArg:
plist = gateway_cli.generate_launchd_plist()
assert "<string>--profile</string>" in plist
assert "<string>mybot</string>" in plist
assert "<string>--replace</string>" not in plist
def test_gateway_run_args_for_profile_omit_replace(self, monkeypatch):
monkeypatch.setattr(gateway_cli, "get_python_path", lambda: "/venv/bin/python")
default_args = gateway_cli._gateway_run_args_for_profile("default")
named_args = gateway_cli._gateway_run_args_for_profile("mybot")
assert default_args == ["/venv/bin/python", "-m", "hermes_cli.main", "gateway", "run"]
assert named_args == [
"/venv/bin/python",
"-m",
"hermes_cli.main",
"--profile",
"mybot",
"gateway",
"run",
]
def test_launchd_plist_path_uses_real_user_home_not_profile_home(self, tmp_path, monkeypatch):
profile_dir = tmp_path / ".hermes" / "profiles" / "orcha"