From f98b5d00a49b01fb833deecace78656035bc6f6d Mon Sep 17 00:00:00 2001 From: Siddharth Balyan <52913345+alt-glitch@users.noreply.github.com> Date: Sat, 2 May 2026 08:51:30 +0530 Subject: [PATCH] fix: gateway systemd unit now retries indefinitely with backoff (#18639) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old defaults (StartLimitIntervalSec=600, StartLimitBurst=5, RestartSec=30) meant any network outage over ~5 minutes would permanently kill the gateway until manual intervention. Changes: - StartLimitIntervalSec=0 (never give up) - Restart=always (not just on-failure) - RestartSec=60 with RestartMaxDelaySec=300, RestartSteps=5 (exponential backoff: 60 → 120 → 180 → 240 → 300s cap) - After=network-online.target + Wants= (both units now wait for actual connectivity, not just network.target) Power outage → internet down → internet back = auto-recovery. --- hermes_cli/gateway.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 50953319a4b..af40444922e 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -188,7 +188,7 @@ def _graceful_restart_via_sigusr1(pid: int, drain_timeout: float) -> bool: SIGUSR1 is wired in gateway/run.py to ``request_restart(via_service=True)`` which drains in-flight agent runs (up to ``agent.restart_drain_timeout`` - seconds), then exits with code 75. Both systemd (``Restart=on-failure`` + seconds), then exits with code 75. Both systemd (``Restart=always`` + ``RestartForceExitStatus=75``) and launchd (``KeepAlive.SuccessfulExit = false``) relaunch the process after the graceful exit. @@ -1655,8 +1655,7 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None) Description={SERVICE_DESCRIPTION} After=network-online.target Wants=network-online.target -StartLimitIntervalSec=600 -StartLimitBurst=5 +StartLimitIntervalSec=0 [Service] Type=simple @@ -1670,8 +1669,10 @@ Environment="LOGNAME={username}" Environment="PATH={sane_path}" Environment="VIRTUAL_ENV={venv_dir}" Environment="HERMES_HOME={hermes_home}" -Restart=on-failure -RestartSec=30 +Restart=always +RestartSec=60 +RestartMaxDelaySec=300 +RestartSteps=5 RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE} KillMode=mixed KillSignal=SIGTERM @@ -1691,9 +1692,9 @@ WantedBy=multi-user.target sane_path = ":".join(path_entries) return f"""[Unit] Description={SERVICE_DESCRIPTION} -After=network.target -StartLimitIntervalSec=600 -StartLimitBurst=5 +After=network-online.target +Wants=network-online.target +StartLimitIntervalSec=0 [Service] Type=simple @@ -1702,8 +1703,10 @@ WorkingDirectory={working_dir} Environment="PATH={sane_path}" Environment="VIRTUAL_ENV={venv_dir}" Environment="HERMES_HOME={hermes_home}" -Restart=on-failure -RestartSec=30 +Restart=always +RestartSec=60 +RestartMaxDelaySec=300 +RestartSteps=5 RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE} KillMode=mixed KillSignal=SIGTERM @@ -2451,7 +2454,7 @@ def run_gateway(verbose: int = 0, quiet: bool = False, replace: bool = False): print() # Exit with code 1 if gateway fails to connect any platform, - # so systemd Restart=on-failure will retry on transient errors + # so systemd Restart=always will retry on transient errors verbosity = None if quiet else verbose try: success = asyncio.run(start_gateway(replace=replace, verbosity=verbosity))