Files
hermes-agent/tests/gateway/test_runner_startup_failures.py

342 lines
13 KiB
Python
Raw Normal View History

import pytest
from unittest.mock import AsyncMock
from gateway.config import GatewayConfig, Platform, PlatformConfig
from gateway.platforms.base import BasePlatformAdapter
from gateway.run import GatewayRunner
from gateway.status import read_runtime_status
class _RetryableFailureAdapter(BasePlatformAdapter):
def __init__(self):
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.TELEGRAM)
async def connect(self) -> bool:
self._set_fatal_error(
"telegram_connect_error",
"Telegram startup failed: temporary DNS resolution failure.",
retryable=True,
)
return False
async def disconnect(self) -> None:
self._mark_disconnected()
async def send(self, chat_id, content, reply_to=None, metadata=None):
raise NotImplementedError
async def get_chat_info(self, chat_id):
return {"id": chat_id}
class _DisabledAdapter(BasePlatformAdapter):
def __init__(self):
super().__init__(PlatformConfig(enabled=False, token="***"), Platform.TELEGRAM)
async def connect(self) -> bool:
raise AssertionError("connect should not be called for disabled platforms")
async def disconnect(self) -> None:
self._mark_disconnected()
async def send(self, chat_id, content, reply_to=None, metadata=None):
raise NotImplementedError
async def get_chat_info(self, chat_id):
return {"id": chat_id}
class _SuccessfulAdapter(BasePlatformAdapter):
def __init__(self):
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.DISCORD)
async def connect(self) -> bool:
return True
async def disconnect(self) -> None:
self._mark_disconnected()
async def send(self, chat_id, content, reply_to=None, metadata=None):
raise NotImplementedError
async def get_chat_info(self, chat_id):
return {"id": chat_id}
@pytest.mark.asyncio
async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch, tmp_path):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _RetryableFailureAdapter())
ok = await runner.start()
assert ok is False
assert runner.should_exit_cleanly is False
state = read_runtime_status()
assert state["gateway_state"] == "startup_failed"
assert "temporary DNS resolution failure" in state["exit_reason"]
assert state["platforms"]["telegram"]["state"] == "retrying"
assert state["platforms"]["telegram"]["error_code"] == "telegram_connect_error"
@pytest.mark.asyncio
async def test_runner_allows_cron_only_mode_when_no_platforms_are_enabled(monkeypatch, tmp_path):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
Platform.TELEGRAM: PlatformConfig(enabled=False, token="***")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
ok = await runner.start()
assert ok is True
assert runner.should_exit_cleanly is False
assert runner.adapters == {}
state = read_runtime_status()
assert state["gateway_state"] == "running"
@pytest.mark.asyncio
async def test_runner_records_connected_platform_state_on_success(monkeypatch, tmp_path):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
Platform.DISCORD: PlatformConfig(enabled=True, token="***")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _SuccessfulAdapter())
monkeypatch.setattr(runner.hooks, "discover_and_load", lambda: None)
monkeypatch.setattr(runner.hooks, "emit", AsyncMock())
ok = await runner.start()
assert ok is True
state = read_runtime_status()
assert state["gateway_state"] == "running"
assert state["platforms"]["discord"]["state"] == "connected"
assert state["platforms"]["discord"]["error_code"] is None
assert state["platforms"]["discord"]["error_message"] is None
@pytest.mark.asyncio
async def test_start_gateway_verbosity_imports_redacting_formatter(monkeypatch, tmp_path):
"""Verbosity != None must not crash with NameError on RedactingFormatter (#8044)."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
class _CleanExitRunner:
def __init__(self, config):
self.config = config
self.should_exit_cleanly = True
self.exit_reason = None
self.adapters = {}
async def start(self):
return True
async def stop(self):
return None
monkeypatch.setattr("gateway.status.get_running_pid", lambda: None)
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
monkeypatch.setattr("gateway.run.GatewayRunner", _CleanExitRunner)
from gateway.run import start_gateway
# verbosity=1 triggers the code path that uses RedactingFormatter.
# Before the fix this raised NameError.
ok = await start_gateway(config=GatewayConfig(), replace=False, verbosity=1)
assert ok is True
@pytest.mark.asyncio
async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_path):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
calls = []
class _CleanExitRunner:
def __init__(self, config):
self.config = config
self.should_exit_cleanly = True
self.exit_reason = None
self.adapters = {}
async def start(self):
return True
async def stop(self):
return None
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 42)
monkeypatch.setattr("gateway.status.remove_pid_file", lambda: None)
monkeypatch.setattr("gateway.status.release_all_scoped_locks", lambda: 0)
monkeypatch.setattr("gateway.status.terminate_pid", lambda pid, force=False: calls.append((pid, force)))
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
monkeypatch.setattr("gateway.run.os.kill", lambda pid, sig: None)
monkeypatch.setattr("time.sleep", lambda _: None)
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
monkeypatch.setattr("gateway.run.GatewayRunner", _CleanExitRunner)
from gateway.run import start_gateway
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
assert ok is True
assert calls == [(42, False), (42, True)]
fix(gateway): detect legacy hermes.service + mark --replace SIGTERM as planned (#11909) * fix(gateway): detect legacy hermes.service units from pre-rename installs Older Hermes installs used a different service name (hermes.service) before the rename to hermes-gateway.service. When both units remain installed, they fight over the same bot token — after PR #5646's signal-recovery change, this manifests as a 30-second SIGTERM flap loop between the two services. Detection is an explicit allowlist (no globbing) plus an ExecStart content check, so profile units (hermes-gateway-<profile>.service) and unrelated third-party services named 'hermes' are never matched. Wired into systemd_install, systemd_status, gateway_setup wizard, and the main hermes setup flow — anywhere we already warn about scope conflicts now also warns about legacy units. * feat(gateway): add migrate-legacy command + install-time removal prompt - New hermes_cli.gateway.remove_legacy_hermes_units() removes legacy unit files with stop → disable → unlink → daemon-reload. Handles user and system scopes separately; system scope returns path list when not running as root so the caller can tell the user to re-run with sudo. - New 'hermes gateway migrate-legacy' subcommand (with --dry-run and -y) routes to remove_legacy_hermes_units via gateway_command dispatch. - systemd_install now offers to remove legacy units BEFORE installing the new hermes-gateway.service, preventing the SIGTERM flap loop that hits users who still have pre-rename hermes.service around. Profile units (hermes-gateway-<profile>.service) remain untouched in all paths — the legacy allowlist is explicit (_LEGACY_SERVICE_NAMES) and the ExecStart content check further narrows matches. * fix(gateway): mark --replace SIGTERM as planned so target exits 0 PR #5646 made SIGTERM exit the gateway with code 1 so systemd's Restart=on-failure revives it after unexpected kills. But when a user has two gateway units fighting for the same bot token (e.g. legacy hermes.service + hermes-gateway.service from a pre-rename install), the --replace takeover itself becomes the 'unexpected' SIGTERM — the loser exits 1, systemd revives it 30s later, and the cycle flaps indefinitely. Before calling terminate_pid(), --replace now writes a short-lived marker file naming the target PID + start_time. The target's shutdown_signal_handler consumes the marker and, when it names this process, leaves _signal_initiated_shutdown=False so the final exit code stays 0. Staleness defences: - PID + start_time combo prevents PID reuse matching an old marker - Marker older than 60s is treated as stale and discarded - Marker is unlinked on first read even if it doesn't match this process - Replacer clears the marker post-loop + on permission-denied give-up
2026-04-17 19:27:58 -07:00
@pytest.mark.asyncio
async def test_start_gateway_replace_writes_takeover_marker_before_sigterm(
monkeypatch, tmp_path
):
"""--replace must write a takeover marker BEFORE sending SIGTERM.
The marker lets the target's shutdown handler identify the signal as a
planned takeover ( exit 0) rather than an unexpected kill ( exit 1).
Without the marker, PR #5646's signal-recovery path would revive the
target via systemd Restart=on-failure, starting a flap loop.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Record the ORDER of marker-write + terminate_pid calls
events: list[str] = []
marker_paths_seen: list = []
def record_write_marker(target_pid: int) -> bool:
events.append(f"write_marker(target_pid={target_pid})")
# Also check that the marker file actually exists after this call
marker_paths_seen.append(
(tmp_path / ".gateway-takeover.json").exists() is False # not yet
)
# Actually write the marker so we can verify cleanup later
from gateway.status import _get_takeover_marker_path, _write_json_file, _get_process_start_time
_write_json_file(_get_takeover_marker_path(), {
"target_pid": target_pid,
"target_start_time": 0,
"replacer_pid": 100,
"written_at": "2026-04-17T00:00:00+00:00",
})
return True
def record_terminate(pid, force=False):
events.append(f"terminate_pid(pid={pid}, force={force})")
class _CleanExitRunner:
def __init__(self, config):
self.config = config
self.should_exit_cleanly = True
self.exit_reason = None
self.adapters = {}
async def start(self):
return True
async def stop(self):
return None
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 42)
monkeypatch.setattr("gateway.status.remove_pid_file", lambda: None)
monkeypatch.setattr("gateway.status.release_all_scoped_locks", lambda: 0)
monkeypatch.setattr("gateway.status.write_takeover_marker", record_write_marker)
monkeypatch.setattr("gateway.status.terminate_pid", record_terminate)
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
# Simulate old process exiting on first check so we don't loop into force-kill
monkeypatch.setattr(
"gateway.run.os.kill",
lambda pid, sig: (_ for _ in ()).throw(ProcessLookupError()),
)
monkeypatch.setattr("time.sleep", lambda _: None)
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
monkeypatch.setattr("gateway.run.GatewayRunner", _CleanExitRunner)
from gateway.run import start_gateway
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
assert ok is True
# Ordering: marker written BEFORE SIGTERM
assert events[0] == "write_marker(target_pid=42)"
assert any(e.startswith("terminate_pid(pid=42") for e in events[1:])
# Marker file cleanup: replacer cleans it after loop completes
assert not (tmp_path / ".gateway-takeover.json").exists()
@pytest.mark.asyncio
async def test_start_gateway_replace_clears_marker_on_permission_denied(
monkeypatch, tmp_path
):
"""If we fail to kill the existing PID (permission denied), clean up the
marker so it doesn't grief an unrelated future shutdown."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
def write_marker(target_pid: int) -> bool:
from gateway.status import _get_takeover_marker_path, _write_json_file
_write_json_file(_get_takeover_marker_path(), {
"target_pid": target_pid,
"target_start_time": 0,
"replacer_pid": 100,
"written_at": "2026-04-17T00:00:00+00:00",
})
return True
def raise_permission(pid, force=False):
raise PermissionError("simulated EPERM")
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 42)
monkeypatch.setattr("gateway.status.write_takeover_marker", write_marker)
monkeypatch.setattr("gateway.status.terminate_pid", raise_permission)
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
from gateway.run import start_gateway
# Should return False due to permission error
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
assert ok is False
# Marker must NOT be left behind
assert not (tmp_path / ".gateway-takeover.json").exists()
def test_runner_warns_when_docker_gateway_lacks_explicit_output_mount(monkeypatch, tmp_path, caplog):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setenv("TERMINAL_ENV", "docker")
monkeypatch.setenv("TERMINAL_DOCKER_VOLUMES", '["/etc/localtime:/etc/localtime:ro"]')
config = GatewayConfig(
platforms={
Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")
},
sessions_dir=tmp_path / "sessions",
)
with caplog.at_level("WARNING"):
GatewayRunner(config)
assert any(
"host-visible output mount" in record.message
for record in caplog.records
)