2026-04-10 14:00:21 -07:00
|
|
|
import asyncio
|
|
|
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
|
|
|
|
|
|
|
|
from gateway.config import GatewayConfig, Platform, PlatformConfig
|
|
|
|
|
from gateway.platforms.base import BasePlatformAdapter, MessageEvent, SendResult
|
|
|
|
|
from gateway.restart import DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
|
|
|
|
|
from gateway.run import GatewayRunner
|
|
|
|
|
from gateway.session import SessionSource
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RestartTestAdapter(BasePlatformAdapter):
|
|
|
|
|
def __init__(self):
|
|
|
|
|
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.TELEGRAM)
|
|
|
|
|
self.sent: list[str] = []
|
|
|
|
|
|
|
|
|
|
async def connect(self):
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
async def disconnect(self):
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
|
|
|
self.sent.append(content)
|
|
|
|
|
return SendResult(success=True, message_id="1")
|
|
|
|
|
|
|
|
|
|
async def send_typing(self, chat_id, metadata=None):
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
|
|
|
return {"id": chat_id}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_restart_source(chat_id: str = "123456", chat_type: str = "dm") -> SessionSource:
|
|
|
|
|
return SessionSource(
|
|
|
|
|
platform=Platform.TELEGRAM,
|
|
|
|
|
chat_id=chat_id,
|
|
|
|
|
chat_type=chat_type,
|
2026-04-13 10:50:24 -07:00
|
|
|
user_id="u1",
|
2026-04-10 14:00:21 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_restart_runner(
|
|
|
|
|
adapter: BasePlatformAdapter | None = None,
|
|
|
|
|
) -> tuple[GatewayRunner, BasePlatformAdapter]:
|
|
|
|
|
runner = object.__new__(GatewayRunner)
|
|
|
|
|
runner.config = GatewayConfig(
|
|
|
|
|
platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")}
|
|
|
|
|
)
|
|
|
|
|
runner._running = True
|
|
|
|
|
runner._shutdown_event = asyncio.Event()
|
|
|
|
|
runner._exit_reason = None
|
|
|
|
|
runner._exit_code = None
|
|
|
|
|
runner._running_agents = {}
|
|
|
|
|
runner._running_agents_ts = {}
|
|
|
|
|
runner._pending_messages = {}
|
|
|
|
|
runner._pending_approvals = {}
|
|
|
|
|
runner._pending_model_notes = {}
|
|
|
|
|
runner._background_tasks = set()
|
|
|
|
|
runner._draining = False
|
|
|
|
|
runner._restart_requested = False
|
|
|
|
|
runner._restart_task_started = False
|
|
|
|
|
runner._restart_detached = False
|
|
|
|
|
runner._restart_via_service = False
|
|
|
|
|
runner._restart_drain_timeout = DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
|
|
|
|
|
runner._stop_task = None
|
|
|
|
|
runner._busy_input_mode = "interrupt"
|
|
|
|
|
runner._update_prompt_pending = {}
|
|
|
|
|
runner._voice_mode = {}
|
|
|
|
|
runner._session_model_overrides = {}
|
|
|
|
|
runner._shutdown_all_gateway_honcho = lambda: None
|
|
|
|
|
runner._update_runtime_status = MagicMock()
|
|
|
|
|
runner._queue_or_replace_pending_event = GatewayRunner._queue_or_replace_pending_event.__get__(
|
|
|
|
|
runner, GatewayRunner
|
|
|
|
|
)
|
|
|
|
|
runner._session_key_for_source = GatewayRunner._session_key_for_source.__get__(
|
|
|
|
|
runner, GatewayRunner
|
|
|
|
|
)
|
|
|
|
|
runner._handle_active_session_busy_message = (
|
|
|
|
|
GatewayRunner._handle_active_session_busy_message.__get__(runner, GatewayRunner)
|
|
|
|
|
)
|
|
|
|
|
runner._handle_restart_command = GatewayRunner._handle_restart_command.__get__(
|
|
|
|
|
runner, GatewayRunner
|
|
|
|
|
)
|
|
|
|
|
runner._status_action_label = GatewayRunner._status_action_label.__get__(
|
|
|
|
|
runner, GatewayRunner
|
|
|
|
|
)
|
|
|
|
|
runner._status_action_gerund = GatewayRunner._status_action_gerund.__get__(
|
|
|
|
|
runner, GatewayRunner
|
|
|
|
|
)
|
|
|
|
|
runner._queue_during_drain_enabled = GatewayRunner._queue_during_drain_enabled.__get__(
|
|
|
|
|
runner, GatewayRunner
|
|
|
|
|
)
|
|
|
|
|
runner._running_agent_count = GatewayRunner._running_agent_count.__get__(
|
|
|
|
|
runner, GatewayRunner
|
|
|
|
|
)
|
fix: notify active sessions on gateway shutdown + update health check
Three fixes for gateway lifecycle stability:
1. Notify active sessions before shutdown (#new)
When the gateway receives SIGTERM or /restart, it now sends a
notification to every chat with an active agent BEFORE starting
the drain. Users see:
- Shutdown: 'Gateway shutting down — your task will be interrupted.'
- Restart: 'Gateway restarting — use /retry after restart to continue.'
Deduplicates per-chat so group sessions with multiple users get
one notification. Best-effort: send failures are logged and swallowed.
2. Skip .clean_shutdown marker when drain timed out
Previously, a graceful SIGTERM always wrote .clean_shutdown, even if
agents were force-interrupted when the drain timed out. This meant
the next startup skipped session suspension, leaving interrupted
sessions in a broken state (trailing tool response, no final message).
Now the marker is only written if the drain completed without timeout,
so interrupted sessions get properly suspended on next startup.
3. Post-restart health check for hermes update (#6631)
cmd_update() now verifies the gateway actually survived after
systemctl restart (sleep 3s + is-active check). If the service
crashed immediately, it retries once. If still dead, prints
actionable diagnostics (journalctl command, manual restart hint).
Also closes #8104 — already fixed on main (the /restart handler
correctly detects systemd via INVOCATION_ID and uses via_service=True).
Test plan:
- 6 new tests for shutdown notifications (dedup, restart vs shutdown
messaging, sentinel filtering, send failure resilience)
- Existing restart drain + update tests pass (47 total)
2026-04-14 12:44:46 -07:00
|
|
|
runner._snapshot_running_agents = GatewayRunner._snapshot_running_agents.__get__(
|
|
|
|
|
runner, GatewayRunner
|
|
|
|
|
)
|
|
|
|
|
runner._notify_active_sessions_of_shutdown = (
|
|
|
|
|
GatewayRunner._notify_active_sessions_of_shutdown.__get__(runner, GatewayRunner)
|
|
|
|
|
)
|
2026-04-10 14:00:21 -07:00
|
|
|
runner._launch_detached_restart_command = GatewayRunner._launch_detached_restart_command.__get__(
|
|
|
|
|
runner, GatewayRunner
|
|
|
|
|
)
|
|
|
|
|
runner.request_restart = GatewayRunner.request_restart.__get__(runner, GatewayRunner)
|
|
|
|
|
runner._is_user_authorized = lambda _source: True
|
|
|
|
|
runner.hooks = MagicMock()
|
|
|
|
|
runner.hooks.emit = AsyncMock()
|
|
|
|
|
runner.pairing_store = MagicMock()
|
|
|
|
|
runner.session_store = MagicMock()
|
|
|
|
|
runner.delivery_router = MagicMock()
|
|
|
|
|
|
|
|
|
|
platform_adapter = adapter or RestartTestAdapter()
|
|
|
|
|
platform_adapter.set_message_handler(AsyncMock(return_value=None))
|
|
|
|
|
platform_adapter.set_busy_session_handler(runner._handle_active_session_busy_message)
|
|
|
|
|
runner.adapters = {Platform.TELEGRAM: platform_adapter}
|
|
|
|
|
return runner, platform_adapter
|