diff --git a/gateway/run.py b/gateway/run.py index 847db36c93..f71fc22804 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -745,10 +745,22 @@ class GatewayRunner: logger.error("No connected messaging platforms remain. Shutting down gateway cleanly.") await self.stop() elif not self.adapters and self._failed_platforms: - logger.warning( - "No connected messaging platforms remain, but %d platform(s) queued for reconnection", - len(self._failed_platforms), - ) + # All platforms are down and queued for background reconnection. + # If the error is retryable, exit with failure so systemd Restart=on-failure + # can restart the process. Otherwise stay alive and keep retrying in background. + if adapter.fatal_error_retryable: + self._exit_reason = adapter.fatal_error_message or "All messaging platforms failed with retryable errors" + self._exit_with_failure = True + logger.error( + "All messaging platforms failed with retryable errors. " + "Shutting down gateway for service restart (systemd will retry)." + ) + await self.stop() + else: + logger.warning( + "No connected messaging platforms remain, but %d platform(s) queued for reconnection", + len(self._failed_platforms), + ) def _request_clean_exit(self, reason: str) -> None: self._exit_cleanly = True diff --git a/tests/gateway/test_platform_reconnect.py b/tests/gateway/test_platform_reconnect.py index 3073f2f5da..68dfd2044d 100644 --- a/tests/gateway/test_platform_reconnect.py +++ b/tests/gateway/test_platform_reconnect.py @@ -344,6 +344,7 @@ class TestRuntimeDisconnectQueuing: async def test_retryable_runtime_error_queued_for_reconnect(self): """Retryable runtime errors should add the platform to _failed_platforms.""" runner = _make_runner() + runner.stop = AsyncMock() adapter = StubAdapter(succeed=True) adapter._set_fatal_error("network_error", "DNS failure", retryable=True) @@ -371,8 +372,12 @@ class TestRuntimeDisconnectQueuing: assert Platform.TELEGRAM not in runner._failed_platforms @pytest.mark.asyncio - async def test_retryable_error_prevents_shutdown_when_queued(self): - """Gateway should not shut down if failed platforms are queued for reconnection.""" + async def test_retryable_error_exits_for_service_restart_when_all_down(self): + """Gateway should exit with failure when all platforms fail with retryable errors. + + This lets systemd Restart=on-failure restart the process, which is more + reliable than in-process background reconnection after exhausted retries. + """ runner = _make_runner() runner.stop = AsyncMock() @@ -382,7 +387,28 @@ class TestRuntimeDisconnectQueuing: await runner._handle_adapter_fatal_error(adapter) - # stop() should NOT have been called since we have platforms queued + # stop() SHOULD be called — gateway exits for systemd restart + runner.stop.assert_called_once() + assert runner._exit_with_failure is True + assert Platform.TELEGRAM in runner._failed_platforms + + @pytest.mark.asyncio + async def test_retryable_error_no_exit_when_other_adapters_still_connected(self): + """Gateway should NOT exit if some adapters are still connected.""" + runner = _make_runner() + runner.stop = AsyncMock() + + failing_adapter = StubAdapter(succeed=True) + failing_adapter._set_fatal_error("network_error", "DNS failure", retryable=True) + runner.adapters[Platform.TELEGRAM] = failing_adapter + + # Another adapter is still connected + healthy_adapter = StubAdapter(succeed=True) + runner.adapters[Platform.DISCORD] = healthy_adapter + + await runner._handle_adapter_fatal_error(failing_adapter) + + # stop() should NOT have been called — Discord is still up runner.stop.assert_not_called() assert Platform.TELEGRAM in runner._failed_platforms diff --git a/tests/gateway/test_runner_fatal_adapter.py b/tests/gateway/test_runner_fatal_adapter.py index 6eb2850598..13b9a7d99e 100644 --- a/tests/gateway/test_runner_fatal_adapter.py +++ b/tests/gateway/test_runner_fatal_adapter.py @@ -89,7 +89,8 @@ async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatc await runner._handle_adapter_fatal_error(adapter) - # Should NOT shut down — platform is queued for reconnection - runner.stop.assert_not_awaited() + # Should shut down with failure — systemd Restart=on-failure will restart + runner.stop.assert_awaited_once() + assert runner._exit_with_failure is True assert Platform.WHATSAPP in runner._failed_platforms assert runner._failed_platforms[Platform.WHATSAPP]["attempts"] == 0