From 3e5ca0bc13cfaf6422fdae96ed85ada22e2e3dac Mon Sep 17 00:00:00 2001 From: Teknium Date: Sun, 22 Mar 2026 09:18:24 -0700 Subject: [PATCH] fix(gateway): clear stale runtime error fields after platform recovery Cherry-picked from PR #2470 by @NaesayerX. write_runtime_status used None as 'don't change', making it impossible to clear stale error_code/error_message after platform recovery. Now uses a sentinel (_UNSET) so None means 'clear this field'. Passing error_code=None removes the key from the platform payload instead of leaving haunted error metadata in gateway_state.json. --- gateway/status.py | 38 ++++++++++++++++++++++++------------ tests/gateway/test_status.py | 28 ++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 13 deletions(-) diff --git a/gateway/status.py b/gateway/status.py index f5f5649b540..d58c1bd7664 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -22,6 +22,7 @@ from typing import Any, Optional _GATEWAY_KIND = "hermes-gateway" _RUNTIME_STATUS_FILE = "gateway_state.json" _LOCKS_DIRNAME = "gateway-locks" +_UNSET = object() def _get_pid_path() -> Path: @@ -185,14 +186,19 @@ def write_pid_file() -> None: def write_runtime_status( *, - gateway_state: Optional[str] = None, - exit_reason: Optional[str] = None, + gateway_state: Any = _UNSET, + exit_reason: Any = _UNSET, platform: Optional[str] = None, - platform_state: Optional[str] = None, - error_code: Optional[str] = None, - error_message: Optional[str] = None, + platform_state: Any = _UNSET, + error_code: Any = _UNSET, + error_message: Any = _UNSET, ) -> None: - """Persist gateway runtime health information for diagnostics/status.""" + """Persist gateway runtime health information for diagnostics/status. + + Parameters use an internal sentinel so callers can intentionally clear fields + by passing ``None``. This matters when a recovered platform transitions from + fatal -> connected/disconnected and stale error metadata must be removed. + """ path = _get_runtime_status_path() payload = _read_json_file(path) or _build_runtime_status_record() payload.setdefault("platforms", {}) @@ -201,19 +207,25 @@ def write_runtime_status( payload["start_time"] = _get_process_start_time(os.getpid()) payload["updated_at"] = _utc_now_iso() - if gateway_state is not None: + if gateway_state is not _UNSET: payload["gateway_state"] = gateway_state - if exit_reason is not None: + if exit_reason is not _UNSET: payload["exit_reason"] = exit_reason if platform is not None: platform_payload = payload["platforms"].get(platform, {}) - if platform_state is not None: + if platform_state is not _UNSET: platform_payload["state"] = platform_state - if error_code is not None: - platform_payload["error_code"] = error_code - if error_message is not None: - platform_payload["error_message"] = error_message + if error_code is not _UNSET: + if error_code is None: + platform_payload.pop("error_code", None) + else: + platform_payload["error_code"] = error_code + if error_message is not _UNSET: + if error_message is None: + platform_payload.pop("error_message", None) + else: + platform_payload["error_message"] = error_message platform_payload["updated_at"] = _utc_now_iso() payload["platforms"][platform] = platform_payload diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index 510892b84ea..b070dba26f1 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -103,6 +103,34 @@ class TestGatewayRuntimeStatus: assert payload["platforms"]["telegram"]["error_code"] == "telegram_polling_conflict" assert payload["platforms"]["telegram"]["error_message"] == "another poller is active" + def test_write_runtime_status_clears_stale_platform_error_fields_on_recovery(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + + status.write_runtime_status( + gateway_state="startup_failed", + exit_reason="telegram conflict", + platform="telegram", + platform_state="fatal", + error_code="telegram_token_lock", + error_message="another local Hermes gateway is already using this Telegram bot token", + ) + + status.write_runtime_status( + gateway_state="running", + exit_reason=None, + platform="telegram", + platform_state="connected", + error_code=None, + error_message=None, + ) + + payload = status.read_runtime_status() + assert payload["gateway_state"] == "running" + assert payload["exit_reason"] is None + assert payload["platforms"]["telegram"]["state"] == "connected" + assert "error_code" not in payload["platforms"]["telegram"] + assert "error_message" not in payload["platforms"]["telegram"] + class TestScopedLocks: def test_acquire_scoped_lock_rejects_live_other_process(self, tmp_path, monkeypatch):