Compare commits

...

1 Commits

Author SHA1 Message Date
Teknium
3e5ca0bc13 fix(gateway): clear stale runtime error fields after platform recovery
Cherry-picked from PR #2470 by @NaesayerX.

write_runtime_status used None as 'don't change', making it impossible
to clear stale error_code/error_message after platform recovery. Now
uses a sentinel (_UNSET) so None means 'clear this field'. Passing
error_code=None removes the key from the platform payload instead of
leaving haunted error metadata in gateway_state.json.
2026-03-22 09:18:24 -07:00
2 changed files with 53 additions and 13 deletions

View File

@@ -22,6 +22,7 @@ from typing import Any, Optional
_GATEWAY_KIND = "hermes-gateway"
_RUNTIME_STATUS_FILE = "gateway_state.json"
_LOCKS_DIRNAME = "gateway-locks"
_UNSET = object()
def _get_pid_path() -> Path:
@@ -185,14 +186,19 @@ def write_pid_file() -> None:
def write_runtime_status(
*,
gateway_state: Optional[str] = None,
exit_reason: Optional[str] = None,
gateway_state: Any = _UNSET,
exit_reason: Any = _UNSET,
platform: Optional[str] = None,
platform_state: Optional[str] = None,
error_code: Optional[str] = None,
error_message: Optional[str] = None,
platform_state: Any = _UNSET,
error_code: Any = _UNSET,
error_message: Any = _UNSET,
) -> None:
"""Persist gateway runtime health information for diagnostics/status."""
"""Persist gateway runtime health information for diagnostics/status.
Parameters use an internal sentinel so callers can intentionally clear fields
by passing ``None``. This matters when a recovered platform transitions from
fatal -> connected/disconnected and stale error metadata must be removed.
"""
path = _get_runtime_status_path()
payload = _read_json_file(path) or _build_runtime_status_record()
payload.setdefault("platforms", {})
@@ -201,19 +207,25 @@ def write_runtime_status(
payload["start_time"] = _get_process_start_time(os.getpid())
payload["updated_at"] = _utc_now_iso()
if gateway_state is not None:
if gateway_state is not _UNSET:
payload["gateway_state"] = gateway_state
if exit_reason is not None:
if exit_reason is not _UNSET:
payload["exit_reason"] = exit_reason
if platform is not None:
platform_payload = payload["platforms"].get(platform, {})
if platform_state is not None:
if platform_state is not _UNSET:
platform_payload["state"] = platform_state
if error_code is not None:
platform_payload["error_code"] = error_code
if error_message is not None:
platform_payload["error_message"] = error_message
if error_code is not _UNSET:
if error_code is None:
platform_payload.pop("error_code", None)
else:
platform_payload["error_code"] = error_code
if error_message is not _UNSET:
if error_message is None:
platform_payload.pop("error_message", None)
else:
platform_payload["error_message"] = error_message
platform_payload["updated_at"] = _utc_now_iso()
payload["platforms"][platform] = platform_payload

View File

@@ -103,6 +103,34 @@ class TestGatewayRuntimeStatus:
assert payload["platforms"]["telegram"]["error_code"] == "telegram_polling_conflict"
assert payload["platforms"]["telegram"]["error_message"] == "another poller is active"
def test_write_runtime_status_clears_stale_platform_error_fields_on_recovery(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
status.write_runtime_status(
gateway_state="startup_failed",
exit_reason="telegram conflict",
platform="telegram",
platform_state="fatal",
error_code="telegram_token_lock",
error_message="another local Hermes gateway is already using this Telegram bot token",
)
status.write_runtime_status(
gateway_state="running",
exit_reason=None,
platform="telegram",
platform_state="connected",
error_code=None,
error_message=None,
)
payload = status.read_runtime_status()
assert payload["gateway_state"] == "running"
assert payload["exit_reason"] is None
assert payload["platforms"]["telegram"]["state"] == "connected"
assert "error_code" not in payload["platforms"]["telegram"]
assert "error_message" not in payload["platforms"]["telegram"]
class TestScopedLocks:
def test_acquire_scoped_lock_rejects_live_other_process(self, tmp_path, monkeypatch):