mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-01 08:21:50 +08:00
fix(gateway): clear stale runtime error fields after platform recovery
Cherry-picked from PR #2470 by @NaesayerX. write_runtime_status used None as 'don't change', making it impossible to clear stale error_code/error_message after platform recovery. Now uses a sentinel (_UNSET) so None means 'clear this field'. Passing error_code=None removes the key from the platform payload instead of leaving haunted error metadata in gateway_state.json.
This commit is contained in:
@@ -22,6 +22,7 @@ from typing import Any, Optional
|
||||
_GATEWAY_KIND = "hermes-gateway"
|
||||
_RUNTIME_STATUS_FILE = "gateway_state.json"
|
||||
_LOCKS_DIRNAME = "gateway-locks"
|
||||
_UNSET = object()
|
||||
|
||||
|
||||
def _get_pid_path() -> Path:
|
||||
@@ -185,14 +186,19 @@ def write_pid_file() -> None:
|
||||
|
||||
def write_runtime_status(
|
||||
*,
|
||||
gateway_state: Optional[str] = None,
|
||||
exit_reason: Optional[str] = None,
|
||||
gateway_state: Any = _UNSET,
|
||||
exit_reason: Any = _UNSET,
|
||||
platform: Optional[str] = None,
|
||||
platform_state: Optional[str] = None,
|
||||
error_code: Optional[str] = None,
|
||||
error_message: Optional[str] = None,
|
||||
platform_state: Any = _UNSET,
|
||||
error_code: Any = _UNSET,
|
||||
error_message: Any = _UNSET,
|
||||
) -> None:
|
||||
"""Persist gateway runtime health information for diagnostics/status."""
|
||||
"""Persist gateway runtime health information for diagnostics/status.
|
||||
|
||||
Parameters use an internal sentinel so callers can intentionally clear fields
|
||||
by passing ``None``. This matters when a recovered platform transitions from
|
||||
fatal -> connected/disconnected and stale error metadata must be removed.
|
||||
"""
|
||||
path = _get_runtime_status_path()
|
||||
payload = _read_json_file(path) or _build_runtime_status_record()
|
||||
payload.setdefault("platforms", {})
|
||||
@@ -201,19 +207,25 @@ def write_runtime_status(
|
||||
payload["start_time"] = _get_process_start_time(os.getpid())
|
||||
payload["updated_at"] = _utc_now_iso()
|
||||
|
||||
if gateway_state is not None:
|
||||
if gateway_state is not _UNSET:
|
||||
payload["gateway_state"] = gateway_state
|
||||
if exit_reason is not None:
|
||||
if exit_reason is not _UNSET:
|
||||
payload["exit_reason"] = exit_reason
|
||||
|
||||
if platform is not None:
|
||||
platform_payload = payload["platforms"].get(platform, {})
|
||||
if platform_state is not None:
|
||||
if platform_state is not _UNSET:
|
||||
platform_payload["state"] = platform_state
|
||||
if error_code is not None:
|
||||
platform_payload["error_code"] = error_code
|
||||
if error_message is not None:
|
||||
platform_payload["error_message"] = error_message
|
||||
if error_code is not _UNSET:
|
||||
if error_code is None:
|
||||
platform_payload.pop("error_code", None)
|
||||
else:
|
||||
platform_payload["error_code"] = error_code
|
||||
if error_message is not _UNSET:
|
||||
if error_message is None:
|
||||
platform_payload.pop("error_message", None)
|
||||
else:
|
||||
platform_payload["error_message"] = error_message
|
||||
platform_payload["updated_at"] = _utc_now_iso()
|
||||
payload["platforms"][platform] = platform_payload
|
||||
|
||||
|
||||
@@ -103,6 +103,34 @@ class TestGatewayRuntimeStatus:
|
||||
assert payload["platforms"]["telegram"]["error_code"] == "telegram_polling_conflict"
|
||||
assert payload["platforms"]["telegram"]["error_message"] == "another poller is active"
|
||||
|
||||
def test_write_runtime_status_clears_stale_platform_error_fields_on_recovery(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
|
||||
status.write_runtime_status(
|
||||
gateway_state="startup_failed",
|
||||
exit_reason="telegram conflict",
|
||||
platform="telegram",
|
||||
platform_state="fatal",
|
||||
error_code="telegram_token_lock",
|
||||
error_message="another local Hermes gateway is already using this Telegram bot token",
|
||||
)
|
||||
|
||||
status.write_runtime_status(
|
||||
gateway_state="running",
|
||||
exit_reason=None,
|
||||
platform="telegram",
|
||||
platform_state="connected",
|
||||
error_code=None,
|
||||
error_message=None,
|
||||
)
|
||||
|
||||
payload = status.read_runtime_status()
|
||||
assert payload["gateway_state"] == "running"
|
||||
assert payload["exit_reason"] is None
|
||||
assert payload["platforms"]["telegram"]["state"] == "connected"
|
||||
assert "error_code" not in payload["platforms"]["telegram"]
|
||||
assert "error_message" not in payload["platforms"]["telegram"]
|
||||
|
||||
|
||||
class TestScopedLocks:
|
||||
def test_acquire_scoped_lock_rejects_live_other_process(self, tmp_path, monkeypatch):
|
||||
|
||||
Reference in New Issue
Block a user