mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-05 10:17:17 +08:00
224 lines
8.0 KiB
Python
224 lines
8.0 KiB
Python
|
|
"""Tests for the gateway stale-code self-check (Issue #17648).
|
||
|
|
|
||
|
|
A gateway that survives ``hermes update`` keeps pre-update modules cached
|
||
|
|
in ``sys.modules``. Later imports of names added post-update (e.g.
|
||
|
|
``cfg_get`` from PR #17304) raise ImportError against the stale module
|
||
|
|
object. The self-check in ``GatewayRunner._detect_stale_code()`` detects
|
||
|
|
this by comparing boot-time sentinel-file mtimes against current ones,
|
||
|
|
and ``_trigger_stale_code_restart()`` triggers a graceful restart.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
from unittest.mock import MagicMock, patch
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
from gateway.run import (
|
||
|
|
GatewayRunner,
|
||
|
|
_compute_repo_mtime,
|
||
|
|
_STALE_CODE_SENTINELS,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _make_tmp_repo(tmp_path: Path) -> Path:
|
||
|
|
"""Create a fake repo with all stale-code sentinel files."""
|
||
|
|
for rel in _STALE_CODE_SENTINELS:
|
||
|
|
p = tmp_path / rel
|
||
|
|
p.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
p.write_text("# test sentinel\n")
|
||
|
|
return tmp_path
|
||
|
|
|
||
|
|
|
||
|
|
def _make_runner(repo_root: Path, *, boot_mtime: float, boot_wall: float):
|
||
|
|
"""Bare GatewayRunner with just the stale-check attributes set."""
|
||
|
|
runner = object.__new__(GatewayRunner)
|
||
|
|
runner._repo_root_for_staleness = repo_root
|
||
|
|
runner._boot_wall_time = boot_wall
|
||
|
|
runner._boot_repo_mtime = boot_mtime
|
||
|
|
runner._stale_code_notified = set()
|
||
|
|
runner._stale_code_restart_triggered = False
|
||
|
|
return runner
|
||
|
|
|
||
|
|
|
||
|
|
def test_compute_repo_mtime_returns_newest(tmp_path):
|
||
|
|
"""_compute_repo_mtime returns the newest mtime across sentinel files."""
|
||
|
|
repo = _make_tmp_repo(tmp_path)
|
||
|
|
|
||
|
|
# Stamp a baseline mtime across all sentinels
|
||
|
|
baseline = time.time() - 100
|
||
|
|
for rel in _STALE_CODE_SENTINELS:
|
||
|
|
os.utime(repo / rel, (baseline, baseline))
|
||
|
|
|
||
|
|
# Touch one file forward
|
||
|
|
newer = time.time()
|
||
|
|
os.utime(repo / "hermes_cli/config.py", (newer, newer))
|
||
|
|
|
||
|
|
result = _compute_repo_mtime(repo)
|
||
|
|
assert abs(result - newer) < 1.0 # within 1s (filesystem mtime resolution)
|
||
|
|
|
||
|
|
|
||
|
|
def test_compute_repo_mtime_missing_files_returns_zero(tmp_path):
|
||
|
|
"""Missing sentinel files return 0.0 (treated as 'can't tell' upstream)."""
|
||
|
|
# tmp_path has none of the sentinels
|
||
|
|
assert _compute_repo_mtime(tmp_path) == 0.0
|
||
|
|
|
||
|
|
|
||
|
|
def test_compute_repo_mtime_partial_files_still_works(tmp_path):
|
||
|
|
"""Partial sentinel presence still returns newest of the readable ones."""
|
||
|
|
(tmp_path / "hermes_cli").mkdir()
|
||
|
|
target = tmp_path / "hermes_cli" / "config.py"
|
||
|
|
target.write_text("# partial\n")
|
||
|
|
target_mtime = time.time() - 50
|
||
|
|
os.utime(target, (target_mtime, target_mtime))
|
||
|
|
|
||
|
|
result = _compute_repo_mtime(tmp_path)
|
||
|
|
assert abs(result - target_mtime) < 1.0
|
||
|
|
|
||
|
|
|
||
|
|
def test_detect_stale_code_false_when_no_boot_snapshot(tmp_path):
|
||
|
|
"""No boot snapshot → can't tell → not stale (no restart loop)."""
|
||
|
|
repo = _make_tmp_repo(tmp_path)
|
||
|
|
runner = _make_runner(repo, boot_mtime=0.0, boot_wall=0.0)
|
||
|
|
assert runner._detect_stale_code() is False
|
||
|
|
|
||
|
|
|
||
|
|
def test_detect_stale_code_false_when_files_unchanged(tmp_path):
|
||
|
|
"""Source files at boot mtime → not stale."""
|
||
|
|
repo = _make_tmp_repo(tmp_path)
|
||
|
|
# Freeze all sentinels to the same mtime
|
||
|
|
baseline = time.time() - 100
|
||
|
|
for rel in _STALE_CODE_SENTINELS:
|
||
|
|
os.utime(repo / rel, (baseline, baseline))
|
||
|
|
|
||
|
|
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
|
||
|
|
assert runner._detect_stale_code() is False
|
||
|
|
|
||
|
|
|
||
|
|
def test_detect_stale_code_true_after_update(tmp_path):
|
||
|
|
"""Sentinel files newer than boot snapshot → stale."""
|
||
|
|
repo = _make_tmp_repo(tmp_path)
|
||
|
|
baseline = time.time() - 100
|
||
|
|
for rel in _STALE_CODE_SENTINELS:
|
||
|
|
os.utime(repo / rel, (baseline, baseline))
|
||
|
|
|
||
|
|
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
|
||
|
|
|
||
|
|
# Simulate hermes update touching config.py
|
||
|
|
new_mtime = time.time()
|
||
|
|
os.utime(repo / "hermes_cli/config.py", (new_mtime, new_mtime))
|
||
|
|
|
||
|
|
assert runner._detect_stale_code() is True
|
||
|
|
|
||
|
|
|
||
|
|
def test_detect_stale_code_ignores_subsecond_drift(tmp_path):
|
||
|
|
"""2-second slack prevents false positives on coarse-mtime filesystems."""
|
||
|
|
repo = _make_tmp_repo(tmp_path)
|
||
|
|
baseline = time.time() - 100
|
||
|
|
for rel in _STALE_CODE_SENTINELS:
|
||
|
|
os.utime(repo / rel, (baseline, baseline))
|
||
|
|
|
||
|
|
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
|
||
|
|
|
||
|
|
# Touch config.py 1s newer — within the 2s slack → not stale
|
||
|
|
os.utime(repo / "hermes_cli/config.py", (baseline + 1.0, baseline + 1.0))
|
||
|
|
assert runner._detect_stale_code() is False
|
||
|
|
|
||
|
|
# Touch 5s newer → stale
|
||
|
|
os.utime(repo / "hermes_cli/config.py", (baseline + 5.0, baseline + 5.0))
|
||
|
|
assert runner._detect_stale_code() is True
|
||
|
|
|
||
|
|
|
||
|
|
def test_trigger_stale_code_restart_is_idempotent(tmp_path):
|
||
|
|
"""Calling _trigger_stale_code_restart twice only requests restart once."""
|
||
|
|
repo = _make_tmp_repo(tmp_path)
|
||
|
|
runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)
|
||
|
|
|
||
|
|
calls = []
|
||
|
|
|
||
|
|
def fake_request_restart(*, detached=False, via_service=False):
|
||
|
|
calls.append((detached, via_service))
|
||
|
|
return True
|
||
|
|
|
||
|
|
runner.request_restart = fake_request_restart
|
||
|
|
|
||
|
|
runner._trigger_stale_code_restart()
|
||
|
|
runner._trigger_stale_code_restart()
|
||
|
|
runner._trigger_stale_code_restart()
|
||
|
|
|
||
|
|
assert len(calls) == 1
|
||
|
|
assert runner._stale_code_restart_triggered is True
|
||
|
|
|
||
|
|
|
||
|
|
def test_trigger_stale_code_restart_survives_request_failure(tmp_path):
|
||
|
|
"""If request_restart raises, we swallow and mark as triggered anyway."""
|
||
|
|
repo = _make_tmp_repo(tmp_path)
|
||
|
|
runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)
|
||
|
|
|
||
|
|
def boom(*, detached=False, via_service=False):
|
||
|
|
raise RuntimeError("no event loop")
|
||
|
|
|
||
|
|
runner.request_restart = boom
|
||
|
|
|
||
|
|
# Should not raise
|
||
|
|
runner._trigger_stale_code_restart()
|
||
|
|
|
||
|
|
# Marked triggered so we don't retry on every subsequent message
|
||
|
|
assert runner._stale_code_restart_triggered is True
|
||
|
|
|
||
|
|
|
||
|
|
def test_detect_stale_code_handles_disappearing_repo_root(tmp_path):
|
||
|
|
"""If the repo root vanishes after boot, return False (don't loop)."""
|
||
|
|
repo = _make_tmp_repo(tmp_path)
|
||
|
|
baseline = time.time() - 100
|
||
|
|
for rel in _STALE_CODE_SENTINELS:
|
||
|
|
os.utime(repo / rel, (baseline, baseline))
|
||
|
|
|
||
|
|
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
|
||
|
|
|
||
|
|
# Remove all sentinel files — _compute_repo_mtime returns 0.0
|
||
|
|
for rel in _STALE_CODE_SENTINELS:
|
||
|
|
(repo / rel).unlink(missing_ok=True)
|
||
|
|
|
||
|
|
assert runner._detect_stale_code() is False
|
||
|
|
|
||
|
|
|
||
|
|
def test_class_level_defaults_prevent_uninitialized_access():
|
||
|
|
"""Partial construction via object.__new__ must not crash _detect_stale_code."""
|
||
|
|
runner = object.__new__(GatewayRunner)
|
||
|
|
# Don't set any instance attrs — class-level defaults should kick in
|
||
|
|
runner._repo_root_for_staleness = Path(".")
|
||
|
|
# _boot_wall_time / _boot_repo_mtime fall through to class defaults (0.0)
|
||
|
|
assert runner._detect_stale_code() is False
|
||
|
|
# _stale_code_restart_triggered falls through to class default (False)
|
||
|
|
assert runner._stale_code_restart_triggered is False
|
||
|
|
|
||
|
|
|
||
|
|
def test_init_captures_boot_snapshot(monkeypatch, tmp_path):
|
||
|
|
"""GatewayRunner.__init__ captures a usable stale-code baseline."""
|
||
|
|
# Stub out the heavy parts of __init__ we don't need. We only want
|
||
|
|
# to prove the stale-code snapshot is captured before anything else.
|
||
|
|
from gateway import run as run_mod
|
||
|
|
|
||
|
|
calls = {}
|
||
|
|
|
||
|
|
def fake_compute(repo_root):
|
||
|
|
calls["repo_root"] = repo_root
|
||
|
|
return 1234567890.0
|
||
|
|
|
||
|
|
monkeypatch.setattr(run_mod, "_compute_repo_mtime", fake_compute)
|
||
|
|
|
||
|
|
# Build a runner without running the full __init__ — then manually
|
||
|
|
# exercise the stale-check init block that __init__ contains.
|
||
|
|
runner = object.__new__(GatewayRunner)
|
||
|
|
runner._boot_wall_time = time.time()
|
||
|
|
runner._repo_root_for_staleness = Path(run_mod.__file__).resolve().parent.parent
|
||
|
|
runner._boot_repo_mtime = run_mod._compute_repo_mtime(runner._repo_root_for_staleness)
|
||
|
|
runner._stale_code_notified = set()
|
||
|
|
runner._stale_code_restart_triggered = False
|
||
|
|
|
||
|
|
assert runner._boot_repo_mtime == 1234567890.0
|
||
|
|
assert calls["repo_root"] == runner._repo_root_for_staleness
|
||
|
|
assert runner._boot_wall_time > 0
|