tests/gateway/test_stale_code_self_check.py

"""Tests for the gateway stale-code self-check (Issue #17648).

A gateway that survives ``hermes update`` keeps pre-update modules cached
in ``sys.modules``.  Later imports of names added post-update (e.g.
``cfg_get`` from PR #17304) raise ImportError against the stale module
object.  The self-check in ``GatewayRunner._detect_stale_code()`` detects
this by comparing boot-time sentinel-file mtimes against current ones,
and ``_trigger_stale_code_restart()`` triggers a graceful restart.
"""

import os
import time
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from gateway.run import (
    GatewayRunner,
    _compute_repo_mtime,
    _STALE_CODE_SENTINELS,
)


def _make_tmp_repo(tmp_path: Path) -> Path:
    """Create a fake repo with all stale-code sentinel files."""
    for rel in _STALE_CODE_SENTINELS:
        p = tmp_path / rel
        p.parent.mkdir(parents=True, exist_ok=True)
        p.write_text("# test sentinel\n")
    return tmp_path


def _make_runner(repo_root: Path, *, boot_mtime: float, boot_wall: float):
    """Bare GatewayRunner with just the stale-check attributes set."""
    runner = object.__new__(GatewayRunner)
    runner._repo_root_for_staleness = repo_root
    runner._boot_wall_time = boot_wall
    runner._boot_repo_mtime = boot_mtime
    runner._stale_code_notified = set()
    runner._stale_code_restart_triggered = False
    return runner


def test_compute_repo_mtime_returns_newest(tmp_path):
    """_compute_repo_mtime returns the newest mtime across sentinel files."""
    repo = _make_tmp_repo(tmp_path)

    # Stamp a baseline mtime across all sentinels
    baseline = time.time() - 100
    for rel in _STALE_CODE_SENTINELS:
        os.utime(repo / rel, (baseline, baseline))

    # Touch one file forward
    newer = time.time()
    os.utime(repo / "hermes_cli/config.py", (newer, newer))

    result = _compute_repo_mtime(repo)
    assert abs(result - newer) < 1.0  # within 1s (filesystem mtime resolution)


def test_compute_repo_mtime_missing_files_returns_zero(tmp_path):
    """Missing sentinel files return 0.0 (treated as 'can't tell' upstream)."""
    # tmp_path has none of the sentinels
    assert _compute_repo_mtime(tmp_path) == 0.0


def test_compute_repo_mtime_partial_files_still_works(tmp_path):
    """Partial sentinel presence still returns newest of the readable ones."""
    (tmp_path / "hermes_cli").mkdir()
    target = tmp_path / "hermes_cli" / "config.py"
    target.write_text("# partial\n")
    target_mtime = time.time() - 50
    os.utime(target, (target_mtime, target_mtime))

    result = _compute_repo_mtime(tmp_path)
    assert abs(result - target_mtime) < 1.0


def test_detect_stale_code_false_when_no_boot_snapshot(tmp_path):
    """No boot snapshot → can't tell → not stale (no restart loop)."""
    repo = _make_tmp_repo(tmp_path)
    runner = _make_runner(repo, boot_mtime=0.0, boot_wall=0.0)
    assert runner._detect_stale_code() is False


def test_detect_stale_code_false_when_files_unchanged(tmp_path):
    """Source files at boot mtime → not stale."""
    repo = _make_tmp_repo(tmp_path)
    # Freeze all sentinels to the same mtime
    baseline = time.time() - 100
    for rel in _STALE_CODE_SENTINELS:
        os.utime(repo / rel, (baseline, baseline))

    runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
    assert runner._detect_stale_code() is False


def test_detect_stale_code_true_after_update(tmp_path):
    """Sentinel files newer than boot snapshot → stale."""
    repo = _make_tmp_repo(tmp_path)
    baseline = time.time() - 100
    for rel in _STALE_CODE_SENTINELS:
        os.utime(repo / rel, (baseline, baseline))

    runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)

    # Simulate hermes update touching config.py
    new_mtime = time.time()
    os.utime(repo / "hermes_cli/config.py", (new_mtime, new_mtime))

    assert runner._detect_stale_code() is True


def test_detect_stale_code_ignores_subsecond_drift(tmp_path):
    """2-second slack prevents false positives on coarse-mtime filesystems."""
    repo = _make_tmp_repo(tmp_path)
    baseline = time.time() - 100
    for rel in _STALE_CODE_SENTINELS:
        os.utime(repo / rel, (baseline, baseline))

    runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)

    # Touch config.py 1s newer — within the 2s slack → not stale
    os.utime(repo / "hermes_cli/config.py", (baseline + 1.0, baseline + 1.0))
    assert runner._detect_stale_code() is False

    # Touch 5s newer → stale
    os.utime(repo / "hermes_cli/config.py", (baseline + 5.0, baseline + 5.0))
    assert runner._detect_stale_code() is True


def test_trigger_stale_code_restart_is_idempotent(tmp_path):
    """Calling _trigger_stale_code_restart twice only requests restart once."""
    repo = _make_tmp_repo(tmp_path)
    runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)

    calls = []

    def fake_request_restart(*, detached=False, via_service=False):
        calls.append((detached, via_service))
        return True

    runner.request_restart = fake_request_restart

    runner._trigger_stale_code_restart()
    runner._trigger_stale_code_restart()
    runner._trigger_stale_code_restart()

    assert len(calls) == 1
    assert runner._stale_code_restart_triggered is True


def test_trigger_stale_code_restart_survives_request_failure(tmp_path):
    """If request_restart raises, we swallow and mark as triggered anyway."""
    repo = _make_tmp_repo(tmp_path)
    runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)

    def boom(*, detached=False, via_service=False):
        raise RuntimeError("no event loop")

    runner.request_restart = boom

    # Should not raise
    runner._trigger_stale_code_restart()

    # Marked triggered so we don't retry on every subsequent message
    assert runner._stale_code_restart_triggered is True


def test_detect_stale_code_handles_disappearing_repo_root(tmp_path):
    """If the repo root vanishes after boot, return False (don't loop)."""
    repo = _make_tmp_repo(tmp_path)
    baseline = time.time() - 100
    for rel in _STALE_CODE_SENTINELS:
        os.utime(repo / rel, (baseline, baseline))

    runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)

    # Remove all sentinel files — _compute_repo_mtime returns 0.0
    for rel in _STALE_CODE_SENTINELS:
        (repo / rel).unlink(missing_ok=True)

    assert runner._detect_stale_code() is False


def test_class_level_defaults_prevent_uninitialized_access():
    """Partial construction via object.__new__ must not crash _detect_stale_code."""
    runner = object.__new__(GatewayRunner)
    # Don't set any instance attrs — class-level defaults should kick in
    runner._repo_root_for_staleness = Path(".")
    # _boot_wall_time / _boot_repo_mtime fall through to class defaults (0.0)
    assert runner._detect_stale_code() is False
    # _stale_code_restart_triggered falls through to class default (False)
    assert runner._stale_code_restart_triggered is False


def test_init_captures_boot_snapshot(monkeypatch, tmp_path):
    """GatewayRunner.__init__ captures a usable stale-code baseline."""
    # Stub out the heavy parts of __init__ we don't need.  We only want
    # to prove the stale-code snapshot is captured before anything else.
    from gateway import run as run_mod

    calls = {}

    def fake_compute(repo_root):
        calls["repo_root"] = repo_root
        return 1234567890.0

    monkeypatch.setattr(run_mod, "_compute_repo_mtime", fake_compute)

    # Build a runner without running the full __init__ — then manually
    # exercise the stale-check init block that __init__ contains.
    runner = object.__new__(GatewayRunner)
    runner._boot_wall_time = time.time()
    runner._repo_root_for_staleness = Path(run_mod.__file__).resolve().parent.parent
    runner._boot_repo_mtime = run_mod._compute_repo_mtime(runner._repo_root_for_staleness)
    runner._stale_code_notified = set()
    runner._stale_code_restart_triggered = False

    assert runner._boot_repo_mtime == 1234567890.0
    assert calls["repo_root"] == runner._repo_root_for_staleness
    assert runner._boot_wall_time > 0
fix(gateway): auto-restart when source files change out from under us (#17648) (#18409) Long-running gateway processes that survive 'hermes update' keep pre-update modules cached in sys.modules. When new tool files on disk then try to 'from hermes_cli.config import cfg_get' (added in PR #17304), the import resolves against the stale module object and raises ImportError — hitting users on Matrix, Telegram, Feishu, and other platforms. Two defenses: 1. Gateway self-check (gateway/run.py). On __init__, snapshot the newest mtime across sentinel source files (hermes_cli/config.py, run_agent.py, gateway/run.py, etc.). On every inbound message, re-read those mtimes; if any is newer than boot time + 2s slack, request a graceful restart via the normal drain path and return a one-line ack to the user. Idempotent, works regardless of how the update happened (hermes update, manual git pull, installer). 2. Post-restart survivor sweep ('hermes update'). After the existing restart loop, sleep 3s, rescan for gateway PIDs we already tried to kill, and SIGKILL any survivors. The detached profile watchers and systemd then relaunch with fresh code instead of waiting out the 120s watcher timeout. Closes #17648. 2026-05-01 09:50:08 -07:00			`"""Tests for the gateway stale-code self-check (Issue #17648).`

			A gateway that survives ``hermes update`` keeps pre-update modules cached
			in ``sys.modules``. Later imports of names added post-update (e.g.
			``cfg_get`` from PR #17304) raise ImportError against the stale module
			object. The self-check in ``GatewayRunner._detect_stale_code()`` detects
			`this by comparing boot-time sentinel-file mtimes against current ones,`
			and ``_trigger_stale_code_restart()`` triggers a graceful restart.
			`"""`

			`import os`
			`import time`
			`from pathlib import Path`
			`from unittest.mock import MagicMock, patch`

			`import pytest`

			`from gateway.run import (`
			`GatewayRunner,`
			`_compute_repo_mtime,`
			`_STALE_CODE_SENTINELS,`
			`)`


			`def _make_tmp_repo(tmp_path: Path) -> Path:`
			`"""Create a fake repo with all stale-code sentinel files."""`
			`for rel in _STALE_CODE_SENTINELS:`
			`p = tmp_path / rel`
			`p.parent.mkdir(parents=True, exist_ok=True)`
			`p.write_text("# test sentinel\n")`
			`return tmp_path`


			`def _make_runner(repo_root: Path, *, boot_mtime: float, boot_wall: float):`
			`"""Bare GatewayRunner with just the stale-check attributes set."""`
			`runner = object.__new__(GatewayRunner)`
			`runner._repo_root_for_staleness = repo_root`
			`runner._boot_wall_time = boot_wall`
			`runner._boot_repo_mtime = boot_mtime`
			`runner._stale_code_notified = set()`
			`runner._stale_code_restart_triggered = False`
			`return runner`


			`def test_compute_repo_mtime_returns_newest(tmp_path):`
			`"""_compute_repo_mtime returns the newest mtime across sentinel files."""`
			`repo = _make_tmp_repo(tmp_path)`

			`# Stamp a baseline mtime across all sentinels`
			`baseline = time.time() - 100`
			`for rel in _STALE_CODE_SENTINELS:`
			`os.utime(repo / rel, (baseline, baseline))`

			`# Touch one file forward`
			`newer = time.time()`
			`os.utime(repo / "hermes_cli/config.py", (newer, newer))`

			`result = _compute_repo_mtime(repo)`
			`assert abs(result - newer) < 1.0 # within 1s (filesystem mtime resolution)`


			`def test_compute_repo_mtime_missing_files_returns_zero(tmp_path):`
			`"""Missing sentinel files return 0.0 (treated as 'can't tell' upstream)."""`
			`# tmp_path has none of the sentinels`
			`assert _compute_repo_mtime(tmp_path) == 0.0`


			`def test_compute_repo_mtime_partial_files_still_works(tmp_path):`
			`"""Partial sentinel presence still returns newest of the readable ones."""`
			`(tmp_path / "hermes_cli").mkdir()`
			`target = tmp_path / "hermes_cli" / "config.py"`
			`target.write_text("# partial\n")`
			`target_mtime = time.time() - 50`
			`os.utime(target, (target_mtime, target_mtime))`

			`result = _compute_repo_mtime(tmp_path)`
			`assert abs(result - target_mtime) < 1.0`


			`def test_detect_stale_code_false_when_no_boot_snapshot(tmp_path):`
			`"""No boot snapshot → can't tell → not stale (no restart loop)."""`
			`repo = _make_tmp_repo(tmp_path)`
			`runner = _make_runner(repo, boot_mtime=0.0, boot_wall=0.0)`
			`assert runner._detect_stale_code() is False`


			`def test_detect_stale_code_false_when_files_unchanged(tmp_path):`
			`"""Source files at boot mtime → not stale."""`
			`repo = _make_tmp_repo(tmp_path)`
			`# Freeze all sentinels to the same mtime`
			`baseline = time.time() - 100`
			`for rel in _STALE_CODE_SENTINELS:`
			`os.utime(repo / rel, (baseline, baseline))`

			`runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)`
			`assert runner._detect_stale_code() is False`


			`def test_detect_stale_code_true_after_update(tmp_path):`
			`"""Sentinel files newer than boot snapshot → stale."""`
			`repo = _make_tmp_repo(tmp_path)`
			`baseline = time.time() - 100`
			`for rel in _STALE_CODE_SENTINELS:`
			`os.utime(repo / rel, (baseline, baseline))`

			`runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)`

			`# Simulate hermes update touching config.py`
			`new_mtime = time.time()`
			`os.utime(repo / "hermes_cli/config.py", (new_mtime, new_mtime))`

			`assert runner._detect_stale_code() is True`


			`def test_detect_stale_code_ignores_subsecond_drift(tmp_path):`
			`"""2-second slack prevents false positives on coarse-mtime filesystems."""`
			`repo = _make_tmp_repo(tmp_path)`
			`baseline = time.time() - 100`
			`for rel in _STALE_CODE_SENTINELS:`
			`os.utime(repo / rel, (baseline, baseline))`

			`runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)`

			`# Touch config.py 1s newer — within the 2s slack → not stale`
			`os.utime(repo / "hermes_cli/config.py", (baseline + 1.0, baseline + 1.0))`
			`assert runner._detect_stale_code() is False`

			`# Touch 5s newer → stale`
			`os.utime(repo / "hermes_cli/config.py", (baseline + 5.0, baseline + 5.0))`
			`assert runner._detect_stale_code() is True`


			`def test_trigger_stale_code_restart_is_idempotent(tmp_path):`
			`"""Calling _trigger_stale_code_restart twice only requests restart once."""`
			`repo = _make_tmp_repo(tmp_path)`
			`runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)`

			`calls = []`

			`def fake_request_restart(*, detached=False, via_service=False):`
			`calls.append((detached, via_service))`
			`return True`

			`runner.request_restart = fake_request_restart`

			`runner._trigger_stale_code_restart()`
			`runner._trigger_stale_code_restart()`
			`runner._trigger_stale_code_restart()`

			`assert len(calls) == 1`
			`assert runner._stale_code_restart_triggered is True`


			`def test_trigger_stale_code_restart_survives_request_failure(tmp_path):`
			`"""If request_restart raises, we swallow and mark as triggered anyway."""`
			`repo = _make_tmp_repo(tmp_path)`
			`runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)`

			`def boom(*, detached=False, via_service=False):`
			`raise RuntimeError("no event loop")`

			`runner.request_restart = boom`

			`# Should not raise`
			`runner._trigger_stale_code_restart()`

			`# Marked triggered so we don't retry on every subsequent message`
			`assert runner._stale_code_restart_triggered is True`


			`def test_detect_stale_code_handles_disappearing_repo_root(tmp_path):`
			`"""If the repo root vanishes after boot, return False (don't loop)."""`
			`repo = _make_tmp_repo(tmp_path)`
			`baseline = time.time() - 100`
			`for rel in _STALE_CODE_SENTINELS:`
			`os.utime(repo / rel, (baseline, baseline))`

			`runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)`

			`# Remove all sentinel files — _compute_repo_mtime returns 0.0`
			`for rel in _STALE_CODE_SENTINELS:`
			`(repo / rel).unlink(missing_ok=True)`

			`assert runner._detect_stale_code() is False`


			`def test_class_level_defaults_prevent_uninitialized_access():`
			`"""Partial construction via object.__new__ must not crash _detect_stale_code."""`
			`runner = object.__new__(GatewayRunner)`
			`# Don't set any instance attrs — class-level defaults should kick in`
			`runner._repo_root_for_staleness = Path(".")`
			`# _boot_wall_time / _boot_repo_mtime fall through to class defaults (0.0)`
			`assert runner._detect_stale_code() is False`
			`# _stale_code_restart_triggered falls through to class default (False)`
			`assert runner._stale_code_restart_triggered is False`


			`def test_init_captures_boot_snapshot(monkeypatch, tmp_path):`
			`"""GatewayRunner.__init__ captures a usable stale-code baseline."""`
			`# Stub out the heavy parts of __init__ we don't need. We only want`
			`# to prove the stale-code snapshot is captured before anything else.`
			`from gateway import run as run_mod`

			`calls = {}`

			`def fake_compute(repo_root):`
			`calls["repo_root"] = repo_root`
			`return 1234567890.0`

			`monkeypatch.setattr(run_mod, "_compute_repo_mtime", fake_compute)`

			`# Build a runner without running the full __init__ — then manually`
			`# exercise the stale-check init block that __init__ contains.`
			`runner = object.__new__(GatewayRunner)`
			`runner._boot_wall_time = time.time()`
			`runner._repo_root_for_staleness = Path(run_mod.__file__).resolve().parent.parent`
			`runner._boot_repo_mtime = run_mod._compute_repo_mtime(runner._repo_root_for_staleness)`
			`runner._stale_code_notified = set()`
			`runner._stale_code_restart_triggered = False`

			`assert runner._boot_repo_mtime == 1234567890.0`
			`assert calls["repo_root"] == runner._repo_root_for_staleness`
			`assert runner._boot_wall_time > 0`