Compare commits

...

2 Commits

Author SHA1 Message Date
Teknium
f2332d4f13 revert(gateway): remove stale-code self-check and auto-restart
Removes the _detect_stale_code / _trigger_stale_code_restart mechanism
introduced in #17648 and iterated in #19740. On every incoming message
the gateway compared the boot-time git HEAD SHA to the current SHA on
disk, and if they differed it would reply with

    Gateway code was updated in the background --
    restarting this gateway so your next message runs
    on the new code. Please retry in a moment.

and then kick off a graceful restart. This is unwanted behaviour:
users who run a long-lived gateway and do their own ad-hoc git
operations on the checkout end up with their chat interrupted and
the current message dropped every time HEAD moves, with no way to
opt out.

If an operator really needs the old protection against stale
sys.modules after "hermes update", the SIGKILL-survivor sweep in
hermes update (hermes_cli/main.py, also tagged #17648) already
handles the supervisor-respawn case on its own.

Removed:
  gateway/run.py:
    - _STALE_CODE_SENTINELS, _GIT_SHA_CACHE_TTL_SECS
    - _read_git_head_sha(), _compute_repo_mtime() module helpers
    - class-level _boot_wall_time / _boot_repo_mtime / _boot_git_sha /
      _stale_code_restart_triggered defaults
    - __init__ boot-snapshot block (_boot_*, _cached_current_sha*,
      _repo_root_for_staleness, _stale_code_notified)
    - _current_git_sha_cached(), _detect_stale_code(),
      _trigger_stale_code_restart() methods
    - stale-code check + user-facing restart notice at the top of
      _handle_message()
  tests/gateway/test_stale_code_self_check.py (deleted, 412 lines)

No new logic added. Zero remaining references to any removed
symbol. Gateway test suite passes the same 4589 tests it passed
before; the 3 pre-existing unrelated failures (discord free-channel,
feishu bot admission, teams typing) are unchanged by this commit.
2026-05-05 03:44:10 -07:00
Siddharth Balyan
13a7cbcd64 fix(nix): refresh stale tui npmDepsHash + fix cache-blind detection (#20144)
The fix-lockfiles script used 'nix build .#tui.npmDeps' to detect stale
hashes. This always succeeds when the OLD derivation is cached in Cachix
or cache.nixos.org — even when the source package-lock.json has changed.

Fix: use prefetch-npm-deps to compute the hash directly from the lockfile
and compare against what's in the nix file. Falls back to nix build only
if prefetch-npm-deps fails.
2026-05-05 15:32:20 +05:30
4 changed files with 32 additions and 731 deletions

View File

@@ -93,153 +93,6 @@ def _telegramize_command_mentions(text: str, platform: Any) -> str:
_AUTO_CONTINUE_FRESHNESS_SECS_DEFAULT = 60 * 60
# --- Stale-code self-check ------------------------------------------------
# Long-running gateway processes that survive an ``hermes update`` keep the
# old ``hermes_cli.config`` (and friends) cached in ``sys.modules``. When
# the updated tool files on disk then try to ``from hermes_cli.config
# import cfg_get`` (added in PR #17304), the import resolves against the
# already-loaded stale module object and raises ``ImportError`` — see
# Issue #17648. Rather than papering over the import failure site-by-site
# in every tool file, detect the stale state centrally and auto-restart
# so the gateway reloads with fresh code.
#
# The signal we use is ``git rev-parse HEAD`` — the only thing ``hermes
# update`` moves that is NOT moved by agent-driven file edits. Earlier
# revisions of this check compared file mtimes across a sentinel set
# (run_agent.py, gateway/run.py, ...), but that produced false positives
# whenever the agent edited its own source files during a session:
# mtime jumps, stale-check fires, gateway restarts, user must retype.
# See the conversation at PR #<this> for the motivating incident.
#
# The legacy mtime sentinels are kept ONLY as a last-resort fallback for
# non-git installs (pip install from wheel, sparse clones with no .git
# dir). In those environments ``hermes update`` is not a supported path,
# so the check effectively no-ops — which is the safe behavior: better
# to ship one broken import than to restart on every agent-edit.
_STALE_CODE_SENTINELS: tuple[str, ...] = (
"hermes_cli/config.py",
"hermes_cli/__init__.py",
"run_agent.py",
"gateway/run.py",
"pyproject.toml",
)
# Cache git HEAD reads across consecutive messages so a chat burst doesn't
# spawn one subprocess per message. 5s is long enough to collapse a burst
# and short enough that the real post-update detection still fires within
# the user's perceived "next message" window.
_GIT_SHA_CACHE_TTL_SECS = 5.0
def _read_git_head_sha(repo_root: Path) -> Optional[str]:
"""Return the git HEAD SHA for ``repo_root``, or None if unavailable.
Reads ``.git/HEAD`` directly (and follows one level of ref) instead
of shelling out to ``git`` — cheaper, no subprocess tax, works on
gateway hosts that don't have a ``git`` binary on PATH. Returns
None for non-git installs (no ``.git`` dir) or any I/O error; callers
treat None as "can't tell" and skip the check.
Supports the three layouts we care about:
1. Main checkout: ``<repo>/.git/`` is a directory.
2. Git worktree: ``<repo>/.git`` is a file ``gitdir: <path>`` that
points at ``<main>/.git/worktrees/<name>/``. The worktree's
gitdir has HEAD + index but NOT refs/heads/ — those live in
the main checkout, and ``<worktree-gitdir>/commondir`` points
at the main ``.git``. We search both locations for refs.
3. Packed refs: ``refs/heads/<branch>`` is absent on disk but
listed in ``<main-git-dir>/packed-refs``.
"""
try:
git_dir = repo_root / ".git"
# Worktrees store ``.git`` as a file pointing at gitdir: <path>
if git_dir.is_file():
try:
content = git_dir.read_text().strip()
if content.startswith("gitdir:"):
git_dir = Path(content.split(":", 1)[1].strip())
if not git_dir.is_absolute():
git_dir = (repo_root / git_dir).resolve()
except OSError:
return None
if not git_dir.is_dir():
return None
# Figure out the "common" git dir — the one that owns shared refs.
# For a worktree, commondir points at it (relative path, resolve
# against git_dir). For a main checkout, common_dir == git_dir.
common_dir = git_dir
commondir_file = git_dir / "commondir"
if commondir_file.is_file():
try:
rel = commondir_file.read_text().strip()
candidate = (git_dir / rel).resolve() if rel else git_dir
if candidate.is_dir():
common_dir = candidate
except OSError:
pass
head_path = git_dir / "HEAD"
if not head_path.is_file():
return None
head_content = head_path.read_text().strip()
if head_content.startswith("ref:"):
# Symbolic ref — follow one level (e.g. ref: refs/heads/main).
# Worktree-local refs (bisect, rebase-merge state) live under
# git_dir; shared refs (refs/heads/*, refs/tags/*) live under
# common_dir. Try git_dir first, then common_dir.
ref_rel = head_content.split(":", 1)[1].strip()
for base in (git_dir, common_dir) if git_dir != common_dir else (git_dir,):
ref_path = base / ref_rel
if ref_path.is_file():
try:
sha = ref_path.read_text().strip()
except OSError:
continue
if sha:
return sha
# Packed refs fallback — always stored in the common dir.
packed = common_dir / "packed-refs"
if packed.is_file():
try:
for line in packed.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#") or line.startswith("^"):
continue
parts = line.split(None, 1)
if len(parts) == 2 and parts[1] == ref_rel:
return parts[0] or None
except OSError:
return None
return None
# Detached HEAD — content is the SHA directly.
return head_content or None
except Exception:
return None
def _compute_repo_mtime(repo_root: Path) -> float:
"""Return the newest mtime across the stale-code sentinel files.
Legacy fallback used only for non-git installs (``.git`` missing).
Missing files are ignored (they may not exist on older checkouts).
Returns 0.0 if no sentinel file is readable — treat that as "can't
tell", which downstream callers interpret as "not stale" to avoid
false-positive restart loops.
"""
newest = 0.0
for rel in _STALE_CODE_SENTINELS:
try:
st = (repo_root / rel).stat()
except (OSError, FileNotFoundError):
continue
if st.st_mtime > newest:
newest = st.st_mtime
return newest
def _coerce_gateway_timestamp(value: Any) -> Optional[float]:
"""Best-effort conversion of stored gateway timestamps to epoch seconds.
@@ -1107,13 +960,6 @@ class GatewayRunner:
_stop_task: Optional[asyncio.Task] = None
_session_model_overrides: Dict[str, Dict[str, str]] = {}
_session_reasoning_overrides: Dict[str, Dict[str, Any]] = {}
# Stale-code self-check defaults (see _detect_stale_code()). Class-level
# so tests that construct GatewayRunner via ``object.__new__`` without
# running __init__ don't crash when _handle_message reads these.
_boot_wall_time: float = 0.0
_boot_repo_mtime: float = 0.0
_boot_git_sha: Optional[str] = None
_stale_code_restart_triggered: bool = False
def __init__(self, config: Optional[GatewayConfig] = None):
global _gateway_runner_ref
@@ -1122,30 +968,6 @@ class GatewayRunner:
self._warn_if_docker_media_delivery_is_risky()
_gateway_runner_ref = _weakref.ref(self)
# Boot-time snapshot used by the stale-code self-check. Captured
# before any work happens so post-update file writes are guaranteed
# to have newer mtimes. See _detect_stale_code() / Issue #17648.
try:
self._boot_wall_time: float = time.time()
self._repo_root_for_staleness: Path = Path(__file__).resolve().parent.parent
self._boot_git_sha: Optional[str] = _read_git_head_sha(
self._repo_root_for_staleness,
)
self._boot_repo_mtime: float = _compute_repo_mtime(
self._repo_root_for_staleness,
)
except Exception:
self._boot_wall_time = 0.0
self._repo_root_for_staleness = Path(".")
self._boot_git_sha = None
self._boot_repo_mtime = 0.0
self._stale_code_notified: set[str] = set()
self._stale_code_restart_triggered: bool = False
# Cached current-SHA read, refreshed at most every
# _GIT_SHA_CACHE_TTL_SECS so bursty chats don't hammer the filesystem.
self._cached_current_sha: Optional[str] = self._boot_git_sha
self._cached_current_sha_at: float = self._boot_wall_time
# Load ephemeral config from config.yaml / env vars.
# Both are injected at API-call time only and never persisted.
self._prefill_messages = self._load_prefill_messages()
@@ -2853,101 +2675,6 @@ class GatewayRunner:
task.add_done_callback(self._background_tasks.discard)
return True
def _current_git_sha_cached(self) -> Optional[str]:
"""Return the current HEAD SHA, cached for _GIT_SHA_CACHE_TTL_SECS.
A bursty chat (user mashes "hello?" three times) would otherwise
re-read ``.git/HEAD`` on every message. Caching collapses that
into a single read and still re-checks within the user's
perceived "next message" window.
"""
now = time.time()
if (
self._cached_current_sha is not None
and (now - self._cached_current_sha_at) < _GIT_SHA_CACHE_TTL_SECS
):
return self._cached_current_sha
try:
sha = _read_git_head_sha(self._repo_root_for_staleness)
except Exception:
sha = None
self._cached_current_sha = sha
self._cached_current_sha_at = now
return sha
def _detect_stale_code(self) -> bool:
"""Return True if the git HEAD moved since this process booted.
A gateway that survives ``hermes update`` (manual SIGTERM never
escalated, systemd restart race, detached-process respawn failed,
etc.) keeps pre-update modules cached in ``sys.modules``. Later
imports of names added post-update — e.g. ``cfg_get`` from PR
#17304 — raise ImportError against the stale module object (see
Issue #17648).
We compare the git HEAD SHA at boot to the current SHA on disk.
``hermes update`` always moves HEAD forward via ``git pull``;
agent file edits (the agent patching ``run_agent.py`` or
``gateway/run.py`` during a self-dev session) never move HEAD.
That makes SHA comparison free of the false-positive class that
the old mtime check suffered from — the agent can edit any file
without triggering a phantom restart.
Returns False when:
- the boot SHA is unavailable (non-git install, first call
during partial init, etc.); we can't tell and refuse to loop
- the current SHA matches the boot SHA
- reading the current SHA fails for any reason
"""
if not self._boot_wall_time:
return False
if not self._boot_git_sha:
# Non-git install. ``hermes update`` is git-based, so a
# non-git install can't experience the stale-modules class
# this check exists to catch. Return False — no check, no
# false positives. (If we ever ship a pip-install update
# path, we'd add a persistent update marker here and compare
# its timestamp to self._boot_wall_time.)
return False
try:
current = self._current_git_sha_cached()
except Exception:
return False
if not current:
return False
return current != self._boot_git_sha
def _trigger_stale_code_restart(self) -> None:
"""Idempotently kick off a graceful restart after stale-code detection.
Runs at most once per process. The restart request goes through
the normal drain path so in-flight agent turns finish before the
process exits; the service manager (systemd / launchd / detached
profile watcher) then respawns with fresh code. On manual
``hermes gateway run`` installs without a supervisor, the
process exits and the user must restart by hand — but they get a
user-visible message telling them so.
"""
if self._stale_code_restart_triggered:
return
self._stale_code_restart_triggered = True
current_sha = None
try:
current_sha = self._current_git_sha_cached()
except Exception:
pass
logger.warning(
"Stale-code self-check: git HEAD moved since gateway boot "
"(boot=%s, current=%s) — requesting graceful restart. "
"See Issue #17648.",
(self._boot_git_sha or "?")[:12],
(current_sha or "?")[:12],
)
try:
self.request_restart(detached=False, via_service=True)
except Exception as exc:
logger.error("Stale-code restart request failed: %s", exc)
async def start(self) -> bool:
"""
Start the gateway and all configured platform adapters.
@@ -4873,27 +4600,6 @@ class GatewayRunner:
"""
source = event.source
# Stale-code self-check (Issue #17648). A gateway that survives
# ``hermes update`` keeps old modules cached in sys.modules; the
# first inbound message is our earliest safe chance to detect
# this and restart gracefully before we dispatch to the agent
# and hit ImportError on freshly-added names (e.g. cfg_get).
# Idempotent — runs the real check at most once per message, and
# request_restart() no-ops after the first call.
try:
if self._detect_stale_code():
self._trigger_stale_code_restart()
# Acknowledge to the user so they don't see a silent
# drop; the gateway will be back up in a moment via the
# service manager / profile-watcher respawn.
return (
"⟳ Gateway code was updated in the background — "
"restarting this gateway so your next message runs "
"on the new code. Please retry in a moment."
)
except Exception as _stale_exc:
logger.debug("Stale-code self-check failed: %s", _stale_exc)
# Internal events (e.g. background-process completion notifications)
# are system-generated and must skip user authorization.
is_internal = bool(getattr(event, "internal", False))

View File

@@ -163,21 +163,22 @@
for entry in "''${ENTRIES[@]}"; do
IFS=":" read -r ATTR FOLDER NIX_FILE <<< "$entry"
echo "==> .#$ATTR ($FOLDER -> $NIX_FILE)"
# Compute the actual hash from the lockfile directly using
# prefetch-npm-deps. This avoids false "ok" from nix build when
# an old derivation is cached in a substituter (cachix/cache.nixos.org).
LOCK_FILE="$FOLDER/package-lock.json"
NEW_HASH=$(${pkgs.lib.getExe pkgs.prefetch-npm-deps} "$LOCK_FILE" 2>/dev/null)
if [ -z "$NEW_HASH" ]; then
echo " prefetch-npm-deps failed, falling back to nix build" >&2
OUTPUT=$(nix build ".#$ATTR.npmDeps" --no-link --print-build-logs 2>&1)
STATUS=$?
if [ "$STATUS" -eq 0 ]; then
echo " ok"
echo " ok (via nix build)"
continue
fi
NEW_HASH=$(echo "$OUTPUT" | awk '/got:/ {print $2; exit}')
if [ -z "$NEW_HASH" ]; then
# Magic-Nix-Cache occasionally returns HTTP 418 / cache-throttled
# mid-run; nix then prints "outputs not valid, so checking is
# not possible" without a `got:` line. That's an infrastructure
# blip, not a stale lockfile warn + skip rather than failing
# the lint. A real hash mismatch would still surface in the
# primary `.#$ATTR` build, which is a separate CI job.
if echo "$OUTPUT" | grep -qE "throttled|HTTP error 418|substituter .* is disabled|some outputs of .* are not valid"; then
echo " skipped (transient cache failure see primary nix build for real status)" >&2
echo "$OUTPUT" | tail -8 >&2
@@ -187,11 +188,17 @@
echo "$OUTPUT" | tail -40 >&2
exit 1
fi
fi
HASH_LINE=$(grep -n 'hash = "sha256-' "$NIX_FILE" | head -1 | cut -d: -f1)
OLD_HASH=$(grep -oE 'hash = "sha256-[^"]+"' "$NIX_FILE" | head -1 \
| sed -E 's/hash = "(.*)"/\1/')
LOCK_FILE="$FOLDER/package-lock.json"
if [ "$NEW_HASH" = "$OLD_HASH" ]; then
echo " ok"
continue
fi
HASH_LINE=$(grep -n 'hash = "sha256-' "$NIX_FILE" | head -1 | cut -d: -f1)
echo " stale: $NIX_FILE:$HASH_LINE $OLD_HASH -> $NEW_HASH"
STALE=1

View File

@@ -4,7 +4,7 @@ let
src = ../ui-tui;
npmDeps = pkgs.fetchNpmDeps {
inherit src;
hash = "sha256-a/HGI9OgVcTnZrMXA7xFMGnFoVxyHe95fulVz+WNYB0=";
hash = "sha256-MLcLhjTF6dgdvNBtJWzo8Nh19eNh/ZitD2b07nm61Tc=";
};
npm = hermesNpmLib.mkNpmPassthru { folder = "ui-tui"; attr = "tui"; pname = "hermes-tui"; };

View File

@@ -1,412 +0,0 @@
"""Tests for the gateway stale-code self-check (Issue #17648).
A gateway that survives ``hermes update`` keeps pre-update modules cached
in ``sys.modules``. Later imports of names added post-update (e.g.
``cfg_get`` from PR #17304) raise ImportError against the stale module
object.
The self-check compares the git HEAD SHA at boot to the current SHA on
disk. ``hermes update`` always moves HEAD forward via ``git pull``;
agent-driven file edits (Hermes editing ``run_agent.py`` / ``gateway/run.py``
during a self-dev session) never move HEAD — so the SHA signal is free of
the false-positive class that the earlier mtime-based check suffered from.
"""
import os
import time
from pathlib import Path
import pytest
from gateway.run import (
GatewayRunner,
_compute_repo_mtime,
_read_git_head_sha,
_STALE_CODE_SENTINELS,
_GIT_SHA_CACHE_TTL_SECS,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_tmp_repo(tmp_path: Path) -> Path:
"""Create a fake repo with all stale-code sentinel files."""
for rel in _STALE_CODE_SENTINELS:
p = tmp_path / rel
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text("# test sentinel\n")
return tmp_path
def _make_git_repo(tmp_path: Path, sha: str = "a" * 40, branch: str = "main") -> Path:
"""Stamp a minimal .git directory so _read_git_head_sha can resolve a SHA.
We don't run real git — just lay down the files the reader walks
(.git/HEAD pointing at refs/heads/<branch>, refs/heads/<branch>
containing the SHA).
"""
git_dir = tmp_path / ".git"
git_dir.mkdir(parents=True, exist_ok=True)
(git_dir / "HEAD").write_text(f"ref: refs/heads/{branch}\n")
refs_dir = git_dir / "refs" / "heads"
refs_dir.mkdir(parents=True, exist_ok=True)
(refs_dir / branch).write_text(f"{sha}\n")
return tmp_path
def _set_head_sha(repo_root: Path, sha: str, branch: str = "main") -> None:
"""Rewrite the current branch ref to a new SHA (simulates git pull)."""
(repo_root / ".git" / "refs" / "heads" / branch).write_text(f"{sha}\n")
def _make_runner(
repo_root: Path,
*,
boot_sha: str | None,
boot_wall: float = None,
boot_mtime: float = 0.0,
):
"""Bare GatewayRunner with just the stale-check attributes set."""
if boot_wall is None:
boot_wall = time.time()
runner = object.__new__(GatewayRunner)
runner._repo_root_for_staleness = repo_root
runner._boot_wall_time = boot_wall
runner._boot_git_sha = boot_sha
runner._boot_repo_mtime = boot_mtime
runner._stale_code_notified = set()
runner._stale_code_restart_triggered = False
runner._cached_current_sha = boot_sha
runner._cached_current_sha_at = boot_wall
return runner
# ---------------------------------------------------------------------------
# _read_git_head_sha — raw SHA reader
# ---------------------------------------------------------------------------
def test_read_git_head_sha_branch_ref(tmp_path):
"""Resolves ref: refs/heads/<branch> → SHA from refs/heads/<branch>."""
sha = "b" * 40
_make_git_repo(tmp_path, sha=sha, branch="main")
assert _read_git_head_sha(tmp_path) == sha
def test_read_git_head_sha_detached_head(tmp_path):
"""Detached HEAD: .git/HEAD contains the SHA directly."""
sha = "c" * 40
git_dir = tmp_path / ".git"
git_dir.mkdir()
(git_dir / "HEAD").write_text(f"{sha}\n")
assert _read_git_head_sha(tmp_path) == sha
def test_read_git_head_sha_packed_refs(tmp_path):
"""Falls back to packed-refs when refs/heads/<branch> is missing."""
sha = "d" * 40
git_dir = tmp_path / ".git"
git_dir.mkdir()
(git_dir / "HEAD").write_text("ref: refs/heads/main\n")
# No refs/heads/main file — only packed-refs
(git_dir / "packed-refs").write_text(
f"# pack-refs with: peeled fully-peeled sorted\n"
f"{sha} refs/heads/main\n"
)
assert _read_git_head_sha(tmp_path) == sha
def test_read_git_head_sha_worktree_gitdir_file(tmp_path):
"""Worktree: .git is a file with `gitdir: <path>` pointing to the real git dir.
Real git worktrees store shared refs (refs/heads/*) in the main
checkout's .git/ and write a ``commondir`` pointer into the
worktree-gitdir. The reader must follow commondir to resolve the
branch ref — this is the layout Hermes dev sessions actually use.
"""
sha = "e" * 40
# Main repo layout
main_repo = tmp_path / "main-repo"
main_git = main_repo / ".git"
(main_git / "refs" / "heads").mkdir(parents=True)
(main_git / "HEAD").write_text("ref: refs/heads/main\n")
(main_git / "refs" / "heads" / "main").write_text("0" * 40 + "\n")
# Worktree lives in main-repo/.git/worktrees/<name>/
worktree_git_dir = main_git / "worktrees" / "feature"
worktree_git_dir.mkdir(parents=True)
(worktree_git_dir / "HEAD").write_text("ref: refs/heads/feature\n")
# commondir points back at the main .git (relative path, "../..")
(worktree_git_dir / "commondir").write_text("../..\n")
# Feature branch ref lives in the shared refs/heads
(main_git / "refs" / "heads" / "feature").write_text(f"{sha}\n")
# Worktree checkout with .git file pointing at worktree_git_dir
worktree = tmp_path / "wt"
worktree.mkdir()
(worktree / ".git").write_text(f"gitdir: {worktree_git_dir}\n")
assert _read_git_head_sha(worktree) == sha
def test_read_git_head_sha_worktree_packed_refs_in_common(tmp_path):
"""Worktree + packed-refs in common dir: fallback still resolves."""
sha = "f" * 40
main_repo = tmp_path / "main-repo"
main_git = main_repo / ".git"
main_git.mkdir(parents=True)
(main_git / "HEAD").write_text("ref: refs/heads/main\n")
# packed-refs in the common (main) .git
(main_git / "packed-refs").write_text(
f"# pack-refs with: peeled fully-peeled sorted\n"
f"{sha} refs/heads/feature\n"
)
worktree_git_dir = main_git / "worktrees" / "feature"
worktree_git_dir.mkdir(parents=True)
(worktree_git_dir / "HEAD").write_text("ref: refs/heads/feature\n")
(worktree_git_dir / "commondir").write_text("../..\n")
worktree = tmp_path / "wt"
worktree.mkdir()
(worktree / ".git").write_text(f"gitdir: {worktree_git_dir}\n")
assert _read_git_head_sha(worktree) == sha
def test_read_git_head_sha_no_git_returns_none(tmp_path):
"""No .git dir → None (non-git install, safely disables the check)."""
assert _read_git_head_sha(tmp_path) is None
def test_read_git_head_sha_malformed_head_returns_none(tmp_path):
"""Empty HEAD file → None (don't loop on corrupt repos)."""
git_dir = tmp_path / ".git"
git_dir.mkdir()
(git_dir / "HEAD").write_text("")
assert _read_git_head_sha(tmp_path) is None
# ---------------------------------------------------------------------------
# _detect_stale_code — the main regression guard
# ---------------------------------------------------------------------------
def test_detect_stale_code_false_when_sha_unchanged(tmp_path):
"""Boot SHA == current SHA → not stale (no restart)."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
# Force fresh read by expiring the cache
runner._cached_current_sha_at = 0.0
assert runner._detect_stale_code() is False
def test_detect_stale_code_true_after_git_pull(tmp_path):
"""Boot SHA != current SHA → stale (hermes update happened)."""
boot_sha = "a" * 40
_make_git_repo(tmp_path, sha=boot_sha)
runner = _make_runner(tmp_path, boot_sha=boot_sha)
# Simulate git pull moving HEAD forward
_set_head_sha(tmp_path, "b" * 40)
runner._cached_current_sha_at = 0.0 # expire cache
assert runner._detect_stale_code() is True
def test_detect_stale_code_ignores_agent_file_edits(tmp_path):
"""THE CORE REGRESSION: agent edits to source files do NOT trigger restart.
This is the motivating incident for the SHA-based check. Under the
previous mtime-based scheme, any ``patch`` / ``write_file`` call
against run_agent.py / gateway/run.py / hermes_cli/config.py would
flip the stale-check to True and force a gateway restart on the
next message — even though no update actually happened. SHA
comparison decouples the two: git HEAD only moves on ``git pull``,
never on file writes.
"""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
_make_tmp_repo(tmp_path) # lay down sentinel files too
runner = _make_runner(tmp_path, boot_sha=sha)
# Simulate the agent editing run_agent.py and gateway/run.py with
# mtimes far into the future — exactly the scenario that used to
# false-positive the old mtime check.
future = time.time() + 10_000
for rel in _STALE_CODE_SENTINELS:
p = tmp_path / rel
if p.is_file():
p.write_text("# agent just edited this\n")
os.utime(p, (future, future))
# HEAD SHA has NOT moved — check must stay False.
runner._cached_current_sha_at = 0.0 # expire cache
assert runner._detect_stale_code() is False
def test_detect_stale_code_false_for_non_git_install(tmp_path):
"""Non-git install (no .git dir) → check disabled, never fires."""
# No .git dir at all; runner's boot_sha is None
runner = _make_runner(tmp_path, boot_sha=None)
# Even if we pretended the current SHA differed, the check should
# short-circuit on boot_sha=None and return False.
assert runner._detect_stale_code() is False
def test_detect_stale_code_false_when_no_boot_wall_time(tmp_path):
"""No boot snapshot at all → can't tell → not stale (no restart loop)."""
runner = _make_runner(tmp_path, boot_sha="a" * 40, boot_wall=0.0)
assert runner._detect_stale_code() is False
def test_detect_stale_code_handles_disappearing_git_dir(tmp_path):
""".git vanishes mid-run → current_sha = None → not stale (don't loop)."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
# Nuke the git dir after boot
import shutil
shutil.rmtree(tmp_path / ".git")
runner._cached_current_sha_at = 0.0 # expire cache
assert runner._detect_stale_code() is False
# ---------------------------------------------------------------------------
# SHA cache
# ---------------------------------------------------------------------------
def test_current_sha_cache_collapses_bursts(tmp_path, monkeypatch):
"""Consecutive calls inside the TTL window reuse the cached SHA."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
read_calls = {"n": 0}
real_reader = _read_git_head_sha
def counting_reader(repo_root):
read_calls["n"] += 1
return real_reader(repo_root)
from gateway import run as run_mod
monkeypatch.setattr(run_mod, "_read_git_head_sha", counting_reader)
# Force cache expiry so the first call definitely reads
runner._cached_current_sha_at = 0.0
runner._current_git_sha_cached()
first_count = read_calls["n"]
# Immediate second/third calls should hit cache (no new read)
runner._current_git_sha_cached()
runner._current_git_sha_cached()
assert read_calls["n"] == first_count
def test_current_sha_cache_expires_after_ttl(tmp_path, monkeypatch):
"""After _GIT_SHA_CACHE_TTL_SECS elapses, a fresh read happens."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
read_calls = {"n": 0}
real_reader = _read_git_head_sha
def counting_reader(repo_root):
read_calls["n"] += 1
return real_reader(repo_root)
from gateway import run as run_mod
monkeypatch.setattr(run_mod, "_read_git_head_sha", counting_reader)
runner._cached_current_sha_at = 0.0
runner._current_git_sha_cached()
first = read_calls["n"]
# Age the cache past the TTL
runner._cached_current_sha_at = time.time() - (_GIT_SHA_CACHE_TTL_SECS + 1.0)
runner._current_git_sha_cached()
assert read_calls["n"] == first + 1
# ---------------------------------------------------------------------------
# _trigger_stale_code_restart — idempotency preserved
# ---------------------------------------------------------------------------
def test_trigger_stale_code_restart_is_idempotent(tmp_path):
"""Calling _trigger_stale_code_restart twice only requests restart once."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
calls = []
def fake_request_restart(*, detached=False, via_service=False):
calls.append((detached, via_service))
return True
runner.request_restart = fake_request_restart
runner._trigger_stale_code_restart()
runner._trigger_stale_code_restart()
runner._trigger_stale_code_restart()
assert len(calls) == 1
assert runner._stale_code_restart_triggered is True
def test_trigger_stale_code_restart_survives_request_failure(tmp_path):
"""If request_restart raises, we swallow and mark as triggered anyway."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
def boom(*, detached=False, via_service=False):
raise RuntimeError("no event loop")
runner.request_restart = boom
# Should not raise
runner._trigger_stale_code_restart()
# Marked triggered so we don't retry on every subsequent message
assert runner._stale_code_restart_triggered is True
# ---------------------------------------------------------------------------
# Class-level defaults — tests that build bare runners via object.__new__
# ---------------------------------------------------------------------------
def test_class_level_defaults_prevent_uninitialized_access():
"""Partial construction via object.__new__ must not crash _detect_stale_code."""
runner = object.__new__(GatewayRunner)
# Don't set any instance attrs — class-level defaults should kick in
runner._repo_root_for_staleness = Path(".")
# _boot_wall_time / _boot_git_sha fall through to class defaults
# (0.0 and None respectively)
assert runner._detect_stale_code() is False
# _stale_code_restart_triggered falls through to class default (False)
assert runner._stale_code_restart_triggered is False
# ---------------------------------------------------------------------------
# Legacy mtime reader kept for compatibility — light sanity check only
# ---------------------------------------------------------------------------
def test_compute_repo_mtime_still_returns_newest(tmp_path):
"""_compute_repo_mtime remains available for any legacy callers."""
repo = _make_tmp_repo(tmp_path)
baseline = time.time() - 100
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
newer = time.time()
os.utime(repo / "hermes_cli/config.py", (newer, newer))
result = _compute_repo_mtime(repo)
assert abs(result - newer) < 1.0
def test_compute_repo_mtime_missing_files_returns_zero(tmp_path):
"""Legacy sanity: missing sentinels → 0.0."""
assert _compute_repo_mtime(tmp_path) == 0.0