Compare commits

...

2 Commits

Author SHA1 Message Date
Brooklyn Nicholson
350f4a6448 fix(update): use subprocess hand-off instead of os.execve (cross-platform)
The exec-based re-exec was sketchy (replaces the running interpreter
mid-update on the common CLI path that already works) and, worse, had to
disable itself on Windows because os.exec* there spawns a new PID and
breaks the desktop installer's exit-code wait — so it didn't fix the
"do it again" trap for Windows users at all.

Replace it with a plain subprocess hand-off: after a successful pull + dep
install, spawn `hermes update` again with HERMES_UPDATE_FINALIZE=1, inherit
stdio, and forward the child's exit code via sys.exit(). The parent PID
stays intact everywhere, so the hand-off now stays on for Windows too. This
is the same `[sys.executable, "-m", "hermes_cli.main", ...]` subprocess
pattern the updater already uses for the desktop build.

Renames: _should_reexec_after_pull -> _should_handoff_after_pull (drops the
Windows gate), _reexec_into_updated_code -> _handoff_update_to_refreshed_code
(returns child rc / None), HERMES_UPDATE_NO_REEXEC -> HERMES_UPDATE_NO_HANDOFF.
On spawn failure we still fall through and finish in-process.
2026-06-05 06:11:17 -05:00
Brooklyn Nicholson
a6c60a2902 fix(update): re-exec into pulled code so update finishes in one run
`hermes update` runs from the *old* install, but its post-pull steps
(dependency install, config migration, skills sync, gateway restart) ran
from the modules this process imported at startup — even though the new
source was already on disk after the git pull. Any bug fixed in the pulled
version that lived in an already-imported post-pull path still crashed the
first run, so users had to run `hermes update` a second time (the retry
loaded the fresh code and succeeded). The desktop installer drives the same
old `hermes update`, so it hit the same "do it again" trap.

After a successful pull AND dependency install (so new source and its deps
are present), re-exec into the refreshed code via POSIX `execve`, which
keeps the same PID/fds so the installer's stdout/stderr streaming and
exit-code wait are unaffected. The re-exec'd run sets
`HERMES_UPDATE_FINALIZE=1`, skips the fetch/pull/snapshot/backup work and
the re-exec itself, and runs only the post-pull finalize steps with new
code.

Gated off on the finalize pass (loop-breaker), on Windows (`os.exec*`
spawns a new PID and would break the installer's exit-code wait), under
pytest (never replace the test interpreter mid-suite), and via the
`HERMES_UPDATE_NO_REEXEC` escape hatch. On any `execve` failure we fall
through and finish in-process — the historical behavior.
2026-06-05 06:07:05 -05:00
2 changed files with 399 additions and 20 deletions

View File

@@ -10131,6 +10131,62 @@ def _cmd_update_pip(args):
print("✓ Update complete! Restart hermes to use the new version.")
def _should_handoff_after_pull(finalize_only: bool) -> bool:
"""Whether to hand the post-pull steps off to a fresh subprocess on new
code.
Returns False (finish in-process) when:
- this IS already the finalize subprocess (avoid an infinite loop),
- under pytest, so the test runner's interpreter never spawns a real
recursive update mid-suite,
- the ``HERMES_UPDATE_NO_HANDOFF`` escape hatch is set.
Cross-platform: unlike an ``os.exec*`` replacement (which on Windows
spawns a *new* PID and breaks the desktop installer's exit-code wait on
the original process), a child subprocess + exit-code forwarding keeps the
parent PID intact everywhere, so this stays on for Windows too.
"""
if finalize_only or os.environ.get("HERMES_UPDATE_FINALIZE") == "1":
return False
if os.environ.get("HERMES_UPDATE_NO_HANDOFF") == "1":
return False
if "pytest" in sys.modules:
return False
return True
def _handoff_update_to_refreshed_code():
"""Finish the update in a fresh subprocess running the just-pulled code.
Called after a successful git pull + dependency install (so the new source
AND its deps are on disk). Spawns ``hermes update`` again with
``HERMES_UPDATE_FINALIZE=1`` set; that child skips the fetch/pull/snapshot
work and this hand-off, and runs only the post-pull finalize steps with
new code. stdin/stdout/stderr are inherited, so interactive prompts and a
parent streaming our output both keep working.
Returns the child's exit code, or ``None`` if the subprocess could not be
launched at all — in which case the caller finishes the update in-process,
the historical behavior.
"""
try:
env = dict(os.environ)
env["HERMES_UPDATE_FINALIZE"] = "1"
cmd = [sys.executable, "-m", "hermes_cli.main", *sys.argv[1:]]
print("→ Finishing update with refreshed code...")
sys.stdout.flush()
sys.stderr.flush()
result = subprocess.run(cmd, cwd=PROJECT_ROOT, env=env)
return result.returncode
except Exception as exc: # pragma: no cover - spawn almost never fails
logger.warning(
"update: could not hand off to refreshed code (%s); "
"finishing in-process",
exc,
)
return None
def _cmd_update_impl(args, gateway_mode: bool):
"""Body of ``cmd_update`` — kept separate so the wrapper can always
restore stdio even on ``sys.exit``."""
@@ -10142,7 +10198,18 @@ def _cmd_update_impl(args, gateway_mode: bool):
)
assume_yes = bool(getattr(args, "yes", False))
print("⚕ Updating Hermes Agent...")
# Self-update hand-off marker. ``hermes update`` runs from the *old*
# install, so its post-pull steps (dep install, config migration, gateway
# restart) would otherwise execute stale in-memory code even though the new
# source is already on disk. After a successful pull we re-exec into the
# refreshed code with HERMES_UPDATE_FINALIZE=1; ``finalize_only`` is True on
# that second pass and makes us skip the fetch/pull work and the re-exec.
finalize_only = os.environ.get("HERMES_UPDATE_FINALIZE") == "1"
if finalize_only:
print("⚕ Finalizing update with refreshed code...")
else:
print("⚕ Updating Hermes Agent...")
print()
# On Windows, abort early if another hermes.exe is holding the venv shim
@@ -10158,8 +10225,10 @@ def _cmd_update_impl(args, gateway_mode: bool):
sys.exit(2)
# Pre-update backup — runs before any git/file mutation so users can
# always roll back to the exact state they had before this update.
_run_pre_update_backup(args)
# always roll back to the exact state they had before this update. Skipped
# on the finalize re-exec (the original pass already took it).
if not finalize_only:
_run_pre_update_backup(args)
# Try git-based update first, fall back to ZIP download on Windows
# when git file I/O is broken (antivirus, NTFS filter drivers, etc.)
@@ -10363,7 +10432,11 @@ def _cmd_update_impl(args, gateway_mode: bool):
)
commit_count = int(result.stdout.strip())
if commit_count == 0:
# On the finalize re-exec the pull already happened in the original
# pass, so origin is level with HEAD (count == 0). Don't take the
# "Already up to date" early return — fall through and run the
# post-pull steps (now with refreshed code).
if commit_count == 0 and not finalize_only:
_invalidate_update_cache()
# Even if origin is up to date, the fork may be behind upstream
@@ -10390,24 +10463,30 @@ def _cmd_update_impl(args, gateway_mode: bool):
print("✓ Already up to date!")
return
print(f"→ Found {commit_count} new commit(s)")
# Snapshot critical state (state.db, config, pairing JSONs, etc.)
# before pulling so a user can recover if something goes wrong.
# Issue #15733 reported missing pairing data after an update; even
# though `git pull` can't touch $HERMES_HOME, this is cheap
# belt-and-suspenders insurance and gives the user something to
# restore from via `/snapshot list` / `/snapshot restore <id>`.
# The "found N commits" notice and pre-update snapshot belong to the
# original pass only — the finalize re-exec sees count == 0 and the
# snapshot was already taken before the pull.
pre_update_snapshot_id = None
try:
from hermes_cli.backup import create_quick_snapshot
if not finalize_only:
print(f"→ Found {commit_count} new commit(s)")
pre_update_snapshot_id = create_quick_snapshot(label="pre-update", keep=1)
if pre_update_snapshot_id:
print(f" ✓ Pre-update snapshot: {pre_update_snapshot_id}")
except Exception as exc:
# Never let a snapshot failure block an update.
logger.debug("Pre-update snapshot failed: %s", exc)
# Snapshot critical state (state.db, config, pairing JSONs, etc.)
# before pulling so a user can recover if something goes wrong.
# Issue #15733 reported missing pairing data after an update; even
# though `git pull` can't touch $HERMES_HOME, this is cheap
# belt-and-suspenders insurance and gives the user something to
# restore from via `/snapshot list` / `/snapshot restore <id>`.
try:
from hermes_cli.backup import create_quick_snapshot
pre_update_snapshot_id = create_quick_snapshot(
label="pre-update", keep=1
)
if pre_update_snapshot_id:
print(f" ✓ Pre-update snapshot: {pre_update_snapshot_id}")
except Exception as exc:
# Never let a snapshot failure block an update.
logger.debug("Pre-update snapshot failed: %s", exc)
print("→ Pulling updates...")
update_succeeded = False
@@ -10579,6 +10658,26 @@ def _cmd_update_impl(args, gateway_mode: bool):
_refresh_active_lazy_features()
# Hand off the remaining post-pull work (node deps, web/desktop build,
# config migration, skills sync, gateway restart) to the freshly-pulled
# code. These are the most frequently-changed and historically most
# fragile update steps, yet they used to run from the modules this
# process imported at startup — so a bug fixed in the pulled version
# still crashed here, forcing users to run ``hermes update`` twice. The
# git pull AND dependency install are done, so the new source and its
# deps are on disk; finish in a fresh subprocess running new code and
# forward its exit code. On the finalize pass (or under pytest /
# opt-out) we skip the hand-off and finish in-process — see
# _should_handoff_after_pull.
if _should_handoff_after_pull(finalize_only):
handoff_rc = _handoff_update_to_refreshed_code()
if handoff_rc is not None:
# The child ran every remaining post-pull step on new code;
# forward its result and stop (cmd_update's finally still
# restores stdio on the way out).
sys.exit(handoff_rc)
# else: spawning the child failed — fall through and finish here.
_update_node_dependencies()
_build_web_ui(PROJECT_ROOT / "web")

View File

@@ -0,0 +1,280 @@
"""Tests for the post-pull subprocess hand-off in ``hermes update``.
``hermes update`` runs from the *old* install. Before this hand-off, the
post-pull steps (dep install, config migration, gateway restart) executed
stale in-memory code even though the new source was already on disk, so a bug
fixed in the pulled version still crashed the first run — users had to run
``hermes update`` a second time. After a successful pull + dep install we now
finish in a fresh subprocess running the refreshed code and forward its exit
code. Unlike an ``os.exec*`` replacement, a child subprocess keeps the parent
PID intact, so this works on Windows too.
These tests cover the gate (when we hand off vs. finish in-process), the
subprocess mechanics, and the finalize pass the child process takes.
"""
import sys
from types import SimpleNamespace
import pytest
from hermes_cli import config as hermes_config
from hermes_cli import main as hermes_main
# ---------------------------------------------------------------------------
# Managed-uv compatibility: make managed_uv helpers follow shutil.which mocking
# (mirrors the autouse fixture in test_update_autostash.py).
# ---------------------------------------------------------------------------
@pytest.fixture(autouse=True)
def _patch_managed_uv():
import shutil
from unittest.mock import patch
with patch("hermes_cli.managed_uv.resolve_uv", side_effect=lambda: shutil.which("uv")), \
patch("hermes_cli.managed_uv.ensure_uv", side_effect=lambda: shutil.which("uv")), \
patch("hermes_cli.managed_uv.update_managed_uv", side_effect=lambda: None):
yield
def _clear_handoff_env(monkeypatch):
monkeypatch.delenv("HERMES_UPDATE_FINALIZE", raising=False)
monkeypatch.delenv("HERMES_UPDATE_NO_HANDOFF", raising=False)
# ---------------------------------------------------------------------------
# _should_handoff_after_pull — the gate
# ---------------------------------------------------------------------------
def test_should_handoff_false_in_finalize_mode(monkeypatch):
_clear_handoff_env(monkeypatch)
assert hermes_main._should_handoff_after_pull(finalize_only=True) is False
def test_should_handoff_false_when_finalize_env_set(monkeypatch):
_clear_handoff_env(monkeypatch)
monkeypatch.setenv("HERMES_UPDATE_FINALIZE", "1")
assert hermes_main._should_handoff_after_pull(finalize_only=False) is False
def test_should_handoff_false_with_opt_out_env(monkeypatch):
_clear_handoff_env(monkeypatch)
monkeypatch.delitem(sys.modules, "pytest", raising=False)
monkeypatch.setenv("HERMES_UPDATE_NO_HANDOFF", "1")
assert hermes_main._should_handoff_after_pull(finalize_only=False) is False
def test_should_handoff_false_under_pytest(monkeypatch):
# Safety invariant: never spawn a real recursive update while the test
# suite is running. ``pytest`` is in sys.modules during the suite, so the
# gate must stay closed even with a clean env.
_clear_handoff_env(monkeypatch)
assert "pytest" in sys.modules
assert hermes_main._should_handoff_after_pull(finalize_only=False) is False
def test_should_handoff_true_when_allowed(monkeypatch):
_clear_handoff_env(monkeypatch)
monkeypatch.delitem(sys.modules, "pytest", raising=False)
assert hermes_main._should_handoff_after_pull(finalize_only=False) is True
def test_should_handoff_stays_on_for_windows(monkeypatch):
# The whole reason we use a subprocess instead of os.exec*: it keeps the
# parent PID intact, so the hand-off works on Windows too (the previous
# exec-based attempt had to disable itself there).
_clear_handoff_env(monkeypatch)
monkeypatch.setattr(sys, "platform", "win32")
monkeypatch.delitem(sys.modules, "pytest", raising=False)
assert hermes_main._should_handoff_after_pull(finalize_only=False) is True
# ---------------------------------------------------------------------------
# _handoff_update_to_refreshed_code — the subprocess mechanics
# ---------------------------------------------------------------------------
def test_handoff_runs_subprocess_with_finalize_env(monkeypatch):
captured = {}
def fake_run(cmd, **kwargs):
captured["cmd"] = list(cmd)
captured["env"] = dict(kwargs.get("env") or {})
return SimpleNamespace(returncode=0)
monkeypatch.setattr(hermes_main.subprocess, "run", fake_run)
monkeypatch.setattr(hermes_main.sys, "argv", ["hermes", "update", "--yes", "--gateway"])
rc = hermes_main._handoff_update_to_refreshed_code()
assert rc == 0
assert captured["cmd"][0] == sys.executable
assert captured["cmd"][1:3] == ["-m", "hermes_cli.main"]
# The original CLI args (minus argv[0]) are carried through verbatim.
assert captured["cmd"][3:] == ["update", "--yes", "--gateway"]
# The loop-breaker guard the child reads.
assert captured["env"]["HERMES_UPDATE_FINALIZE"] == "1"
def test_handoff_forwards_nonzero_exit_code(monkeypatch):
monkeypatch.setattr(
hermes_main.subprocess, "run", lambda *a, **k: SimpleNamespace(returncode=42)
)
monkeypatch.setattr(hermes_main.sys, "argv", ["hermes", "update"])
assert hermes_main._handoff_update_to_refreshed_code() == 42
def test_handoff_returns_none_when_spawn_fails(monkeypatch):
def boom(*_a, **_kw):
raise OSError("could not spawn")
monkeypatch.setattr(hermes_main.subprocess, "run", boom)
monkeypatch.setattr(hermes_main.sys, "argv", ["hermes", "update"])
# None signals the caller to finish in-process instead of bailing out.
assert hermes_main._handoff_update_to_refreshed_code() is None
# ---------------------------------------------------------------------------
# cmd_update integration — original pass hands off, finalize pass finishes
# ---------------------------------------------------------------------------
def _setup_update_mocks(monkeypatch, tmp_path):
(tmp_path / ".git").mkdir()
monkeypatch.setattr(hermes_main, "PROJECT_ROOT", tmp_path)
monkeypatch.setattr(hermes_main, "_stash_local_changes_if_needed", lambda *a, **kw: None)
monkeypatch.setattr(hermes_main, "_restore_stashed_changes", lambda *a, **kw: True)
monkeypatch.setattr(hermes_config, "get_missing_env_vars", lambda required_only=True: [])
monkeypatch.setattr(hermes_config, "get_missing_config_fields", lambda: [])
monkeypatch.setattr(hermes_config, "check_config_version", lambda: (5, 5))
monkeypatch.setattr(hermes_config, "migrate_config", lambda **kw: {"env_added": [], "config_added": []})
monkeypatch.setattr(hermes_main, "_refresh_active_lazy_features", lambda: None)
def _fake_git_run(commit_count):
recorded = []
def side_effect(cmd, **kwargs):
recorded.append(cmd)
joined = " ".join(str(c) for c in cmd)
if "fetch" in joined and "origin" in joined:
return SimpleNamespace(stdout="", stderr="", returncode=0)
if "rev-parse" in joined and "--abbrev-ref" in joined:
return SimpleNamespace(stdout="main\n", stderr="", returncode=0)
if "rev-list" in joined:
return SimpleNamespace(stdout=f"{commit_count}\n", stderr="", returncode=0)
if "--ff-only" in joined:
return SimpleNamespace(stdout="Already up to date.\n", stderr="", returncode=0)
return SimpleNamespace(returncode=0, stdout="", stderr="")
return side_effect, recorded
def test_original_pass_hands_off_and_forwards_exit_code(monkeypatch, tmp_path):
_clear_handoff_env(monkeypatch)
_setup_update_mocks(monkeypatch, tmp_path)
monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/uv" if name == "uv" else None)
monkeypatch.setattr(hermes_main, "_is_termux_env", lambda env=None: False)
# Force the gate open (pytest normally closes it) and stub the child run.
monkeypatch.setattr(hermes_main, "_should_handoff_after_pull", lambda finalize_only: True)
monkeypatch.setattr(hermes_main, "_handoff_update_to_refreshed_code", lambda: 0)
node_called = []
monkeypatch.setattr(
hermes_main, "_update_node_dependencies", lambda *a, **k: node_called.append(True)
)
side_effect, _recorded = _fake_git_run(commit_count="3")
monkeypatch.setattr(hermes_main.subprocess, "run", side_effect)
with pytest.raises(SystemExit) as exc:
hermes_main.cmd_update(SimpleNamespace())
assert exc.value.code == 0
# After a successful hand-off the parent must NOT run the remaining
# post-pull steps — the child already did them on new code.
assert node_called == [], "parent ran post-handoff steps after a successful hand-off"
def test_handoff_failure_falls_back_in_process(monkeypatch, tmp_path):
_clear_handoff_env(monkeypatch)
_setup_update_mocks(monkeypatch, tmp_path)
monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/uv" if name == "uv" else None)
monkeypatch.setattr(hermes_main, "_is_termux_env", lambda env=None: False)
monkeypatch.setattr(hermes_main, "_should_handoff_after_pull", lambda finalize_only: True)
# Child couldn't be spawned -> None -> parent finishes in-process.
monkeypatch.setattr(hermes_main, "_handoff_update_to_refreshed_code", lambda: None)
node_called = []
monkeypatch.setattr(
hermes_main, "_update_node_dependencies", lambda *a, **k: node_called.append(True)
)
monkeypatch.setattr(hermes_main, "_build_web_ui", lambda *a, **k: None)
side_effect, _recorded = _fake_git_run(commit_count="3")
monkeypatch.setattr(hermes_main.subprocess, "run", side_effect)
hermes_main.cmd_update(SimpleNamespace())
assert node_called == [True], "fallback should finish the post-pull steps in-process"
def test_finalize_pass_does_not_hand_off(monkeypatch, tmp_path):
# In finalize mode the gate is closed, so we must never spawn another
# update. Make the hand-off explode so the test fails loudly if reached.
_clear_handoff_env(monkeypatch)
monkeypatch.setenv("HERMES_UPDATE_FINALIZE", "1")
_setup_update_mocks(monkeypatch, tmp_path)
monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/uv" if name == "uv" else None)
monkeypatch.setattr(hermes_main, "_is_termux_env", lambda env=None: False)
def no_handoff(): # pragma: no cover - asserts it's not called
raise AssertionError("finalize pass must not hand off")
monkeypatch.setattr(hermes_main, "_handoff_update_to_refreshed_code", no_handoff)
side_effect, recorded = _fake_git_run(commit_count="0")
monkeypatch.setattr(hermes_main.subprocess, "run", side_effect)
hermes_main.cmd_update(SimpleNamespace())
# The whole point of the finalize pass: even with zero new commits (the
# pull already happened in the original pass) it does NOT take the "Already
# up to date" early return — it runs the post-pull dependency install.
install_cmds = [c for c in recorded if "pip" in c and "install" in c]
assert install_cmds, "finalize pass should run the dependency install, not early-return"
def test_finalize_pass_skips_pre_update_backup(monkeypatch, tmp_path):
_clear_handoff_env(monkeypatch)
monkeypatch.setenv("HERMES_UPDATE_FINALIZE", "1")
_setup_update_mocks(monkeypatch, tmp_path)
monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/uv" if name == "uv" else None)
monkeypatch.setattr(hermes_main, "_is_termux_env", lambda env=None: False)
backup_calls = []
monkeypatch.setattr(hermes_main, "_run_pre_update_backup", lambda args: backup_calls.append(args))
side_effect, _recorded = _fake_git_run(commit_count="0")
monkeypatch.setattr(hermes_main.subprocess, "run", side_effect)
hermes_main.cmd_update(SimpleNamespace())
assert backup_calls == [], "finalize pass must not retake the pre-update backup"
def test_original_pass_still_runs_pre_update_backup(monkeypatch, tmp_path):
# Sanity counter-check: a normal (non-finalize) run still takes the backup.
# Under pytest the gate is closed, so the run finishes in-process exactly
# as it always did.
_clear_handoff_env(monkeypatch)
_setup_update_mocks(monkeypatch, tmp_path)
monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/uv" if name == "uv" else None)
monkeypatch.setattr(hermes_main, "_is_termux_env", lambda env=None: False)
backup_calls = []
monkeypatch.setattr(hermes_main, "_run_pre_update_backup", lambda args: backup_calls.append(args))
side_effect, _recorded = _fake_git_run(commit_count="3")
monkeypatch.setattr(hermes_main.subprocess, "run", side_effect)
hermes_main.cmd_update(SimpleNamespace())
assert len(backup_calls) == 1