fix(update): use subprocess hand-off instead of os.execve (cross-platform)

The exec-based re-exec was sketchy (replaces the running interpreter mid-update on the common CLI path that already works) and, worse, had to disable itself on Windows because os.exec* there spawns a new PID and breaks the desktop installer's exit-code wait — so it didn't fix the "do it again" trap for Windows users at all. Replace it with a plain subprocess hand-off: after a successful pull + dep install, spawn `hermes update` again with HERMES_UPDATE_FINALIZE=1, inherit stdio, and forward the child's exit code via sys.exit(). The parent PID stays intact everywhere, so the hand-off now stays on for Windows too. This is the same `[sys.executable, "-m", "hermes_cli.main", ...]` subprocess pattern the updater already uses for the desktop build. Renames: _should_reexec_after_pull -> _should_handoff_after_pull (drops the Windows gate), _reexec_into_updated_code -> _handoff_update_to_refreshed_code (returns child rc / None), HERMES_UPDATE_NO_REEXEC -> HERMES_UPDATE_NO_HANDOFF. On spawn failure we still fall through and finish in-process.
fix(update): re-exec into pulled code so update finishes in one run
2026-06-10 12:18:44 +08:00 · 2026-06-05 06:11:17 -05:00 · 2026-06-05 06:07:05 -05:00
2 changed files with 399 additions and 20 deletions
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -10131,6 +10131,62 @@ def _cmd_update_pip(args):
    print("✓ Update complete! Restart hermes to use the new version.")


+def _should_handoff_after_pull(finalize_only: bool) -> bool:
+    """Whether to hand the post-pull steps off to a fresh subprocess on new
+    code.
+
+    Returns False (finish in-process) when:
+      - this IS already the finalize subprocess (avoid an infinite loop),
+      - under pytest, so the test runner's interpreter never spawns a real
+        recursive update mid-suite,
+      - the ``HERMES_UPDATE_NO_HANDOFF`` escape hatch is set.
+
+    Cross-platform: unlike an ``os.exec*`` replacement (which on Windows
+    spawns a *new* PID and breaks the desktop installer's exit-code wait on
+    the original process), a child subprocess + exit-code forwarding keeps the
+    parent PID intact everywhere, so this stays on for Windows too.
+    """
+    if finalize_only or os.environ.get("HERMES_UPDATE_FINALIZE") == "1":
+        return False
+    if os.environ.get("HERMES_UPDATE_NO_HANDOFF") == "1":
+        return False
+    if "pytest" in sys.modules:
+        return False
+    return True
+
+
+def _handoff_update_to_refreshed_code():
+    """Finish the update in a fresh subprocess running the just-pulled code.
+
+    Called after a successful git pull + dependency install (so the new source
+    AND its deps are on disk). Spawns ``hermes update`` again with
+    ``HERMES_UPDATE_FINALIZE=1`` set; that child skips the fetch/pull/snapshot
+    work and this hand-off, and runs only the post-pull finalize steps with
+    new code. stdin/stdout/stderr are inherited, so interactive prompts and a
+    parent streaming our output both keep working.
+
+    Returns the child's exit code, or ``None`` if the subprocess could not be
+    launched at all — in which case the caller finishes the update in-process,
+    the historical behavior.
+    """
+    try:
+        env = dict(os.environ)
+        env["HERMES_UPDATE_FINALIZE"] = "1"
+        cmd = [sys.executable, "-m", "hermes_cli.main", *sys.argv[1:]]
+        print("→ Finishing update with refreshed code...")
+        sys.stdout.flush()
+        sys.stderr.flush()
+        result = subprocess.run(cmd, cwd=PROJECT_ROOT, env=env)
+        return result.returncode
+    except Exception as exc:  # pragma: no cover - spawn almost never fails
+        logger.warning(
+            "update: could not hand off to refreshed code (%s); "
+            "finishing in-process",
+            exc,
+        )
+        return None
+
+
 def _cmd_update_impl(args, gateway_mode: bool):
    """Body of ``cmd_update`` — kept separate so the wrapper can always
    restore stdio even on ``sys.exit``."""
@@ -10142,7 +10198,18 @@ def _cmd_update_impl(args, gateway_mode: bool):
    )
    assume_yes = bool(getattr(args, "yes", False))

-    print("⚕ Updating Hermes Agent...")
+    # Self-update hand-off marker. ``hermes update`` runs from the *old*
+    # install, so its post-pull steps (dep install, config migration, gateway
+    # restart) would otherwise execute stale in-memory code even though the new
+    # source is already on disk. After a successful pull we re-exec into the
+    # refreshed code with HERMES_UPDATE_FINALIZE=1; ``finalize_only`` is True on
+    # that second pass and makes us skip the fetch/pull work and the re-exec.
+    finalize_only = os.environ.get("HERMES_UPDATE_FINALIZE") == "1"
+
+    if finalize_only:
+        print("⚕ Finalizing update with refreshed code...")
+    else:
+        print("⚕ Updating Hermes Agent...")
    print()

    # On Windows, abort early if another hermes.exe is holding the venv shim
@@ -10158,8 +10225,10 @@ def _cmd_update_impl(args, gateway_mode: bool):
                sys.exit(2)

    # Pre-update backup — runs before any git/file mutation so users can
-    # always roll back to the exact state they had before this update.
-    _run_pre_update_backup(args)
+    # always roll back to the exact state they had before this update. Skipped
+    # on the finalize re-exec (the original pass already took it).
+    if not finalize_only:
+        _run_pre_update_backup(args)

    # Try git-based update first, fall back to ZIP download on Windows
    # when git file I/O is broken (antivirus, NTFS filter drivers, etc.)
@@ -10363,7 +10432,11 @@ def _cmd_update_impl(args, gateway_mode: bool):
        )
        commit_count = int(result.stdout.strip())

-        if commit_count == 0:
+        # On the finalize re-exec the pull already happened in the original
+        # pass, so origin is level with HEAD (count == 0). Don't take the
+        # "Already up to date" early return — fall through and run the
+        # post-pull steps (now with refreshed code).
+        if commit_count == 0 and not finalize_only:
            _invalidate_update_cache()

            # Even if origin is up to date, the fork may be behind upstream
@@ -10390,24 +10463,30 @@ def _cmd_update_impl(args, gateway_mode: bool):
            print("✓ Already up to date!")
            return

-        print(f"→ Found {commit_count} new commit(s)")
-
-        # Snapshot critical state (state.db, config, pairing JSONs, etc.)
-        # before pulling so a user can recover if something goes wrong.
-        # Issue #15733 reported missing pairing data after an update; even
-        # though `git pull` can't touch $HERMES_HOME, this is cheap
-        # belt-and-suspenders insurance and gives the user something to
-        # restore from via `/snapshot list` / `/snapshot restore <id>`.
+        # The "found N commits" notice and pre-update snapshot belong to the
+        # original pass only — the finalize re-exec sees count == 0 and the
+        # snapshot was already taken before the pull.
        pre_update_snapshot_id = None
-        try:
-            from hermes_cli.backup import create_quick_snapshot
+        if not finalize_only:
+            print(f"→ Found {commit_count} new commit(s)")

-            pre_update_snapshot_id = create_quick_snapshot(label="pre-update", keep=1)
-            if pre_update_snapshot_id:
-                print(f"  ✓ Pre-update snapshot: {pre_update_snapshot_id}")
-        except Exception as exc:
-            # Never let a snapshot failure block an update.
-            logger.debug("Pre-update snapshot failed: %s", exc)
+            # Snapshot critical state (state.db, config, pairing JSONs, etc.)
+            # before pulling so a user can recover if something goes wrong.
+            # Issue #15733 reported missing pairing data after an update; even
+            # though `git pull` can't touch $HERMES_HOME, this is cheap
+            # belt-and-suspenders insurance and gives the user something to
+            # restore from via `/snapshot list` / `/snapshot restore <id>`.
+            try:
+                from hermes_cli.backup import create_quick_snapshot
+
+                pre_update_snapshot_id = create_quick_snapshot(
+                    label="pre-update", keep=1
+                )
+                if pre_update_snapshot_id:
+                    print(f"  ✓ Pre-update snapshot: {pre_update_snapshot_id}")
+            except Exception as exc:
+                # Never let a snapshot failure block an update.
+                logger.debug("Pre-update snapshot failed: %s", exc)

        print("→ Pulling updates...")
        update_succeeded = False
@@ -10579,6 +10658,26 @@ def _cmd_update_impl(args, gateway_mode: bool):

        _refresh_active_lazy_features()

+        # Hand off the remaining post-pull work (node deps, web/desktop build,
+        # config migration, skills sync, gateway restart) to the freshly-pulled
+        # code. These are the most frequently-changed and historically most
+        # fragile update steps, yet they used to run from the modules this
+        # process imported at startup — so a bug fixed in the pulled version
+        # still crashed here, forcing users to run ``hermes update`` twice. The
+        # git pull AND dependency install are done, so the new source and its
+        # deps are on disk; finish in a fresh subprocess running new code and
+        # forward its exit code. On the finalize pass (or under pytest /
+        # opt-out) we skip the hand-off and finish in-process — see
+        # _should_handoff_after_pull.
+        if _should_handoff_after_pull(finalize_only):
+            handoff_rc = _handoff_update_to_refreshed_code()
+            if handoff_rc is not None:
+                # The child ran every remaining post-pull step on new code;
+                # forward its result and stop (cmd_update's finally still
+                # restores stdio on the way out).
+                sys.exit(handoff_rc)
+            # else: spawning the child failed — fall through and finish here.
+
        _update_node_dependencies()
        _build_web_ui(PROJECT_ROOT / "web")

--- a/tests/hermes_cli/test_update_handoff.py
+++ b/tests/hermes_cli/test_update_handoff.py
@@ -0,0 +1,280 @@
+"""Tests for the post-pull subprocess hand-off in ``hermes update``.
+
+``hermes update`` runs from the *old* install. Before this hand-off, the
+post-pull steps (dep install, config migration, gateway restart) executed
+stale in-memory code even though the new source was already on disk, so a bug
+fixed in the pulled version still crashed the first run — users had to run
+``hermes update`` a second time. After a successful pull + dep install we now
+finish in a fresh subprocess running the refreshed code and forward its exit
+code. Unlike an ``os.exec*`` replacement, a child subprocess keeps the parent
+PID intact, so this works on Windows too.
+
+These tests cover the gate (when we hand off vs. finish in-process), the
+subprocess mechanics, and the finalize pass the child process takes.
+"""
+
+import sys
+from types import SimpleNamespace
+
+import pytest
+
+from hermes_cli import config as hermes_config
+from hermes_cli import main as hermes_main
+
+
+# ---------------------------------------------------------------------------
+# Managed-uv compatibility: make managed_uv helpers follow shutil.which mocking
+# (mirrors the autouse fixture in test_update_autostash.py).
+# ---------------------------------------------------------------------------
+@pytest.fixture(autouse=True)
+def _patch_managed_uv():
+    import shutil
+    from unittest.mock import patch
+
+    with patch("hermes_cli.managed_uv.resolve_uv", side_effect=lambda: shutil.which("uv")), \
+         patch("hermes_cli.managed_uv.ensure_uv", side_effect=lambda: shutil.which("uv")), \
+         patch("hermes_cli.managed_uv.update_managed_uv", side_effect=lambda: None):
+        yield
+
+
+def _clear_handoff_env(monkeypatch):
+    monkeypatch.delenv("HERMES_UPDATE_FINALIZE", raising=False)
+    monkeypatch.delenv("HERMES_UPDATE_NO_HANDOFF", raising=False)
+
+
+# ---------------------------------------------------------------------------
+# _should_handoff_after_pull — the gate
+# ---------------------------------------------------------------------------
+def test_should_handoff_false_in_finalize_mode(monkeypatch):
+    _clear_handoff_env(monkeypatch)
+    assert hermes_main._should_handoff_after_pull(finalize_only=True) is False
+
+
+def test_should_handoff_false_when_finalize_env_set(monkeypatch):
+    _clear_handoff_env(monkeypatch)
+    monkeypatch.setenv("HERMES_UPDATE_FINALIZE", "1")
+    assert hermes_main._should_handoff_after_pull(finalize_only=False) is False
+
+
+def test_should_handoff_false_with_opt_out_env(monkeypatch):
+    _clear_handoff_env(monkeypatch)
+    monkeypatch.delitem(sys.modules, "pytest", raising=False)
+    monkeypatch.setenv("HERMES_UPDATE_NO_HANDOFF", "1")
+    assert hermes_main._should_handoff_after_pull(finalize_only=False) is False
+
+
+def test_should_handoff_false_under_pytest(monkeypatch):
+    # Safety invariant: never spawn a real recursive update while the test
+    # suite is running. ``pytest`` is in sys.modules during the suite, so the
+    # gate must stay closed even with a clean env.
+    _clear_handoff_env(monkeypatch)
+    assert "pytest" in sys.modules
+    assert hermes_main._should_handoff_after_pull(finalize_only=False) is False
+
+
+def test_should_handoff_true_when_allowed(monkeypatch):
+    _clear_handoff_env(monkeypatch)
+    monkeypatch.delitem(sys.modules, "pytest", raising=False)
+    assert hermes_main._should_handoff_after_pull(finalize_only=False) is True
+
+
+def test_should_handoff_stays_on_for_windows(monkeypatch):
+    # The whole reason we use a subprocess instead of os.exec*: it keeps the
+    # parent PID intact, so the hand-off works on Windows too (the previous
+    # exec-based attempt had to disable itself there).
+    _clear_handoff_env(monkeypatch)
+    monkeypatch.setattr(sys, "platform", "win32")
+    monkeypatch.delitem(sys.modules, "pytest", raising=False)
+    assert hermes_main._should_handoff_after_pull(finalize_only=False) is True
+
+
+# ---------------------------------------------------------------------------
+# _handoff_update_to_refreshed_code — the subprocess mechanics
+# ---------------------------------------------------------------------------
+def test_handoff_runs_subprocess_with_finalize_env(monkeypatch):
+    captured = {}
+
+    def fake_run(cmd, **kwargs):
+        captured["cmd"] = list(cmd)
+        captured["env"] = dict(kwargs.get("env") or {})
+        return SimpleNamespace(returncode=0)
+
+    monkeypatch.setattr(hermes_main.subprocess, "run", fake_run)
+    monkeypatch.setattr(hermes_main.sys, "argv", ["hermes", "update", "--yes", "--gateway"])
+
+    rc = hermes_main._handoff_update_to_refreshed_code()
+
+    assert rc == 0
+    assert captured["cmd"][0] == sys.executable
+    assert captured["cmd"][1:3] == ["-m", "hermes_cli.main"]
+    # The original CLI args (minus argv[0]) are carried through verbatim.
+    assert captured["cmd"][3:] == ["update", "--yes", "--gateway"]
+    # The loop-breaker guard the child reads.
+    assert captured["env"]["HERMES_UPDATE_FINALIZE"] == "1"
+
+
+def test_handoff_forwards_nonzero_exit_code(monkeypatch):
+    monkeypatch.setattr(
+        hermes_main.subprocess, "run", lambda *a, **k: SimpleNamespace(returncode=42)
+    )
+    monkeypatch.setattr(hermes_main.sys, "argv", ["hermes", "update"])
+    assert hermes_main._handoff_update_to_refreshed_code() == 42
+
+
+def test_handoff_returns_none_when_spawn_fails(monkeypatch):
+    def boom(*_a, **_kw):
+        raise OSError("could not spawn")
+
+    monkeypatch.setattr(hermes_main.subprocess, "run", boom)
+    monkeypatch.setattr(hermes_main.sys, "argv", ["hermes", "update"])
+    # None signals the caller to finish in-process instead of bailing out.
+    assert hermes_main._handoff_update_to_refreshed_code() is None
+
+
+# ---------------------------------------------------------------------------
+# cmd_update integration — original pass hands off, finalize pass finishes
+# ---------------------------------------------------------------------------
+def _setup_update_mocks(monkeypatch, tmp_path):
+    (tmp_path / ".git").mkdir()
+    monkeypatch.setattr(hermes_main, "PROJECT_ROOT", tmp_path)
+    monkeypatch.setattr(hermes_main, "_stash_local_changes_if_needed", lambda *a, **kw: None)
+    monkeypatch.setattr(hermes_main, "_restore_stashed_changes", lambda *a, **kw: True)
+    monkeypatch.setattr(hermes_config, "get_missing_env_vars", lambda required_only=True: [])
+    monkeypatch.setattr(hermes_config, "get_missing_config_fields", lambda: [])
+    monkeypatch.setattr(hermes_config, "check_config_version", lambda: (5, 5))
+    monkeypatch.setattr(hermes_config, "migrate_config", lambda **kw: {"env_added": [], "config_added": []})
+    monkeypatch.setattr(hermes_main, "_refresh_active_lazy_features", lambda: None)
+
+
+def _fake_git_run(commit_count):
+    recorded = []
+
+    def side_effect(cmd, **kwargs):
+        recorded.append(cmd)
+        joined = " ".join(str(c) for c in cmd)
+        if "fetch" in joined and "origin" in joined:
+            return SimpleNamespace(stdout="", stderr="", returncode=0)
+        if "rev-parse" in joined and "--abbrev-ref" in joined:
+            return SimpleNamespace(stdout="main\n", stderr="", returncode=0)
+        if "rev-list" in joined:
+            return SimpleNamespace(stdout=f"{commit_count}\n", stderr="", returncode=0)
+        if "--ff-only" in joined:
+            return SimpleNamespace(stdout="Already up to date.\n", stderr="", returncode=0)
+        return SimpleNamespace(returncode=0, stdout="", stderr="")
+
+    return side_effect, recorded
+
+
+def test_original_pass_hands_off_and_forwards_exit_code(monkeypatch, tmp_path):
+    _clear_handoff_env(monkeypatch)
+    _setup_update_mocks(monkeypatch, tmp_path)
+    monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/uv" if name == "uv" else None)
+    monkeypatch.setattr(hermes_main, "_is_termux_env", lambda env=None: False)
+
+    # Force the gate open (pytest normally closes it) and stub the child run.
+    monkeypatch.setattr(hermes_main, "_should_handoff_after_pull", lambda finalize_only: True)
+    monkeypatch.setattr(hermes_main, "_handoff_update_to_refreshed_code", lambda: 0)
+
+    node_called = []
+    monkeypatch.setattr(
+        hermes_main, "_update_node_dependencies", lambda *a, **k: node_called.append(True)
+    )
+
+    side_effect, _recorded = _fake_git_run(commit_count="3")
+    monkeypatch.setattr(hermes_main.subprocess, "run", side_effect)
+
+    with pytest.raises(SystemExit) as exc:
+        hermes_main.cmd_update(SimpleNamespace())
+
+    assert exc.value.code == 0
+    # After a successful hand-off the parent must NOT run the remaining
+    # post-pull steps — the child already did them on new code.
+    assert node_called == [], "parent ran post-handoff steps after a successful hand-off"
+
+
+def test_handoff_failure_falls_back_in_process(monkeypatch, tmp_path):
+    _clear_handoff_env(monkeypatch)
+    _setup_update_mocks(monkeypatch, tmp_path)
+    monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/uv" if name == "uv" else None)
+    monkeypatch.setattr(hermes_main, "_is_termux_env", lambda env=None: False)
+
+    monkeypatch.setattr(hermes_main, "_should_handoff_after_pull", lambda finalize_only: True)
+    # Child couldn't be spawned -> None -> parent finishes in-process.
+    monkeypatch.setattr(hermes_main, "_handoff_update_to_refreshed_code", lambda: None)
+
+    node_called = []
+    monkeypatch.setattr(
+        hermes_main, "_update_node_dependencies", lambda *a, **k: node_called.append(True)
+    )
+    monkeypatch.setattr(hermes_main, "_build_web_ui", lambda *a, **k: None)
+
+    side_effect, _recorded = _fake_git_run(commit_count="3")
+    monkeypatch.setattr(hermes_main.subprocess, "run", side_effect)
+
+    hermes_main.cmd_update(SimpleNamespace())
+
+    assert node_called == [True], "fallback should finish the post-pull steps in-process"
+
+
+def test_finalize_pass_does_not_hand_off(monkeypatch, tmp_path):
+    # In finalize mode the gate is closed, so we must never spawn another
+    # update. Make the hand-off explode so the test fails loudly if reached.
+    _clear_handoff_env(monkeypatch)
+    monkeypatch.setenv("HERMES_UPDATE_FINALIZE", "1")
+    _setup_update_mocks(monkeypatch, tmp_path)
+    monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/uv" if name == "uv" else None)
+    monkeypatch.setattr(hermes_main, "_is_termux_env", lambda env=None: False)
+
+    def no_handoff():  # pragma: no cover - asserts it's not called
+        raise AssertionError("finalize pass must not hand off")
+
+    monkeypatch.setattr(hermes_main, "_handoff_update_to_refreshed_code", no_handoff)
+
+    side_effect, recorded = _fake_git_run(commit_count="0")
+    monkeypatch.setattr(hermes_main.subprocess, "run", side_effect)
+
+    hermes_main.cmd_update(SimpleNamespace())
+
+    # The whole point of the finalize pass: even with zero new commits (the
+    # pull already happened in the original pass) it does NOT take the "Already
+    # up to date" early return — it runs the post-pull dependency install.
+    install_cmds = [c for c in recorded if "pip" in c and "install" in c]
+    assert install_cmds, "finalize pass should run the dependency install, not early-return"
+
+
+def test_finalize_pass_skips_pre_update_backup(monkeypatch, tmp_path):
+    _clear_handoff_env(monkeypatch)
+    monkeypatch.setenv("HERMES_UPDATE_FINALIZE", "1")
+    _setup_update_mocks(monkeypatch, tmp_path)
+    monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/uv" if name == "uv" else None)
+    monkeypatch.setattr(hermes_main, "_is_termux_env", lambda env=None: False)
+
+    backup_calls = []
+    monkeypatch.setattr(hermes_main, "_run_pre_update_backup", lambda args: backup_calls.append(args))
+
+    side_effect, _recorded = _fake_git_run(commit_count="0")
+    monkeypatch.setattr(hermes_main.subprocess, "run", side_effect)
+
+    hermes_main.cmd_update(SimpleNamespace())
+
+    assert backup_calls == [], "finalize pass must not retake the pre-update backup"
+
+
+def test_original_pass_still_runs_pre_update_backup(monkeypatch, tmp_path):
+    # Sanity counter-check: a normal (non-finalize) run still takes the backup.
+    # Under pytest the gate is closed, so the run finishes in-process exactly
+    # as it always did.
+    _clear_handoff_env(monkeypatch)
+    _setup_update_mocks(monkeypatch, tmp_path)
+    monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/uv" if name == "uv" else None)
+    monkeypatch.setattr(hermes_main, "_is_termux_env", lambda env=None: False)
+
+    backup_calls = []
+    monkeypatch.setattr(hermes_main, "_run_pre_update_backup", lambda args: backup_calls.append(args))
+
+    side_effect, _recorded = _fake_git_run(commit_count="3")
+    monkeypatch.setattr(hermes_main.subprocess, "run", side_effect)
+
+    hermes_main.cmd_update(SimpleNamespace())
+
+    assert len(backup_calls) == 1