feat(checkpoints): auto-prune orphan and stale shadow repos at startup (#16303)

Every working dir hermes ever touches gets its own shadow git repo under ~/.hermes/checkpoints/{sha256(abs_dir)[:16]}/. The per-repo _prune is a no-op (comment in CheckpointManager._prune says so), so abandoned repos from deleted/moved projects or one-off tmp dirs pile up forever. Field reports put the typical offender at 1000+ repos / ~12 GB on active contributor machines. Adds an opt-in startup sweep that mirrors the sessions.auto_prune pattern from #13861 / #16286: - tools/checkpoint_manager.py: new prune_checkpoints() and maybe_auto_prune_checkpoints() helpers. Deletes shadow repos that are orphan (HERMES_WORKDIR marker points to a path that no longer exists) or stale (newest in-repo mtime older than retention_days). Idempotent via a CHECKPOINT_BASE/.last_prune marker file so it only runs once per min_interval_hours regardless of how many hermes processes start up. - hermes_cli/config.py: new checkpoints.auto_prune / retention_days / delete_orphans / min_interval_hours knobs. Default auto_prune: false so users who rely on /rollback against long-ago sessions never lose data silently. - cli.py / gateway/run.py: startup hooks gated on checkpoints.auto_prune, called right next to the existing state.db maintenance block. - Docs updated with the new config knobs. - 11 regression tests: orphan/stale deletion, precedence, byte-freed tracking, non-shadow dir skip, interval gating, corrupt marker recovery. Refs #3015 (session-file disk growth was fixed in #16286; this covers the checkpoint side noted out-of-scope there).
2026-04-28 06:51:16 +08:00 · 2026-04-26 19:05:52 -07:00
parent ced8f44cd2
commit 478444c262
6 changed files with 458 additions and 0 deletions
--- a/tools/checkpoint_manager.py
+++ b/tools/checkpoint_manager.py
@@ -651,3 +651,204 @@ def format_checkpoint_list(checkpoints: List[Dict], directory: str) -> str:
    lines.append("  /rollback diff <N>        preview changes since checkpoint N")
    lines.append("  /rollback <N> <file>      restore a single file from checkpoint N")
    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Auto-maintenance (issue #3015 follow-up)
+# ---------------------------------------------------------------------------
+#
+# Every working directory the agent has ever touched gets its own shadow
+# repo under CHECKPOINT_BASE.  Per-repo ``_prune`` is a no-op (see comment
+# in CheckpointManager._prune), so abandoned repos (deleted projects,
+# one-off tmp dirs, long-stale work trees) accumulate forever.  Field
+# reports put the typical offender at 1000+ repos / ~12 GB on active
+# contributor machines.
+#
+# ``prune_checkpoints`` sweeps CHECKPOINT_BASE at startup, deleting shadow
+# repos that match either criterion:
+#   * orphan:  the ``HERMES_WORKDIR`` path no longer exists on disk
+#   * stale:   the repo's newest mtime is older than ``retention_days``
+#
+# ``maybe_auto_prune_checkpoints`` wraps it with an idempotency marker
+# (``CHECKPOINT_BASE/.last_prune``) so calling it on every CLI/gateway
+# startup is free after the first run of the day.  Opt-in via
+# ``checkpoints.auto_prune`` in config.yaml — default off so users who
+# rely on ``/rollback`` against long-ago sessions never lose data
+# silently.
+
+_PRUNE_MARKER_NAME = ".last_prune"
+
+
+def _read_workdir_marker(shadow_repo: Path) -> Optional[str]:
+    """Read ``HERMES_WORKDIR`` from a shadow repo, or None if missing/unreadable."""
+    try:
+        return (shadow_repo / "HERMES_WORKDIR").read_text(encoding="utf-8").strip()
+    except (OSError, UnicodeDecodeError):
+        return None
+
+
+def _shadow_repo_newest_mtime(shadow_repo: Path) -> float:
+    """Return newest mtime across the shadow repo (walks objects/refs/HEAD).
+
+    We walk instead of trusting the directory mtime because git's pack
+    operations can leave the top-level dir untouched while refs/objects
+    inside get updated.  Best-effort — returns 0.0 on any error.
+    """
+    newest = 0.0
+    try:
+        for p in shadow_repo.rglob("*"):
+            try:
+                m = p.stat().st_mtime
+                if m > newest:
+                    newest = m
+            except OSError:
+                continue
+    except OSError:
+        pass
+    return newest
+
+
+def prune_checkpoints(
+    retention_days: int = 7,
+    delete_orphans: bool = True,
+    checkpoint_base: Optional[Path] = None,
+) -> Dict[str, int]:
+    """Delete stale/orphan shadow repos under ``checkpoint_base``.
+
+    A shadow repo is deleted when either:
+
+    * ``delete_orphans=True`` and its ``HERMES_WORKDIR`` path no longer
+      exists on disk (the original project was deleted / moved); OR
+    * its newest in-repo mtime is older than ``retention_days`` days.
+
+    Returns a dict with counts ``{"scanned", "deleted_orphan",
+    "deleted_stale", "errors", "bytes_freed"}``.
+
+    Never raises — maintenance must never block interactive startup.
+    """
+    base = checkpoint_base or CHECKPOINT_BASE
+    result = {
+        "scanned": 0,
+        "deleted_orphan": 0,
+        "deleted_stale": 0,
+        "errors": 0,
+        "bytes_freed": 0,
+    }
+    if not base.exists():
+        return result
+
+    cutoff = 0.0
+    if retention_days > 0:
+        import time as _time
+        cutoff = _time.time() - retention_days * 86400
+
+    for child in base.iterdir():
+        if not child.is_dir():
+            continue
+        # Protect the marker file and anything that isn't a real shadow
+        # repo (no HEAD = not initialised, leave alone).
+        if not (child / "HEAD").exists():
+            continue
+        result["scanned"] += 1
+
+        reason: Optional[str] = None
+        if delete_orphans:
+            workdir = _read_workdir_marker(child)
+            if workdir is None or not Path(workdir).exists():
+                reason = "orphan"
+
+        if reason is None and retention_days > 0:
+            newest = _shadow_repo_newest_mtime(child)
+            if newest > 0 and newest < cutoff:
+                reason = "stale"
+
+        if reason is None:
+            continue
+
+        # Measure size before delete (best-effort)
+        try:
+            size = sum(p.stat().st_size for p in child.rglob("*") if p.is_file())
+        except OSError:
+            size = 0
+        try:
+            shutil.rmtree(child)
+            result["bytes_freed"] += size
+            if reason == "orphan":
+                result["deleted_orphan"] += 1
+            else:
+                result["deleted_stale"] += 1
+            logger.debug("Pruned %s checkpoint repo: %s (%d bytes)", reason, child.name, size)
+        except OSError as exc:
+            result["errors"] += 1
+            logger.warning("Failed to prune checkpoint repo %s: %s", child.name, exc)
+
+    return result
+
+
+def maybe_auto_prune_checkpoints(
+    retention_days: int = 7,
+    min_interval_hours: int = 24,
+    delete_orphans: bool = True,
+    checkpoint_base: Optional[Path] = None,
+) -> Dict[str, object]:
+    """Idempotent wrapper around ``prune_checkpoints`` for startup hooks.
+
+    Writes ``CHECKPOINT_BASE/.last_prune`` on completion so subsequent
+    calls within ``min_interval_hours`` short-circuit.  Designed to be
+    called once per CLI/gateway process startup; the marker keeps costs
+    bounded regardless of how many times hermes is invoked per day.
+
+    Returns ``{"skipped": bool, "result": prune_checkpoints-dict,
+    "error": optional str}``.
+    """
+    import time as _time
+    base = checkpoint_base or CHECKPOINT_BASE
+    out: Dict[str, object] = {"skipped": False}
+
+    try:
+        if not base.exists():
+            out["result"] = {
+                "scanned": 0, "deleted_orphan": 0, "deleted_stale": 0,
+                "errors": 0, "bytes_freed": 0,
+            }
+            return out
+
+        marker = base / _PRUNE_MARKER_NAME
+        now = _time.time()
+        if marker.exists():
+            try:
+                last_ts = float(marker.read_text(encoding="utf-8").strip())
+                if now - last_ts < min_interval_hours * 3600:
+                    out["skipped"] = True
+                    return out
+            except (OSError, ValueError):
+                pass  # corrupt marker — treat as no prior run
+
+        result = prune_checkpoints(
+            retention_days=retention_days,
+            delete_orphans=delete_orphans,
+            checkpoint_base=base,
+        )
+        out["result"] = result
+
+        try:
+            marker.write_text(str(now), encoding="utf-8")
+        except OSError as exc:
+            logger.debug("Could not write checkpoint prune marker: %s", exc)
+
+        total = result["deleted_orphan"] + result["deleted_stale"]
+        if total > 0:
+            logger.info(
+                "checkpoint auto-maintenance: pruned %d repo(s) "
+                "(%d orphan, %d stale), reclaimed %.1f MB",
+                total,
+                result["deleted_orphan"],
+                result["deleted_stale"],
+                result["bytes_freed"] / (1024 * 1024),
+            )
+    except Exception as exc:
+        logger.warning("checkpoint auto-maintenance failed: %s", exc)
+        out["error"] = str(exc)
+
+    return out
+