feat(checkpoints): auto-prune orphan and stale shadow repos at startup (#16303)

Every working dir hermes ever touches gets its own shadow git repo under
~/.hermes/checkpoints/{sha256(abs_dir)[:16]}/.  The per-repo _prune is a
no-op (comment in CheckpointManager._prune says so), so abandoned repos
from deleted/moved projects or one-off tmp dirs pile up forever.  Field
reports put the typical offender at 1000+ repos / ~12 GB on active
contributor machines.

Adds an opt-in startup sweep that mirrors the sessions.auto_prune
pattern from #13861 / #16286:

- tools/checkpoint_manager.py: new prune_checkpoints() and
  maybe_auto_prune_checkpoints() helpers.  Deletes shadow repos that
  are orphan (HERMES_WORKDIR marker points to a path that no longer
  exists) or stale (newest in-repo mtime older than retention_days).
  Idempotent via a CHECKPOINT_BASE/.last_prune marker file so it only
  runs once per min_interval_hours regardless of how many hermes
  processes start up.
- hermes_cli/config.py: new checkpoints.auto_prune /
  retention_days / delete_orphans / min_interval_hours knobs.
  Default auto_prune: false so users who rely on /rollback against
  long-ago sessions never lose data silently.
- cli.py / gateway/run.py: startup hooks gated on checkpoints.auto_prune,
  called right next to the existing state.db maintenance block.
- Docs updated with the new config knobs.
- 11 regression tests: orphan/stale deletion, precedence, byte-freed
  tracking, non-shadow dir skip, interval gating, corrupt marker
  recovery.

Refs #3015 (session-file disk growth was fixed in #16286; this covers
the checkpoint side noted out-of-scope there).
This commit is contained in:
Teknium
2026-04-26 19:05:52 -07:00
committed by GitHub
parent ced8f44cd2
commit 478444c262
6 changed files with 458 additions and 0 deletions

28
cli.py
View File

@@ -988,6 +988,29 @@ def _run_state_db_auto_maintenance(session_db) -> None:
logger.debug("state.db auto-maintenance skipped: %s", exc)
def _run_checkpoint_auto_maintenance() -> None:
"""Call ``checkpoint_manager.maybe_auto_prune_checkpoints`` using current config.
Reads the ``checkpoints:`` section from config.yaml via
:func:`hermes_cli.config.load_config`. Honours ``auto_prune`` /
``retention_days`` / ``delete_orphans`` / ``min_interval_hours``.
Never raises — maintenance must never block interactive startup.
"""
try:
from hermes_cli.config import load_config as _load_full_config
cfg = (_load_full_config().get("checkpoints") or {})
if not cfg.get("auto_prune", False):
return
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
maybe_auto_prune_checkpoints(
retention_days=int(cfg.get("retention_days", 7)),
min_interval_hours=int(cfg.get("min_interval_hours", 24)),
delete_orphans=bool(cfg.get("delete_orphans", True)),
)
except Exception as exc:
logger.debug("checkpoint auto-maintenance skipped: %s", exc)
def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None:
"""Remove stale worktrees and orphaned branches on startup.
@@ -2054,6 +2077,11 @@ class HermesCLI:
# Never blocks startup on failure.
_run_state_db_auto_maintenance(self._session_db)
# Opportunistic shadow-repo cleanup — deletes orphan/stale
# checkpoint repos under ~/.hermes/checkpoints/. Opt-in via
# checkpoints.auto_prune, idempotent via .last_prune marker.
_run_checkpoint_auto_maintenance()
# Deferred title: stored in memory until the session is created in the DB
self._pending_title: Optional[str] = None