Compare commits

...

1 Commits

Author SHA1 Message Date
teknium1
059647d1c9 Port from google-gemini/gemini-cli#21541: back up corrupted config.yaml
When config.yaml fails to parse, load_config() silently falls back to
DEFAULT_CONFIG and leaves the broken file on disk. If the user then re-runs
the setup wizard or hermes config set (both rewrite config.yaml), their
broken-but-recoverable overrides are lost for good.

Adapts the policy-file recovery from gemini-cli#21541: on the first parse
warning for a given broken file, snapshot it to config.yaml.corrupt.<ts>.bak
(best-effort, symlink-guarded, size-deduped) and tell the user where it
landed. Unlike Gemini's version we deliberately do NOT reset config.yaml to a
clean state — hermes never silently mutates user config, and leaving it means
a hand-fixed file is re-read on the next load.

Tests: 3 new cases (backup created + content preserved + original untouched;
same-size backup dedup; symlink not copied). E2E verified with isolated
HERMES_HOME and a real tab-indented broken config.
2026-06-03 17:05:11 -07:00
2 changed files with 129 additions and 1 deletions

View File

@@ -17,11 +17,13 @@ import logging
import os
import platform
import re
import shutil
import stat
import subprocess
import sys
import tempfile
import threading
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Any, Optional, List, Tuple
@@ -36,6 +38,60 @@ logger = logging.getLogger(__name__)
_CONFIG_PARSE_WARNED: set = set()
def _backup_corrupt_config(config_path: Path) -> Optional[Path]:
"""Preserve a corrupted ``config.yaml`` by copying it to a timestamped ``.bak``.
When the YAML can't be parsed, ``load_config()`` silently falls back to
``DEFAULT_CONFIG`` and the user's broken file stays on disk untouched.
That file is still the user's only copy of their intended overrides — if
they re-run the setup wizard or ``hermes config set`` (which rewrites
``config.yaml``), the broken-but-recoverable content is gone for good.
This snapshots the corrupted file to ``config.yaml.corrupt.<ts>.bak`` so
the user can diff/repair it. Unlike Gemini CLI's policy-file recovery
(which resets the live file to a clean state), we deliberately leave
``config.yaml`` in place: hermes never silently mutates the user's config,
and leaving it means a hand-fixed file is re-read on the next load. The
backup is best-effort — any failure (permissions, symlink, disk full) is
swallowed so config loading is never blocked by backup problems.
Returns the backup path on success, else ``None``. Symlinks are not
followed/copied (mirrors the Gemini #21541 lstat guard) to avoid
clobbering whatever a malicious/misconfigured symlink points at.
"""
try:
if config_path.is_symlink():
return None
st = config_path.stat()
if st.st_size == 0:
# Empty file isn't worth preserving and yaml.safe_load returns {}
# for it anyway (so it wouldn't reach here), but guard regardless.
return None
ts = time.strftime("%Y%m%d-%H%M%S")
backup_path = config_path.with_name(f"{config_path.name}.corrupt.{ts}.bak")
# Don't clobber an existing backup from the same second; if there's
# already a corrupt backup for this exact mtime, assume we've snapshotted
# this corruption already and skip (the dedup cache normally prevents a
# second call, but a process restart can clear it).
sibling_baks = list(
config_path.parent.glob(f"{config_path.name}.corrupt.*.bak")
)
for existing in sibling_baks:
try:
if existing.stat().st_size == st.st_size:
# Same size as the current broken file — likely the same
# corruption already preserved. Avoid backup churn.
return None
except OSError:
continue
if backup_path.exists():
return None
shutil.copy2(config_path, backup_path)
return backup_path
except Exception:
return None
def _warn_config_parse_failure(config_path: Path, exc: Exception) -> None:
"""Surface a config.yaml parse failure to user, log, and stderr.
@@ -48,7 +104,11 @@ def _warn_config_parse_failure(config_path: Path, exc: Exception) -> None:
Now: warn once per (path, mtime_ns, size) on stderr **and** in
``agent.log`` / ``errors.log`` at WARNING level so ``hermes logs``
surfaces it. Re-warns automatically if the file changes (different
mtime/size), so users editing the config see the next failure.
mtime/size), so users editing the config see the next failure. On the
first warning for a given broken file we also snapshot it to a
timestamped ``.bak`` (best-effort) so the user's recoverable content
survives any later rewrite of ``config.yaml`` by the setup wizard or
``hermes config set``.
"""
try:
st = config_path.stat()
@@ -59,12 +119,16 @@ def _warn_config_parse_failure(config_path: Path, exc: Exception) -> None:
return
_CONFIG_PARSE_WARNED.add(key)
backup_path = _backup_corrupt_config(config_path)
msg = (
f"Failed to parse {config_path}: {exc}. "
f"Falling back to default config — every user override "
f"(auxiliary providers, fallback chain, model settings) is being IGNORED. "
f"Fix the YAML and restart."
)
if backup_path is not None:
msg += f" A copy of the corrupted file was saved to {backup_path}."
logger.warning(msg)
try:
sys.stderr.write(f"⚠️ hermes config: {msg}\n")

View File

@@ -156,6 +156,70 @@ class TestLoadConfigParseFailure:
after_edit = capsys.readouterr().err
assert "hermes config:" in after_edit, "edited file should re-warn"
def test_corrupt_config_is_backed_up(self, tmp_path, capsys):
"""A broken config.yaml is snapshotted to a timestamped .bak so the
user's recoverable overrides survive a later wizard/config-set rewrite.
Ported from google-gemini/gemini-cli#21541 (policy-file TOML recovery),
adapted: we back up but deliberately do NOT reset config.yaml.
"""
from hermes_cli import config as cfg_mod
cfg_mod._CONFIG_PARSE_WARNED.clear()
with patch.dict(os.environ, {"HERMES_HOME": str(tmp_path)}):
broken = "\tmodel: test/custom\nbroken indent:\n"
(tmp_path / "config.yaml").write_text(broken)
load_config()
err = capsys.readouterr().err
baks = list(tmp_path.glob("config.yaml.corrupt.*.bak"))
assert len(baks) == 1, f"expected one backup, got {baks}"
# Backup preserves the original broken content verbatim
assert baks[0].read_text() == broken
# Original config.yaml is left untouched (not reset to clean state)
assert (tmp_path / "config.yaml").read_text() == broken
# User is told where the backup landed
assert str(baks[0]) in err
def test_backup_skips_when_same_size_bak_exists(self, tmp_path, capsys):
"""Don't churn backups: if a corrupt backup of the same size already
exists (same corruption already preserved), skip making another."""
from hermes_cli import config as cfg_mod
cfg_mod._CONFIG_PARSE_WARNED.clear()
with patch.dict(os.environ, {"HERMES_HOME": str(tmp_path)}):
broken = "\tbroken:\n"
cfg = tmp_path / "config.yaml"
cfg.write_text(broken)
# Pre-existing backup of identical size simulates an earlier snapshot.
(tmp_path / "config.yaml.corrupt.20260101-000000.bak").write_text(broken)
load_config()
baks = list(tmp_path.glob("config.yaml.corrupt.*.bak"))
assert len(baks) == 1, f"should not add a second same-size backup, got {baks}"
def test_corrupt_symlink_config_not_backed_up(self, tmp_path):
"""Symlinked config.yaml is not copied (mirrors Gemini #21541 lstat
guard) — avoids clobbering whatever the symlink points at."""
import sys as _sys
if _sys.platform == "win32":
pytest.skip("symlink creation requires privileges on Windows")
from hermes_cli import config as cfg_mod
cfg_mod._CONFIG_PARSE_WARNED.clear()
with patch.dict(os.environ, {"HERMES_HOME": str(tmp_path)}):
real = tmp_path / "real_config.yaml"
real.write_text("\tbroken:\n")
link = tmp_path / "config.yaml"
link.symlink_to(real)
load_config()
assert not list(tmp_path.glob("config.yaml.corrupt.*.bak"))
class TestSaveAndLoadRoundtrip:
def test_roundtrip(self, tmp_path):