fix: sanitize .env before loading to prevent token duplication (#8908)

When .env files become corrupted (e.g. concatenated KEY=VALUE pairs on a single line due to concurrent writes or encoding issues), both python-dotenv and load_env() would parse the entire concatenated string as a single value. This caused bot tokens to appear duplicated up to 8×, triggering InvalidToken errors from the Telegram API. Root cause: _sanitize_env_lines() — which correctly splits concatenated lines — was only called during save_env_value() writes, not during reads. Fix: - load_env() now calls _sanitize_env_lines() before parsing - env_loader.load_hermes_dotenv() sanitizes the .env file on disk before python-dotenv reads it, so os.getenv() also returns clean values - Added tests reproducing the exact corruption pattern from #8908 Closes #8908
2026-04-28 06:51:16 +08:00 · 2026-04-13 18:41:12 +08:00
parent e77f135ed8
commit e469f3f3db
3 changed files with 159 additions and 6 deletions
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -2384,7 +2384,13 @@ def save_config(config: Dict[str, Any]):


 def load_env() -> Dict[str, str]:
-    """Load environment variables from ~/.hermes/.env."""
+    """Load environment variables from ~/.hermes/.env.
+
+    Sanitizes lines before parsing so that corrupted files (e.g.
+    concatenated KEY=VALUE pairs on a single line) are handled
+    gracefully instead of producing mangled values such as duplicated
+    bot tokens.  See #8908.
+    """
    env_path = get_env_path()
    env_vars = {}
    
@@ -2393,11 +2399,15 @@ def load_env() -> Dict[str, str]:
        # fail on UTF-8 .env files. Use explicit UTF-8 only on Windows.
        open_kw = {"encoding": "utf-8", "errors": "replace"} if _IS_WINDOWS else {}
        with open(env_path, **open_kw) as f:
-            for line in f:
-                line = line.strip()
-                if line and not line.startswith('#') and '=' in line:
-                    key, _, value = line.partition('=')
-                    env_vars[key.strip()] = value.strip().strip('"\'')
+            raw_lines = f.readlines()
+        # Sanitize before parsing: split concatenated lines & drop stale
+        # placeholders so corrupted .env files don't produce invalid tokens.
+        lines = _sanitize_env_lines(raw_lines)
+        for line in lines:
+            line = line.strip()
+            if line and not line.startswith('#') and '=' in line:
+                key, _, value = line.partition('=')
+                env_vars[key.strip()] = value.strip().strip('"\'')
    
    return env_vars

--- a/hermes_cli/env_loader.py
+++ b/hermes_cli/env_loader.py
@@ -15,6 +15,51 @@ def _load_dotenv_with_fallback(path: Path, *, override: bool) -> None:
        load_dotenv(dotenv_path=path, override=override, encoding="latin-1")


+def _sanitize_env_file_if_needed(path: Path) -> None:
+    """Pre-sanitize a .env file before python-dotenv reads it.
+
+    python-dotenv does not handle corrupted lines where multiple
+    KEY=VALUE pairs are concatenated on a single line (missing newline).
+    This produces mangled values — e.g. a bot token duplicated 8×
+    (see #8908).
+
+    We delegate to ``hermes_cli.config._sanitize_env_lines`` which
+    already knows all valid Hermes env-var names and can split
+    concatenated lines correctly.
+    """
+    if not path.exists():
+        return
+    try:
+        from hermes_cli.config import _sanitize_env_lines
+    except ImportError:
+        return  # early bootstrap — config module not available yet
+
+    read_kw = {"encoding": "utf-8", "errors": "replace"}
+    try:
+        with open(path, **read_kw) as f:
+            original = f.readlines()
+        sanitized = _sanitize_env_lines(original)
+        if sanitized != original:
+            import tempfile
+            fd, tmp = tempfile.mkstemp(
+                dir=str(path.parent), suffix=".tmp", prefix=".env_"
+            )
+            try:
+                with os.fdopen(fd, "w", encoding="utf-8") as f:
+                    f.writelines(sanitized)
+                    f.flush()
+                    os.fsync(f.fileno())
+                os.replace(tmp, path)
+            except BaseException:
+                try:
+                    os.unlink(tmp)
+                except OSError:
+                    pass
+                raise
+    except Exception:
+        pass  # best-effort — don't block gateway startup
+
+
 def load_hermes_dotenv(
    *,
    hermes_home: str | os.PathLike | None = None,
@@ -34,6 +79,10 @@ def load_hermes_dotenv(
    user_env = home_path / ".env"
    project_env_path = Path(project_env) if project_env else None

+    # Fix corrupted .env files before python-dotenv parses them (#8908).
+    if user_env.exists():
+        _sanitize_env_file_if_needed(user_env)
+
    if user_env.exists():
        _load_dotenv_with_fallback(user_env, override=True)
        loaded.append(user_env)
--- a/tests/test_env_sanitize_on_load.py
+++ b/tests/test_env_sanitize_on_load.py
@@ -0,0 +1,94 @@
+"""Tests for .env sanitization during load to prevent token duplication (#8908)."""
+
+import os
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+
+def test_load_env_sanitizes_concatenated_lines():
+    """Verify load_env() splits concatenated KEY=VALUE pairs.
+
+    Reproduces the scenario from #8908 where a corrupted .env file
+    contained multiple tokens on a single line, causing the bot token
+    to be duplicated 8 times.
+    """
+    from hermes_cli.config import load_env
+
+    token = "8356550917:AAGGEkzg06Hrc3Hjb3Sa1jkGVDOdU_lYy2Q"
+    # Simulate concatenated line: TOKEN=xxx followed immediately by another key
+    corrupted = f"TELEGRAM_BOT_TOKEN={token}ANTHROPIC_API_KEY=sk-ant-test123\n"
+
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".env", delete=False, encoding="utf-8"
+    ) as f:
+        f.write(corrupted)
+        env_path = Path(f.name)
+
+    try:
+        with patch("hermes_cli.config.get_env_path", return_value=env_path):
+            result = load_env()
+        assert result.get("TELEGRAM_BOT_TOKEN") == token, (
+            f"Token should be exactly '{token}', got '{result.get('TELEGRAM_BOT_TOKEN')}'"
+        )
+        assert result.get("ANTHROPIC_API_KEY") == "sk-ant-test123"
+    finally:
+        env_path.unlink(missing_ok=True)
+
+
+def test_load_env_normal_file_unchanged():
+    """A well-formed .env file should be parsed identically."""
+    from hermes_cli.config import load_env
+
+    content = (
+        "TELEGRAM_BOT_TOKEN=mytoken123\n"
+        "ANTHROPIC_API_KEY=sk-ant-key\n"
+        "# comment\n"
+        "\n"
+        "OPENAI_API_KEY=sk-openai\n"
+    )
+
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".env", delete=False, encoding="utf-8"
+    ) as f:
+        f.write(content)
+        env_path = Path(f.name)
+
+    try:
+        with patch("hermes_cli.config.get_env_path", return_value=env_path):
+            result = load_env()
+        assert result["TELEGRAM_BOT_TOKEN"] == "mytoken123"
+        assert result["ANTHROPIC_API_KEY"] == "sk-ant-key"
+        assert result["OPENAI_API_KEY"] == "sk-openai"
+    finally:
+        env_path.unlink(missing_ok=True)
+
+
+def test_env_loader_sanitizes_before_dotenv():
+    """Verify env_loader._sanitize_env_file_if_needed fixes corrupted files."""
+    from hermes_cli.env_loader import _sanitize_env_file_if_needed
+
+    token = "8356550917:AAGGEkzg06Hrc3Hjb3Sa1jkGVDOdU_lYy2Q"
+    corrupted = f"TELEGRAM_BOT_TOKEN={token}ANTHROPIC_API_KEY=sk-ant-test\n"
+
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".env", delete=False, encoding="utf-8"
+    ) as f:
+        f.write(corrupted)
+        env_path = Path(f.name)
+
+    try:
+        _sanitize_env_file_if_needed(env_path)
+        with open(env_path, encoding="utf-8") as f:
+            lines = f.readlines()
+        # Should be split into two separate lines
+        assert len(lines) == 2, f"Expected 2 lines, got {len(lines)}: {lines}"
+        assert lines[0].startswith("TELEGRAM_BOT_TOKEN=")
+        assert lines[1].startswith("ANTHROPIC_API_KEY=")
+        # Token should not contain the second key
+        parsed_token = lines[0].strip().split("=", 1)[1]
+        assert parsed_token == token
+    finally:
+        env_path.unlink(missing_ok=True)