fix: sanitize .env before loading to prevent token duplication (#8908)

When .env files become corrupted (e.g. concatenated KEY=VALUE pairs on
a single line due to concurrent writes or encoding issues), both
python-dotenv and load_env() would parse the entire concatenated string
as a single value. This caused bot tokens to appear duplicated up to 8×,
triggering InvalidToken errors from the Telegram API.

Root cause: _sanitize_env_lines() — which correctly splits concatenated
lines — was only called during save_env_value() writes, not during reads.

Fix:
- load_env() now calls _sanitize_env_lines() before parsing
- env_loader.load_hermes_dotenv() sanitizes the .env file on disk
  before python-dotenv reads it, so os.getenv() also returns clean values
- Added tests reproducing the exact corruption pattern from #8908

Closes #8908
This commit is contained in:
Mil Wang (from Dev Box)
2026-04-13 18:41:12 +08:00
committed by Teknium
parent e77f135ed8
commit e469f3f3db
3 changed files with 159 additions and 6 deletions

View File

@@ -2384,7 +2384,13 @@ def save_config(config: Dict[str, Any]):
def load_env() -> Dict[str, str]:
"""Load environment variables from ~/.hermes/.env."""
"""Load environment variables from ~/.hermes/.env.
Sanitizes lines before parsing so that corrupted files (e.g.
concatenated KEY=VALUE pairs on a single line) are handled
gracefully instead of producing mangled values such as duplicated
bot tokens. See #8908.
"""
env_path = get_env_path()
env_vars = {}
@@ -2393,11 +2399,15 @@ def load_env() -> Dict[str, str]:
# fail on UTF-8 .env files. Use explicit UTF-8 only on Windows.
open_kw = {"encoding": "utf-8", "errors": "replace"} if _IS_WINDOWS else {}
with open(env_path, **open_kw) as f:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
key, _, value = line.partition('=')
env_vars[key.strip()] = value.strip().strip('"\'')
raw_lines = f.readlines()
# Sanitize before parsing: split concatenated lines & drop stale
# placeholders so corrupted .env files don't produce invalid tokens.
lines = _sanitize_env_lines(raw_lines)
for line in lines:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
key, _, value = line.partition('=')
env_vars[key.strip()] = value.strip().strip('"\'')
return env_vars

View File

@@ -15,6 +15,51 @@ def _load_dotenv_with_fallback(path: Path, *, override: bool) -> None:
load_dotenv(dotenv_path=path, override=override, encoding="latin-1")
def _sanitize_env_file_if_needed(path: Path) -> None:
"""Pre-sanitize a .env file before python-dotenv reads it.
python-dotenv does not handle corrupted lines where multiple
KEY=VALUE pairs are concatenated on a single line (missing newline).
This produces mangled values — e.g. a bot token duplicated 8×
(see #8908).
We delegate to ``hermes_cli.config._sanitize_env_lines`` which
already knows all valid Hermes env-var names and can split
concatenated lines correctly.
"""
if not path.exists():
return
try:
from hermes_cli.config import _sanitize_env_lines
except ImportError:
return # early bootstrap — config module not available yet
read_kw = {"encoding": "utf-8", "errors": "replace"}
try:
with open(path, **read_kw) as f:
original = f.readlines()
sanitized = _sanitize_env_lines(original)
if sanitized != original:
import tempfile
fd, tmp = tempfile.mkstemp(
dir=str(path.parent), suffix=".tmp", prefix=".env_"
)
try:
with os.fdopen(fd, "w", encoding="utf-8") as f:
f.writelines(sanitized)
f.flush()
os.fsync(f.fileno())
os.replace(tmp, path)
except BaseException:
try:
os.unlink(tmp)
except OSError:
pass
raise
except Exception:
pass # best-effort — don't block gateway startup
def load_hermes_dotenv(
*,
hermes_home: str | os.PathLike | None = None,
@@ -34,6 +79,10 @@ def load_hermes_dotenv(
user_env = home_path / ".env"
project_env_path = Path(project_env) if project_env else None
# Fix corrupted .env files before python-dotenv parses them (#8908).
if user_env.exists():
_sanitize_env_file_if_needed(user_env)
if user_env.exists():
_load_dotenv_with_fallback(user_env, override=True)
loaded.append(user_env)

View File

@@ -0,0 +1,94 @@
"""Tests for .env sanitization during load to prevent token duplication (#8908)."""
import os
import tempfile
from pathlib import Path
from unittest.mock import patch
import pytest
def test_load_env_sanitizes_concatenated_lines():
"""Verify load_env() splits concatenated KEY=VALUE pairs.
Reproduces the scenario from #8908 where a corrupted .env file
contained multiple tokens on a single line, causing the bot token
to be duplicated 8 times.
"""
from hermes_cli.config import load_env
token = "8356550917:AAGGEkzg06Hrc3Hjb3Sa1jkGVDOdU_lYy2Q"
# Simulate concatenated line: TOKEN=xxx followed immediately by another key
corrupted = f"TELEGRAM_BOT_TOKEN={token}ANTHROPIC_API_KEY=sk-ant-test123\n"
with tempfile.NamedTemporaryFile(
mode="w", suffix=".env", delete=False, encoding="utf-8"
) as f:
f.write(corrupted)
env_path = Path(f.name)
try:
with patch("hermes_cli.config.get_env_path", return_value=env_path):
result = load_env()
assert result.get("TELEGRAM_BOT_TOKEN") == token, (
f"Token should be exactly '{token}', got '{result.get('TELEGRAM_BOT_TOKEN')}'"
)
assert result.get("ANTHROPIC_API_KEY") == "sk-ant-test123"
finally:
env_path.unlink(missing_ok=True)
def test_load_env_normal_file_unchanged():
"""A well-formed .env file should be parsed identically."""
from hermes_cli.config import load_env
content = (
"TELEGRAM_BOT_TOKEN=mytoken123\n"
"ANTHROPIC_API_KEY=sk-ant-key\n"
"# comment\n"
"\n"
"OPENAI_API_KEY=sk-openai\n"
)
with tempfile.NamedTemporaryFile(
mode="w", suffix=".env", delete=False, encoding="utf-8"
) as f:
f.write(content)
env_path = Path(f.name)
try:
with patch("hermes_cli.config.get_env_path", return_value=env_path):
result = load_env()
assert result["TELEGRAM_BOT_TOKEN"] == "mytoken123"
assert result["ANTHROPIC_API_KEY"] == "sk-ant-key"
assert result["OPENAI_API_KEY"] == "sk-openai"
finally:
env_path.unlink(missing_ok=True)
def test_env_loader_sanitizes_before_dotenv():
"""Verify env_loader._sanitize_env_file_if_needed fixes corrupted files."""
from hermes_cli.env_loader import _sanitize_env_file_if_needed
token = "8356550917:AAGGEkzg06Hrc3Hjb3Sa1jkGVDOdU_lYy2Q"
corrupted = f"TELEGRAM_BOT_TOKEN={token}ANTHROPIC_API_KEY=sk-ant-test\n"
with tempfile.NamedTemporaryFile(
mode="w", suffix=".env", delete=False, encoding="utf-8"
) as f:
f.write(corrupted)
env_path = Path(f.name)
try:
_sanitize_env_file_if_needed(env_path)
with open(env_path, encoding="utf-8") as f:
lines = f.readlines()
# Should be split into two separate lines
assert len(lines) == 2, f"Expected 2 lines, got {len(lines)}: {lines}"
assert lines[0].startswith("TELEGRAM_BOT_TOKEN=")
assert lines[1].startswith("ANTHROPIC_API_KEY=")
# Token should not contain the second key
parsed_token = lines[0].strip().split("=", 1)[1]
assert parsed_token == token
finally:
env_path.unlink(missing_ok=True)