fix(env_loader): warn when non-ASCII stripped from credential env vars (#13300)

Load-time sanitizer silently removed non-ASCII codepoints from any
env var ending in _API_KEY / _TOKEN / _SECRET / _KEY, turning
copy-paste artifacts (Unicode lookalikes, ZWSP, NBSP) into opaque
provider-side API_KEY_INVALID errors.

Warn once per key to stderr with the offending codepoints (U+XXXX)
and guidance to re-copy from the provider dashboard.
This commit is contained in:
Teknium
2026-04-20 22:14:03 -07:00
committed by GitHub
parent 5125a78283
commit fdd0ecaf13
2 changed files with 102 additions and 3 deletions

View File

@@ -3,6 +3,7 @@
from __future__ import annotations
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
@@ -14,6 +15,26 @@ from dotenv import load_dotenv
# pure ASCII (they become HTTP header values).
_CREDENTIAL_SUFFIXES = ("_API_KEY", "_TOKEN", "_SECRET", "_KEY")
# Names we've already warned about during this process, so repeated
# load_hermes_dotenv() calls (user env + project env, gateway hot-reload,
# tests) don't spam the same warning multiple times.
_WARNED_KEYS: set[str] = set()
def _format_offending_chars(value: str, limit: int = 3) -> str:
"""Return a compact 'U+XXXX ('c'), ...' summary of non-ASCII codepoints."""
seen: list[str] = []
for ch in value:
if ord(ch) > 127:
label = f"U+{ord(ch):04X}"
if ch.isprintable():
label += f" ({ch!r})"
if label not in seen:
seen.append(label)
if len(seen) >= limit:
break
return ", ".join(seen)
def _sanitize_loaded_credentials() -> None:
"""Strip non-ASCII characters from credential env vars in os.environ.
@@ -21,14 +42,42 @@ def _sanitize_loaded_credentials() -> None:
Called after dotenv loads so the rest of the codebase never sees
non-ASCII API keys. Only touches env vars whose names end with
known credential suffixes (``_API_KEY``, ``_TOKEN``, etc.).
Emits a one-line warning to stderr when characters are stripped.
Silent stripping would mask copy-paste corruption (Unicode lookalike
glyphs from PDFs / rich-text editors, ZWSP from web pages) as opaque
provider-side "invalid API key" errors (see #6843).
"""
for key, value in list(os.environ.items()):
if not any(key.endswith(suffix) for suffix in _CREDENTIAL_SUFFIXES):
continue
try:
value.encode("ascii")
continue
except UnicodeEncodeError:
os.environ[key] = value.encode("ascii", errors="ignore").decode("ascii")
pass
cleaned = value.encode("ascii", errors="ignore").decode("ascii")
os.environ[key] = cleaned
if key in _WARNED_KEYS:
continue
_WARNED_KEYS.add(key)
stripped = len(value) - len(cleaned)
detail = _format_offending_chars(value) or "non-printable"
print(
f" Warning: {key} contained {stripped} non-ASCII character"
f"{'s' if stripped != 1 else ''} ({detail}) — stripped so the "
f"key can be sent as an HTTP header.",
file=sys.stderr,
)
print(
" This usually means the key was copy-pasted from a PDF, "
"rich-text editor, or web page that substituted lookalike\n"
" Unicode glyphs for ASCII letters. If authentication fails "
"(e.g. \"API key not valid\"), re-copy the key from the\n"
" provider's dashboard and run `hermes setup` (or edit the "
".env file in a plain-text editor).",
file=sys.stderr,
)
def _load_dotenv_with_fallback(path: Path, *, override: bool) -> None: