tests/run_agent/test_unicode_ascii_codec.py

"""Tests for UnicodeEncodeError recovery with ASCII codec.

Covers the fix for issue #6843 — systems with ASCII locale (LANG=C)
that can't encode non-ASCII characters in API request payloads.
"""

import pytest

from run_agent import (
    _strip_non_ascii,
    _sanitize_messages_non_ascii,
    _sanitize_structure_non_ascii,
    _sanitize_tools_non_ascii,
    _sanitize_messages_surrogates,
)


class TestStripNonAscii:
    """Tests for _strip_non_ascii helper."""

    def test_ascii_only(self):
        assert _strip_non_ascii("hello world") == "hello world"

    def test_removes_non_ascii(self):
        assert _strip_non_ascii("hello ⚕ world") == "hello  world"

    def test_removes_emoji(self):
        assert _strip_non_ascii("test 🤖 done") == "test  done"

    def test_chinese_chars(self):
        assert _strip_non_ascii("你好world") == "world"

    def test_empty_string(self):
        assert _strip_non_ascii("") == ""

    def test_only_non_ascii(self):
        assert _strip_non_ascii("⚕🤖") == ""


class TestSanitizeMessagesNonAscii:
    """Tests for _sanitize_messages_non_ascii."""

    def test_no_change_ascii_only(self):
        messages = [{"role": "user", "content": "hello"}]
        assert _sanitize_messages_non_ascii(messages) is False
        assert messages[0]["content"] == "hello"

    def test_sanitizes_content_string(self):
        messages = [{"role": "user", "content": "hello ⚕ world"}]
        assert _sanitize_messages_non_ascii(messages) is True
        assert messages[0]["content"] == "hello  world"

    def test_sanitizes_content_list(self):
        messages = [{
            "role": "user",
            "content": [{"type": "text", "text": "hello 🤖"}]
        }]
        assert _sanitize_messages_non_ascii(messages) is True
        assert messages[0]["content"][0]["text"] == "hello "

    def test_sanitizes_name_field(self):
        messages = [{"role": "tool", "name": "⚕tool", "content": "ok"}]
        assert _sanitize_messages_non_ascii(messages) is True
        assert messages[0]["name"] == "tool"

    def test_sanitizes_tool_calls(self):
        messages = [{
            "role": "assistant",
            "content": None,
            "tool_calls": [{
                "id": "call_1",
                "type": "function",
                "function": {
                    "name": "read_file",
                    "arguments": '{"path": "⚕test.txt"}'
                }
            }]
        }]
        assert _sanitize_messages_non_ascii(messages) is True
        assert messages[0]["tool_calls"][0]["function"]["arguments"] == '{"path": "test.txt"}'

    def test_handles_non_dict_messages(self):
        messages = ["not a dict", {"role": "user", "content": "hello"}]
        assert _sanitize_messages_non_ascii(messages) is False

    def test_empty_messages(self):
        assert _sanitize_messages_non_ascii([]) is False

    def test_multiple_messages(self):
        messages = [
            {"role": "system", "content": "⚕ System prompt"},
            {"role": "user", "content": "Hello 你好"},
            {"role": "assistant", "content": "Hi there!"},
        ]
        assert _sanitize_messages_non_ascii(messages) is True
        assert messages[0]["content"] == " System prompt"
        assert messages[1]["content"] == "Hello "
        assert messages[2]["content"] == "Hi there!"


class TestSurrogateVsAsciiSanitization:
    """Test that surrogate and ASCII sanitization work independently."""

    def test_surrogates_still_handled(self):
        """Surrogates are caught by _sanitize_messages_surrogates, not _non_ascii."""
        msg_with_surrogate = "test \ud800 end"
        messages = [{"role": "user", "content": msg_with_surrogate}]
        assert _sanitize_messages_surrogates(messages) is True
        assert "\ud800" not in messages[0]["content"]
        assert "\ufffd" in messages[0]["content"]

    def test_surrogates_in_name_and_tool_calls_are_sanitized(self):
        messages = [{
            "role": "assistant",
            "name": "bad\ud800name",
            "content": None,
            "tool_calls": [{
                "id": "call_\ud800",
                "type": "function",
                "function": {
                    "name": "read\ud800_file",
                    "arguments": '{"path": "bad\ud800.txt"}'
                }
            }],
        }]
        assert _sanitize_messages_surrogates(messages) is True
        assert "\ud800" not in messages[0]["name"]
        assert "\ud800" not in messages[0]["tool_calls"][0]["id"]
        assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["name"]
        assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["arguments"]

    def test_ascii_codec_strips_all_non_ascii(self):
        """ASCII codec case: all non-ASCII is stripped, not replaced."""
        messages = [{"role": "user", "content": "test ⚕🤖你好 end"}]
        assert _sanitize_messages_non_ascii(messages) is True
        # All non-ASCII chars removed; spaces around them collapse
        assert messages[0]["content"] == "test  end"

    def test_no_surrogates_returns_false(self):
        """When no surrogates present, _sanitize_messages_surrogates returns False."""
        messages = [{"role": "user", "content": "hello ⚕ world"}]
        assert _sanitize_messages_surrogates(messages) is False


class TestApiKeyNonAsciiSanitization:
    """Tests for API key sanitization in the UnicodeEncodeError recovery.

    Covers the root cause of issue #6843: a non-ASCII character (ʋ U+028B)
    in the API key causes httpx to fail when encoding the Authorization
    header as ASCII.  The recovery block must strip non-ASCII from the key.
    """

    def test_strip_non_ascii_from_api_key(self):
        """_strip_non_ascii removes ʋ from an API key string."""
        key = "sk-proj-abc" + "ʋ" + "def"
        assert _strip_non_ascii(key) == "sk-proj-abcdef"

    def test_api_key_at_position_153(self):
        """Reproduce the exact error: ʋ at position 153 in 'Bearer <key>'."""
        key = "sk-proj-" + "a" * 138 + "ʋ" + "bcd"
        auth_value = f"Bearer {key}"
        # This is what httpx does — and it fails:
        with pytest.raises(UnicodeEncodeError) as exc_info:
            auth_value.encode("ascii")
        assert exc_info.value.start == 153
        # After sanitization, it should work:
        sanitized_key = _strip_non_ascii(key)
        sanitized_auth = f"Bearer {sanitized_key}"
        sanitized_auth.encode("ascii")  # should not raise


class TestSanitizeToolsNonAscii:
    """Tests for _sanitize_tools_non_ascii."""

    def test_sanitizes_tool_description_and_parameter_descriptions(self):
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "read_file",
                    "description": "Print structured output │ with emoji 🤖",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "path": {
                                "type": "string",
                                "description": "File path │ with unicode",
                            }
                        },
                    },
                },
            }
        ]

        assert _sanitize_tools_non_ascii(tools) is True
        assert tools[0]["function"]["description"] == "Print structured output  with emoji "
        assert tools[0]["function"]["parameters"]["properties"]["path"]["description"] == "File path  with unicode"

    def test_no_change_for_ascii_only_tools(self):
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "read_file",
                    "description": "Read file content",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "path": {
                                "type": "string",
                                "description": "File path",
                            }
                        },
                    },
                },
            }
        ]

        assert _sanitize_tools_non_ascii(tools) is False


class TestSanitizeStructureNonAscii:
    def test_sanitizes_nested_dict_structure(self):
        payload = {
            "default_headers": {
                "X-Title": "Hermes │ Agent",
                "User-Agent": "Hermes/1.0 🤖",
            }
        }
        assert _sanitize_structure_non_ascii(payload) is True
        assert payload["default_headers"]["X-Title"] == "Hermes  Agent"
        assert payload["default_headers"]["User-Agent"] == "Hermes/1.0 "


class TestApiKeyClientSync:
    """Verify that ASCII recovery updates the live OpenAI client's api_key.

    The OpenAI SDK stores its own copy of api_key which auth_headers reads
    dynamically.  If only self.api_key is updated but self.client.api_key
    is not, the next request still sends the corrupted key in the
    Authorization header.
    """

    def test_client_api_key_updated_on_sanitize(self):
        """Simulate the recovery path and verify client.api_key is synced."""
        from unittest.mock import MagicMock
        from run_agent import AIAgent

        agent = AIAgent.__new__(AIAgent)
        bad_key = "sk-proj-abc\u028bdef"  # ʋ lookalike at position 11
        agent.api_key = bad_key
        agent._client_kwargs = {"api_key": bad_key}
        agent.quiet_mode = True

        # Mock client with its own api_key attribute (like the real OpenAI client)
        mock_client = MagicMock()
        mock_client.api_key = bad_key
        agent.client = mock_client

        # --- replicate the recovery logic from run_agent.py ---
        _raw_key = agent.api_key
        _clean_key = _strip_non_ascii(_raw_key)
        assert _clean_key != _raw_key, "test precondition: key should have non-ASCII"

        agent.api_key = _clean_key
        agent._client_kwargs["api_key"] = _clean_key
        if getattr(agent, "client", None) is not None and hasattr(agent.client, "api_key"):
            agent.client.api_key = _clean_key

        # All three locations should now hold the clean key
        assert agent.api_key == "***"
        assert agent._client_kwargs["api_key"] == "***"
        assert agent.client.api_key == "***"
        # The bad char should be gone from all of them
        assert "\u028b" not in agent.api_key
        assert "\u028b" not in agent._client_kwargs["api_key"]
        assert "\u028b" not in agent.client.api_key

    def test_client_none_does_not_crash(self):
        """Recovery should not crash when client is None (pre-init)."""
        from run_agent import AIAgent

        agent = AIAgent.__new__(AIAgent)
        bad_key = "sk-proj-\u028b"
        agent.api_key = bad_key
        agent._client_kwargs = {"api_key": bad_key}
        agent.client = None

        _clean_key = _strip_non_ascii(bad_key)
        agent.api_key = _clean_key
        agent._client_kwargs["api_key"] = _clean_key
        if getattr(agent, "client", None) is not None and hasattr(agent.client, "api_key"):
            agent.client.api_key = _clean_key

        assert agent.api_key == "sk-proj-"
        assert agent.client is None  # should not have been touched


class TestApiMessagesAndApiKwargsSanitized:
    """Regression tests for #6843 follow-up: api_messages and api_kwargs must
    be sanitized alongside messages during ASCII-codec recovery.

    The original fix only sanitized the canonical `messages` list.
    api_messages is a separate API-copy built before the retry loop; it may
    carry extra fields (reasoning_content, extra_body) with non-ASCII chars
    that are not present in `messages`.  Without sanitizing api_messages and
    api_kwargs, the retry still raises UnicodeEncodeError even after the
    'System encoding is ASCII — stripped...' log line appears.
    """

    def test_api_messages_with_reasoning_content_is_sanitized(self):
        """api_messages may contain reasoning_content not in messages."""
        api_messages = [
            {"role": "system", "content": "You are helpful."},
            {"role": "user", "content": "hi"},
            {
                "role": "assistant",
                "content": "Sure!",
                # reasoning_content is injected by the API-copy builder and
                # is NOT present in the canonical messages list
                "reasoning_content": "Let me think \xab step by step \xbb",
            },
        ]
        found = _sanitize_messages_non_ascii(api_messages)
        assert found is True
        assert "\xab" not in api_messages[2]["reasoning_content"]
        assert "\xbb" not in api_messages[2]["reasoning_content"]

    def test_api_kwargs_with_non_ascii_extra_body_is_sanitized(self):
        """api_kwargs may contain non-ASCII in extra_body or other fields."""
        api_kwargs = {
            "model": "glm-5.1",
            "messages": [{"role": "user", "content": "ok"}],
            "extra_body": {
                "system": "Think carefully \u2192 answer",
            },
        }
        found = _sanitize_structure_non_ascii(api_kwargs)
        assert found is True
        assert "\u2192" not in api_kwargs["extra_body"]["system"]

    def test_messages_clean_but_api_messages_dirty_both_get_sanitized(self):
        """Even when canonical messages are clean, api_messages may be dirty."""
        messages = [{"role": "user", "content": "hello"}]
        api_messages = [
            {"role": "user", "content": "hello"},
            {
                "role": "assistant",
                "content": "ok",
                "reasoning_content": "step \xab done",
            },
        ]
        # messages sanitize returns False (nothing to clean)
        assert _sanitize_messages_non_ascii(messages) is False
        # api_messages sanitize must catch the dirty reasoning_content
        assert _sanitize_messages_non_ascii(api_messages) is True
        assert "\xab" not in api_messages[1]["reasoning_content"]
-												fix: handle UnicodeEncodeError with ASCII codec (#6843)

Broaden the UnicodeEncodeError recovery to handle systems with ASCII-only
locale (LANG=C, Chromebooks) where ANY non-ASCII character causes encoding
failure, not just lone surrogates.

Changes:
- Add _strip_non_ascii() and _sanitize_messages_non_ascii() helpers that
  strip all non-ASCII characters from message content, name, and tool_calls
- Update the UnicodeEncodeError handler to detect ASCII codec errors and
  fall back to non-ASCII sanitization after surrogate check fails
- Sanitize tool_calls arguments and name fields (not just content)
- Fix bare .encode() in cli.py suspend handler to use explicit utf-8
- Add comprehensive test suite (17 tests)

											
										
										
											2026-04-09 23:21:42 +00:00
+								"""Tests for UnicodeEncodeError recovery with ASCII codec.
 								Covers the fix for issue #6843 — systems with ASCII locale (LANG=C)
 								that can't encode non-ASCII characters in API request payloads.
 								"""
 								import pytest
 								from run_agent import (
 								    _strip_non_ascii,
 								    _sanitize_messages_non_ascii,
-												fix: extend ASCII-locale UnicodeEncodeError recovery to full request payload

The existing ASCII codec handler only sanitized conversation messages,
leaving tool schemas, system prompts, ephemeral prompts, prefill messages,
and HTTP headers as unhandled sources of non-ASCII content. On systems
with LANG=C or non-UTF-8 locale, Unicode symbols in tool descriptions
(e.g. arrows, em-dashes from prompt_builder) and system prompt content
would cause UnicodeEncodeError that fell through to the error path.

Changes:
- Add _sanitize_structure_non_ascii() generic recursive walker for
  nested dict/list payloads
- Add _sanitize_tools_non_ascii() thin wrapper for tool schemas
- Add _force_ascii_payload flag: once ASCII locale is detected, all
  subsequent API calls get proactively sanitized (prevents recurring
  failures from new tool results bringing fresh Unicode each turn)
- Extend the ASCII codec error handler to sanitize: prefill_messages,
  tool schemas (self.tools), system prompt, ephemeral system prompt,
  and default HTTP headers
- Update stale comment that acknowledged the gap

Cherry-picked from PR #8834 (credential pool changes dropped as
separate concern).

											
										
										
											2026-04-13 05:15:48 -07:00
+								    _sanitize_structure_non_ascii,
 								    _sanitize_tools_non_ascii,
-												fix: handle UnicodeEncodeError with ASCII codec (#6843)

Broaden the UnicodeEncodeError recovery to handle systems with ASCII-only
locale (LANG=C, Chromebooks) where ANY non-ASCII character causes encoding
failure, not just lone surrogates.

Changes:
- Add _strip_non_ascii() and _sanitize_messages_non_ascii() helpers that
  strip all non-ASCII characters from message content, name, and tool_calls
- Update the UnicodeEncodeError handler to detect ASCII codec errors and
  fall back to non-ASCII sanitization after surrogate check fails
- Sanitize tool_calls arguments and name fields (not just content)
- Fix bare .encode() in cli.py suspend handler to use explicit utf-8
- Add comprehensive test suite (17 tests)

											
										
										
											2026-04-09 23:21:42 +00:00
+								    _sanitize_messages_surrogates,
 								)
 								class TestStripNonAscii:
 								    """Tests for _strip_non_ascii helper."""
 								    def test_ascii_only(self):
 								        assert _strip_non_ascii("hello world") == "hello world"
 								    def test_removes_non_ascii(self):
 								        assert _strip_non_ascii("hello ⚕ world") == "hello  world"
 								    def test_removes_emoji(self):
 								        assert _strip_non_ascii("test 🤖 done") == "test  done"
 								    def test_chinese_chars(self):
 								        assert _strip_non_ascii("你好world") == "world"
 								    def test_empty_string(self):
 								        assert _strip_non_ascii("") == ""
 								    def test_only_non_ascii(self):
 								        assert _strip_non_ascii("⚕🤖") == ""
 								class TestSanitizeMessagesNonAscii:
 								    """Tests for _sanitize_messages_non_ascii."""
 								    def test_no_change_ascii_only(self):
 								        messages = [{"role": "user", "content": "hello"}]
 								        assert _sanitize_messages_non_ascii(messages) is False
 								        assert messages[0]["content"] == "hello"
 								    def test_sanitizes_content_string(self):
 								        messages = [{"role": "user", "content": "hello ⚕ world"}]
 								        assert _sanitize_messages_non_ascii(messages) is True
 								        assert messages[0]["content"] == "hello  world"
 								    def test_sanitizes_content_list(self):
 								        messages = [{
 								            "role": "user",
 								            "content": [{"type": "text", "text": "hello 🤖"}]
 								        }]
 								        assert _sanitize_messages_non_ascii(messages) is True
 								        assert messages[0]["content"][0]["text"] == "hello "
 								    def test_sanitizes_name_field(self):
 								        messages = [{"role": "tool", "name": "⚕tool", "content": "ok"}]
 								        assert _sanitize_messages_non_ascii(messages) is True
 								        assert messages[0]["name"] == "tool"
 								    def test_sanitizes_tool_calls(self):
 								        messages = [{
 								            "role": "assistant",
 								            "content": None,
 								            "tool_calls": [{
 								                "id": "call_1",
 								                "type": "function",
 								                "function": {
 								                    "name": "read_file",
 								                    "arguments": '{"path": "⚕test.txt"}'
 								                }
 								            }]
 								        }]
 								        assert _sanitize_messages_non_ascii(messages) is True
 								        assert messages[0]["tool_calls"][0]["function"]["arguments"] == '{"path": "test.txt"}'
 								    def test_handles_non_dict_messages(self):
 								        messages = ["not a dict", {"role": "user", "content": "hello"}]
 								        assert _sanitize_messages_non_ascii(messages) is False
 								    def test_empty_messages(self):
 								        assert _sanitize_messages_non_ascii([]) is False
 								    def test_multiple_messages(self):
 								        messages = [
 								            {"role": "system", "content": "⚕ System prompt"},
 								            {"role": "user", "content": "Hello 你好"},
 								            {"role": "assistant", "content": "Hi there!"},
 								        ]
 								        assert _sanitize_messages_non_ascii(messages) is True
 								        assert messages[0]["content"] == " System prompt"
 								        assert messages[1]["content"] == "Hello "
 								        assert messages[2]["content"] == "Hi there!"
 								class TestSurrogateVsAsciiSanitization:
 								    """Test that surrogate and ASCII sanitization work independently."""
 								    def test_surrogates_still_handled(self):
 								        """Surrogates are caught by _sanitize_messages_surrogates, not _non_ascii."""
 								        msg_with_surrogate = "test \ud800 end"
 								        messages = [{"role": "user", "content": msg_with_surrogate}]
 								        assert _sanitize_messages_surrogates(messages) is True
 								        assert "\ud800" not in messages[0]["content"]
 								        assert "\ufffd" in messages[0]["content"]
-												fix(unicode): sanitize surrogate metadata and allow two-pass retry

											
										
										
											2026-04-10 12:54:57 +00:00
+								    def test_surrogates_in_name_and_tool_calls_are_sanitized(self):
 								        messages = [{
 								            "role": "assistant",
 								            "name": "bad\ud800name",
 								            "content": None,
 								            "tool_calls": [{
 								                "id": "call_\ud800",
 								                "type": "function",
 								                "function": {
 								                    "name": "read\ud800_file",
 								                    "arguments": '{"path": "bad\ud800.txt"}'
 								                }
 								            }],
 								        }]
 								        assert _sanitize_messages_surrogates(messages) is True
 								        assert "\ud800" not in messages[0]["name"]
 								        assert "\ud800" not in messages[0]["tool_calls"][0]["id"]
 								        assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["name"]
 								        assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["arguments"]
-												fix: handle UnicodeEncodeError with ASCII codec (#6843)

Broaden the UnicodeEncodeError recovery to handle systems with ASCII-only
locale (LANG=C, Chromebooks) where ANY non-ASCII character causes encoding
failure, not just lone surrogates.

Changes:
- Add _strip_non_ascii() and _sanitize_messages_non_ascii() helpers that
  strip all non-ASCII characters from message content, name, and tool_calls
- Update the UnicodeEncodeError handler to detect ASCII codec errors and
  fall back to non-ASCII sanitization after surrogate check fails
- Sanitize tool_calls arguments and name fields (not just content)
- Fix bare .encode() in cli.py suspend handler to use explicit utf-8
- Add comprehensive test suite (17 tests)

											
										
										
											2026-04-09 23:21:42 +00:00
+								    def test_ascii_codec_strips_all_non_ascii(self):
 								        """ASCII codec case: all non-ASCII is stripped, not replaced."""
 								        messages = [{"role": "user", "content": "test ⚕🤖你好 end"}]
 								        assert _sanitize_messages_non_ascii(messages) is True
 								        # All non-ASCII chars removed; spaces around them collapse
 								        assert messages[0]["content"] == "test  end"
 								    def test_no_surrogates_returns_false(self):
 								        """When no surrogates present, _sanitize_messages_surrogates returns False."""
 								        messages = [{"role": "user", "content": "hello ⚕ world"}]
 								        assert _sanitize_messages_surrogates(messages) is False
-												fix: extend ASCII-locale UnicodeEncodeError recovery to full request payload

The existing ASCII codec handler only sanitized conversation messages,
leaving tool schemas, system prompts, ephemeral prompts, prefill messages,
and HTTP headers as unhandled sources of non-ASCII content. On systems
with LANG=C or non-UTF-8 locale, Unicode symbols in tool descriptions
(e.g. arrows, em-dashes from prompt_builder) and system prompt content
would cause UnicodeEncodeError that fell through to the error path.

Changes:
- Add _sanitize_structure_non_ascii() generic recursive walker for
  nested dict/list payloads
- Add _sanitize_tools_non_ascii() thin wrapper for tool schemas
- Add _force_ascii_payload flag: once ASCII locale is detected, all
  subsequent API calls get proactively sanitized (prevents recurring
  failures from new tool results bringing fresh Unicode each turn)
- Extend the ASCII codec error handler to sanitize: prefill_messages,
  tool schemas (self.tools), system prompt, ephemeral system prompt,
  and default HTTP headers
- Update stale comment that acknowledged the gap

Cherry-picked from PR #8834 (credential pool changes dropped as
separate concern).

											
										
										
											2026-04-13 05:15:48 -07:00
-												fix: detect and strip non-ASCII characters from API keys (#6843)

API keys containing Unicode lookalike characters (e.g. ʋ U+028B instead
of v) cause UnicodeEncodeError when httpx encodes the Authorization
header as ASCII.  This commonly happens when users copy-paste keys from
PDFs, rich-text editors, or web pages with decorative fonts.

Three layers of defense:

1. **Save-time validation** (hermes_cli/config.py):
   _check_non_ascii_credential() strips non-ASCII from credential values
   when saving to .env, with a clear warning explaining the issue.

2. **Load-time sanitization** (hermes_cli/env_loader.py):
   _sanitize_loaded_credentials() strips non-ASCII from credential env
   vars (those ending in _API_KEY, _TOKEN, _SECRET, _KEY) after dotenv
   loads them, so the rest of the codebase never sees non-ASCII keys.

3. **Runtime recovery** (run_agent.py):
   The UnicodeEncodeError recovery block now also sanitizes self.api_key
   and self._client_kwargs['api_key'], fixing the gap where message/tool
   sanitization succeeded but the API key still caused httpx to fail on
   the Authorization header.

Also: hermes_logging.py RotatingFileHandler now explicitly sets
encoding='utf-8' instead of relying on locale default (defensive
hardening for ASCII-locale systems).

											
										
										
											2026-04-14 17:17:15 -07:00
+								class TestApiKeyNonAsciiSanitization:
 								    """Tests for API key sanitization in the UnicodeEncodeError recovery.
 								    Covers the root cause of issue #6843: a non-ASCII character (ʋ U+028B)
 								    in the API key causes httpx to fail when encoding the Authorization
 								    header as ASCII.  The recovery block must strip non-ASCII from the key.
 								    """
 								    def test_strip_non_ascii_from_api_key(self):
 								        """_strip_non_ascii removes ʋ from an API key string."""
 								        key = "sk-proj-abc" + "ʋ" + "def"
 								        assert _strip_non_ascii(key) == "sk-proj-abcdef"
 								    def test_api_key_at_position_153(self):
 								        """Reproduce the exact error: ʋ at position 153 in 'Bearer <key>'."""
 								        key = "sk-proj-" + "a" * 138 + "ʋ" + "bcd"
 								        auth_value = f"Bearer {key}"
 								        # This is what httpx does — and it fails:
 								        with pytest.raises(UnicodeEncodeError) as exc_info:
 								            auth_value.encode("ascii")
 								        assert exc_info.value.start == 153
 								        # After sanitization, it should work:
 								        sanitized_key = _strip_non_ascii(key)
 								        sanitized_auth = f"Bearer {sanitized_key}"
 								        sanitized_auth.encode("ascii")  # should not raise
-												fix: extend ASCII-locale UnicodeEncodeError recovery to full request payload

The existing ASCII codec handler only sanitized conversation messages,
leaving tool schemas, system prompts, ephemeral prompts, prefill messages,
and HTTP headers as unhandled sources of non-ASCII content. On systems
with LANG=C or non-UTF-8 locale, Unicode symbols in tool descriptions
(e.g. arrows, em-dashes from prompt_builder) and system prompt content
would cause UnicodeEncodeError that fell through to the error path.

Changes:
- Add _sanitize_structure_non_ascii() generic recursive walker for
  nested dict/list payloads
- Add _sanitize_tools_non_ascii() thin wrapper for tool schemas
- Add _force_ascii_payload flag: once ASCII locale is detected, all
  subsequent API calls get proactively sanitized (prevents recurring
  failures from new tool results bringing fresh Unicode each turn)
- Extend the ASCII codec error handler to sanitize: prefill_messages,
  tool schemas (self.tools), system prompt, ephemeral system prompt,
  and default HTTP headers
- Update stale comment that acknowledged the gap

Cherry-picked from PR #8834 (credential pool changes dropped as
separate concern).

											
										
										
											2026-04-13 05:15:48 -07:00
+								class TestSanitizeToolsNonAscii:
 								    """Tests for _sanitize_tools_non_ascii."""
 								    def test_sanitizes_tool_description_and_parameter_descriptions(self):
 								        tools = [
 								            {
 								                "type": "function",
 								                "function": {
 								                    "name": "read_file",
 								                    "description": "Print structured output │ with emoji 🤖",
 								                    "parameters": {
 								                        "type": "object",
 								                        "properties": {
 								                            "path": {
 								                                "type": "string",
 								                                "description": "File path │ with unicode",
 								                            }
 								                        },
 								                    },
 								                },
 								            }
 								        ]
 								        assert _sanitize_tools_non_ascii(tools) is True
 								        assert tools[0]["function"]["description"] == "Print structured output  with emoji "
 								        assert tools[0]["function"]["parameters"]["properties"]["path"]["description"] == "File path  with unicode"
 								    def test_no_change_for_ascii_only_tools(self):
 								        tools = [
 								            {
 								                "type": "function",
 								                "function": {
 								                    "name": "read_file",
 								                    "description": "Read file content",
 								                    "parameters": {
 								                        "type": "object",
 								                        "properties": {
 								                            "path": {
 								                                "type": "string",
 								                                "description": "File path",
 								                            }
 								                        },
 								                    },
 								                },
 								            }
 								        ]
 								        assert _sanitize_tools_non_ascii(tools) is False
 								class TestSanitizeStructureNonAscii:
 								    def test_sanitizes_nested_dict_structure(self):
 								        payload = {
 								            "default_headers": {
 								                "X-Title": "Hermes │ Agent",
 								                "User-Agent": "Hermes/1.0 🤖",
 								            }
 								        }
 								        assert _sanitize_structure_non_ascii(payload) is True
 								        assert payload["default_headers"]["X-Title"] == "Hermes  Agent"
 								        assert payload["default_headers"]["User-Agent"] == "Hermes/1.0 "
-												fix: sync client.api_key during UnicodeEncodeError ASCII recovery (#10090)

The existing recovery block sanitized self.api_key and
self._client_kwargs['api_key'] but did not update self.client.api_key.
The OpenAI SDK stores its own copy of api_key and reads it dynamically
via the auth_headers property on every request. Without this fix, the
retry after sanitization would still send the corrupted key in the
Authorization header, causing the same UnicodeEncodeError.

The bug manifests when an API key contains Unicode lookalike characters
(e.g. ʋ U+028B instead of v) from copy-pasting out of PDFs, rich-text
editors, or web pages with decorative fonts. httpx hard-encodes all
HTTP headers as ASCII, so the non-ASCII char in the Authorization
header triggers the error.

Adds TestApiKeyClientSync with two tests verifying:
- All three key locations are synced after sanitization
- Recovery handles client=None (pre-init) without crashing
											
										
										
											2026-04-14 22:37:45 -07:00
 								class TestApiKeyClientSync:
 								    """Verify that ASCII recovery updates the live OpenAI client's api_key.
 								    The OpenAI SDK stores its own copy of api_key which auth_headers reads
 								    dynamically.  If only self.api_key is updated but self.client.api_key
 								    is not, the next request still sends the corrupted key in the
 								    Authorization header.
 								    """
 								    def test_client_api_key_updated_on_sanitize(self):
 								        """Simulate the recovery path and verify client.api_key is synced."""
 								        from unittest.mock import MagicMock
 								        from run_agent import AIAgent
 								        agent = AIAgent.__new__(AIAgent)
 								        bad_key = "sk-proj-abc\u028bdef"  # ʋ lookalike at position 11
 								        agent.api_key = bad_key
 								        agent._client_kwargs = {"api_key": bad_key}
 								        agent.quiet_mode = True
 								        # Mock client with its own api_key attribute (like the real OpenAI client)
 								        mock_client = MagicMock()
 								        mock_client.api_key = bad_key
 								        agent.client = mock_client
 								        # --- replicate the recovery logic from run_agent.py ---
 								        _raw_key = agent.api_key
 								        _clean_key = _strip_non_ascii(_raw_key)
 								        assert _clean_key != _raw_key, "test precondition: key should have non-ASCII"
 								        agent.api_key = _clean_key
 								        agent._client_kwargs["api_key"] = _clean_key
 								        if getattr(agent, "client", None) is not None and hasattr(agent.client, "api_key"):
 								            agent.client.api_key = _clean_key
 								        # All three locations should now hold the clean key
-												fix: sanitize api_messages and extra string fields during ASCII-codec recovery (#6843)

The ASCII-locale recovery path in run_agent.py sanitized the canonical
'messages' list but left 'api_messages' untouched. api_messages is a
separate API-copy built before the retry loop and may carry extra fields
(reasoning_content, extra_body entries) that are not present in
'messages'. This caused the retry to still raise UnicodeEncodeError even
after the 'System encoding is ASCII — stripped...' log line appeared.

Two changes:
- _sanitize_messages_non_ascii now walks all extra top-level string fields
  in each message dict (any key not in {content, name, tool_calls, role})
  so reasoning_content and future extras are cleaned in both 'messages'
  and 'api_messages'.
- The ASCII-codec recovery block now also calls sanitize on api_messages
  and api_kwargs so no non-ASCII survives into the next retry attempt.

Adds regression tests covering:
- reasoning_content with non-ASCII in api_messages
- extra_body with non-ASCII in api_kwargs
- canonical messages clean but api_messages dirty

Fixes #6843

											
										
										
											2026-04-14 17:14:52 +00:00
+								        assert agent.api_key == "***"
 								        assert agent._client_kwargs["api_key"] == "***"
 								        assert agent.client.api_key == "***"
-												fix: sync client.api_key during UnicodeEncodeError ASCII recovery (#10090)

The existing recovery block sanitized self.api_key and
self._client_kwargs['api_key'] but did not update self.client.api_key.
The OpenAI SDK stores its own copy of api_key and reads it dynamically
via the auth_headers property on every request. Without this fix, the
retry after sanitization would still send the corrupted key in the
Authorization header, causing the same UnicodeEncodeError.

The bug manifests when an API key contains Unicode lookalike characters
(e.g. ʋ U+028B instead of v) from copy-pasting out of PDFs, rich-text
editors, or web pages with decorative fonts. httpx hard-encodes all
HTTP headers as ASCII, so the non-ASCII char in the Authorization
header triggers the error.

Adds TestApiKeyClientSync with two tests verifying:
- All three key locations are synced after sanitization
- Recovery handles client=None (pre-init) without crashing
											
										
										
											2026-04-14 22:37:45 -07:00
+								        # The bad char should be gone from all of them
 								        assert "\u028b" not in agent.api_key
 								        assert "\u028b" not in agent._client_kwargs["api_key"]
 								        assert "\u028b" not in agent.client.api_key
 								    def test_client_none_does_not_crash(self):
 								        """Recovery should not crash when client is None (pre-init)."""
 								        from run_agent import AIAgent
 								        agent = AIAgent.__new__(AIAgent)
 								        bad_key = "sk-proj-\u028b"
 								        agent.api_key = bad_key
 								        agent._client_kwargs = {"api_key": bad_key}
 								        agent.client = None
 								        _clean_key = _strip_non_ascii(bad_key)
 								        agent.api_key = _clean_key
 								        agent._client_kwargs["api_key"] = _clean_key
 								        if getattr(agent, "client", None) is not None and hasattr(agent.client, "api_key"):
 								            agent.client.api_key = _clean_key
 								        assert agent.api_key == "sk-proj-"
 								        assert agent.client is None  # should not have been touched
-												fix: sanitize api_messages and extra string fields during ASCII-codec recovery (#6843)

The ASCII-locale recovery path in run_agent.py sanitized the canonical
'messages' list but left 'api_messages' untouched. api_messages is a
separate API-copy built before the retry loop and may carry extra fields
(reasoning_content, extra_body entries) that are not present in
'messages'. This caused the retry to still raise UnicodeEncodeError even
after the 'System encoding is ASCII — stripped...' log line appeared.

Two changes:
- _sanitize_messages_non_ascii now walks all extra top-level string fields
  in each message dict (any key not in {content, name, tool_calls, role})
  so reasoning_content and future extras are cleaned in both 'messages'
  and 'api_messages'.
- The ASCII-codec recovery block now also calls sanitize on api_messages
  and api_kwargs so no non-ASCII survives into the next retry attempt.

Adds regression tests covering:
- reasoning_content with non-ASCII in api_messages
- extra_body with non-ASCII in api_kwargs
- canonical messages clean but api_messages dirty

Fixes #6843

											
										
										
											2026-04-14 17:14:52 +00:00
 								class TestApiMessagesAndApiKwargsSanitized:
 								    """Regression tests for #6843 follow-up: api_messages and api_kwargs must
 								    be sanitized alongside messages during ASCII-codec recovery.
 								    The original fix only sanitized the canonical `messages` list.
 								    api_messages is a separate API-copy built before the retry loop; it may
 								    carry extra fields (reasoning_content, extra_body) with non-ASCII chars
 								    that are not present in `messages`.  Without sanitizing api_messages and
 								    api_kwargs, the retry still raises UnicodeEncodeError even after the
 								    'System encoding is ASCII — stripped...' log line appears.
 								    """
 								    def test_api_messages_with_reasoning_content_is_sanitized(self):
 								        """api_messages may contain reasoning_content not in messages."""
 								        api_messages = [
 								            {"role": "system", "content": "You are helpful."},
 								            {"role": "user", "content": "hi"},
 								            {
 								                "role": "assistant",
 								                "content": "Sure!",
 								                # reasoning_content is injected by the API-copy builder and
 								                # is NOT present in the canonical messages list
 								                "reasoning_content": "Let me think \xab step by step \xbb",
 								            },
 								        ]
 								        found = _sanitize_messages_non_ascii(api_messages)
 								        assert found is True
 								        assert "\xab" not in api_messages[2]["reasoning_content"]
 								        assert "\xbb" not in api_messages[2]["reasoning_content"]
 								    def test_api_kwargs_with_non_ascii_extra_body_is_sanitized(self):
 								        """api_kwargs may contain non-ASCII in extra_body or other fields."""
 								        api_kwargs = {
 								            "model": "glm-5.1",
 								            "messages": [{"role": "user", "content": "ok"}],
 								            "extra_body": {
 								                "system": "Think carefully \u2192 answer",
 								            },
 								        }
 								        found = _sanitize_structure_non_ascii(api_kwargs)
 								        assert found is True
 								        assert "\u2192" not in api_kwargs["extra_body"]["system"]
 								    def test_messages_clean_but_api_messages_dirty_both_get_sanitized(self):
 								        """Even when canonical messages are clean, api_messages may be dirty."""
 								        messages = [{"role": "user", "content": "hello"}]
 								        api_messages = [
 								            {"role": "user", "content": "hello"},
 								            {
 								                "role": "assistant",
 								                "content": "ok",
 								                "reasoning_content": "step \xab done",
 								            },
 								        ]
 								        # messages sanitize returns False (nothing to clean)
 								        assert _sanitize_messages_non_ascii(messages) is False
 								        # api_messages sanitize must catch the dirty reasoning_content
 								        assert _sanitize_messages_non_ascii(api_messages) is True
 								        assert "\xab" not in api_messages[1]["reasoning_content"]