2026-04-09 23:21:42 +00:00
|
|
|
|
"""Tests for UnicodeEncodeError recovery with ASCII codec.
|
|
|
|
|
|
|
|
|
|
|
|
Covers the fix for issue #6843 — systems with ASCII locale (LANG=C)
|
|
|
|
|
|
that can't encode non-ASCII characters in API request payloads.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
|
|
from run_agent import (
|
|
|
|
|
|
_strip_non_ascii,
|
|
|
|
|
|
_sanitize_messages_non_ascii,
|
fix: extend ASCII-locale UnicodeEncodeError recovery to full request payload
The existing ASCII codec handler only sanitized conversation messages,
leaving tool schemas, system prompts, ephemeral prompts, prefill messages,
and HTTP headers as unhandled sources of non-ASCII content. On systems
with LANG=C or non-UTF-8 locale, Unicode symbols in tool descriptions
(e.g. arrows, em-dashes from prompt_builder) and system prompt content
would cause UnicodeEncodeError that fell through to the error path.
Changes:
- Add _sanitize_structure_non_ascii() generic recursive walker for
nested dict/list payloads
- Add _sanitize_tools_non_ascii() thin wrapper for tool schemas
- Add _force_ascii_payload flag: once ASCII locale is detected, all
subsequent API calls get proactively sanitized (prevents recurring
failures from new tool results bringing fresh Unicode each turn)
- Extend the ASCII codec error handler to sanitize: prefill_messages,
tool schemas (self.tools), system prompt, ephemeral system prompt,
and default HTTP headers
- Update stale comment that acknowledged the gap
Cherry-picked from PR #8834 (credential pool changes dropped as
separate concern).
2026-04-13 05:15:48 -07:00
|
|
|
|
_sanitize_structure_non_ascii,
|
|
|
|
|
|
_sanitize_tools_non_ascii,
|
2026-04-09 23:21:42 +00:00
|
|
|
|
_sanitize_messages_surrogates,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestStripNonAscii:
|
|
|
|
|
|
"""Tests for _strip_non_ascii helper."""
|
|
|
|
|
|
|
|
|
|
|
|
def test_ascii_only(self):
|
|
|
|
|
|
assert _strip_non_ascii("hello world") == "hello world"
|
|
|
|
|
|
|
|
|
|
|
|
def test_removes_non_ascii(self):
|
|
|
|
|
|
assert _strip_non_ascii("hello ⚕ world") == "hello world"
|
|
|
|
|
|
|
|
|
|
|
|
def test_removes_emoji(self):
|
|
|
|
|
|
assert _strip_non_ascii("test 🤖 done") == "test done"
|
|
|
|
|
|
|
|
|
|
|
|
def test_chinese_chars(self):
|
|
|
|
|
|
assert _strip_non_ascii("你好world") == "world"
|
|
|
|
|
|
|
|
|
|
|
|
def test_empty_string(self):
|
|
|
|
|
|
assert _strip_non_ascii("") == ""
|
|
|
|
|
|
|
|
|
|
|
|
def test_only_non_ascii(self):
|
|
|
|
|
|
assert _strip_non_ascii("⚕🤖") == ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestSanitizeMessagesNonAscii:
|
|
|
|
|
|
"""Tests for _sanitize_messages_non_ascii."""
|
|
|
|
|
|
|
|
|
|
|
|
def test_no_change_ascii_only(self):
|
|
|
|
|
|
messages = [{"role": "user", "content": "hello"}]
|
|
|
|
|
|
assert _sanitize_messages_non_ascii(messages) is False
|
|
|
|
|
|
assert messages[0]["content"] == "hello"
|
|
|
|
|
|
|
|
|
|
|
|
def test_sanitizes_content_string(self):
|
|
|
|
|
|
messages = [{"role": "user", "content": "hello ⚕ world"}]
|
|
|
|
|
|
assert _sanitize_messages_non_ascii(messages) is True
|
|
|
|
|
|
assert messages[0]["content"] == "hello world"
|
|
|
|
|
|
|
|
|
|
|
|
def test_sanitizes_content_list(self):
|
|
|
|
|
|
messages = [{
|
|
|
|
|
|
"role": "user",
|
|
|
|
|
|
"content": [{"type": "text", "text": "hello 🤖"}]
|
|
|
|
|
|
}]
|
|
|
|
|
|
assert _sanitize_messages_non_ascii(messages) is True
|
|
|
|
|
|
assert messages[0]["content"][0]["text"] == "hello "
|
|
|
|
|
|
|
|
|
|
|
|
def test_sanitizes_name_field(self):
|
|
|
|
|
|
messages = [{"role": "tool", "name": "⚕tool", "content": "ok"}]
|
|
|
|
|
|
assert _sanitize_messages_non_ascii(messages) is True
|
|
|
|
|
|
assert messages[0]["name"] == "tool"
|
|
|
|
|
|
|
|
|
|
|
|
def test_sanitizes_tool_calls(self):
|
|
|
|
|
|
messages = [{
|
|
|
|
|
|
"role": "assistant",
|
|
|
|
|
|
"content": None,
|
|
|
|
|
|
"tool_calls": [{
|
|
|
|
|
|
"id": "call_1",
|
|
|
|
|
|
"type": "function",
|
|
|
|
|
|
"function": {
|
|
|
|
|
|
"name": "read_file",
|
|
|
|
|
|
"arguments": '{"path": "⚕test.txt"}'
|
|
|
|
|
|
}
|
|
|
|
|
|
}]
|
|
|
|
|
|
}]
|
|
|
|
|
|
assert _sanitize_messages_non_ascii(messages) is True
|
|
|
|
|
|
assert messages[0]["tool_calls"][0]["function"]["arguments"] == '{"path": "test.txt"}'
|
|
|
|
|
|
|
|
|
|
|
|
def test_handles_non_dict_messages(self):
|
|
|
|
|
|
messages = ["not a dict", {"role": "user", "content": "hello"}]
|
|
|
|
|
|
assert _sanitize_messages_non_ascii(messages) is False
|
|
|
|
|
|
|
|
|
|
|
|
def test_empty_messages(self):
|
|
|
|
|
|
assert _sanitize_messages_non_ascii([]) is False
|
|
|
|
|
|
|
|
|
|
|
|
def test_multiple_messages(self):
|
|
|
|
|
|
messages = [
|
|
|
|
|
|
{"role": "system", "content": "⚕ System prompt"},
|
|
|
|
|
|
{"role": "user", "content": "Hello 你好"},
|
|
|
|
|
|
{"role": "assistant", "content": "Hi there!"},
|
|
|
|
|
|
]
|
|
|
|
|
|
assert _sanitize_messages_non_ascii(messages) is True
|
|
|
|
|
|
assert messages[0]["content"] == " System prompt"
|
|
|
|
|
|
assert messages[1]["content"] == "Hello "
|
|
|
|
|
|
assert messages[2]["content"] == "Hi there!"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestSurrogateVsAsciiSanitization:
|
|
|
|
|
|
"""Test that surrogate and ASCII sanitization work independently."""
|
|
|
|
|
|
|
|
|
|
|
|
def test_surrogates_still_handled(self):
|
|
|
|
|
|
"""Surrogates are caught by _sanitize_messages_surrogates, not _non_ascii."""
|
|
|
|
|
|
msg_with_surrogate = "test \ud800 end"
|
|
|
|
|
|
messages = [{"role": "user", "content": msg_with_surrogate}]
|
|
|
|
|
|
assert _sanitize_messages_surrogates(messages) is True
|
|
|
|
|
|
assert "\ud800" not in messages[0]["content"]
|
|
|
|
|
|
assert "\ufffd" in messages[0]["content"]
|
|
|
|
|
|
|
2026-04-10 12:54:57 +00:00
|
|
|
|
def test_surrogates_in_name_and_tool_calls_are_sanitized(self):
|
|
|
|
|
|
messages = [{
|
|
|
|
|
|
"role": "assistant",
|
|
|
|
|
|
"name": "bad\ud800name",
|
|
|
|
|
|
"content": None,
|
|
|
|
|
|
"tool_calls": [{
|
|
|
|
|
|
"id": "call_\ud800",
|
|
|
|
|
|
"type": "function",
|
|
|
|
|
|
"function": {
|
|
|
|
|
|
"name": "read\ud800_file",
|
|
|
|
|
|
"arguments": '{"path": "bad\ud800.txt"}'
|
|
|
|
|
|
}
|
|
|
|
|
|
}],
|
|
|
|
|
|
}]
|
|
|
|
|
|
assert _sanitize_messages_surrogates(messages) is True
|
|
|
|
|
|
assert "\ud800" not in messages[0]["name"]
|
|
|
|
|
|
assert "\ud800" not in messages[0]["tool_calls"][0]["id"]
|
|
|
|
|
|
assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["name"]
|
|
|
|
|
|
assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["arguments"]
|
|
|
|
|
|
|
2026-04-09 23:21:42 +00:00
|
|
|
|
def test_ascii_codec_strips_all_non_ascii(self):
|
|
|
|
|
|
"""ASCII codec case: all non-ASCII is stripped, not replaced."""
|
|
|
|
|
|
messages = [{"role": "user", "content": "test ⚕🤖你好 end"}]
|
|
|
|
|
|
assert _sanitize_messages_non_ascii(messages) is True
|
|
|
|
|
|
# All non-ASCII chars removed; spaces around them collapse
|
|
|
|
|
|
assert messages[0]["content"] == "test end"
|
|
|
|
|
|
|
|
|
|
|
|
def test_no_surrogates_returns_false(self):
|
|
|
|
|
|
"""When no surrogates present, _sanitize_messages_surrogates returns False."""
|
|
|
|
|
|
messages = [{"role": "user", "content": "hello ⚕ world"}]
|
|
|
|
|
|
assert _sanitize_messages_surrogates(messages) is False
|
fix: extend ASCII-locale UnicodeEncodeError recovery to full request payload
The existing ASCII codec handler only sanitized conversation messages,
leaving tool schemas, system prompts, ephemeral prompts, prefill messages,
and HTTP headers as unhandled sources of non-ASCII content. On systems
with LANG=C or non-UTF-8 locale, Unicode symbols in tool descriptions
(e.g. arrows, em-dashes from prompt_builder) and system prompt content
would cause UnicodeEncodeError that fell through to the error path.
Changes:
- Add _sanitize_structure_non_ascii() generic recursive walker for
nested dict/list payloads
- Add _sanitize_tools_non_ascii() thin wrapper for tool schemas
- Add _force_ascii_payload flag: once ASCII locale is detected, all
subsequent API calls get proactively sanitized (prevents recurring
failures from new tool results bringing fresh Unicode each turn)
- Extend the ASCII codec error handler to sanitize: prefill_messages,
tool schemas (self.tools), system prompt, ephemeral system prompt,
and default HTTP headers
- Update stale comment that acknowledged the gap
Cherry-picked from PR #8834 (credential pool changes dropped as
separate concern).
2026-04-13 05:15:48 -07:00
|
|
|
|
|
|
|
|
|
|
|
2026-04-14 17:17:15 -07:00
|
|
|
|
class TestApiKeyNonAsciiSanitization:
|
|
|
|
|
|
"""Tests for API key sanitization in the UnicodeEncodeError recovery.
|
|
|
|
|
|
|
|
|
|
|
|
Covers the root cause of issue #6843: a non-ASCII character (ʋ U+028B)
|
|
|
|
|
|
in the API key causes httpx to fail when encoding the Authorization
|
|
|
|
|
|
header as ASCII. The recovery block must strip non-ASCII from the key.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def test_strip_non_ascii_from_api_key(self):
|
|
|
|
|
|
"""_strip_non_ascii removes ʋ from an API key string."""
|
|
|
|
|
|
key = "sk-proj-abc" + "ʋ" + "def"
|
|
|
|
|
|
assert _strip_non_ascii(key) == "sk-proj-abcdef"
|
|
|
|
|
|
|
|
|
|
|
|
def test_api_key_at_position_153(self):
|
|
|
|
|
|
"""Reproduce the exact error: ʋ at position 153 in 'Bearer <key>'."""
|
|
|
|
|
|
key = "sk-proj-" + "a" * 138 + "ʋ" + "bcd"
|
|
|
|
|
|
auth_value = f"Bearer {key}"
|
|
|
|
|
|
# This is what httpx does — and it fails:
|
|
|
|
|
|
with pytest.raises(UnicodeEncodeError) as exc_info:
|
|
|
|
|
|
auth_value.encode("ascii")
|
|
|
|
|
|
assert exc_info.value.start == 153
|
|
|
|
|
|
# After sanitization, it should work:
|
|
|
|
|
|
sanitized_key = _strip_non_ascii(key)
|
|
|
|
|
|
sanitized_auth = f"Bearer {sanitized_key}"
|
|
|
|
|
|
sanitized_auth.encode("ascii") # should not raise
|
|
|
|
|
|
|
|
|
|
|
|
|
fix: extend ASCII-locale UnicodeEncodeError recovery to full request payload
The existing ASCII codec handler only sanitized conversation messages,
leaving tool schemas, system prompts, ephemeral prompts, prefill messages,
and HTTP headers as unhandled sources of non-ASCII content. On systems
with LANG=C or non-UTF-8 locale, Unicode symbols in tool descriptions
(e.g. arrows, em-dashes from prompt_builder) and system prompt content
would cause UnicodeEncodeError that fell through to the error path.
Changes:
- Add _sanitize_structure_non_ascii() generic recursive walker for
nested dict/list payloads
- Add _sanitize_tools_non_ascii() thin wrapper for tool schemas
- Add _force_ascii_payload flag: once ASCII locale is detected, all
subsequent API calls get proactively sanitized (prevents recurring
failures from new tool results bringing fresh Unicode each turn)
- Extend the ASCII codec error handler to sanitize: prefill_messages,
tool schemas (self.tools), system prompt, ephemeral system prompt,
and default HTTP headers
- Update stale comment that acknowledged the gap
Cherry-picked from PR #8834 (credential pool changes dropped as
separate concern).
2026-04-13 05:15:48 -07:00
|
|
|
|
class TestSanitizeToolsNonAscii:
|
|
|
|
|
|
"""Tests for _sanitize_tools_non_ascii."""
|
|
|
|
|
|
|
|
|
|
|
|
def test_sanitizes_tool_description_and_parameter_descriptions(self):
|
|
|
|
|
|
tools = [
|
|
|
|
|
|
{
|
|
|
|
|
|
"type": "function",
|
|
|
|
|
|
"function": {
|
|
|
|
|
|
"name": "read_file",
|
|
|
|
|
|
"description": "Print structured output │ with emoji 🤖",
|
|
|
|
|
|
"parameters": {
|
|
|
|
|
|
"type": "object",
|
|
|
|
|
|
"properties": {
|
|
|
|
|
|
"path": {
|
|
|
|
|
|
"type": "string",
|
|
|
|
|
|
"description": "File path │ with unicode",
|
|
|
|
|
|
}
|
|
|
|
|
|
},
|
|
|
|
|
|
},
|
|
|
|
|
|
},
|
|
|
|
|
|
}
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
assert _sanitize_tools_non_ascii(tools) is True
|
|
|
|
|
|
assert tools[0]["function"]["description"] == "Print structured output with emoji "
|
|
|
|
|
|
assert tools[0]["function"]["parameters"]["properties"]["path"]["description"] == "File path with unicode"
|
|
|
|
|
|
|
|
|
|
|
|
def test_no_change_for_ascii_only_tools(self):
|
|
|
|
|
|
tools = [
|
|
|
|
|
|
{
|
|
|
|
|
|
"type": "function",
|
|
|
|
|
|
"function": {
|
|
|
|
|
|
"name": "read_file",
|
|
|
|
|
|
"description": "Read file content",
|
|
|
|
|
|
"parameters": {
|
|
|
|
|
|
"type": "object",
|
|
|
|
|
|
"properties": {
|
|
|
|
|
|
"path": {
|
|
|
|
|
|
"type": "string",
|
|
|
|
|
|
"description": "File path",
|
|
|
|
|
|
}
|
|
|
|
|
|
},
|
|
|
|
|
|
},
|
|
|
|
|
|
},
|
|
|
|
|
|
}
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
assert _sanitize_tools_non_ascii(tools) is False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestSanitizeStructureNonAscii:
|
|
|
|
|
|
def test_sanitizes_nested_dict_structure(self):
|
|
|
|
|
|
payload = {
|
|
|
|
|
|
"default_headers": {
|
|
|
|
|
|
"X-Title": "Hermes │ Agent",
|
|
|
|
|
|
"User-Agent": "Hermes/1.0 🤖",
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
assert _sanitize_structure_non_ascii(payload) is True
|
|
|
|
|
|
assert payload["default_headers"]["X-Title"] == "Hermes Agent"
|
|
|
|
|
|
assert payload["default_headers"]["User-Agent"] == "Hermes/1.0 "
|
2026-04-14 22:37:45 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestApiKeyClientSync:
|
|
|
|
|
|
"""Verify that ASCII recovery updates the live OpenAI client's api_key.
|
|
|
|
|
|
|
|
|
|
|
|
The OpenAI SDK stores its own copy of api_key which auth_headers reads
|
|
|
|
|
|
dynamically. If only self.api_key is updated but self.client.api_key
|
|
|
|
|
|
is not, the next request still sends the corrupted key in the
|
|
|
|
|
|
Authorization header.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def test_client_api_key_updated_on_sanitize(self):
|
|
|
|
|
|
"""Simulate the recovery path and verify client.api_key is synced."""
|
|
|
|
|
|
from unittest.mock import MagicMock
|
|
|
|
|
|
from run_agent import AIAgent
|
|
|
|
|
|
|
|
|
|
|
|
agent = AIAgent.__new__(AIAgent)
|
|
|
|
|
|
bad_key = "sk-proj-abc\u028bdef" # ʋ lookalike at position 11
|
|
|
|
|
|
agent.api_key = bad_key
|
|
|
|
|
|
agent._client_kwargs = {"api_key": bad_key}
|
|
|
|
|
|
agent.quiet_mode = True
|
|
|
|
|
|
|
|
|
|
|
|
# Mock client with its own api_key attribute (like the real OpenAI client)
|
|
|
|
|
|
mock_client = MagicMock()
|
|
|
|
|
|
mock_client.api_key = bad_key
|
|
|
|
|
|
agent.client = mock_client
|
|
|
|
|
|
|
|
|
|
|
|
# --- replicate the recovery logic from run_agent.py ---
|
|
|
|
|
|
_raw_key = agent.api_key
|
|
|
|
|
|
_clean_key = _strip_non_ascii(_raw_key)
|
|
|
|
|
|
assert _clean_key != _raw_key, "test precondition: key should have non-ASCII"
|
|
|
|
|
|
|
|
|
|
|
|
agent.api_key = _clean_key
|
|
|
|
|
|
agent._client_kwargs["api_key"] = _clean_key
|
|
|
|
|
|
if getattr(agent, "client", None) is not None and hasattr(agent.client, "api_key"):
|
|
|
|
|
|
agent.client.api_key = _clean_key
|
|
|
|
|
|
|
|
|
|
|
|
# All three locations should now hold the clean key
|
2026-04-15 14:56:55 -07:00
|
|
|
|
assert agent.api_key == "sk-proj-abcdef"
|
|
|
|
|
|
assert agent._client_kwargs["api_key"] == "sk-proj-abcdef"
|
|
|
|
|
|
assert agent.client.api_key == "sk-proj-abcdef"
|
2026-04-14 22:37:45 -07:00
|
|
|
|
# The bad char should be gone from all of them
|
|
|
|
|
|
assert "\u028b" not in agent.api_key
|
|
|
|
|
|
assert "\u028b" not in agent._client_kwargs["api_key"]
|
|
|
|
|
|
assert "\u028b" not in agent.client.api_key
|
|
|
|
|
|
|
|
|
|
|
|
def test_client_none_does_not_crash(self):
|
|
|
|
|
|
"""Recovery should not crash when client is None (pre-init)."""
|
|
|
|
|
|
from run_agent import AIAgent
|
|
|
|
|
|
|
|
|
|
|
|
agent = AIAgent.__new__(AIAgent)
|
|
|
|
|
|
bad_key = "sk-proj-\u028b"
|
|
|
|
|
|
agent.api_key = bad_key
|
|
|
|
|
|
agent._client_kwargs = {"api_key": bad_key}
|
|
|
|
|
|
agent.client = None
|
|
|
|
|
|
|
|
|
|
|
|
_clean_key = _strip_non_ascii(bad_key)
|
|
|
|
|
|
agent.api_key = _clean_key
|
|
|
|
|
|
agent._client_kwargs["api_key"] = _clean_key
|
|
|
|
|
|
if getattr(agent, "client", None) is not None and hasattr(agent.client, "api_key"):
|
|
|
|
|
|
agent.client.api_key = _clean_key
|
|
|
|
|
|
|
|
|
|
|
|
assert agent.api_key == "sk-proj-"
|
|
|
|
|
|
assert agent.client is None # should not have been touched
|
2026-04-14 17:14:52 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestApiMessagesAndApiKwargsSanitized:
|
|
|
|
|
|
"""Regression tests for #6843 follow-up: api_messages and api_kwargs must
|
|
|
|
|
|
be sanitized alongside messages during ASCII-codec recovery.
|
|
|
|
|
|
|
|
|
|
|
|
The original fix only sanitized the canonical `messages` list.
|
|
|
|
|
|
api_messages is a separate API-copy built before the retry loop; it may
|
|
|
|
|
|
carry extra fields (reasoning_content, extra_body) with non-ASCII chars
|
|
|
|
|
|
that are not present in `messages`. Without sanitizing api_messages and
|
|
|
|
|
|
api_kwargs, the retry still raises UnicodeEncodeError even after the
|
|
|
|
|
|
'System encoding is ASCII — stripped...' log line appears.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def test_api_messages_with_reasoning_content_is_sanitized(self):
|
|
|
|
|
|
"""api_messages may contain reasoning_content not in messages."""
|
|
|
|
|
|
api_messages = [
|
|
|
|
|
|
{"role": "system", "content": "You are helpful."},
|
|
|
|
|
|
{"role": "user", "content": "hi"},
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "assistant",
|
|
|
|
|
|
"content": "Sure!",
|
|
|
|
|
|
# reasoning_content is injected by the API-copy builder and
|
|
|
|
|
|
# is NOT present in the canonical messages list
|
|
|
|
|
|
"reasoning_content": "Let me think \xab step by step \xbb",
|
|
|
|
|
|
},
|
|
|
|
|
|
]
|
|
|
|
|
|
found = _sanitize_messages_non_ascii(api_messages)
|
|
|
|
|
|
assert found is True
|
|
|
|
|
|
assert "\xab" not in api_messages[2]["reasoning_content"]
|
|
|
|
|
|
assert "\xbb" not in api_messages[2]["reasoning_content"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_api_kwargs_with_non_ascii_extra_body_is_sanitized(self):
|
|
|
|
|
|
"""api_kwargs may contain non-ASCII in extra_body or other fields."""
|
|
|
|
|
|
api_kwargs = {
|
|
|
|
|
|
"model": "glm-5.1",
|
|
|
|
|
|
"messages": [{"role": "user", "content": "ok"}],
|
|
|
|
|
|
"extra_body": {
|
|
|
|
|
|
"system": "Think carefully \u2192 answer",
|
|
|
|
|
|
},
|
|
|
|
|
|
}
|
|
|
|
|
|
found = _sanitize_structure_non_ascii(api_kwargs)
|
|
|
|
|
|
assert found is True
|
|
|
|
|
|
assert "\u2192" not in api_kwargs["extra_body"]["system"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_messages_clean_but_api_messages_dirty_both_get_sanitized(self):
|
|
|
|
|
|
"""Even when canonical messages are clean, api_messages may be dirty."""
|
|
|
|
|
|
messages = [{"role": "user", "content": "hello"}]
|
|
|
|
|
|
api_messages = [
|
|
|
|
|
|
{"role": "user", "content": "hello"},
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "assistant",
|
|
|
|
|
|
"content": "ok",
|
|
|
|
|
|
"reasoning_content": "step \xab done",
|
|
|
|
|
|
},
|
|
|
|
|
|
]
|
|
|
|
|
|
# messages sanitize returns False (nothing to clean)
|
|
|
|
|
|
assert _sanitize_messages_non_ascii(messages) is False
|
|
|
|
|
|
# api_messages sanitize must catch the dirty reasoning_content
|
|
|
|
|
|
assert _sanitize_messages_non_ascii(api_messages) is True
|
|
|
|
|
|
assert "\xab" not in api_messages[1]["reasoning_content"]
|
2026-04-15 14:56:55 -07:00
|
|
|
|
|
|
|
|
|
|
def test_reasoning_field_in_canonical_messages_is_sanitized(self):
|
|
|
|
|
|
"""The canonical messages list stores reasoning as 'reasoning', not
|
|
|
|
|
|
'reasoning_content'. The extra-fields loop must catch it."""
|
|
|
|
|
|
messages = [
|
|
|
|
|
|
{"role": "user", "content": "hello"},
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "assistant",
|
|
|
|
|
|
"content": "ok",
|
|
|
|
|
|
"reasoning": "Let me think \xab carefully \xbb",
|
|
|
|
|
|
},
|
|
|
|
|
|
]
|
|
|
|
|
|
assert _sanitize_messages_non_ascii(messages) is True
|
|
|
|
|
|
assert "\xab" not in messages[1]["reasoning"]
|
|
|
|
|
|
assert "\xbb" not in messages[1]["reasoning"]
|