fix(codex): surface error code in Responses 'failed' status errors

When a Codex Responses turn ends with status=failed, the response carries the failure details under `response.error` as `{code, message, param, ...}`. The previous extractor pulled only `message`, so users seeing a rate-limit failure got a bare "Slow down" string indistinguishable from a generic stream truncation; an internal_error with empty message degraded to a dict dump ("{'code': 'internal_error', 'message': ''}"). Extract a `_format_responses_error()` helper that: - prefixes `code` when both code and message are present (e.g. 'rate_limit_exceeded: Slow down') - falls back to the bare `code` when message is empty - accepts both dict and attribute-style payloads (SDK and JSON-RPC paths) - preserves the prior status-only fallback when no error payload exists Apply the same helper at the sibling site in `codex_app_server_session.run_turn()` so codex-CLI subprocess turn failures get the same treatment. Tests: - 8 new unit tests for `_format_responses_error` covering both shapes, empty/missing fields, non-string fields, and the status-only fallback. - 2 regression tests on `_normalize_codex_response` for failed status with and without a code, asserting the exact RuntimeError message. - All 3603 tests in tests/agent/ pass. Adapted from anomalyco/opencode#28757.
2026-06-10 12:18:44 +08:00 · 2026-05-28 17:10:25 -07:00
3 changed files with 159 additions and 6 deletions
--- a/agent/codex_responses_adapter.py
+++ b/agent/codex_responses_adapter.py
@@ -980,6 +980,48 @@ def _extract_responses_reasoning_text(item: Any) -> str:
    return ""


+def _format_responses_error(error_obj: Any, response_status: str) -> str:
+    """Build a human-readable error string from a Responses ``response.error`` payload.
+
+    The OpenAI Responses API carries failure details under ``response.error``
+    on terminal ``response.failed`` events, in the shape
+    ``{"code": "rate_limit_exceeded", "message": "Slow down", "param": ...}``.
+    Earlier code only surfaced ``message``, which left users staring at bare
+    strings like ``"Slow down"`` while the failure mode (rate limit vs
+    context-length vs internal_error vs model-overloaded) was hidden in
+    ``code``. We now prefix ``code`` when both are present so consumers can
+    distinguish failure modes without parsing the bare message.
+
+    Falls back to ``code`` alone when ``message`` is empty, and to a stable
+    default referencing the response status when no error payload is
+    available at all. Adapted from anomalyco/opencode#28757.
+    """
+    # Pull code and message from either dict or attribute-style payloads.
+    code: Any = None
+    message: Any = None
+    if isinstance(error_obj, dict):
+        code = error_obj.get("code")
+        message = error_obj.get("message")
+    elif error_obj is not None:
+        code = getattr(error_obj, "code", None)
+        message = getattr(error_obj, "message", None)
+
+    code_str = str(code).strip() if isinstance(code, str) else (str(code).strip() if code else "")
+    message_str = str(message).strip() if isinstance(message, str) else (str(message).strip() if message else "")
+
+    if code_str and message_str:
+        return f"{code_str}: {message_str}"
+    if message_str:
+        return message_str
+    if code_str:
+        return code_str
+    if error_obj:
+        # Last-resort: stringify whatever the provider sent so it's at least
+        # visible in logs/UI rather than silently swallowed.
+        return str(error_obj)
+    return f"Responses API returned status '{response_status}'"
+
+
 # ---------------------------------------------------------------------------
 # Full response normalization
 # ---------------------------------------------------------------------------
@@ -1023,10 +1065,7 @@ def _normalize_codex_response(

    if response_status in {"failed", "cancelled"}:
        error_obj = getattr(response, "error", None)
-        if isinstance(error_obj, dict):
-            error_msg = error_obj.get("message") or str(error_obj)
-        else:
-            error_msg = str(error_obj) if error_obj else f"Responses API returned status '{response_status}'"
+        error_msg = _format_responses_error(error_obj, response_status)
        raise RuntimeError(error_msg)

    content_parts: List[str] = []
--- a/agent/transports/codex_app_server_session.py
+++ b/agent/transports/codex_app_server_session.py
@@ -31,6 +31,7 @@ import time
 from dataclasses import dataclass, field
 from typing import Any, Callable, Optional

+from agent.codex_responses_adapter import _format_responses_error
 from agent.redact import redact_sensitive_text
 from agent.transports.codex_app_server import (
    CodexAppServerClient,
@@ -581,7 +582,7 @@ class CodexAppServerSession:
                        (note.get("params") or {}).get("turn") or {}
                    ).get("error")
                    if err_obj:
-                        err_msg = err_obj.get("message") or str(err_obj)
+                        err_msg = _format_responses_error(err_obj, str(turn_status))
                        # If the turn failed for an auth/refresh reason,
                        # rewrite the error into a re-auth hint AND mark
                        # the session for retirement.
--- a/tests/agent/test_codex_responses_adapter.py
+++ b/tests/agent/test_codex_responses_adapter.py
@@ -1,6 +1,11 @@
 from types import SimpleNamespace

-from agent.codex_responses_adapter import _normalize_codex_response
+import pytest
+
+from agent.codex_responses_adapter import (
+    _format_responses_error,
+    _normalize_codex_response,
+)


 def test_normalize_codex_response_drops_transient_rs_tmp_reasoning_items():
@@ -61,3 +66,111 @@ def test_normalize_codex_response_treats_summary_only_reasoning_as_incomplete():
    assert assistant_message.content == ""
    assert assistant_message.reasoning == "still thinking"
    assert assistant_message.codex_reasoning_items is None
+
+
+# ---------------------------------------------------------------------------
+# _format_responses_error — adapted from anomalyco/opencode#28757.
+# Provider failures should surface BOTH the code (rate_limit_exceeded /
+# context_length_exceeded / internal_error / server_error) and the message,
+# so consumers can tell rate limits apart from context-length failures and
+# both apart from generic stream drops.
+# ---------------------------------------------------------------------------
+
+
+def test_format_responses_error_combines_code_and_message():
+    err = {"code": "rate_limit_exceeded", "message": "Slow down"}
+    assert _format_responses_error(err, "failed") == "rate_limit_exceeded: Slow down"
+
+
+def test_format_responses_error_message_only():
+    err = {"message": "Upstream model unavailable"}
+    assert _format_responses_error(err, "failed") == "Upstream model unavailable"
+
+
+def test_format_responses_error_code_only_when_message_empty():
+    # Some providers/proxies emit a code with an empty message body. We
+    # used to fall back to ``str(error_obj)`` — a dict dump — which leaked
+    # ``{'code': 'internal_error', 'message': ''}`` into chat output. Now
+    # the bare code is surfaced, which is the meaningful field.
+    err = {"code": "internal_error", "message": ""}
+    assert _format_responses_error(err, "failed") == "internal_error"
+
+
+def test_format_responses_error_code_only_when_message_missing():
+    err = {"code": "server_error"}
+    assert _format_responses_error(err, "failed") == "server_error"
+
+
+def test_format_responses_error_attribute_style_payload():
+    # SDK objects expose ``code``/``message`` as attributes rather than dict
+    # keys. The helper must accept both shapes since the Responses SDK
+    # returns SimpleNamespace-style objects on ``response.failed``.
+    err = SimpleNamespace(code="context_length_exceeded", message="too long")
+    assert _format_responses_error(err, "failed") == "context_length_exceeded: too long"
+
+
+def test_format_responses_error_falls_back_to_status_when_empty():
+    assert (
+        _format_responses_error(None, "failed")
+        == "Responses API returned status 'failed'"
+    )
+    assert (
+        _format_responses_error(None, "cancelled")
+        == "Responses API returned status 'cancelled'"
+    )
+
+
+def test_format_responses_error_stringifies_opaque_payload():
+    # Last-resort: a provider sent something that isn't a dict and has no
+    # code/message attributes. Surface its repr rather than swallow it
+    # silently — at least it's visible in logs.
+    assert _format_responses_error("opaque sentinel", "failed") == "opaque sentinel"
+
+
+def test_format_responses_error_ignores_non_string_code_message():
+    # Defensive: a malformed gateway could send numbers/objects in these
+    # fields. We don't want to crash; we want a best-effort string.
+    err = {"code": 500, "message": None}
+    assert _format_responses_error(err, "failed") == "500"
+
+
+def test_normalize_codex_response_failed_includes_code_in_error():
+    """Regression: response_status == 'failed' should surface the error
+    code, not just the message. Used to leak a bare 'Slow down' string
+    that was indistinguishable from a generic stream truncation."""
+    # ``output`` non-empty so we don't trip the "no output items" guard
+    # before reaching the failed-status branch. Real failed responses
+    # often DO carry a partial message item alongside the error.
+    response = SimpleNamespace(
+        status="failed",
+        output=[
+            SimpleNamespace(
+                type="message",
+                role="assistant",
+                status="incomplete",
+                content=[SimpleNamespace(type="output_text", text="partial")],
+            ),
+        ],
+        error={"code": "rate_limit_exceeded", "message": "Slow down"},
+    )
+    with pytest.raises(RuntimeError, match=r"^rate_limit_exceeded: Slow down$"):
+        _normalize_codex_response(response)
+
+
+def test_normalize_codex_response_failed_with_message_only():
+    """Backwards-compat: a failed response with only a message field
+    (no code) should still surface that message verbatim."""
+    response = SimpleNamespace(
+        status="failed",
+        output=[
+            SimpleNamespace(
+                type="message",
+                role="assistant",
+                status="incomplete",
+                content=[SimpleNamespace(type="output_text", text="partial")],
+            ),
+        ],
+        error={"message": "model error"},
+    )
+    with pytest.raises(RuntimeError, match=r"^model error$"):
+        _normalize_codex_response(response)