fix(agent): detect thinking-budget exhaustion on truncation, skip useless retries (#3444)

When finish_reason='length' and the response contains only reasoning (think blocks or empty content), the model exhausted its output token budget on thinking with nothing left for the actual response. Previously, this fell into either: - chat_completions: 3 useless continuation retries (model hits same limit) - anthropic/codex: generic 'Response truncated' error with rollback Now: detect the think-only + length condition early and return immediately with a targeted error message: 'Model used all output tokens on reasoning with none left for the response. Try lowering reasoning effort or increasing max_tokens.' This saves 2 wasted API calls on the chat_completions path and gives users actionable guidance instead of a cryptic error. The existing think-only retry logic (finish_reason='stop') is unchanged — that's a genuine model glitch where retrying can help.
2026-04-28 06:51:16 +08:00 · 2026-03-27 15:29:30 -07:00
parent 658692799d
commit 8fdfc4b00c
2 changed files with 107 additions and 13 deletions
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@@ -1372,19 +1372,11 @@ class TestRunConversation:
        assert result["final_response"] == "Recovered after compression"
        assert result["completed"] is True

-    @pytest.mark.parametrize(
-        ("first_content", "second_content", "expected_final"),
-        [
-            ("Part 1 ", "Part 2", "Part 1 Part 2"),
-            ("<think>internal reasoning</think>", "Recovered final answer", "Recovered final answer"),
-        ],
-    )
-    def test_length_finish_reason_requests_continuation(
-        self, agent, first_content, second_content, expected_final
-    ):
+    def test_length_finish_reason_requests_continuation(self, agent):
+        """Normal truncation (partial real content) triggers continuation."""
        self._setup_agent(agent)
-        first = _mock_response(content=first_content, finish_reason="length")
-        second = _mock_response(content=second_content, finish_reason="stop")
+        first = _mock_response(content="Part 1 ", finish_reason="length")
+        second = _mock_response(content="Part 2", finish_reason="stop")
        agent.client.chat.completions.create.side_effect = [first, second]

        with (
@@ -1396,12 +1388,58 @@ class TestRunConversation:

        assert result["completed"] is True
        assert result["api_calls"] == 2
-        assert result["final_response"] == expected_final
+        assert result["final_response"] == "Part 1 Part 2"

        second_call_messages = agent.client.chat.completions.create.call_args_list[1].kwargs["messages"]
        assert second_call_messages[-1]["role"] == "user"
        assert "truncated by the output length limit" in second_call_messages[-1]["content"]

+    def test_length_thinking_exhausted_skips_continuation(self, agent):
+        """When finish_reason='length' but content is only thinking, skip retries."""
+        self._setup_agent(agent)
+        resp = _mock_response(
+            content="<think>internal reasoning</think>",
+            finish_reason="length",
+        )
+        agent.client.chat.completions.create.return_value = resp
+
+        with (
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            result = agent.run_conversation("hello")
+
+        # Should return immediately — no continuation, only 1 API call
+        assert result["completed"] is False
+        assert result["api_calls"] == 1
+        assert "reasoning" in result["error"].lower()
+        assert "output tokens" in result["error"].lower()
+        # Should have a user-friendly response (not None)
+        assert result["final_response"] is not None
+        assert "Thinking Budget Exhausted" in result["final_response"]
+        assert "/thinkon" in result["final_response"]
+
+    def test_length_empty_content_detected_as_thinking_exhausted(self, agent):
+        """When finish_reason='length' and content is None/empty, detect exhaustion."""
+        self._setup_agent(agent)
+        resp = _mock_response(content=None, finish_reason="length")
+        agent.client.chat.completions.create.return_value = resp
+
+        with (
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            result = agent.run_conversation("hello")
+
+        assert result["completed"] is False
+        assert result["api_calls"] == 1
+        assert "reasoning" in result["error"].lower()
+        # User-friendly message is returned
+        assert result["final_response"] is not None
+        assert "Thinking Budget Exhausted" in result["final_response"]
+

 class TestRetryExhaustion:
    """Regression: retry_count > max_retries was dead code (off-by-one).