From 099dfca6dbb2ab26380340a274ab42728b9fa756 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 25 Mar 2026 12:05:37 -0700 Subject: [PATCH] fix: GLM reasoning-only and max-length handling (#3010) - Add 'prompt exceeds max length' to context overflow detection for Z.AI/GLM 400 errors - Extract inline reasoning blocks from assistant content as fallback when no structured reasoning fields are present - Guard inline extraction so structured API reasoning takes priority - Update test for reasoning-only response salvage behavior Cherry-picked from PR #2993 by kshitijk4poor. Added priority guard to fix test_structured_reasoning_takes_priority failure. Co-authored-by: kshitijk4poor --- run_agent.py | 19 +++++++++++++++ tests/test_run_agent.py | 54 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 68 insertions(+), 5 deletions(-) diff --git a/run_agent.py b/run_agent.py index 60be561292..fb03ee5c4f 100644 --- a/run_agent.py +++ b/run_agent.py @@ -1326,6 +1326,24 @@ class AIAgent: summary = detail.get('summary') or detail.get('content') or detail.get('text') if summary and summary not in reasoning_parts: reasoning_parts.append(summary) + + # Some providers embed reasoning directly inside assistant content + # instead of returning structured reasoning fields. Only fall back + # to inline extraction when no structured reasoning was found. + content = getattr(assistant_message, "content", None) + if not reasoning_parts and isinstance(content, str) and content: + inline_patterns = ( + r"(.*?)", + r"(.*?)", + r"(.*?)", + r"(.*?)", + ) + for pattern in inline_patterns: + flags = re.DOTALL | re.IGNORECASE + for block in re.findall(pattern, content, flags=flags): + cleaned = block.strip() + if cleaned and cleaned not in reasoning_parts: + reasoning_parts.append(cleaned) # Combine all reasoning parts if reasoning_parts: @@ -6392,6 +6410,7 @@ class AIAgent: 'exceeds the limit', 'context window', 'request entity too large', # OpenRouter/Nous 413 safety net 'prompt is too long', # Anthropic: "prompt is too long: N tokens > M maximum" + 'prompt exceeds max length', # Z.AI / GLM: generic 400 overflow wording ]) # Fallback heuristic: Anthropic sometimes returns a generic diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py index 81e16b7027..3dd9a134b3 100644 --- a/tests/test_run_agent.py +++ b/tests/test_run_agent.py @@ -267,6 +267,21 @@ class TestExtractReasoning: result = agent._extract_reasoning(msg) assert result == "same text" + @pytest.mark.parametrize( + ("content", "expected"), + [ + ("thinking hard", "thinking hard"), + ("step by step", "step by step"), + ( + "scratch analysis", + "scratch analysis", + ), + ], + ) + def test_inline_reasoning_blocks_fallback(self, agent, content, expected): + msg = _mock_assistant_msg(content=content) + assert agent._extract_reasoning(msg) == expected + class TestCleanSessionContent: def test_none_passthrough(self): @@ -1202,8 +1217,8 @@ class TestRunConversation: assert result["completed"] is True assert result["api_calls"] == 2 - def test_empty_content_retry_and_fallback(self, agent): - """Empty content (only think block) retries, then falls back to partial.""" + def test_empty_content_retry_uses_inline_reasoning_as_response(self, agent): + """Reasoning-only payloads should recover the inline reasoning text.""" self._setup_agent(agent) empty_resp = _mock_response( content="internal reasoning", @@ -1221,9 +1236,8 @@ class TestRunConversation: patch.object(agent, "_cleanup_task_resources"), ): result = agent.run_conversation("answer me") - # After 3 retries with no real content, should return partial - assert result["completed"] is False - assert result.get("partial") is True + assert result["completed"] is True + assert result["final_response"] == "internal reasoning" def test_nous_401_refreshes_after_remint_and_retries(self, agent): self._setup_agent(agent) @@ -1296,6 +1310,36 @@ class TestRunConversation: assert result["final_response"] == "All done" assert result["completed"] is True + def test_glm_prompt_exceeds_max_length_triggers_compression(self, agent): + """GLM/Z.AI uses 'Prompt exceeds max length' for context overflow.""" + self._setup_agent(agent) + err_400 = Exception( + "Error code: 400 - {'error': {'code': '1261', 'message': 'Prompt exceeds max length'}}" + ) + err_400.status_code = 400 + ok_resp = _mock_response(content="Recovered after compression", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [err_400, ok_resp] + prefill = [ + {"role": "user", "content": "previous question"}, + {"role": "assistant", "content": "previous answer"}, + ] + + with ( + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + mock_compress.return_value = ( + [{"role": "user", "content": "hello"}], + "compressed system prompt", + ) + result = agent.run_conversation("hello", conversation_history=prefill) + + mock_compress.assert_called_once() + assert result["final_response"] == "Recovered after compression" + assert result["completed"] is True + @pytest.mark.parametrize( ("first_content", "second_content", "expected_final"), [