Compare commits

...

1 Commits

Author SHA1 Message Date
Teknium
589d14c11e feat: thinking-only prefill continuation for structured reasoning responses
When the model produces structured reasoning (via API fields like .reasoning,
.reasoning_content, .reasoning_details) but no visible text content, append
the assistant message as prefill and continue the loop. The model sees its own
reasoning context on the next turn and produces the text portion.

Inspired by clawdbot's 'incomplete-text' recovery pattern. Up to 2 prefill
attempts before falling through to the existing '(empty)' terminal.

Key design decisions:
- Only triggers for structured reasoning (API fields), NOT inline <think> tags
- Prefill messages are popped on success to maintain strict role alternation
- _thinking_prefill marker stripped from all API message building paths
- Works across all providers: OpenAI (continuation), Anthropic (native prefill)

Verified with E2E tests: simulated thinking-only → real OpenRouter continuation
produces correct content. Also confirmed Qwen models consistently produce
structured-reasoning-only responses under token pressure.
2026-04-07 13:18:23 -07:00
2 changed files with 94 additions and 14 deletions

View File

@@ -5823,6 +5823,7 @@ class AIAgent:
api_msg.pop("reasoning", None) api_msg.pop("reasoning", None)
api_msg.pop("finish_reason", None) api_msg.pop("finish_reason", None)
api_msg.pop("_flush_sentinel", None) api_msg.pop("_flush_sentinel", None)
api_msg.pop("_thinking_prefill", None)
if _needs_sanitize: if _needs_sanitize:
self._sanitize_tool_calls_for_strict_api(api_msg) self._sanitize_tool_calls_for_strict_api(api_msg)
api_messages.append(api_msg) api_messages.append(api_msg)
@@ -6746,7 +6747,7 @@ class AIAgent:
api_messages = [] api_messages = []
for msg in messages: for msg in messages:
api_msg = msg.copy() api_msg = msg.copy()
for internal_field in ("reasoning", "finish_reason"): for internal_field in ("reasoning", "finish_reason", "_thinking_prefill"):
api_msg.pop(internal_field, None) api_msg.pop(internal_field, None)
if _needs_sanitize: if _needs_sanitize:
self._sanitize_tool_calls_for_strict_api(api_msg) self._sanitize_tool_calls_for_strict_api(api_msg)
@@ -6938,6 +6939,7 @@ class AIAgent:
self._empty_content_retries = 0 self._empty_content_retries = 0
self._incomplete_scratchpad_retries = 0 self._incomplete_scratchpad_retries = 0
self._codex_incomplete_retries = 0 self._codex_incomplete_retries = 0
self._thinking_prefill_retries = 0
self._last_content_with_tools = None self._last_content_with_tools = None
self._mute_post_response = False self._mute_post_response = False
self._surrogate_sanitized = False self._surrogate_sanitized = False
@@ -7283,6 +7285,8 @@ class AIAgent:
# Remove finish_reason - not accepted by strict APIs (e.g. Mistral) # Remove finish_reason - not accepted by strict APIs (e.g. Mistral)
if "finish_reason" in api_msg: if "finish_reason" in api_msg:
api_msg.pop("finish_reason") api_msg.pop("finish_reason")
# Strip internal thinking-prefill marker
api_msg.pop("_thinking_prefill", None)
# Strip Codex Responses API fields (call_id, response_item_id) for # Strip Codex Responses API fields (call_id, response_item_id) for
# strict providers like Mistral, Fireworks, etc. that reject unknown fields. # strict providers like Mistral, Fireworks, etc. that reject unknown fields.
# Uses new dicts so the internal messages list retains the fields # Uses new dicts so the internal messages list retains the fields
@@ -8817,6 +8821,15 @@ class AIAgent:
if clean: if clean:
self._vprint(f" ┊ 💬 {clean}") self._vprint(f" ┊ 💬 {clean}")
# Pop thinking-only prefill message(s) before appending
# (tool-call path — same rationale as the final-response path).
while (
messages
and isinstance(messages[-1], dict)
and messages[-1].get("_thinking_prefill")
):
messages.pop()
messages.append(assistant_msg) messages.append(assistant_msg)
# Close any open streaming display (response box, reasoning # Close any open streaming display (response box, reasoning
@@ -8930,11 +8943,36 @@ class AIAgent:
self._response_was_previewed = True self._response_was_previewed = True
break break
# Reasoning-only response: the model produced thinking # ── Thinking-only prefill continuation ──────────
# but no visible content. This is a valid response — # The model produced structured reasoning (via API
# keep reasoning in its own field and set content to # fields) but no visible text content. Rather than
# "(empty)" so every provider accepts the message. # giving up, append the assistant message as-is and
# No retries needed. # continue — the model will see its own reasoning
# on the next turn and produce the text portion.
# Inspired by clawdbot's "incomplete-text" recovery.
_has_structured = bool(
getattr(assistant_message, "reasoning", None)
or getattr(assistant_message, "reasoning_content", None)
or getattr(assistant_message, "reasoning_details", None)
)
if _has_structured and self._thinking_prefill_retries < 2:
self._thinking_prefill_retries += 1
self._vprint(
f"{self.log_prefix}↻ Thinking-only response — "
f"prefilling to continue "
f"({self._thinking_prefill_retries}/2)"
)
interim_msg = self._build_assistant_message(
assistant_message, "incomplete"
)
interim_msg["_thinking_prefill"] = True
messages.append(interim_msg)
self._session_messages = messages
self._save_session_log(messages)
continue
# Exhausted prefill attempts or no structured
# reasoning — fall through to "(empty)" terminal.
reasoning_text = self._extract_reasoning(assistant_message) reasoning_text = self._extract_reasoning(assistant_message)
assistant_msg = self._build_assistant_message(assistant_message, finish_reason) assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
assistant_msg["content"] = "(empty)" assistant_msg["content"] = "(empty)"
@@ -8953,6 +8991,7 @@ class AIAgent:
if hasattr(self, '_empty_content_retries'): if hasattr(self, '_empty_content_retries'):
self._empty_content_retries = 0 self._empty_content_retries = 0
self._last_empty_content_signature = None self._last_empty_content_signature = None
self._thinking_prefill_retries = 0
if ( if (
self.api_mode == "codex_responses" self.api_mode == "codex_responses"
@@ -8992,6 +9031,17 @@ class AIAgent:
final_msg = self._build_assistant_message(assistant_message, finish_reason) final_msg = self._build_assistant_message(assistant_message, finish_reason)
# Pop thinking-only prefill message(s) before appending
# the final response. This avoids consecutive assistant
# messages which break strict-alternation providers
# (Anthropic Messages API) and keeps history clean.
while (
messages
and isinstance(messages[-1], dict)
and messages[-1].get("_thinking_prefill")
):
messages.pop()
messages.append(final_msg) messages.append(final_msg)
if not self.quiet_mode: if not self.quiet_mode:

View File

@@ -1547,7 +1547,7 @@ class TestRunConversation:
assert any(m.get("reasoning") for m in assistant_msgs) assert any(m.get("reasoning") for m in assistant_msgs)
def test_reasoning_only_local_resumed_no_compression_triggered(self, agent): def test_reasoning_only_local_resumed_no_compression_triggered(self, agent):
"""Reasoning-only responses no longer trigger compression — accepted immediately.""" """Reasoning-only responses no longer trigger compression — prefill then accepted."""
self._setup_agent(agent) self._setup_agent(agent)
agent.base_url = "http://127.0.0.1:1234/v1" agent.base_url = "http://127.0.0.1:1234/v1"
agent.compression_enabled = True agent.compression_enabled = True
@@ -1561,8 +1561,9 @@ class TestRunConversation:
{"role": "assistant", "content": "old answer"}, {"role": "assistant", "content": "old answer"},
] ]
# 3 responses: original + 2 prefill continuations (structured reasoning triggers prefill)
with ( with (
patch.object(agent, "_interruptible_api_call", side_effect=[empty_resp]), patch.object(agent, "_interruptible_api_call", side_effect=[empty_resp, empty_resp, empty_resp]),
patch.object(agent, "_compress_context") as mock_compress, patch.object(agent, "_compress_context") as mock_compress,
patch.object(agent, "_persist_session"), patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"), patch.object(agent, "_save_trajectory"),
@@ -1573,17 +1574,18 @@ class TestRunConversation:
mock_compress.assert_not_called() # no compression triggered mock_compress.assert_not_called() # no compression triggered
assert result["completed"] is True assert result["completed"] is True
assert result["final_response"] == "(empty)" assert result["final_response"] == "(empty)"
assert result["api_calls"] == 1 assert result["api_calls"] == 3 # 1 original + 2 prefill continuations
def test_reasoning_only_response_accepted_without_retry(self, agent): def test_reasoning_only_response_prefill_then_empty(self, agent):
"""Reasoning-only response should be accepted with (empty) content, no retries.""" """Structured reasoning-only triggers prefill continuation (up to 2), then falls through to (empty)."""
self._setup_agent(agent) self._setup_agent(agent)
empty_resp = _mock_response( empty_resp = _mock_response(
content=None, content=None,
finish_reason="stop", finish_reason="stop",
reasoning_content="structured reasoning answer", reasoning_content="structured reasoning answer",
) )
agent.client.chat.completions.create.side_effect = [empty_resp] # 3 responses: original + 2 prefill continuations, all reasoning-only
agent.client.chat.completions.create.side_effect = [empty_resp, empty_resp, empty_resp]
with ( with (
patch.object(agent, "_persist_session"), patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"), patch.object(agent, "_save_trajectory"),
@@ -1592,7 +1594,35 @@ class TestRunConversation:
result = agent.run_conversation("answer me") result = agent.run_conversation("answer me")
assert result["completed"] is True assert result["completed"] is True
assert result["final_response"] == "(empty)" assert result["final_response"] == "(empty)"
assert result["api_calls"] == 1 # no retries assert result["api_calls"] == 3 # 1 original + 2 prefill continuations
def test_reasoning_only_prefill_succeeds_on_continuation(self, agent):
"""When prefill continuation produces content, it becomes the final response."""
self._setup_agent(agent)
empty_resp = _mock_response(
content=None,
finish_reason="stop",
reasoning_content="structured reasoning answer",
)
content_resp = _mock_response(
content="Here is the actual answer.",
finish_reason="stop",
)
agent.client.chat.completions.create.side_effect = [empty_resp, content_resp]
with (
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
result = agent.run_conversation("answer me")
assert result["completed"] is True
assert result["final_response"] == "Here is the actual answer."
assert result["api_calls"] == 2 # 1 original + 1 prefill continuation
# Prefill message should be cleaned up — no consecutive assistant messages
roles = [m.get("role") for m in result["messages"]]
for i in range(len(roles) - 1):
if roles[i] == "assistant" and roles[i + 1] == "assistant":
raise AssertionError("Consecutive assistant messages found in history")
def test_truly_empty_response_accepted_without_retry(self, agent): def test_truly_empty_response_accepted_without_retry(self, agent):
"""Truly empty response (no content, no reasoning) should still complete with (empty).""" """Truly empty response (no content, no reasoning) should still complete with (empty)."""