mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-30 16:01:49 +08:00
Compare commits
1 Commits
opencode-p
...
hermes/her
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4da939a714 |
56
run_agent.py
56
run_agent.py
@@ -6340,6 +6340,62 @@ class AIAgent:
|
||||
if finish_reason == "length":
|
||||
self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True)
|
||||
|
||||
# ── Detect thinking-budget exhaustion ──────────────
|
||||
# When the model spends ALL output tokens on reasoning
|
||||
# and has none left for the response, continuation
|
||||
# retries are pointless. Detect this early and give a
|
||||
# targeted error instead of wasting 3 API calls.
|
||||
_trunc_content = None
|
||||
if self.api_mode == "chat_completions":
|
||||
_trunc_msg = response.choices[0].message if (hasattr(response, "choices") and response.choices) else None
|
||||
_trunc_content = getattr(_trunc_msg, "content", None) if _trunc_msg else None
|
||||
elif self.api_mode == "anthropic_messages":
|
||||
# Anthropic response.content is a list of blocks
|
||||
_text_parts = []
|
||||
for _blk in getattr(response, "content", []):
|
||||
if getattr(_blk, "type", None) == "text":
|
||||
_text_parts.append(getattr(_blk, "text", ""))
|
||||
_trunc_content = "\n".join(_text_parts) if _text_parts else None
|
||||
|
||||
_thinking_exhausted = (
|
||||
_trunc_content is not None
|
||||
and not self._has_content_after_think_block(_trunc_content)
|
||||
) or _trunc_content is None
|
||||
|
||||
if _thinking_exhausted:
|
||||
_exhaust_error = (
|
||||
"Model used all output tokens on reasoning with none left "
|
||||
"for the response. Try lowering reasoning effort or "
|
||||
"increasing max_tokens."
|
||||
)
|
||||
self._vprint(
|
||||
f"{self.log_prefix}💭 Reasoning exhausted the output token budget — "
|
||||
f"no visible response was produced.",
|
||||
force=True,
|
||||
)
|
||||
# Return a user-friendly message as the response so
|
||||
# CLI (response box) and gateway (chat message) both
|
||||
# display it naturally instead of a suppressed error.
|
||||
_exhaust_response = (
|
||||
"⚠️ **Thinking Budget Exhausted**\n\n"
|
||||
"The model used all its output tokens on reasoning "
|
||||
"and had none left for the actual response.\n\n"
|
||||
"To fix this:\n"
|
||||
"→ Lower reasoning effort: `/thinkon low` or `/thinkon minimal`\n"
|
||||
"→ Increase the output token limit: "
|
||||
"set `model.max_tokens` in config.yaml"
|
||||
)
|
||||
self._cleanup_task_resources(effective_task_id)
|
||||
self._persist_session(messages, conversation_history)
|
||||
return {
|
||||
"final_response": _exhaust_response,
|
||||
"messages": messages,
|
||||
"api_calls": api_call_count,
|
||||
"completed": False,
|
||||
"partial": True,
|
||||
"error": _exhaust_error,
|
||||
}
|
||||
|
||||
if self.api_mode == "chat_completions":
|
||||
assistant_message = response.choices[0].message
|
||||
if not assistant_message.tool_calls:
|
||||
|
||||
@@ -1372,19 +1372,11 @@ class TestRunConversation:
|
||||
assert result["final_response"] == "Recovered after compression"
|
||||
assert result["completed"] is True
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("first_content", "second_content", "expected_final"),
|
||||
[
|
||||
("Part 1 ", "Part 2", "Part 1 Part 2"),
|
||||
("<think>internal reasoning</think>", "Recovered final answer", "Recovered final answer"),
|
||||
],
|
||||
)
|
||||
def test_length_finish_reason_requests_continuation(
|
||||
self, agent, first_content, second_content, expected_final
|
||||
):
|
||||
def test_length_finish_reason_requests_continuation(self, agent):
|
||||
"""Normal truncation (partial real content) triggers continuation."""
|
||||
self._setup_agent(agent)
|
||||
first = _mock_response(content=first_content, finish_reason="length")
|
||||
second = _mock_response(content=second_content, finish_reason="stop")
|
||||
first = _mock_response(content="Part 1 ", finish_reason="length")
|
||||
second = _mock_response(content="Part 2", finish_reason="stop")
|
||||
agent.client.chat.completions.create.side_effect = [first, second]
|
||||
|
||||
with (
|
||||
@@ -1396,12 +1388,58 @@ class TestRunConversation:
|
||||
|
||||
assert result["completed"] is True
|
||||
assert result["api_calls"] == 2
|
||||
assert result["final_response"] == expected_final
|
||||
assert result["final_response"] == "Part 1 Part 2"
|
||||
|
||||
second_call_messages = agent.client.chat.completions.create.call_args_list[1].kwargs["messages"]
|
||||
assert second_call_messages[-1]["role"] == "user"
|
||||
assert "truncated by the output length limit" in second_call_messages[-1]["content"]
|
||||
|
||||
def test_length_thinking_exhausted_skips_continuation(self, agent):
|
||||
"""When finish_reason='length' but content is only thinking, skip retries."""
|
||||
self._setup_agent(agent)
|
||||
resp = _mock_response(
|
||||
content="<think>internal reasoning</think>",
|
||||
finish_reason="length",
|
||||
)
|
||||
agent.client.chat.completions.create.return_value = resp
|
||||
|
||||
with (
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
result = agent.run_conversation("hello")
|
||||
|
||||
# Should return immediately — no continuation, only 1 API call
|
||||
assert result["completed"] is False
|
||||
assert result["api_calls"] == 1
|
||||
assert "reasoning" in result["error"].lower()
|
||||
assert "output tokens" in result["error"].lower()
|
||||
# Should have a user-friendly response (not None)
|
||||
assert result["final_response"] is not None
|
||||
assert "Thinking Budget Exhausted" in result["final_response"]
|
||||
assert "/thinkon" in result["final_response"]
|
||||
|
||||
def test_length_empty_content_detected_as_thinking_exhausted(self, agent):
|
||||
"""When finish_reason='length' and content is None/empty, detect exhaustion."""
|
||||
self._setup_agent(agent)
|
||||
resp = _mock_response(content=None, finish_reason="length")
|
||||
agent.client.chat.completions.create.return_value = resp
|
||||
|
||||
with (
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
result = agent.run_conversation("hello")
|
||||
|
||||
assert result["completed"] is False
|
||||
assert result["api_calls"] == 1
|
||||
assert "reasoning" in result["error"].lower()
|
||||
# User-friendly message is returned
|
||||
assert result["final_response"] is not None
|
||||
assert "Thinking Budget Exhausted" in result["final_response"]
|
||||
|
||||
|
||||
class TestRetryExhaustion:
|
||||
"""Regression: retry_count > max_retries was dead code (off-by-one).
|
||||
|
||||
Reference in New Issue
Block a user