mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
fix: wire _ephemeral_max_output_tokens into chat_completions and add NVIDIA NIM default
Based on #12152 by @LVT382009. Two fixes to run_agent.py: 1. _ephemeral_max_output_tokens consumption in chat_completions path: The error-recovery ephemeral override was only consumed in the anthropic_messages branch of _build_api_kwargs. All chat_completions providers (OpenRouter, NVIDIA NIM, Qwen, Alibaba, custom, etc.) silently ignored it. Now consumed at highest priority, matching the anthropic pattern. 2. NVIDIA NIM max_tokens default (16384): NVIDIA NIM falls back to a very low internal default when max_tokens is omitted, causing models like GLM-4.7 to truncate immediately (thinking tokens exhaust the budget before the response starts). 3. Progressive length-continuation boost: When finish_reason='length' triggers a continuation retry, the output budget now grows progressively (2x base on retry 1, 3x on retry 2, capped at 32768) via _ephemeral_max_output_tokens. Previously the retry loop just re-sent the same token limit on all 3 attempts.
This commit is contained in:
20
run_agent.py
20
run_agent.py
@@ -7061,8 +7061,20 @@ class AIAgent:
|
||||
if self.tools:
|
||||
api_kwargs["tools"] = self.tools
|
||||
|
||||
if self.max_tokens is not None:
|
||||
# ── max_tokens for chat_completions ──────────────────────────────
|
||||
# Priority: ephemeral override (error recovery / length-continuation
|
||||
# boost) > user-configured max_tokens > provider-specific defaults.
|
||||
_ephemeral_out = getattr(self, "_ephemeral_max_output_tokens", None)
|
||||
if _ephemeral_out is not None:
|
||||
self._ephemeral_max_output_tokens = None # consume immediately
|
||||
api_kwargs.update(self._max_tokens_param(_ephemeral_out))
|
||||
elif self.max_tokens is not None:
|
||||
api_kwargs.update(self._max_tokens_param(self.max_tokens))
|
||||
elif "integrate.api.nvidia.com" in self._base_url_lower:
|
||||
# NVIDIA NIM defaults to a very low max_tokens when omitted,
|
||||
# causing models like GLM-4.7 to truncate immediately (thinking
|
||||
# tokens alone exhaust the budget). 16384 provides adequate room.
|
||||
api_kwargs.update(self._max_tokens_param(16384))
|
||||
elif self._is_qwen_portal():
|
||||
# Qwen Portal defaults to a very low max_tokens when omitted.
|
||||
# Reasoning models (qwen3-coder-plus) exhaust that budget on
|
||||
@@ -10804,6 +10816,12 @@ class AIAgent:
|
||||
continue
|
||||
|
||||
if restart_with_length_continuation:
|
||||
# Progressively boost the output token budget on each retry.
|
||||
# Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
|
||||
# Applies to all providers via _ephemeral_max_output_tokens.
|
||||
_boost_base = self.max_tokens if self.max_tokens else 4096
|
||||
_boost = _boost_base * (length_continue_retries + 1)
|
||||
self._ephemeral_max_output_tokens = min(_boost, 32768)
|
||||
continue
|
||||
|
||||
# Guard: if all retries exhausted without a successful response
|
||||
|
||||
Reference in New Issue
Block a user