From f7af90e2daf2e2a11262ff3152bb3f08ff13ca37 Mon Sep 17 00:00:00 2001 From: LVT382009 Date: Sat, 18 Apr 2026 22:49:30 +0530 Subject: [PATCH] fix: wire _ephemeral_max_output_tokens into chat_completions and add NVIDIA NIM default Based on #12152 by @LVT382009. Two fixes to run_agent.py: 1. _ephemeral_max_output_tokens consumption in chat_completions path: The error-recovery ephemeral override was only consumed in the anthropic_messages branch of _build_api_kwargs. All chat_completions providers (OpenRouter, NVIDIA NIM, Qwen, Alibaba, custom, etc.) silently ignored it. Now consumed at highest priority, matching the anthropic pattern. 2. NVIDIA NIM max_tokens default (16384): NVIDIA NIM falls back to a very low internal default when max_tokens is omitted, causing models like GLM-4.7 to truncate immediately (thinking tokens exhaust the budget before the response starts). 3. Progressive length-continuation boost: When finish_reason='length' triggers a continuation retry, the output budget now grows progressively (2x base on retry 1, 3x on retry 2, capped at 32768) via _ephemeral_max_output_tokens. Previously the retry loop just re-sent the same token limit on all 3 attempts. --- run_agent.py | 20 +++++++++++++++++++- scripts/release.py | 1 + 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/run_agent.py b/run_agent.py index e88096a603..a0f4db5485 100644 --- a/run_agent.py +++ b/run_agent.py @@ -7061,8 +7061,20 @@ class AIAgent: if self.tools: api_kwargs["tools"] = self.tools - if self.max_tokens is not None: + # ── max_tokens for chat_completions ────────────────────────────── + # Priority: ephemeral override (error recovery / length-continuation + # boost) > user-configured max_tokens > provider-specific defaults. + _ephemeral_out = getattr(self, "_ephemeral_max_output_tokens", None) + if _ephemeral_out is not None: + self._ephemeral_max_output_tokens = None # consume immediately + api_kwargs.update(self._max_tokens_param(_ephemeral_out)) + elif self.max_tokens is not None: api_kwargs.update(self._max_tokens_param(self.max_tokens)) + elif "integrate.api.nvidia.com" in self._base_url_lower: + # NVIDIA NIM defaults to a very low max_tokens when omitted, + # causing models like GLM-4.7 to truncate immediately (thinking + # tokens alone exhaust the budget). 16384 provides adequate room. + api_kwargs.update(self._max_tokens_param(16384)) elif self._is_qwen_portal(): # Qwen Portal defaults to a very low max_tokens when omitted. # Reasoning models (qwen3-coder-plus) exhaust that budget on @@ -10804,6 +10816,12 @@ class AIAgent: continue if restart_with_length_continuation: + # Progressively boost the output token budget on each retry. + # Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768. + # Applies to all providers via _ephemeral_max_output_tokens. + _boost_base = self.max_tokens if self.max_tokens else 4096 + _boost = _boost_base * (length_continue_retries + 1) + self._ephemeral_max_output_tokens = min(_boost, 32768) continue # Guard: if all retries exhausted without a successful response diff --git a/scripts/release.py b/scripts/release.py index 88ddb2f434..94ebef5d34 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -267,6 +267,7 @@ AUTHOR_MAP = { "aviralarora002@gmail.com": "AviArora02-commits", "junminliu@gmail.com": "JimLiu", "jarvischer@gmail.com": "maxchernin", + "levantam.98.2324@gmail.com": "LVT382009", }