From f7af90e2daf2e2a11262ff3152bb3f08ff13ca37 Mon Sep 17 00:00:00 2001
From: LVT382009 <levantam.98.2324@gmail.com>
Date: Sat, 18 Apr 2026 22:49:30 +0530
Subject: [PATCH] fix: wire _ephemeral_max_output_tokens into chat_completions
 and add NVIDIA NIM default

Based on #12152 by @LVT382009.

Two fixes to run_agent.py:

1. _ephemeral_max_output_tokens consumption in chat_completions path:
   The error-recovery ephemeral override was only consumed in the
   anthropic_messages branch of _build_api_kwargs.  All chat_completions
   providers (OpenRouter, NVIDIA NIM, Qwen, Alibaba, custom, etc.)
   silently ignored it.  Now consumed at highest priority, matching the
   anthropic pattern.

2. NVIDIA NIM max_tokens default (16384):
   NVIDIA NIM falls back to a very low internal default when max_tokens
   is omitted, causing models like GLM-4.7 to truncate immediately
   (thinking tokens exhaust the budget before the response starts).

3. Progressive length-continuation boost:
   When finish_reason='length' triggers a continuation retry, the output
   budget now grows progressively (2x base on retry 1, 3x on retry 2,
   capped at 32768) via _ephemeral_max_output_tokens.  Previously the
   retry loop just re-sent the same token limit on all 3 attempts.
---
 run_agent.py       | 20 +++++++++++++++++++-
 scripts/release.py |  1 +
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/run_agent.py b/run_agent.py
index e88096a603..a0f4db5485 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -7061,8 +7061,20 @@ class AIAgent:
         if self.tools:
             api_kwargs["tools"] = self.tools
 
-        if self.max_tokens is not None:
+        # ── max_tokens for chat_completions ──────────────────────────────
+        # Priority: ephemeral override (error recovery / length-continuation
+        # boost) > user-configured max_tokens > provider-specific defaults.
+        _ephemeral_out = getattr(self, "_ephemeral_max_output_tokens", None)
+        if _ephemeral_out is not None:
+            self._ephemeral_max_output_tokens = None  # consume immediately
+            api_kwargs.update(self._max_tokens_param(_ephemeral_out))
+        elif self.max_tokens is not None:
             api_kwargs.update(self._max_tokens_param(self.max_tokens))
+        elif "integrate.api.nvidia.com" in self._base_url_lower:
+            # NVIDIA NIM defaults to a very low max_tokens when omitted,
+            # causing models like GLM-4.7 to truncate immediately (thinking
+            # tokens alone exhaust the budget).  16384 provides adequate room.
+            api_kwargs.update(self._max_tokens_param(16384))
         elif self._is_qwen_portal():
             # Qwen Portal defaults to a very low max_tokens when omitted.
             # Reasoning models (qwen3-coder-plus) exhaust that budget on
@@ -10804,6 +10816,12 @@ class AIAgent:
                 continue
 
             if restart_with_length_continuation:
+                # Progressively boost the output token budget on each retry.
+                # Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
+                # Applies to all providers via _ephemeral_max_output_tokens.
+                _boost_base = self.max_tokens if self.max_tokens else 4096
+                _boost = _boost_base * (length_continue_retries + 1)
+                self._ephemeral_max_output_tokens = min(_boost, 32768)
                 continue
 
             # Guard: if all retries exhausted without a successful response
diff --git a/scripts/release.py b/scripts/release.py
index 88ddb2f434..94ebef5d34 100755
--- a/scripts/release.py
+++ b/scripts/release.py
@@ -267,6 +267,7 @@ AUTHOR_MAP = {
     "aviralarora002@gmail.com": "AviArora02-commits",
     "junminliu@gmail.com": "JimLiu",
     "jarvischer@gmail.com": "maxchernin",
+    "levantam.98.2324@gmail.com": "LVT382009",
 }