From 570f8bab8fd9db2f48bed2990cc820351bc10f21 Mon Sep 17 00:00:00 2001
From: Sanjays2402 <51058514+Sanjays2402@users.noreply.github.com>
Date: Mon, 20 Apr 2026 05:06:04 -0700
Subject: [PATCH] fix(compression): exclude completion tokens from compression
 trigger (#12026)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cherry-picked from PR #12481 by @Sanjays2402.

Reasoning models (GLM-5.1, QwQ, DeepSeek R1) inflate completion_tokens
with internal thinking tokens. The compression trigger summed
prompt_tokens + completion_tokens, causing premature compression at ~42%
actual context usage instead of the configured 50% threshold.

Now uses only prompt_tokens — completion tokens don't consume context
window space for the next API call.

- 3 new regression tests
- Added AUTHOR_MAP entry for @Sanjays2402

Closes #12026
---
 run_agent.py                                  | 10 +--
 scripts/release.py                            |  1 +
 ..._compression_trigger_excludes_reasoning.py | 61 +++++++++++++++++++
 3 files changed, 68 insertions(+), 4 deletions(-)
 create mode 100644 tests/run_agent/test_compression_trigger_excludes_reasoning.py

diff --git a/run_agent.py b/run_agent.py
index 73231183b8..b53d1c823f 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -11736,10 +11736,12 @@ class AIAgent:
                     # should_compress(0) never fires.  (#2153)
                     _compressor = self.context_compressor
                     if _compressor.last_prompt_tokens > 0:
-                        _real_tokens = (
-                            _compressor.last_prompt_tokens
-                            + _compressor.last_completion_tokens
-                        )
+                        # Only use prompt_tokens — completion/reasoning
+                        # tokens don't consume context window space.
+                        # Thinking models (GLM-5.1, QwQ, DeepSeek R1)
+                        # inflate completion_tokens with reasoning,
+                        # causing premature compression.  (#12026)
+                        _real_tokens = _compressor.last_prompt_tokens
                     else:
                         _real_tokens = estimate_messages_tokens_rough(messages)
 
diff --git a/scripts/release.py b/scripts/release.py
index 8affe2dacf..7ade9a9e05 100755
--- a/scripts/release.py
+++ b/scripts/release.py
@@ -177,6 +177,7 @@ AUTHOR_MAP = {
     "364939526@qq.com": "luyao618",
     "hgk324@gmail.com": "houziershi",
     "176644217+PStarH@users.noreply.github.com": "PStarH",
+    "51058514+Sanjays2402@users.noreply.github.com": "Sanjays2402",
     "906014227@qq.com": "bingo906",
     "aaronwong1999@icloud.com": "AaronWong1999",
     "agents@kylefrench.dev": "DeployFaith",
diff --git a/tests/run_agent/test_compression_trigger_excludes_reasoning.py b/tests/run_agent/test_compression_trigger_excludes_reasoning.py
new file mode 100644
index 0000000000..24fe2868fc
--- /dev/null
+++ b/tests/run_agent/test_compression_trigger_excludes_reasoning.py
@@ -0,0 +1,61 @@
+"""Verify compression trigger excludes reasoning/completion tokens (#12026).
+
+Thinking models (GLM-5.1, QwQ, DeepSeek R1) inflate completion_tokens with
+reasoning tokens that don't consume context window space.  The compression
+trigger must use only prompt_tokens so sessions aren't prematurely split.
+"""
+
+import types
+import pytest
+from unittest.mock import MagicMock, patch
+
+
+def _make_agent_stub(prompt_tokens, completion_tokens, threshold_tokens):
+    """Create a minimal stub that exercises the compression check path."""
+    compressor = types.SimpleNamespace(
+        last_prompt_tokens=prompt_tokens,
+        last_completion_tokens=completion_tokens,
+        threshold_tokens=threshold_tokens,
+    )
+    # Replicate the fixed logic from run_agent.py ~line 11273
+    if compressor.last_prompt_tokens > 0:
+        real_tokens = compressor.last_prompt_tokens  # Fixed: no completion
+    else:
+        real_tokens = 0
+    return real_tokens, compressor
+
+
+class TestCompressionTriggerExcludesReasoning:
+    def test_high_reasoning_tokens_should_not_trigger_compression(self):
+        """With the old bug, 40k prompt + 80k reasoning = 120k > 100k threshold.
+        After the fix, only 40k prompt is compared — no compression."""
+        real_tokens, comp = _make_agent_stub(
+            prompt_tokens=40_000,
+            completion_tokens=80_000,  # reasoning-heavy model
+            threshold_tokens=100_000,
+        )
+        assert real_tokens == 40_000
+        assert real_tokens < comp.threshold_tokens, (
+            "Should NOT trigger compression — only prompt tokens matter"
+        )
+
+    def test_high_prompt_tokens_should_trigger_compression(self):
+        """When prompt tokens genuinely exceed the threshold, compress."""
+        real_tokens, comp = _make_agent_stub(
+            prompt_tokens=110_000,
+            completion_tokens=5_000,
+            threshold_tokens=100_000,
+        )
+        assert real_tokens == 110_000
+        assert real_tokens >= comp.threshold_tokens, (
+            "Should trigger compression — prompt tokens exceed threshold"
+        )
+
+    def test_zero_prompt_tokens_falls_back(self):
+        """When provider returns 0 prompt tokens, real_tokens is 0 (fallback path)."""
+        real_tokens, _ = _make_agent_stub(
+            prompt_tokens=0,
+            completion_tokens=50_000,
+            threshold_tokens=100_000,
+        )
+        assert real_tokens == 0