Port from cline/cline#10578 : enable cache_control for OpenRouter Qwen/DeepSeek

OpenRouter's prompt-caching docs list a set of non-Claude models (Alibaba Qwen family, DeepSeek V3.2) that only cache when requests carry explicit cache_control breakpoints. Without markers these models serve 0% cache reads across turns — re-billing the full prompt every call. Cline verified empirically on qwen/qwen3.6-plus with a 5-turn repeated-prefix harness: 0% hit rate without markers vs 99.28% post-warmup hit rate with markers (18,242 cached tokens written on turn 1, read on turns 2-5). Changes: - _OPENROUTER_EXPLICIT_CACHE_CONTROL_MODEL_IDS: new frozenset with the six documented explicit-cache model ids. - _anthropic_prompt_cache_policy: add a branch that turns on envelope-layout caching when the request hits OpenRouter and the model id is in the allowlist. - Tests: 10 new cases covering each allowlisted id, case insensitivity, non-OpenRouter gateways (must stay off), and non-allowlisted Qwen/DeepSeek slugs (must stay off). Updated comment on the existing qwen/qwen3-coder test to reflect allowlist semantics. OpenAI- and Google-family models on OpenRouter are intentionally omitted — OpenRouter handles their caching automatically and sending cache_control is ignored at best, rejected at worst.
2026-06-10 04:08:28 +08:00 · 2026-05-06 17:04:15 -07:00
2 changed files with 151 additions and 2 deletions
--- a/run_agent.py
+++ b/run_agent.py
@@ -336,6 +336,22 @@ _PARALLEL_SAFE_TOOLS = frozenset({
 # File tools can run concurrently when they target independent paths.
 _PATH_SCOPED_TOOLS = frozenset({"read_file", "write_file", "patch"})

+# OpenRouter model IDs that require explicit ``cache_control`` breakpoints for
+# prompt caching on their upstream provider. OpenRouter's prompt-caching docs
+# list Alibaba's Qwen family and DeepSeek V3.2 as explicit-cache models —
+# without breakpoints they return 0% cache reads and re-bill the full prompt
+# on every turn.  Claude models are handled separately (``is_claude`` branch).
+# OpenAI and Google models are intentionally excluded — OpenRouter manages
+# their caching automatically.  Ported from cline/cline#10578.
+_OPENROUTER_EXPLICIT_CACHE_CONTROL_MODEL_IDS = frozenset({
+    "deepseek/deepseek-v3.2",
+    "qwen/qwen-plus",
+    "qwen/qwen3-max",
+    "qwen/qwen3.6-plus",
+    "qwen/qwen3-coder-plus",
+    "qwen/qwen3-coder-flash",
+})
+
 # Maximum number of concurrent worker threads for parallel tool execution.
 _MAX_TOOL_WORKERS = 8

@@ -2966,6 +2982,20 @@ class AIAgent:
            return True, True
        if is_openrouter and is_claude:
            return True, False
+        if is_openrouter and model_lower in _OPENROUTER_EXPLICIT_CACHE_CONTROL_MODEL_IDS:
+            # OpenRouter's prompt-caching docs list a set of non-Claude models
+            # (Alibaba Qwen-family, DeepSeek V3.2) that only cache when the
+            # request carries explicit ``cache_control`` breakpoints — otherwise
+            # the upstream provider serves 0% cache hits, re-billing the full
+            # prompt on every turn.  Ported from cline/cline#10578 which
+            # verified empirically: qwen/qwen3.6-plus went from 0% cache reads
+            # across 5 turns to a 99.28% hit rate post-warmup after adding
+            # breakpoints.  OpenAI- and Google-family models are intentionally
+            # omitted: OpenRouter handles their caching automatically and
+            # sending ``cache_control`` is ignored at best, rejected at worst.
+            # Envelope layout (native_anthropic=False) — OpenRouter's wire
+            # format is OpenAI chat.completions.
+            return True, False
        if is_anthropic_wire and is_claude:
            # Third-party Anthropic-compatible gateway.
            return True, True
--- a/tests/run_agent/test_anthropic_prompt_cache_policy.py
+++ b/tests/run_agent/test_anthropic_prompt_cache_policy.py
@@ -247,8 +247,12 @@ class TestQwenAlibabaFamily:
        assert agent._anthropic_prompt_cache_policy() == (False, False)

    def test_qwen_on_openrouter_not_affected(self):
-        # Qwen via OpenRouter falls through — OpenRouter has its own
-        # upstream caching arrangement for Qwen (provider-dependent).
+        # Qwen via OpenRouter falls through for models NOT in the explicit
+        # cache-control allowlist — the bare ``qwen/qwen3-coder`` slug (as
+        # opposed to ``qwen3-coder-plus`` / ``qwen3-coder-flash``) is served
+        # on OpenRouter without the explicit-cache requirement. Models that
+        # DO require ``cache_control`` on OpenRouter are covered separately
+        # by TestOpenRouterExplicitCacheControl.
        agent = _make_agent(
            provider="openrouter",
            base_url="https://openrouter.ai/api/v1",
@@ -258,6 +262,121 @@ class TestQwenAlibabaFamily:
        assert agent._anthropic_prompt_cache_policy() == (False, False)


+class TestOpenRouterExplicitCacheControl:
+    """OpenRouter models that need explicit ``cache_control`` breakpoints.
+
+    OpenRouter's prompt-caching docs list a set of non-Claude models
+    (Alibaba Qwen-family, DeepSeek V3.2) as explicit-cache models.
+    Without breakpoints these serve 0% cache reads across turns —
+    re-billing the full prompt every call.  Ported from
+    cline/cline#10578 which verified empirically: a 5-turn harness
+    on ``qwen/qwen3.6-plus`` went from 0% cache hits to a 99.28%
+    post-warmup hit rate after adding breakpoints.
+
+    Envelope layout (``native_anthropic=False``) — OpenRouter speaks
+    OpenAI ``chat.completions`` wire format.
+    """
+
+    def test_qwen_plus(self):
+        agent = _make_agent(
+            provider="openrouter",
+            base_url="https://openrouter.ai/api/v1",
+            api_mode="chat_completions",
+            model="qwen/qwen-plus",
+        )
+        assert agent._anthropic_prompt_cache_policy() == (True, False)
+
+    def test_qwen3_max(self):
+        agent = _make_agent(
+            provider="openrouter",
+            base_url="https://openrouter.ai/api/v1",
+            api_mode="chat_completions",
+            model="qwen/qwen3-max",
+        )
+        assert agent._anthropic_prompt_cache_policy() == (True, False)
+
+    def test_qwen3_6_plus(self):
+        agent = _make_agent(
+            provider="openrouter",
+            base_url="https://openrouter.ai/api/v1",
+            api_mode="chat_completions",
+            model="qwen/qwen3.6-plus",
+        )
+        assert agent._anthropic_prompt_cache_policy() == (True, False)
+
+    def test_qwen3_coder_plus(self):
+        agent = _make_agent(
+            provider="openrouter",
+            base_url="https://openrouter.ai/api/v1",
+            api_mode="chat_completions",
+            model="qwen/qwen3-coder-plus",
+        )
+        assert agent._anthropic_prompt_cache_policy() == (True, False)
+
+    def test_qwen3_coder_flash(self):
+        agent = _make_agent(
+            provider="openrouter",
+            base_url="https://openrouter.ai/api/v1",
+            api_mode="chat_completions",
+            model="qwen/qwen3-coder-flash",
+        )
+        assert agent._anthropic_prompt_cache_policy() == (True, False)
+
+    def test_deepseek_v3_2(self):
+        agent = _make_agent(
+            provider="openrouter",
+            base_url="https://openrouter.ai/api/v1",
+            api_mode="chat_completions",
+            model="deepseek/deepseek-v3.2",
+        )
+        assert agent._anthropic_prompt_cache_policy() == (True, False)
+
+    def test_uppercased_model_id_still_matches(self):
+        # ``_anthropic_prompt_cache_policy`` lowercases the model; explicit
+        # allowlist entries are all lowercase. An uppercase variant of the
+        # same slug must resolve identically.
+        agent = _make_agent(
+            provider="openrouter",
+            base_url="https://openrouter.ai/api/v1",
+            api_mode="chat_completions",
+            model="Qwen/Qwen3.6-Plus",
+        )
+        assert agent._anthropic_prompt_cache_policy() == (True, False)
+
+    def test_allowlist_only_applies_on_openrouter(self):
+        # The same model id served via a non-OpenRouter OpenAI-wire gateway
+        # must NOT get cache_control — we have no evidence that third-party
+        # proxies for these models honour the marker.
+        agent = _make_agent(
+            provider="custom",
+            base_url="https://api.fireworks.ai/inference/v1",
+            api_mode="chat_completions",
+            model="qwen/qwen3.6-plus",
+        )
+        assert agent._anthropic_prompt_cache_policy() == (False, False)
+
+    def test_non_allowlisted_qwen_on_openrouter_stays_off(self):
+        # Qwen models NOT in the OpenRouter explicit-cache list (per cline's
+        # empirical testing) stay off — sending unknown cache_control fields
+        # risks breaking strict upstreams that don't silently ignore them.
+        agent = _make_agent(
+            provider="openrouter",
+            base_url="https://openrouter.ai/api/v1",
+            api_mode="chat_completions",
+            model="qwen/qwen2.5-72b-instruct",
+        )
+        assert agent._anthropic_prompt_cache_policy() == (False, False)
+
+    def test_non_allowlisted_deepseek_on_openrouter_stays_off(self):
+        agent = _make_agent(
+            provider="openrouter",
+            base_url="https://openrouter.ai/api/v1",
+            api_mode="chat_completions",
+            model="deepseek/deepseek-chat",
+        )
+        assert agent._anthropic_prompt_cache_policy() == (False, False)
+
+
 class TestExplicitOverrides:
    """Policy accepts keyword overrides for switch_model / fallback activation."""