Compare commits

...

1 Commits

Author SHA1 Message Date
Teknium
eb1896ef20 Port from cline/cline#10578: enable cache_control for OpenRouter Qwen/DeepSeek
OpenRouter's prompt-caching docs list a set of non-Claude models
(Alibaba Qwen family, DeepSeek V3.2) that only cache when requests
carry explicit cache_control breakpoints. Without markers these
models serve 0% cache reads across turns — re-billing the full
prompt every call.

Cline verified empirically on qwen/qwen3.6-plus with a 5-turn
repeated-prefix harness: 0% hit rate without markers vs 99.28%
post-warmup hit rate with markers (18,242 cached tokens written
on turn 1, read on turns 2-5).

Changes:
- _OPENROUTER_EXPLICIT_CACHE_CONTROL_MODEL_IDS: new frozenset
  with the six documented explicit-cache model ids.
- _anthropic_prompt_cache_policy: add a branch that turns on
  envelope-layout caching when the request hits OpenRouter and
  the model id is in the allowlist.
- Tests: 10 new cases covering each allowlisted id, case
  insensitivity, non-OpenRouter gateways (must stay off), and
  non-allowlisted Qwen/DeepSeek slugs (must stay off). Updated
  comment on the existing qwen/qwen3-coder test to reflect
  allowlist semantics.

OpenAI- and Google-family models on OpenRouter are intentionally
omitted — OpenRouter handles their caching automatically and
sending cache_control is ignored at best, rejected at worst.
2026-05-06 17:04:15 -07:00
2 changed files with 151 additions and 2 deletions

View File

@@ -336,6 +336,22 @@ _PARALLEL_SAFE_TOOLS = frozenset({
# File tools can run concurrently when they target independent paths.
_PATH_SCOPED_TOOLS = frozenset({"read_file", "write_file", "patch"})
# OpenRouter model IDs that require explicit ``cache_control`` breakpoints for
# prompt caching on their upstream provider. OpenRouter's prompt-caching docs
# list Alibaba's Qwen family and DeepSeek V3.2 as explicit-cache models —
# without breakpoints they return 0% cache reads and re-bill the full prompt
# on every turn. Claude models are handled separately (``is_claude`` branch).
# OpenAI and Google models are intentionally excluded — OpenRouter manages
# their caching automatically. Ported from cline/cline#10578.
_OPENROUTER_EXPLICIT_CACHE_CONTROL_MODEL_IDS = frozenset({
"deepseek/deepseek-v3.2",
"qwen/qwen-plus",
"qwen/qwen3-max",
"qwen/qwen3.6-plus",
"qwen/qwen3-coder-plus",
"qwen/qwen3-coder-flash",
})
# Maximum number of concurrent worker threads for parallel tool execution.
_MAX_TOOL_WORKERS = 8
@@ -2966,6 +2982,20 @@ class AIAgent:
return True, True
if is_openrouter and is_claude:
return True, False
if is_openrouter and model_lower in _OPENROUTER_EXPLICIT_CACHE_CONTROL_MODEL_IDS:
# OpenRouter's prompt-caching docs list a set of non-Claude models
# (Alibaba Qwen-family, DeepSeek V3.2) that only cache when the
# request carries explicit ``cache_control`` breakpoints — otherwise
# the upstream provider serves 0% cache hits, re-billing the full
# prompt on every turn. Ported from cline/cline#10578 which
# verified empirically: qwen/qwen3.6-plus went from 0% cache reads
# across 5 turns to a 99.28% hit rate post-warmup after adding
# breakpoints. OpenAI- and Google-family models are intentionally
# omitted: OpenRouter handles their caching automatically and
# sending ``cache_control`` is ignored at best, rejected at worst.
# Envelope layout (native_anthropic=False) — OpenRouter's wire
# format is OpenAI chat.completions.
return True, False
if is_anthropic_wire and is_claude:
# Third-party Anthropic-compatible gateway.
return True, True

View File

@@ -247,8 +247,12 @@ class TestQwenAlibabaFamily:
assert agent._anthropic_prompt_cache_policy() == (False, False)
def test_qwen_on_openrouter_not_affected(self):
# Qwen via OpenRouter falls through — OpenRouter has its own
# upstream caching arrangement for Qwen (provider-dependent).
# Qwen via OpenRouter falls through for models NOT in the explicit
# cache-control allowlist — the bare ``qwen/qwen3-coder`` slug (as
# opposed to ``qwen3-coder-plus`` / ``qwen3-coder-flash``) is served
# on OpenRouter without the explicit-cache requirement. Models that
# DO require ``cache_control`` on OpenRouter are covered separately
# by TestOpenRouterExplicitCacheControl.
agent = _make_agent(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
@@ -258,6 +262,121 @@ class TestQwenAlibabaFamily:
assert agent._anthropic_prompt_cache_policy() == (False, False)
class TestOpenRouterExplicitCacheControl:
"""OpenRouter models that need explicit ``cache_control`` breakpoints.
OpenRouter's prompt-caching docs list a set of non-Claude models
(Alibaba Qwen-family, DeepSeek V3.2) as explicit-cache models.
Without breakpoints these serve 0% cache reads across turns —
re-billing the full prompt every call. Ported from
cline/cline#10578 which verified empirically: a 5-turn harness
on ``qwen/qwen3.6-plus`` went from 0% cache hits to a 99.28%
post-warmup hit rate after adding breakpoints.
Envelope layout (``native_anthropic=False``) — OpenRouter speaks
OpenAI ``chat.completions`` wire format.
"""
def test_qwen_plus(self):
agent = _make_agent(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
api_mode="chat_completions",
model="qwen/qwen-plus",
)
assert agent._anthropic_prompt_cache_policy() == (True, False)
def test_qwen3_max(self):
agent = _make_agent(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
api_mode="chat_completions",
model="qwen/qwen3-max",
)
assert agent._anthropic_prompt_cache_policy() == (True, False)
def test_qwen3_6_plus(self):
agent = _make_agent(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
api_mode="chat_completions",
model="qwen/qwen3.6-plus",
)
assert agent._anthropic_prompt_cache_policy() == (True, False)
def test_qwen3_coder_plus(self):
agent = _make_agent(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
api_mode="chat_completions",
model="qwen/qwen3-coder-plus",
)
assert agent._anthropic_prompt_cache_policy() == (True, False)
def test_qwen3_coder_flash(self):
agent = _make_agent(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
api_mode="chat_completions",
model="qwen/qwen3-coder-flash",
)
assert agent._anthropic_prompt_cache_policy() == (True, False)
def test_deepseek_v3_2(self):
agent = _make_agent(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
api_mode="chat_completions",
model="deepseek/deepseek-v3.2",
)
assert agent._anthropic_prompt_cache_policy() == (True, False)
def test_uppercased_model_id_still_matches(self):
# ``_anthropic_prompt_cache_policy`` lowercases the model; explicit
# allowlist entries are all lowercase. An uppercase variant of the
# same slug must resolve identically.
agent = _make_agent(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
api_mode="chat_completions",
model="Qwen/Qwen3.6-Plus",
)
assert agent._anthropic_prompt_cache_policy() == (True, False)
def test_allowlist_only_applies_on_openrouter(self):
# The same model id served via a non-OpenRouter OpenAI-wire gateway
# must NOT get cache_control — we have no evidence that third-party
# proxies for these models honour the marker.
agent = _make_agent(
provider="custom",
base_url="https://api.fireworks.ai/inference/v1",
api_mode="chat_completions",
model="qwen/qwen3.6-plus",
)
assert agent._anthropic_prompt_cache_policy() == (False, False)
def test_non_allowlisted_qwen_on_openrouter_stays_off(self):
# Qwen models NOT in the OpenRouter explicit-cache list (per cline's
# empirical testing) stay off — sending unknown cache_control fields
# risks breaking strict upstreams that don't silently ignore them.
agent = _make_agent(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
api_mode="chat_completions",
model="qwen/qwen2.5-72b-instruct",
)
assert agent._anthropic_prompt_cache_policy() == (False, False)
def test_non_allowlisted_deepseek_on_openrouter_stays_off(self):
agent = _make_agent(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
api_mode="chat_completions",
model="deepseek/deepseek-chat",
)
assert agent._anthropic_prompt_cache_policy() == (False, False)
class TestExplicitOverrides:
"""Policy accepts keyword overrides for switch_model / fallback activation."""