Compare commits

...

2 Commits

Author SHA1 Message Date
teknium1
2bd57dd39d fix(caching): honor prompt_caching.enabled across model switch + fallback
@janrenz's PR #35862 added prompt_caching.enabled=false at init only. But
_anthropic_prompt_cache_policy re-derives _use_prompt_caching on every /model
switch (agent_runtime_helpers) and fallback-model swap (chat_completion_helpers),
which re-enabled markers and re-broke the strict proxy the toggle was meant to fix.

Move the kill switch into anthropic_prompt_cache_policy so it returns (False, False)
on every path. Drop the now-redundant init-time override (kept @janrenz's isinstance
hardening on the cache_ttl read). Add policy-level tests + docs for the toggle.

Follow-up to salvaged PR #35862.

(cherry picked from commit 36f9f50145)
2026-07-01 00:27:32 -07:00
Jan Renz
3985ecad02 fix: allow disabling prompt caching
(cherry picked from commit c1c1a12fe6)
2026-07-01 00:27:32 -07:00
8 changed files with 138 additions and 5 deletions

View File

@@ -521,7 +521,9 @@ def init_agent(
from hermes_cli.config import load_config as _load_pc_cfg
_pc_cfg = _load_pc_cfg().get("prompt_caching", {}) or {}
_ttl = _pc_cfg.get("cache_ttl", "5m")
# prompt_caching.enabled=false is honored in _anthropic_prompt_cache_policy
# (applied above and on every re-derivation), so no override is needed here.
_ttl = _pc_cfg.get("cache_ttl", "5m") if isinstance(_pc_cfg, dict) else "5m"
if _ttl in {"5m", "1h"}:
agent._cache_ttl = _ttl
except Exception:

View File

@@ -1443,6 +1443,21 @@ def anthropic_prompt_cache_policy(
eff_api_mode = api_mode if api_mode is not None else (agent.api_mode or "")
eff_model = (model if model is not None else agent.model) or ""
# Global kill switch: prompt_caching.enabled=false disables cache_control
# markers on every path (init, /model switch, fallback re-derivation).
# Escape hatch for strict Anthropic-compatible proxies that inject their
# own markers server-side — stacking ours on top exceeds Anthropic's
# 4-breakpoint limit and 400s. Gating here (not just at init) keeps the
# switch honored after a model switch or fallback re-evaluates the policy.
try:
from hermes_cli.config import load_config as _load_pc_cfg
_pc_cfg = _load_pc_cfg().get("prompt_caching", {}) or {}
if isinstance(_pc_cfg, dict) and _pc_cfg.get("enabled") is False:
return False, False
except Exception:
pass
model_lower = eff_model.lower()
provider_lower = eff_provider.lower()
is_claude = "claude" in model_lower

View File

@@ -1391,8 +1391,11 @@ DEFAULT_CONFIG = {
},
# Anthropic prompt caching (Claude via OpenRouter or native Anthropic API).
# cache_ttl must be "5m" or "1h" (Anthropic-supported tiers); other values are ignored.
# Set enabled: false as an escape hatch for strict providers that reject
# cache_control markers; cache_ttl must be "5m" or "1h" (Anthropic-supported
# tiers), other values are ignored.
"prompt_caching": {
"enabled": True,
"cache_ttl": "5m",
},

View File

@@ -45,6 +45,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json"
# Auto-extracted from noreply emails + manual overrides
AUTHOR_MAP = {
"janrenz@Mac.fritz.box": "janrenz", # PR #35862 salvage (prompt_caching.enabled escape hatch for strict providers)
"syahidfrd@gmail.com": "syahidfrd", # PR #17059 salvage (tag unverified senders in Slack thread context to mitigate indirect prompt injection)
"22971845+H2KFORGIVEN@users.noreply.github.com": "H2KFORGIVEN", # PR #22523 salvage (turn-pair preservation: never orphan the last user ask at head_end during compaction)
"5823452+sgabel@users.noreply.github.com": "sgabel", # PR #13139 salvage (redact secrets in user-facing approval prompts)

View File

@@ -8,7 +8,7 @@ the native layout on OpenRouter) surfaces loudly.
from __future__ import annotations
from unittest.mock import MagicMock
from unittest.mock import MagicMock, patch
from run_agent import AIAgent
@@ -326,7 +326,91 @@ class TestExplicitOverrides:
assert (should, native) == (True, False)
# ─────────────────────────────────────────────────────────────────────
# prompt_caching.enabled=false global kill switch
# ─────────────────────────────────────────────────────────────────────
class TestPromptCachingDisabledKillSwitch:
"""prompt_caching.enabled=false must disable cache_control markers on
every endpoint class and every re-derivation path (init, /model switch,
fallback). This is the correct escape hatch for a strict Anthropic-
compatible proxy that injects its own markers server-side — a single
per-setup toggle, not a blanket strip that would regress the many
well-behaved third-party gateways the policy deliberately caches on.
"""
def _disabled_cfg(self):
return patch(
"hermes_cli.config.load_config",
return_value={"prompt_caching": {"enabled": False}},
)
def test_disables_native_anthropic(self):
agent = _make_agent(
provider="anthropic",
base_url="https://api.anthropic.com",
api_mode="anthropic_messages",
model="claude-sonnet-4-6",
)
with self._disabled_cfg():
assert agent._anthropic_prompt_cache_policy() == (False, False)
def test_disables_openrouter_claude(self):
agent = _make_agent(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
api_mode="chat_completions",
model="anthropic/claude-sonnet-4.6",
)
with self._disabled_cfg():
assert agent._anthropic_prompt_cache_policy() == (False, False)
def test_disables_third_party_anthropic_gateway(self):
# llm.echo.tech-style LiteLLM proxy — the reported failure case.
agent = _make_agent(
provider="anthropic",
base_url="https://llm.echo.tech",
api_mode="anthropic_messages",
model="claude-sonnet-4-6",
)
with self._disabled_cfg():
assert agent._anthropic_prompt_cache_policy() == (False, False)
def test_survives_model_switch_re_derivation(self):
# Start native Anthropic, /model switch to a proxy — disable must hold.
agent = _make_agent(
provider="anthropic",
base_url="https://api.anthropic.com",
api_mode="anthropic_messages",
model="claude-opus-4.6",
)
with self._disabled_cfg():
assert agent._anthropic_prompt_cache_policy(
provider="anthropic",
base_url="https://llm.echo.tech",
api_mode="anthropic_messages",
model="claude-sonnet-4-6",
) == (False, False)
def test_enabled_true_keeps_third_party_caching_on(self):
# The well-behaved third-party gateways a blanket strip would break
# must keep caching by default.
agent = _make_agent(
provider="anthropic",
base_url="https://llm.echo.tech",
api_mode="anthropic_messages",
model="claude-sonnet-4-6",
)
with patch(
"hermes_cli.config.load_config",
return_value={"prompt_caching": {"enabled": True}},
):
assert agent._anthropic_prompt_cache_policy() == (True, True)
# ─────────────────────────────────────────────────────────────────────
# Long-lived prefix cache policy (cross-session 1h tier)
# ─────────────────────────────────────────────────────────────────────

View File

@@ -986,6 +986,30 @@ class TestInit:
)
assert a._cache_ttl == "5m"
def test_prompt_caching_enabled_false_disables_cache_markers(self):
"""prompt_caching.enabled=false is an escape hatch for strict providers."""
with (
patch("run_agent.get_tool_definitions", return_value=[]),
patch("run_agent.check_toolset_requirements", return_value={}),
patch("agent.anthropic_adapter._anthropic_sdk"),
patch(
"hermes_cli.config.load_config",
return_value={"prompt_caching": {"enabled": False}},
),
):
a = AIAgent(
api_key="test-key-1234567890",
provider="anthropic",
model="claude-sonnet-4-6",
base_url="https://api.anthropic.com/v1/",
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
)
assert a.api_mode == "anthropic_messages"
assert a._use_prompt_caching is False
assert a._use_native_cache_layout is False
def test_valid_tool_names_populated(self):
"""valid_tool_names should contain names from loaded tools."""
tools = _make_tool_defs("web_search", "terminal")

View File

@@ -362,6 +362,7 @@ Prompt caching is automatically enabled when:
```yaml
# config.yaml — TTL is configurable (must be "5m" or "1h")
prompt_caching:
enabled: true # set false to stop sending cache_control markers (strict-proxy escape hatch)
cache_ttl: "5m"
```

View File

@@ -915,17 +915,20 @@ For Claude on **native Anthropic**, **OpenRouter**, and **Nous Portal**, Hermes
The Qwen Cloud (Alibaba DashScope) upstream caps cache TTL at 5 minutes, so Hermes uses the 5-minute breakpoint TTL there instead. Other Claude-via-third-party paths (AWS Bedrock, Azure Foundry) fall back to the provider's own caching defaults. xAI Grok uses a separate session-pinned conversation-id mechanism — see [xAI prompt caching](/integrations/providers#xai-grok--responses-api--prompt-caching).
No knob exists to disable this — caching is always-on and saves money even on single-turn conversations because the system prompt alone is a meaningful fraction of the input token count.
Caching is on by default and saves money even on single-turn conversations because the system prompt alone is a meaningful fraction of the input token count. It can be turned off entirely with the `enabled` knob below when a strict provider rejects `cache_control` markers.
The one explicit knob is the cache TTL tier Hermes requests on Anthropic-style breakpoints:
The explicit knobs are whether caching runs at all and the cache TTL tier Hermes requests on Anthropic-style breakpoints:
```yaml
prompt_caching:
enabled: true # set false to stop sending cache_control markers entirely
cache_ttl: "5m" # "5m" or "1h" (Anthropic-supported tiers); other values are ignored
```
`cache_ttl` selects the breakpoint TTL Hermes attaches for Claude via the native Anthropic API, OpenRouter, and Nous Portal. Only the two Anthropic-supported tiers (`"5m"`, `"1h"`) are honored — any other value is ignored. Providers with their own caps (e.g. Qwen Cloud, which maxes at 5 minutes) still clamp to what the upstream allows.
`enabled` defaults to `true`. Set it to `false` as an escape hatch for strict Anthropic-compatible proxies that inject their own `cache_control` markers server-side — stacking those on top of Hermes' breakpoints can exceed Anthropic's 4-breakpoint limit and return HTTP 400 `"A maximum of 4 blocks with cache_control may be provided"`. Disabling caching on that setup passes requests through without client-side markers so the proxy manages its own.
## Auxiliary Models
Hermes uses "auxiliary" models for side tasks like image analysis, web page summarization, browser screenshot analysis, session-title generation, and context compression. By default (`auxiliary.*.provider: "auto"`), Hermes routes every auxiliary task to your **main chat model** — the same provider/model you picked in `hermes model`. You don't need to configure anything to get started, but be aware that on expensive reasoning models (Opus, MiniMax M2.7, etc.) auxiliary tasks add meaningful cost. If you want cheap-and-fast side tasks regardless of your main model, set `auxiliary.<task>.provider` and `auxiliary.<task>.model` explicitly (for example, Gemini Flash on OpenRouter for vision and web extraction).