mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-02 16:26:34 +08:00
Compare commits
2 Commits
main
...
feat/promp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2bd57dd39d | ||
|
|
3985ecad02 |
@@ -521,7 +521,9 @@ def init_agent(
|
||||
from hermes_cli.config import load_config as _load_pc_cfg
|
||||
|
||||
_pc_cfg = _load_pc_cfg().get("prompt_caching", {}) or {}
|
||||
_ttl = _pc_cfg.get("cache_ttl", "5m")
|
||||
# prompt_caching.enabled=false is honored in _anthropic_prompt_cache_policy
|
||||
# (applied above and on every re-derivation), so no override is needed here.
|
||||
_ttl = _pc_cfg.get("cache_ttl", "5m") if isinstance(_pc_cfg, dict) else "5m"
|
||||
if _ttl in {"5m", "1h"}:
|
||||
agent._cache_ttl = _ttl
|
||||
except Exception:
|
||||
|
||||
@@ -1443,6 +1443,21 @@ def anthropic_prompt_cache_policy(
|
||||
eff_api_mode = api_mode if api_mode is not None else (agent.api_mode or "")
|
||||
eff_model = (model if model is not None else agent.model) or ""
|
||||
|
||||
# Global kill switch: prompt_caching.enabled=false disables cache_control
|
||||
# markers on every path (init, /model switch, fallback re-derivation).
|
||||
# Escape hatch for strict Anthropic-compatible proxies that inject their
|
||||
# own markers server-side — stacking ours on top exceeds Anthropic's
|
||||
# 4-breakpoint limit and 400s. Gating here (not just at init) keeps the
|
||||
# switch honored after a model switch or fallback re-evaluates the policy.
|
||||
try:
|
||||
from hermes_cli.config import load_config as _load_pc_cfg
|
||||
|
||||
_pc_cfg = _load_pc_cfg().get("prompt_caching", {}) or {}
|
||||
if isinstance(_pc_cfg, dict) and _pc_cfg.get("enabled") is False:
|
||||
return False, False
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
model_lower = eff_model.lower()
|
||||
provider_lower = eff_provider.lower()
|
||||
is_claude = "claude" in model_lower
|
||||
|
||||
@@ -1391,8 +1391,11 @@ DEFAULT_CONFIG = {
|
||||
},
|
||||
|
||||
# Anthropic prompt caching (Claude via OpenRouter or native Anthropic API).
|
||||
# cache_ttl must be "5m" or "1h" (Anthropic-supported tiers); other values are ignored.
|
||||
# Set enabled: false as an escape hatch for strict providers that reject
|
||||
# cache_control markers; cache_ttl must be "5m" or "1h" (Anthropic-supported
|
||||
# tiers), other values are ignored.
|
||||
"prompt_caching": {
|
||||
"enabled": True,
|
||||
"cache_ttl": "5m",
|
||||
},
|
||||
|
||||
|
||||
@@ -45,6 +45,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json"
|
||||
|
||||
# Auto-extracted from noreply emails + manual overrides
|
||||
AUTHOR_MAP = {
|
||||
"janrenz@Mac.fritz.box": "janrenz", # PR #35862 salvage (prompt_caching.enabled escape hatch for strict providers)
|
||||
"syahidfrd@gmail.com": "syahidfrd", # PR #17059 salvage (tag unverified senders in Slack thread context to mitigate indirect prompt injection)
|
||||
"22971845+H2KFORGIVEN@users.noreply.github.com": "H2KFORGIVEN", # PR #22523 salvage (turn-pair preservation: never orphan the last user ask at head_end during compaction)
|
||||
"5823452+sgabel@users.noreply.github.com": "sgabel", # PR #13139 salvage (redact secrets in user-facing approval prompts)
|
||||
|
||||
@@ -8,7 +8,7 @@ the native layout on OpenRouter) surfaces loudly.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from run_agent import AIAgent
|
||||
|
||||
@@ -326,7 +326,91 @@ class TestExplicitOverrides:
|
||||
assert (should, native) == (True, False)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────
|
||||
# prompt_caching.enabled=false global kill switch
|
||||
# ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestPromptCachingDisabledKillSwitch:
|
||||
"""prompt_caching.enabled=false must disable cache_control markers on
|
||||
every endpoint class and every re-derivation path (init, /model switch,
|
||||
fallback). This is the correct escape hatch for a strict Anthropic-
|
||||
compatible proxy that injects its own markers server-side — a single
|
||||
per-setup toggle, not a blanket strip that would regress the many
|
||||
well-behaved third-party gateways the policy deliberately caches on.
|
||||
"""
|
||||
|
||||
def _disabled_cfg(self):
|
||||
return patch(
|
||||
"hermes_cli.config.load_config",
|
||||
return_value={"prompt_caching": {"enabled": False}},
|
||||
)
|
||||
|
||||
def test_disables_native_anthropic(self):
|
||||
agent = _make_agent(
|
||||
provider="anthropic",
|
||||
base_url="https://api.anthropic.com",
|
||||
api_mode="anthropic_messages",
|
||||
model="claude-sonnet-4-6",
|
||||
)
|
||||
with self._disabled_cfg():
|
||||
assert agent._anthropic_prompt_cache_policy() == (False, False)
|
||||
|
||||
def test_disables_openrouter_claude(self):
|
||||
agent = _make_agent(
|
||||
provider="openrouter",
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
api_mode="chat_completions",
|
||||
model="anthropic/claude-sonnet-4.6",
|
||||
)
|
||||
with self._disabled_cfg():
|
||||
assert agent._anthropic_prompt_cache_policy() == (False, False)
|
||||
|
||||
def test_disables_third_party_anthropic_gateway(self):
|
||||
# llm.echo.tech-style LiteLLM proxy — the reported failure case.
|
||||
agent = _make_agent(
|
||||
provider="anthropic",
|
||||
base_url="https://llm.echo.tech",
|
||||
api_mode="anthropic_messages",
|
||||
model="claude-sonnet-4-6",
|
||||
)
|
||||
with self._disabled_cfg():
|
||||
assert agent._anthropic_prompt_cache_policy() == (False, False)
|
||||
|
||||
def test_survives_model_switch_re_derivation(self):
|
||||
# Start native Anthropic, /model switch to a proxy — disable must hold.
|
||||
agent = _make_agent(
|
||||
provider="anthropic",
|
||||
base_url="https://api.anthropic.com",
|
||||
api_mode="anthropic_messages",
|
||||
model="claude-opus-4.6",
|
||||
)
|
||||
with self._disabled_cfg():
|
||||
assert agent._anthropic_prompt_cache_policy(
|
||||
provider="anthropic",
|
||||
base_url="https://llm.echo.tech",
|
||||
api_mode="anthropic_messages",
|
||||
model="claude-sonnet-4-6",
|
||||
) == (False, False)
|
||||
|
||||
def test_enabled_true_keeps_third_party_caching_on(self):
|
||||
# The well-behaved third-party gateways a blanket strip would break
|
||||
# must keep caching by default.
|
||||
agent = _make_agent(
|
||||
provider="anthropic",
|
||||
base_url="https://llm.echo.tech",
|
||||
api_mode="anthropic_messages",
|
||||
model="claude-sonnet-4-6",
|
||||
)
|
||||
with patch(
|
||||
"hermes_cli.config.load_config",
|
||||
return_value={"prompt_caching": {"enabled": True}},
|
||||
):
|
||||
assert agent._anthropic_prompt_cache_policy() == (True, True)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────
|
||||
# Long-lived prefix cache policy (cross-session 1h tier)
|
||||
# ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
|
||||
@@ -986,6 +986,30 @@ class TestInit:
|
||||
)
|
||||
assert a._cache_ttl == "5m"
|
||||
|
||||
def test_prompt_caching_enabled_false_disables_cache_markers(self):
|
||||
"""prompt_caching.enabled=false is an escape hatch for strict providers."""
|
||||
with (
|
||||
patch("run_agent.get_tool_definitions", return_value=[]),
|
||||
patch("run_agent.check_toolset_requirements", return_value={}),
|
||||
patch("agent.anthropic_adapter._anthropic_sdk"),
|
||||
patch(
|
||||
"hermes_cli.config.load_config",
|
||||
return_value={"prompt_caching": {"enabled": False}},
|
||||
),
|
||||
):
|
||||
a = AIAgent(
|
||||
api_key="test-key-1234567890",
|
||||
provider="anthropic",
|
||||
model="claude-sonnet-4-6",
|
||||
base_url="https://api.anthropic.com/v1/",
|
||||
quiet_mode=True,
|
||||
skip_context_files=True,
|
||||
skip_memory=True,
|
||||
)
|
||||
assert a.api_mode == "anthropic_messages"
|
||||
assert a._use_prompt_caching is False
|
||||
assert a._use_native_cache_layout is False
|
||||
|
||||
def test_valid_tool_names_populated(self):
|
||||
"""valid_tool_names should contain names from loaded tools."""
|
||||
tools = _make_tool_defs("web_search", "terminal")
|
||||
|
||||
@@ -362,6 +362,7 @@ Prompt caching is automatically enabled when:
|
||||
```yaml
|
||||
# config.yaml — TTL is configurable (must be "5m" or "1h")
|
||||
prompt_caching:
|
||||
enabled: true # set false to stop sending cache_control markers (strict-proxy escape hatch)
|
||||
cache_ttl: "5m"
|
||||
```
|
||||
|
||||
|
||||
@@ -915,17 +915,20 @@ For Claude on **native Anthropic**, **OpenRouter**, and **Nous Portal**, Hermes
|
||||
|
||||
The Qwen Cloud (Alibaba DashScope) upstream caps cache TTL at 5 minutes, so Hermes uses the 5-minute breakpoint TTL there instead. Other Claude-via-third-party paths (AWS Bedrock, Azure Foundry) fall back to the provider's own caching defaults. xAI Grok uses a separate session-pinned conversation-id mechanism — see [xAI prompt caching](/integrations/providers#xai-grok--responses-api--prompt-caching).
|
||||
|
||||
No knob exists to disable this — caching is always-on and saves money even on single-turn conversations because the system prompt alone is a meaningful fraction of the input token count.
|
||||
Caching is on by default and saves money even on single-turn conversations because the system prompt alone is a meaningful fraction of the input token count. It can be turned off entirely with the `enabled` knob below when a strict provider rejects `cache_control` markers.
|
||||
|
||||
The one explicit knob is the cache TTL tier Hermes requests on Anthropic-style breakpoints:
|
||||
The explicit knobs are whether caching runs at all and the cache TTL tier Hermes requests on Anthropic-style breakpoints:
|
||||
|
||||
```yaml
|
||||
prompt_caching:
|
||||
enabled: true # set false to stop sending cache_control markers entirely
|
||||
cache_ttl: "5m" # "5m" or "1h" (Anthropic-supported tiers); other values are ignored
|
||||
```
|
||||
|
||||
`cache_ttl` selects the breakpoint TTL Hermes attaches for Claude via the native Anthropic API, OpenRouter, and Nous Portal. Only the two Anthropic-supported tiers (`"5m"`, `"1h"`) are honored — any other value is ignored. Providers with their own caps (e.g. Qwen Cloud, which maxes at 5 minutes) still clamp to what the upstream allows.
|
||||
|
||||
`enabled` defaults to `true`. Set it to `false` as an escape hatch for strict Anthropic-compatible proxies that inject their own `cache_control` markers server-side — stacking those on top of Hermes' breakpoints can exceed Anthropic's 4-breakpoint limit and return HTTP 400 `"A maximum of 4 blocks with cache_control may be provided"`. Disabling caching on that setup passes requests through without client-side markers so the proxy manages its own.
|
||||
|
||||
## Auxiliary Models
|
||||
|
||||
Hermes uses "auxiliary" models for side tasks like image analysis, web page summarization, browser screenshot analysis, session-title generation, and context compression. By default (`auxiliary.*.provider: "auto"`), Hermes routes every auxiliary task to your **main chat model** — the same provider/model you picked in `hermes model`. You don't need to configure anything to get started, but be aware that on expensive reasoning models (Opus, MiniMax M2.7, etc.) auxiliary tasks add meaningful cost. If you want cheap-and-fast side tasks regardless of your main model, set `auxiliary.<task>.provider` and `auxiliary.<task>.model` explicitly (for example, Gemini Flash on OpenRouter for vision and web extraction).
|
||||
|
||||
Reference in New Issue
Block a user