Compare commits

...

1 Commits

Author SHA1 Message Date
Brooklyn Nicholson
cc66b666e5 fix: inject plugin context after cache markers to preserve Anthropic prompt cache prefix stability 2026-04-04 18:04:55 -05:00
2 changed files with 51 additions and 4 deletions

View File

@@ -6649,8 +6649,8 @@ class AIAgent:
# Plugin hook: pre_llm_call # Plugin hook: pre_llm_call
# Fired once per turn before the tool-calling loop. Plugins can # Fired once per turn before the tool-calling loop. Plugins can
# return a dict with a ``context`` key whose value is a string # return a dict with a ``context`` key whose value is a string
# that will be appended to the ephemeral system prompt for every # that will be injected at request time for every API call in
# API call in this turn (not persisted to session DB or cache). # this turn (not persisted to session DB or cached prefix).
_plugin_turn_context = "" _plugin_turn_context = ""
try: try:
from hermes_cli.plugins import invoke_hook as _invoke_hook from hermes_cli.plugins import invoke_hook as _invoke_hook
@@ -6796,8 +6796,11 @@ class AIAgent:
effective_system = active_system_prompt or "" effective_system = active_system_prompt or ""
if self.ephemeral_system_prompt: if self.ephemeral_system_prompt:
effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip() effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
# Plugin context from pre_llm_call hooks — ephemeral, not cached. # Plugin context from pre_llm_call hooks.
if _plugin_turn_context: # For non-cached providers/requests we can append directly.
# For Anthropic prompt-cached requests we inject it later as an
# uncached system suffix block so the cache key stays stable.
if _plugin_turn_context and not self._use_prompt_caching:
effective_system = (effective_system + "\n\n" + _plugin_turn_context).strip() effective_system = (effective_system + "\n\n" + _plugin_turn_context).strip()
if effective_system: if effective_system:
api_messages = [{"role": "system", "content": effective_system}] + api_messages api_messages = [{"role": "system", "content": effective_system}] + api_messages
@@ -6816,6 +6819,16 @@ class AIAgent:
if self._use_prompt_caching: if self._use_prompt_caching:
api_messages = apply_anthropic_cache_control(api_messages, cache_ttl=self._cache_ttl, native_anthropic=(self.api_mode == 'anthropic_messages')) api_messages = apply_anthropic_cache_control(api_messages, cache_ttl=self._cache_ttl, native_anthropic=(self.api_mode == 'anthropic_messages'))
# Append plugin context AFTER cache markers so the system-level
# cache key stays stable even when plugin output varies per turn.
if _plugin_turn_context and api_messages and api_messages[0].get("role") == "system":
_sys = api_messages[0].get("content", "")
_blocks = list(_sys) if isinstance(_sys, list) else [{"type": "text", "text": _sys}] if isinstance(_sys, str) else []
_blocks.append({"type": "text", "text": _plugin_turn_context})
api_messages[0]["content"] = _blocks
elif _plugin_turn_context:
api_messages.insert(0, {"role": "system", "content": _plugin_turn_context})
# Safety net: strip orphaned tool results / add stubs for missing # Safety net: strip orphaned tool results / add stubs for missing
# results before sending to the API. Runs unconditionally — not # results before sending to the API. Runs unconditionally — not
# gated on context_compressor — so orphans from session loading or # gated on context_compressor — so orphans from session loading or

View File

@@ -1573,6 +1573,40 @@ class TestRunConversation:
assert "Local/custom backend returned reasoning-only output" in result["error"] assert "Local/custom backend returned reasoning-only output" in result["error"]
assert "wrong /v1 endpoint" in result["error"] assert "wrong /v1 endpoint" in result["error"]
def test_plugin_context_is_uncached_system_suffix_when_prompt_caching_enabled(self, agent):
self._setup_agent(agent)
agent._use_prompt_caching = True
captured = {}
def _fake_api_call(api_kwargs):
captured["kwargs"] = api_kwargs
return _mock_response(content="ok", finish_reason="stop")
with (
patch(
"hermes_cli.plugins.invoke_hook",
return_value=[{"context": "plugin-turn-context"}],
),
patch.object(agent, "_interruptible_api_call", side_effect=_fake_api_call),
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
result = agent.run_conversation("hello")
assert result["completed"] is True
assert result["final_response"] == "ok"
messages = captured["kwargs"]["messages"]
assert messages[0]["role"] == "system"
system_blocks = messages[0]["content"]
assert isinstance(system_blocks, list)
assert system_blocks[0]["text"] == "You are helpful."
assert system_blocks[0]["cache_control"]["type"] == "ephemeral"
assert system_blocks[-1]["text"] == "plugin-turn-context"
assert "cache_control" not in system_blocks[-1]
def test_nous_401_refreshes_after_remint_and_retries(self, agent): def test_nous_401_refreshes_after_remint_and_retries(self, agent):
self._setup_agent(agent) self._setup_agent(agent)
agent.provider = "nous" agent.provider = "nous"