fix: inject plugin context after cache markers to preserve Anthropic prompt cache prefix stability

2026-04-28 15:01:34 +08:00 · 2026-04-04 18:04:55 -05:00
2 changed files with 51 additions and 4 deletions
--- a/run_agent.py
+++ b/run_agent.py
@@ -6649,8 +6649,8 @@ class AIAgent:
        # Plugin hook: pre_llm_call
        # Fired once per turn before the tool-calling loop.  Plugins can
        # return a dict with a ``context`` key whose value is a string
-        # that will be appended to the ephemeral system prompt for every
+        # that will be injected at request time for every API call in
-        # API call in this turn (not persisted to session DB or cache).
+        # this turn (not persisted to session DB or cached prefix).
        _plugin_turn_context = ""
        try:
            from hermes_cli.plugins import invoke_hook as _invoke_hook
@@ -6796,8 +6796,11 @@ class AIAgent:
            effective_system = active_system_prompt or ""
            if self.ephemeral_system_prompt:
                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
-            # Plugin context from pre_llm_call hooks — ephemeral, not cached.
+            # Plugin context from pre_llm_call hooks.
-            if _plugin_turn_context:
+            # For non-cached providers/requests we can append directly.
            # For Anthropic prompt-cached requests we inject it later as an
            # uncached system suffix block so the cache key stays stable.
            if _plugin_turn_context and not self._use_prompt_caching:
                effective_system = (effective_system + "\n\n" + _plugin_turn_context).strip()
            if effective_system:
                api_messages = [{"role": "system", "content": effective_system}] + api_messages
@@ -6816,6 +6819,16 @@ class AIAgent:
            if self._use_prompt_caching:
                api_messages = apply_anthropic_cache_control(api_messages, cache_ttl=self._cache_ttl, native_anthropic=(self.api_mode == 'anthropic_messages'))
                # Append plugin context AFTER cache markers so the system-level
                # cache key stays stable even when plugin output varies per turn.
                if _plugin_turn_context and api_messages and api_messages[0].get("role") == "system":
                    _sys = api_messages[0].get("content", "")
                    _blocks = list(_sys) if isinstance(_sys, list) else [{"type": "text", "text": _sys}] if isinstance(_sys, str) else []
                    _blocks.append({"type": "text", "text": _plugin_turn_context})
                    api_messages[0]["content"] = _blocks
                elif _plugin_turn_context:
                    api_messages.insert(0, {"role": "system", "content": _plugin_turn_context})
            # Safety net: strip orphaned tool results / add stubs for missing
            # results before sending to the API.  Runs unconditionally — not
            # gated on context_compressor — so orphans from session loading or
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@@ -1573,6 +1573,40 @@ class TestRunConversation:
        assert "Local/custom backend returned reasoning-only output" in result["error"]
        assert "wrong /v1 endpoint" in result["error"]
    def test_plugin_context_is_uncached_system_suffix_when_prompt_caching_enabled(self, agent):
        self._setup_agent(agent)
        agent._use_prompt_caching = True
        captured = {}
        def _fake_api_call(api_kwargs):
            captured["kwargs"] = api_kwargs
            return _mock_response(content="ok", finish_reason="stop")
        with (
            patch(
                "hermes_cli.plugins.invoke_hook",
                return_value=[{"context": "plugin-turn-context"}],
            ),
            patch.object(agent, "_interruptible_api_call", side_effect=_fake_api_call),
            patch.object(agent, "_persist_session"),
            patch.object(agent, "_save_trajectory"),
            patch.object(agent, "_cleanup_task_resources"),
        ):
            result = agent.run_conversation("hello")
        assert result["completed"] is True
        assert result["final_response"] == "ok"
        messages = captured["kwargs"]["messages"]
        assert messages[0]["role"] == "system"
        system_blocks = messages[0]["content"]
        assert isinstance(system_blocks, list)
        assert system_blocks[0]["text"] == "You are helpful."
        assert system_blocks[0]["cache_control"]["type"] == "ephemeral"
        assert system_blocks[-1]["text"] == "plugin-turn-context"
        assert "cache_control" not in system_blocks[-1]
    def test_nous_401_refreshes_after_remint_and_retries(self, agent):
        self._setup_agent(agent)
        agent.provider = "nous"