fix: keep honcho recall out of cached system prefix

Attach later-turn Honcho recall to the current-turn user message at API call time instead of appending it to the system prompt. This preserves the stable system-prefix cache while keeping Honcho continuity context available for the turn. Also adds regression coverage for the injection helper and for continuing sessions so Honcho recall stays out of the system prompt.
2026-04-28 23:11:37 +08:00 · 2026-03-13 13:20:07 -07:00
2 changed files with 87 additions and 7 deletions
--- a/run_agent.py
+++ b/run_agent.py
@@ -202,6 +202,32 @@ _NEVER_PARALLEL_TOOLS = frozenset({"clarify"})
 _MAX_TOOL_WORKERS = 8
 def _inject_honcho_turn_context(content, turn_context: str):
    """Append Honcho recall to the current-turn user message without mutating history.
    The returned content is sent to the API for this turn only. Keeping Honcho
    recall out of the system prompt preserves the stable cache prefix while
    still giving the model continuity context.
    """
    if not turn_context:
        return content
    note = (
        "[System note: The following Honcho memory was retrieved from prior "
        "sessions. It is continuity context for this turn only, not new user "
        "input.]\n\n"
        f"{turn_context}"
    )
    if isinstance(content, list):
        return list(content) + [{"type": "text", "text": note}]
    text = "" if content is None else str(content)
    if not text.strip():
        return note
    return f"{text}\n\n{note}"
 class AIAgent:
    """
    AI Agent with tool calling capabilities.
@@ -3909,10 +3935,11 @@ class AIAgent:
        # Honcho prefetch consumption:
        # - First turn: bake into cached system prompt (stable for the session).
-        # - Later turns: inject as ephemeral system context for this API call only.
+        # - Later turns: attach recall to the current-turn user message at
        #   API-call time only (never persisted to history / session DB).
        #
-        # This keeps the persisted/cached prompt stable while still allowing
+        # This keeps the system-prefix cache stable while still allowing turn N
-        # turn N to consume background prefetch results from turn N-1.
+        # to consume background prefetch results from turn N-1.
        self._honcho_context = ""
        self._honcho_turn_context = ""
        _recall_mode = (self._honcho_config.recall_mode if self._honcho_config else "hybrid")
@@ -3930,6 +3957,7 @@ class AIAgent:
        # Add user message
        user_msg = {"role": "user", "content": user_message}
        messages.append(user_msg)
        current_turn_user_idx = len(messages) - 1
        if not self.quiet_mode:
            print(f"💬 Starting conversation: '{user_message[:60]}{'...' if len(user_message) > 60 else ''}'")
@@ -4079,9 +4107,14 @@ class AIAgent:
            # However, providers like Moonshot AI require a separate 'reasoning_content' field
            # on assistant messages with tool_calls. We handle both cases here.
            api_messages = []
-            for msg in messages:
+            for idx, msg in enumerate(messages):
                api_msg = msg.copy()
                if idx == current_turn_user_idx and msg.get("role") == "user" and self._honcho_turn_context:
                    api_msg["content"] = _inject_honcho_turn_context(
                        api_msg.get("content", ""), self._honcho_turn_context
                    )
                # For ALL assistant messages, pass reasoning back to the API
                # This ensures multi-turn reasoning context is preserved
                if msg.get("role") == "assistant":
@@ -4109,11 +4142,11 @@ class AIAgent:
            # Build the final system message: cached prompt + ephemeral system prompt.
            # Ephemeral additions are API-call-time only (not persisted to session DB).
            # Honcho later-turn recall is intentionally kept OUT of the system prompt
            # so the stable cache prefix remains unchanged.
            effective_system = active_system_prompt or ""
            if self.ephemeral_system_prompt:
                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
            if self._honcho_turn_context:
                effective_system = (effective_system + "\n\n" + self._honcho_turn_context).strip()
            if effective_system:
                api_messages = [{"role": "system", "content": effective_system}] + api_messages
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@@ -14,7 +14,7 @@ from unittest.mock import MagicMock, patch
 import pytest
 from honcho_integration.client import HonchoClientConfig
-from run_agent import AIAgent
+from run_agent import AIAgent, _inject_honcho_turn_context
 from agent.prompt_builder import DEFAULT_AGENT_IDENTITY
@@ -1441,6 +1441,53 @@ class TestSystemPromptStability:
        should_prefetch = bool(conversation_history) and recall_mode != "tools"
        assert should_prefetch is True
    def test_inject_honcho_turn_context_appends_system_note(self):
        content = _inject_honcho_turn_context("hello", "## Honcho Memory\nprior context")
        assert "hello" in content
        assert "Honcho memory was retrieved from prior sessions" in content
        assert "## Honcho Memory" in content
    def test_honcho_continuing_session_keeps_turn_context_out_of_system_prompt(self, agent):
        captured = {}
        def _fake_api_call(api_kwargs):
            captured.update(api_kwargs)
            return _mock_response(content="done", finish_reason="stop")
        agent._honcho = object()
        agent._honcho_session_key = "session-1"
        agent._honcho_config = SimpleNamespace(
            ai_peer="hermes",
            memory_mode="hybrid",
            write_frequency="async",
            recall_mode="hybrid",
        )
        agent._use_prompt_caching = False
        conversation_history = [
            {"role": "user", "content": "hello"},
            {"role": "assistant", "content": "hi there"},
        ]
        with (
            patch.object(agent, "_honcho_prefetch", return_value="## Honcho Memory\nprior context"),
            patch.object(agent, "_queue_honcho_prefetch"),
            patch.object(agent, "_persist_session"),
            patch.object(agent, "_save_trajectory"),
            patch.object(agent, "_cleanup_task_resources"),
            patch.object(agent, "_interruptible_api_call", side_effect=_fake_api_call),
        ):
            result = agent.run_conversation("what were we doing?", conversation_history=conversation_history)
        assert result["completed"] is True
        api_messages = captured["messages"]
        assert api_messages[0]["role"] == "system"
        assert "prior context" not in api_messages[0]["content"]
        current_user = api_messages[-1]
        assert current_user["role"] == "user"
        assert "what were we doing?" in current_user["content"]
        assert "prior context" in current_user["content"]
        assert "Honcho memory was retrieved from prior sessions" in current_user["content"]
    def test_honcho_prefetch_runs_on_first_turn(self):
        """Honcho prefetch should run when conversation_history is empty."""
        conversation_history = []