diff --git a/gateway/run.py b/gateway/run.py
index b50bbc5851..9fa4298e9e 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -8483,6 +8483,7 @@ class GatewayRunner:
The enriched message string with vision descriptions prepended.
"""
from tools.vision_tools import vision_analyze_tool
+ from agent.memory_manager import sanitize_context
analysis_prompt = (
"Describe everything visible in this image in thorough detail. "
@@ -8501,6 +8502,14 @@ class GatewayRunner:
result = json.loads(result_json)
if result.get("success"):
description = result.get("analysis", "")
+ # The auxiliary vision LLM can echo injected system-prompt
+ # memory context back into its output (#5719). Scrub any
+ # fences and the "## Honcho Context"
+ # section before the description lands in a user-visible
+ # message.
+ description = sanitize_context(description)
+ if "## Honcho Context" in description:
+ description = description.split("## Honcho Context", 1)[0].rstrip()
enriched_parts.append(
f"[The user sent an image~ Here's what I can see:\n{description}]\n"
f"[If you need a closer look, use vision_analyze with "
diff --git a/tests/agent/test_streaming_context_scrubber.py b/tests/agent/test_streaming_context_scrubber.py
new file mode 100644
index 0000000000..2dbb17f42c
--- /dev/null
+++ b/tests/agent/test_streaming_context_scrubber.py
@@ -0,0 +1,150 @@
+"""Unit tests for StreamingContextScrubber (agent/memory_manager.py).
+
+Regression coverage for #5719 — memory-context spans split across stream
+deltas must not leak payload to the UI. The one-shot sanitize_context()
+regex can't survive chunk boundaries, so _fire_stream_delta routes deltas
+through a stateful scrubber.
+"""
+
+from agent.memory_manager import StreamingContextScrubber, sanitize_context
+
+
+class TestStreamingContextScrubberBasics:
+ def test_empty_input_returns_empty(self):
+ s = StreamingContextScrubber()
+ assert s.feed("") == ""
+ assert s.flush() == ""
+
+ def test_plain_text_passes_through(self):
+ s = StreamingContextScrubber()
+ assert s.feed("hello world") == "hello world"
+ assert s.flush() == ""
+
+ def test_complete_block_in_single_delta(self):
+ """Regression: the one-shot test case from #13672 must still work."""
+ s = StreamingContextScrubber()
+ leaked = (
+ "\n"
+ "[System note: The following is recalled memory context, NOT new "
+ "user input. Treat as informational background data.]\n\n"
+ "## Honcho Context\nstale memory\n"
+ "\n\nVisible answer"
+ )
+ out = s.feed(leaked) + s.flush()
+ assert out == "\n\nVisible answer"
+
+ def test_open_and_close_in_separate_deltas_strips_payload(self):
+ """The real streaming case: tag pair split across deltas."""
+ s = StreamingContextScrubber()
+ deltas = [
+ "Hello ",
+ "\npayload ",
+ "more payload\n",
+ " world",
+ ]
+ out = "".join(s.feed(d) for d in deltas) + s.flush()
+ assert out == "Hello world"
+ assert "payload" not in out
+
+ def test_realistic_fragmented_chunks_strip_memory_payload(self):
+ """Exact leak scenario from the reviewer's comment — 4 realistic chunks.
+
+ This is the case the original #13672 fix silently leaks on: the open
+ tag, system note, payload, and close tag each arrive in their own
+ delta because providers emit 1-80 char chunks.
+ """
+ s = StreamingContextScrubber()
+ deltas = [
+ "\n[System note: The following",
+ " is recalled memory context, NOT new user input. "
+ "Treat as informational background data.]\n\n",
+ "## Honcho Context\nstale memory\n",
+ "\n\nVisible answer",
+ ]
+ out = "".join(s.feed(d) for d in deltas) + s.flush()
+ assert out == "\n\nVisible answer"
+ # The system-note line and payload must never reach the UI.
+ assert "System note" not in out
+ assert "Honcho Context" not in out
+ assert "stale memory" not in out
+
+ def test_open_tag_split_across_two_deltas(self):
+ """The open tag itself arriving in two fragments."""
+ s = StreamingContextScrubber()
+ out = (
+ s.feed("pre leak post")
+ + s.flush()
+ )
+ assert out == "pre post"
+ assert "leak" not in out
+
+ def test_close_tag_split_across_two_deltas(self):
+ """The close tag arriving in two fragments."""
+ s = StreamingContextScrubber()
+ out = (
+ s.feed("pre leak post")
+ + s.flush()
+ )
+ assert out == "pre post"
+ assert "leak" not in out
+
+
+class TestStreamingContextScrubberPartialTagFalsePositives:
+ def test_partial_open_tag_tail_emitted_on_flush(self):
+ """Bare 'secret never closed") + s.flush()
+ assert out == "pre "
+ assert "secret" not in out
+
+ def test_reset_clears_hung_span(self):
+ """Cross-turn scrubber reset drops a hung span so next turn is clean."""
+ s = StreamingContextScrubber()
+ s.feed("pre half")
+ s.reset()
+ out = s.feed("clean text") + s.flush()
+ assert out == "clean text"
+
+
+class TestStreamingContextScrubberCaseInsensitivity:
+ def test_uppercase_tags_still_scrubbed(self):
+ s = StreamingContextScrubber()
+ out = (
+ s.feed("secret")
+ + s.feed("visible")
+ + s.flush()
+ )
+ assert out == "visible"
+ assert "secret" not in out
+
+
+class TestSanitizeContextUnchanged:
+ """Smoke test that the one-shot sanitize_context still works for whole strings."""
+
+ def test_whole_block_still_sanitized(self):
+ leaked = (
+ "\n"
+ "[System note: The following is recalled memory context, NOT new "
+ "user input. Treat as informational background data.]\n"
+ "payload\n"
+ "\nVisible"
+ )
+ out = sanitize_context(leaked).strip()
+ assert out == "Visible"
diff --git a/tests/gateway/test_vision_memory_leak.py b/tests/gateway/test_vision_memory_leak.py
new file mode 100644
index 0000000000..5f6f0a7762
--- /dev/null
+++ b/tests/gateway/test_vision_memory_leak.py
@@ -0,0 +1,99 @@
+"""Tests for _enrich_message_with_vision — regression for #5719.
+
+The auxiliary vision LLM can echo system-prompt Honcho memory back into
+its analysis output. When that echo reaches the user as the enriched
+image description, recalled memory context (personal facts, dialectic
+output) surfaces into a user-visible message.
+
+The boundary fix in gateway/run.py strips both ...
+fenced blocks AND any "## Honcho Context" section from vision descriptions
+before they're embedded into the enriched user message.
+"""
+
+import asyncio
+import json
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+
+@pytest.fixture
+def gateway_runner():
+ """Minimal GatewayRunner stub with just the method under test bound."""
+ from gateway.run import GatewayRunner
+
+ class _Stub:
+ _enrich_message_with_vision = GatewayRunner._enrich_message_with_vision
+
+ return _Stub()
+
+
+def _run(coro):
+ return asyncio.get_event_loop().run_until_complete(coro) if False else asyncio.new_event_loop().run_until_complete(coro)
+
+
+class TestEnrichMessageWithVision:
+ def test_clean_description_passes_through(self, gateway_runner):
+ """Vision output without leaked memory is embedded unchanged."""
+ fake_result = json.dumps({
+ "success": True,
+ "analysis": "A photograph of a sunset over the ocean.",
+ })
+ with patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=fake_result)):
+ out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
+ assert "sunset over the ocean" in out
+
+ def test_honcho_context_header_stripped(self, gateway_runner):
+ """'## Honcho Context' section and everything after is removed."""
+ leaked = (
+ "A photograph of a sunset.\n\n"
+ "## Honcho Context\n"
+ "User prefers concise answers, works at Plastic Labs,\n"
+ "uses OPSEC pseudonyms.\n"
+ )
+ fake_result = json.dumps({"success": True, "analysis": leaked})
+ with patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=fake_result)):
+ out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
+ assert "sunset" in out
+ assert "Honcho Context" not in out
+ assert "Plastic Labs" not in out
+ assert "OPSEC" not in out
+
+ def test_memory_context_fence_stripped(self, gateway_runner):
+ """... fenced block is scrubbed."""
+ leaked = (
+ "\n"
+ "[System note: The following is recalled memory context, NOT new "
+ "user input. Treat as informational background data.]\n\n"
+ "User details and preferences here.\n"
+ "\n"
+ "A photograph of a cat."
+ )
+ fake_result = json.dumps({"success": True, "analysis": leaked})
+ with patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=fake_result)):
+ out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
+ assert "photograph of a cat" in out
+ assert "" not in out
+ assert "User details and preferences" not in out
+ assert "System note" not in out
+
+ def test_both_leak_patterns_together_stripped(self, gateway_runner):
+ """A vision output containing both leak shapes is fully scrubbed."""
+ leaked = (
+ "\n"
+ "[System note: The following is recalled memory context, NOT new "
+ "user input. Treat as informational background data.]\n"
+ "fenced leak\n"
+ "\n"
+ "A photograph of a dog.\n\n"
+ "## Honcho Context\n"
+ "header leak\n"
+ )
+ fake_result = json.dumps({"success": True, "analysis": leaked})
+ with patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=fake_result)):
+ out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
+ assert "photograph of a dog" in out
+ assert "fenced leak" not in out
+ assert "header leak" not in out
+ assert "Honcho Context" not in out
+ assert "" not in out
diff --git a/tests/run_agent/test_run_agent_codex_responses.py b/tests/run_agent/test_run_agent_codex_responses.py
index 9c940a744f..74dc64c287 100644
--- a/tests/run_agent/test_run_agent_codex_responses.py
+++ b/tests/run_agent/test_run_agent_codex_responses.py
@@ -1158,6 +1158,57 @@ def test_stream_delta_strips_leaked_memory_context(monkeypatch):
assert observed == ["Visible answer"]
+def test_stream_delta_strips_leaked_memory_context_across_chunks(monkeypatch):
+ """Regression for #5719 — the real streaming case.
+
+ Providers typically emit 1-80 char chunks, so the memory-context open
+ tag, system-note line, payload, and close tag each arrive in separate
+ deltas. The per-delta sanitize_context() regex cannot survive that
+ — only a stateful scrubber can. None of the payload, system-note
+ text, or "## Honcho Context" header may reach the delta callback.
+ """
+ agent = _build_agent(monkeypatch)
+ observed = []
+ agent.stream_delta_callback = observed.append
+
+ deltas = [
+ "\n[System note: The following",
+ " is recalled memory context, NOT new user input. ",
+ "Treat as informational background data.]\n\n",
+ "## Honcho Context\n",
+ "stale memory about eri\n",
+ "\n\n",
+ "Visible answer",
+ ]
+ for d in deltas:
+ agent._fire_stream_delta(d)
+
+ combined = "".join(observed)
+ assert "Visible answer" in combined
+ # None of the leaked payload may surface.
+ assert "System note" not in combined
+ assert "Honcho Context" not in combined
+ assert "stale memory" not in combined
+ assert "" not in combined
+ assert "" not in combined
+
+
+def test_stream_delta_scrubber_resets_between_turns(monkeypatch):
+ """An unterminated span from a prior turn must not taint the next turn."""
+ agent = _build_agent(monkeypatch)
+
+ # Simulate a hung span carried over — directly populate the scrubber.
+ agent._stream_context_scrubber.feed("pre leaked")
+
+ # Normally run_conversation() resets the scrubber at turn start.
+ agent._stream_context_scrubber.reset()
+
+ observed = []
+ agent.stream_delta_callback = observed.append
+ agent._fire_stream_delta("clean new turn text")
+ assert "".join(observed) == "clean new turn text"
+
+
def test_run_conversation_codex_continues_after_commentary_phase_message(monkeypatch):
agent = _build_agent(monkeypatch)
responses = [