fix(gateway): scrub memory-context leaks from vision auto-analysis output

fixes #5719

The auxiliary vision LLM called by gateway._enrich_message_with_vision
can echo its injected Honcho system prompt back into the image
description.  That description gets embedded verbatim into the enriched
user message, so recalled memory (personal facts, dialectic output)
surfaces into a user-visible bubble.

Strips both forms of leak before embedding:
  - <memory-context>...</memory-context> fenced blocks (sanitize_context)
  - trailing '## Honcho Context' sections (header + everything after)

Plus regression tests:
  - tests/agent/test_streaming_context_scrubber.py — 13 tests on the
    stateful scrubber (whole block, split tags, false-positive partial
    tags, unterminated span, reset, case-insensitivity)
  - tests/run_agent/test_run_agent_codex_responses.py — 2 new tests on
    _fire_stream_delta covering the realistic 7-chunk leak scenario and
    the cross-turn scrubber reset
  - tests/gateway/test_vision_memory_leak.py — 4 tests covering the
    vision auto-analysis boundary (clean pass-through, '## Honcho Context'
    header, fenced block, both patterns together)
This commit is contained in:
Erosika
2026-04-24 18:33:19 -04:00
committed by kshitij
parent 5ce5b17a42
commit 3b2edb347d
4 changed files with 309 additions and 0 deletions

View File

@@ -8483,6 +8483,7 @@ class GatewayRunner:
The enriched message string with vision descriptions prepended.
"""
from tools.vision_tools import vision_analyze_tool
from agent.memory_manager import sanitize_context
analysis_prompt = (
"Describe everything visible in this image in thorough detail. "
@@ -8501,6 +8502,14 @@ class GatewayRunner:
result = json.loads(result_json)
if result.get("success"):
description = result.get("analysis", "")
# The auxiliary vision LLM can echo injected system-prompt
# memory context back into its output (#5719). Scrub any
# <memory-context> fences and the "## Honcho Context"
# section before the description lands in a user-visible
# message.
description = sanitize_context(description)
if "## Honcho Context" in description:
description = description.split("## Honcho Context", 1)[0].rstrip()
enriched_parts.append(
f"[The user sent an image~ Here's what I can see:\n{description}]\n"
f"[If you need a closer look, use vision_analyze with "

View File

@@ -0,0 +1,150 @@
"""Unit tests for StreamingContextScrubber (agent/memory_manager.py).
Regression coverage for #5719 — memory-context spans split across stream
deltas must not leak payload to the UI. The one-shot sanitize_context()
regex can't survive chunk boundaries, so _fire_stream_delta routes deltas
through a stateful scrubber.
"""
from agent.memory_manager import StreamingContextScrubber, sanitize_context
class TestStreamingContextScrubberBasics:
def test_empty_input_returns_empty(self):
s = StreamingContextScrubber()
assert s.feed("") == ""
assert s.flush() == ""
def test_plain_text_passes_through(self):
s = StreamingContextScrubber()
assert s.feed("hello world") == "hello world"
assert s.flush() == ""
def test_complete_block_in_single_delta(self):
"""Regression: the one-shot test case from #13672 must still work."""
s = StreamingContextScrubber()
leaked = (
"<memory-context>\n"
"[System note: The following is recalled memory context, NOT new "
"user input. Treat as informational background data.]\n\n"
"## Honcho Context\nstale memory\n"
"</memory-context>\n\nVisible answer"
)
out = s.feed(leaked) + s.flush()
assert out == "\n\nVisible answer"
def test_open_and_close_in_separate_deltas_strips_payload(self):
"""The real streaming case: tag pair split across deltas."""
s = StreamingContextScrubber()
deltas = [
"Hello ",
"<memory-context>\npayload ",
"more payload\n",
"</memory-context> world",
]
out = "".join(s.feed(d) for d in deltas) + s.flush()
assert out == "Hello world"
assert "payload" not in out
def test_realistic_fragmented_chunks_strip_memory_payload(self):
"""Exact leak scenario from the reviewer's comment — 4 realistic chunks.
This is the case the original #13672 fix silently leaks on: the open
tag, system note, payload, and close tag each arrive in their own
delta because providers emit 1-80 char chunks.
"""
s = StreamingContextScrubber()
deltas = [
"<memory-context>\n[System note: The following",
" is recalled memory context, NOT new user input. "
"Treat as informational background data.]\n\n",
"## Honcho Context\nstale memory\n",
"</memory-context>\n\nVisible answer",
]
out = "".join(s.feed(d) for d in deltas) + s.flush()
assert out == "\n\nVisible answer"
# The system-note line and payload must never reach the UI.
assert "System note" not in out
assert "Honcho Context" not in out
assert "stale memory" not in out
def test_open_tag_split_across_two_deltas(self):
"""The open tag itself arriving in two fragments."""
s = StreamingContextScrubber()
out = (
s.feed("pre <memory")
+ s.feed("-context>leak</memory-context> post")
+ s.flush()
)
assert out == "pre post"
assert "leak" not in out
def test_close_tag_split_across_two_deltas(self):
"""The close tag arriving in two fragments."""
s = StreamingContextScrubber()
out = (
s.feed("pre <memory-context>leak</memory")
+ s.feed("-context> post")
+ s.flush()
)
assert out == "pre post"
assert "leak" not in out
class TestStreamingContextScrubberPartialTagFalsePositives:
def test_partial_open_tag_tail_emitted_on_flush(self):
"""Bare '<mem' at end of stream is not really a memory-context tag."""
s = StreamingContextScrubber()
out = s.feed("hello <mem") + s.feed("ory other") + s.flush()
assert out == "hello <memory other"
def test_partial_tag_released_when_disambiguated(self):
"""A held-back partial tag that turns out to be prose gets released."""
s = StreamingContextScrubber()
# '< ' should not look like the start of any tag.
out = s.feed("price < ") + s.feed("10 dollars") + s.flush()
assert out == "price < 10 dollars"
class TestStreamingContextScrubberUnterminatedSpan:
def test_unterminated_span_drops_payload(self):
"""Provider drops close tag — better to lose output than to leak."""
s = StreamingContextScrubber()
out = s.feed("pre <memory-context>secret never closed") + s.flush()
assert out == "pre "
assert "secret" not in out
def test_reset_clears_hung_span(self):
"""Cross-turn scrubber reset drops a hung span so next turn is clean."""
s = StreamingContextScrubber()
s.feed("pre <memory-context>half")
s.reset()
out = s.feed("clean text") + s.flush()
assert out == "clean text"
class TestStreamingContextScrubberCaseInsensitivity:
def test_uppercase_tags_still_scrubbed(self):
s = StreamingContextScrubber()
out = (
s.feed("<MEMORY-CONTEXT>secret")
+ s.feed("</Memory-Context>visible")
+ s.flush()
)
assert out == "visible"
assert "secret" not in out
class TestSanitizeContextUnchanged:
"""Smoke test that the one-shot sanitize_context still works for whole strings."""
def test_whole_block_still_sanitized(self):
leaked = (
"<memory-context>\n"
"[System note: The following is recalled memory context, NOT new "
"user input. Treat as informational background data.]\n"
"payload\n"
"</memory-context>\nVisible"
)
out = sanitize_context(leaked).strip()
assert out == "Visible"

View File

@@ -0,0 +1,99 @@
"""Tests for _enrich_message_with_vision — regression for #5719.
The auxiliary vision LLM can echo system-prompt Honcho memory back into
its analysis output. When that echo reaches the user as the enriched
image description, recalled memory context (personal facts, dialectic
output) surfaces into a user-visible message.
The boundary fix in gateway/run.py strips both <memory-context>...</memory-context>
fenced blocks AND any "## Honcho Context" section from vision descriptions
before they're embedded into the enriched user message.
"""
import asyncio
import json
from unittest.mock import AsyncMock, patch
import pytest
@pytest.fixture
def gateway_runner():
"""Minimal GatewayRunner stub with just the method under test bound."""
from gateway.run import GatewayRunner
class _Stub:
_enrich_message_with_vision = GatewayRunner._enrich_message_with_vision
return _Stub()
def _run(coro):
return asyncio.get_event_loop().run_until_complete(coro) if False else asyncio.new_event_loop().run_until_complete(coro)
class TestEnrichMessageWithVision:
def test_clean_description_passes_through(self, gateway_runner):
"""Vision output without leaked memory is embedded unchanged."""
fake_result = json.dumps({
"success": True,
"analysis": "A photograph of a sunset over the ocean.",
})
with patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=fake_result)):
out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
assert "sunset over the ocean" in out
def test_honcho_context_header_stripped(self, gateway_runner):
"""'## Honcho Context' section and everything after is removed."""
leaked = (
"A photograph of a sunset.\n\n"
"## Honcho Context\n"
"User prefers concise answers, works at Plastic Labs,\n"
"uses OPSEC pseudonyms.\n"
)
fake_result = json.dumps({"success": True, "analysis": leaked})
with patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=fake_result)):
out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
assert "sunset" in out
assert "Honcho Context" not in out
assert "Plastic Labs" not in out
assert "OPSEC" not in out
def test_memory_context_fence_stripped(self, gateway_runner):
"""<memory-context>...</memory-context> fenced block is scrubbed."""
leaked = (
"<memory-context>\n"
"[System note: The following is recalled memory context, NOT new "
"user input. Treat as informational background data.]\n\n"
"User details and preferences here.\n"
"</memory-context>\n"
"A photograph of a cat."
)
fake_result = json.dumps({"success": True, "analysis": leaked})
with patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=fake_result)):
out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
assert "photograph of a cat" in out
assert "<memory-context>" not in out
assert "User details and preferences" not in out
assert "System note" not in out
def test_both_leak_patterns_together_stripped(self, gateway_runner):
"""A vision output containing both leak shapes is fully scrubbed."""
leaked = (
"<memory-context>\n"
"[System note: The following is recalled memory context, NOT new "
"user input. Treat as informational background data.]\n"
"fenced leak\n"
"</memory-context>\n"
"A photograph of a dog.\n\n"
"## Honcho Context\n"
"header leak\n"
)
fake_result = json.dumps({"success": True, "analysis": leaked})
with patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=fake_result)):
out = _run(gateway_runner._enrich_message_with_vision("caption", ["/tmp/img.jpg"]))
assert "photograph of a dog" in out
assert "fenced leak" not in out
assert "header leak" not in out
assert "Honcho Context" not in out
assert "<memory-context>" not in out

View File

@@ -1158,6 +1158,57 @@ def test_stream_delta_strips_leaked_memory_context(monkeypatch):
assert observed == ["Visible answer"]
def test_stream_delta_strips_leaked_memory_context_across_chunks(monkeypatch):
"""Regression for #5719 — the real streaming case.
Providers typically emit 1-80 char chunks, so the memory-context open
tag, system-note line, payload, and close tag each arrive in separate
deltas. The per-delta sanitize_context() regex cannot survive that
— only a stateful scrubber can. None of the payload, system-note
text, or "## Honcho Context" header may reach the delta callback.
"""
agent = _build_agent(monkeypatch)
observed = []
agent.stream_delta_callback = observed.append
deltas = [
"<memory-context>\n[System note: The following",
" is recalled memory context, NOT new user input. ",
"Treat as informational background data.]\n\n",
"## Honcho Context\n",
"stale memory about eri\n",
"</memory-context>\n\n",
"Visible answer",
]
for d in deltas:
agent._fire_stream_delta(d)
combined = "".join(observed)
assert "Visible answer" in combined
# None of the leaked payload may surface.
assert "System note" not in combined
assert "Honcho Context" not in combined
assert "stale memory" not in combined
assert "<memory-context>" not in combined
assert "</memory-context>" not in combined
def test_stream_delta_scrubber_resets_between_turns(monkeypatch):
"""An unterminated span from a prior turn must not taint the next turn."""
agent = _build_agent(monkeypatch)
# Simulate a hung span carried over — directly populate the scrubber.
agent._stream_context_scrubber.feed("pre <memory-context>leaked")
# Normally run_conversation() resets the scrubber at turn start.
agent._stream_context_scrubber.reset()
observed = []
agent.stream_delta_callback = observed.append
agent._fire_stream_delta("clean new turn text")
assert "".join(observed) == "clean new turn text"
def test_run_conversation_codex_continues_after_commentary_phase_message(monkeypatch):
agent = _build_agent(monkeypatch)
responses = [