tests/run_agent/test_vision_aware_preprocessing.py

"""Tests for the vision-aware image preprocessing in run_agent.py.

Covers:

* ``_prepare_anthropic_messages_for_api`` — passes image parts through
  unchanged when the active model reports ``supports_vision=True`` (the
  adapter handles them natively), and falls back to text-description
  replacement when the model lacks vision.

* ``_prepare_messages_for_non_vision_model`` — the mirror method for the
  chat.completions / codex_responses paths. Same contract.
"""

from __future__ import annotations

from unittest.mock import MagicMock, patch

import pytest

from run_agent import AIAgent


def _make_agent() -> AIAgent:
    """Build a bare-bones AIAgent instance without running __init__.

    Avoids the heavy provider/credential setup for these pure-method tests.
    """
    agent = object.__new__(AIAgent)
    agent.provider = "anthropic"
    agent.model = "claude-sonnet-4"
    agent._anthropic_image_fallback_cache = {}
    return agent


IMG_PARTS_USER_MSG = {
    "role": "user",
    "content": [
        {"type": "text", "text": "What's in this image?"},
        {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}},
    ],
}

PLAIN_USER_MSG = {"role": "user", "content": "hello, no images here"}


# ─── _prepare_anthropic_messages_for_api ─────────────────────────────────────


class TestPrepareAnthropicMessages:
    def test_no_images_passes_through(self):
        agent = _make_agent()
        msgs = [PLAIN_USER_MSG]
        out = agent._prepare_anthropic_messages_for_api(msgs)
        assert out is msgs  # unchanged reference

    def test_vision_capable_passes_images_through(self):
        """The Anthropic adapter handles image_url/input_image natively."""
        agent = _make_agent()
        with patch.object(agent, "_model_supports_vision", return_value=True):
            out = agent._prepare_anthropic_messages_for_api([IMG_PARTS_USER_MSG])
        # Passes through unchanged — image_url parts still present.
        assert out[0]["content"][1]["type"] == "image_url"

    def test_non_vision_replaces_images_with_text(self):
        agent = _make_agent()
        with patch.object(agent, "_model_supports_vision", return_value=False), \
             patch.object(
                 agent,
                 "_describe_image_for_anthropic_fallback",
                 return_value="[Image description: a cat]",
             ):
            out = agent._prepare_anthropic_messages_for_api([IMG_PARTS_USER_MSG])
        # Content collapsed to a string containing the description + user text.
        content = out[0]["content"]
        assert isinstance(content, str)
        assert "[Image description: a cat]" in content
        assert "What's in this image?" in content
        # No more image parts.
        assert "image_url" not in content


# ─── _prepare_messages_for_non_vision_model ──────────────────────────────────


class TestPrepareMessagesForNonVision:
    def test_no_images_passes_through(self):
        agent = _make_agent()
        msgs = [PLAIN_USER_MSG]
        out = agent._prepare_messages_for_non_vision_model(msgs)
        assert out is msgs

    def test_vision_capable_passes_through(self):
        """For vision-capable models on chat.completions path, provider handles pixels."""
        agent = _make_agent()
        agent.provider = "openrouter"
        agent.model = "anthropic/claude-sonnet-4"
        with patch.object(agent, "_model_supports_vision", return_value=True):
            out = agent._prepare_messages_for_non_vision_model([IMG_PARTS_USER_MSG])
        assert out[0]["content"][1]["type"] == "image_url"

    def test_non_vision_strips_images(self):
        agent = _make_agent()
        agent.provider = "openrouter"
        agent.model = "qwen/qwen3-235b-a22b"
        with patch.object(agent, "_model_supports_vision", return_value=False), \
             patch.object(
                 agent,
                 "_describe_image_for_anthropic_fallback",
                 return_value="[Image description: a dog]",
             ):
            out = agent._prepare_messages_for_non_vision_model([IMG_PARTS_USER_MSG])
        content = out[0]["content"]
        assert isinstance(content, str)
        assert "[Image description: a dog]" in content
        assert "image_url" not in content

    def test_multiple_messages_with_mixed_content(self):
        agent = _make_agent()
        agent.model = "qwen/qwen3-235b"
        msgs = [
            {"role": "user", "content": "first turn"},
            {"role": "assistant", "content": "ack"},
            IMG_PARTS_USER_MSG,
        ]
        with patch.object(agent, "_model_supports_vision", return_value=False), \
             patch.object(
                 agent,
                 "_describe_image_for_anthropic_fallback",
                 return_value="[Image: thing]",
             ):
            out = agent._prepare_messages_for_non_vision_model(msgs)
        # First two messages unchanged (no images), third stripped.
        assert out[0]["content"] == "first turn"
        assert out[1]["content"] == "ack"
        assert isinstance(out[2]["content"], str)
        assert "[Image: thing]" in out[2]["content"]


# ─── _model_supports_vision ──────────────────────────────────────────────────


class TestModelSupportsVision:
    def test_missing_provider_or_model_returns_false(self):
        agent = _make_agent()
        agent.provider = ""
        agent.model = "claude-sonnet-4"
        assert agent._model_supports_vision() is False
        agent.provider = "anthropic"
        agent.model = ""
        assert agent._model_supports_vision() is False

    def test_uses_get_model_capabilities(self):
        agent = _make_agent()
        fake_caps = MagicMock()
        fake_caps.supports_vision = True
        with patch("agent.models_dev.get_model_capabilities", return_value=fake_caps):
            assert agent._model_supports_vision() is True
        fake_caps.supports_vision = False
        with patch("agent.models_dev.get_model_capabilities", return_value=fake_caps):
            assert agent._model_supports_vision() is False

    def test_none_caps_returns_false(self):
        agent = _make_agent()
        with patch("agent.models_dev.get_model_capabilities", return_value=None):
            assert agent._model_supports_vision() is False

    def test_exception_returns_false(self):
        agent = _make_agent()
        with patch("agent.models_dev.get_model_capabilities", side_effect=RuntimeError("boom")):
            assert agent._model_supports_vision() is False
feat(image-input): native multimodal routing based on model vision capability (#16506) * feat(image-input): native multimodal routing based on model vision capability Attach user-sent images as OpenAI-style content parts on the user turn when the active model supports native vision, so vision-capable models see real pixels instead of a lossy text description from vision_analyze. Routing decision (agent/image_routing.py::decide_image_input_mode): agent.image_input_mode = auto \| native \| text (default: auto) In auto mode: - If auxiliary.vision.provider/model is explicitly configured, keep the text pipeline (user paid for a dedicated vision backend). - Else if models.dev reports supports_vision=True for the active provider/model, attach natively. - Else fall back to text (current behaviour). Call sites updated: gateway/run.py (all messaging platforms), tui_gateway (dashboard/Ink), cli.py (interactive /attach + drag-drop). run_agent.py changes: - _prepare_anthropic_messages_for_api now passes image parts through unchanged when the model supports vision — the Anthropic adapter translates them to native image blocks. Previous behaviour (vision_analyze → text) only runs for non-vision Anthropic models. - New _prepare_messages_for_non_vision_model mirrors the same contract for chat.completions and codex_responses paths, so non-vision models on any provider get text-fallback instead of failing at the provider. - New _model_supports_vision() helper reads models.dev caps. vision_analyze description rewritten: positions it as a tool for images NOT already visible in the conversation (URLs, tool output, deeper inspection). Prevents the model from redundantly calling it on images already attached natively. Config default: agent.image_input_mode = auto. Tests: 35 new (test_image_routing.py + test_vision_aware_preprocessing.py), all existing tests that reference _prepare_anthropic_messages_for_api still pass (198 targeted + new tests green). * feat(image-input): size-cap + resize oversized images, charge image tokens in compressor Two follow-ups that make the native image routing safer for long / heavy sessions: 1) Oversize handling in build_native_content_parts: - 20 MB ceiling per image (matches vision_tools._MAX_BASE64_BYTES, the most restrictive provider — Gemini inline data). - Delegates to vision_tools._resize_image_for_vision (Pillow-based, already battle-tested) to downscale to 5 MB first-try. - If Pillow is missing or resize still overshoots, the image is dropped and reported back in skipped[]; caller falls back to text enrichment for that image. 2) Image-token accounting in context_compressor: - New _IMAGE_TOKEN_ESTIMATE = 1600 (matches Claude Code's constant; within the realistic range for Anthropic/GPT-4o/Gemini billing). - _content_length_for_budget() helper: sums text-part lengths and charges _IMAGE_CHAR_EQUIVALENT (1600 * 4 chars) per image/image_url/ input_image part. Base64 payload inside image_url is NOT counted as chars — dimensions don't matter, only image-presence. - Both tail-cut sites (_prune_old_tool_results L527 and _find_tail_cut_by_tokens L1126) now call the helper so multi-image conversations don't slip past compression budget. Tests: 9 new in test_image_routing.py (oversize triggers resize, resize-fails-returns-None, oversize-skipped-reported), 11 new in test_compressor_image_tokens.py (flat charge per image, multiple images, Responses-API / Anthropic-native / OpenAI-chat shapes, no-inflation on raw base64, bounds-check on the constant, integration test that an image-heavy tail actually gets trimmed). * fix(image-input): replace blanket 20MB ceiling with empirically-verified per-provider limits The previous commit imposed a hardcoded 20 MB base64 ceiling on all providers, triggering auto-resize on anything larger. This was wrong in both directions: * Too loose for Anthropic — actual limit is 5 MB (returns HTTP 400 'image exceeds 5 MB maximum' above that). * Too strict for OpenAI / Codex / OpenRouter — accept 49 MB+ without complaint (empirically verified April 2026 with progressive PNG sizes). New behaviour: * _PROVIDER_BASE64_CEILING table: only anthropic and bedrock have a ceiling (5 MB, since bedrock-on-Claude shares Anthropic's decoder). * Providers NOT in the table get no ceiling — images attach at native size and we trust the provider to return its own error if it disagrees. A provider-specific 400 message is clearer than us guessing wrong and silently degrading image quality. * build_native_content_parts() gains a keyword-only provider arg; gateway/CLI/TUI pass the active provider so Anthropic users get auto-resize protection while OpenAI users don't pay it. * Resize target dropped from 5 MB to 4 MB to slide safely under Anthropic's boundary with header overhead. Empirical measurements (direct API, no Hermes in the loop): image b64 anthropic openrouter/gpt5.5 codex-oauth/gpt5.5 0.19 MB ✓ ✓ ✓ 12.37 MB ✗ 400 5MB ✓ ✓ 23.85 MB ✗ 400 5MB ✓ ✓ 49.46 MB ✗ 413 ✓ ✓ Tests: rewrote TestOversizeHandling (5 tests): no-ceiling pass-through, Anthropic resize fires, Anthropic skip on resize-fail, build_native_parts routes ceiling by provider, unknown provider gets no ceiling. All 52 targeted tests pass. * refactor(image-input): attempt native, shrink-and-retry on provider reject Replace proactive per-provider size ceilings with a reactive shrink path on the provider's actual rejection. All providers now attempt native full-size attachment first; if the provider returns an image-too-large error, the agent silently shrinks and retries once. Why the previous design was wrong: hardcoding provider ceilings (anthropic=5MB, others=unlimited) meant OpenAI users on a 10MB image paid no tax, but Anthropic users lost quality on anything >5MB even though the empirical behaviour at provider-reject time is the same (shrink + retry). Baking the table into the routing layer also requires updating Hermes every time a provider's limit changes. Reactive design: - image_routing.py: _file_to_data_url encodes native size, no ceiling. build_native_content_parts drops its provider kwarg. - error_classifier.py: new FailoverReason.image_too_large + pattern match ("image exceeds", "image too large", etc.) checked BEFORE context_overflow so Anthropic's 5MB rejection lands in the right bucket. - run_agent.py: new _try_shrink_image_parts_in_messages walks api messages in-place, re-encodes oversized data: URL image parts through vision_tools._resize_image_for_vision to fit under 4MB, handles both chat.completions (dict image_url) and Responses (string image_url) shapes, ignores http URLs (provider-fetched). New image_shrink_retry_attempted flag in the retry loop fires the shrink exactly once per turn after credential-pool recovery but before auth retries. E2E verified live against Anthropic claude-sonnet-4-6: - 17.9MB PNG (23.9MB b64) attached at native size - Anthropic returns 400 "image exceeds 5 MB maximum" - Agent logs '📐 Image(s) exceeded provider size limit — shrank and retrying...' - Retry succeeds, correct response delivered in 6.8s total. Tests: 12 new (8 shrink-helper shapes + 4 classifier signals), replaces 5 proactive-ceiling tests with 3 simpler 'native attach works' tests. 181 targeted tests pass. test_enum_members_exist in test_error_classifier.py updated for the new enum value. 2026-04-27 06:27:59 -07:00			`"""Tests for the vision-aware image preprocessing in run_agent.py.`

			`Covers:`

			* ``_prepare_anthropic_messages_for_api`` — passes image parts through
			unchanged when the active model reports ``supports_vision=True`` (the
			`adapter handles them natively), and falls back to text-description`
			`replacement when the model lacks vision.`

			* ``_prepare_messages_for_non_vision_model`` — the mirror method for the
			`chat.completions / codex_responses paths. Same contract.`
			`"""`

			`from __future__ import annotations`

			`from unittest.mock import MagicMock, patch`

			`import pytest`

			`from run_agent import AIAgent`


			`def _make_agent() -> AIAgent:`
			`"""Build a bare-bones AIAgent instance without running __init__.`

			`Avoids the heavy provider/credential setup for these pure-method tests.`
			`"""`
			`agent = object.__new__(AIAgent)`
			`agent.provider = "anthropic"`
			`agent.model = "claude-sonnet-4"`
			`agent._anthropic_image_fallback_cache = {}`
			`return agent`


			`IMG_PARTS_USER_MSG = {`
			`"role": "user",`
			`"content": [`
			`{"type": "text", "text": "What's in this image?"},`
			`{"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}},`
			`],`
			`}`

			`PLAIN_USER_MSG = {"role": "user", "content": "hello, no images here"}`


			`# ─── _prepare_anthropic_messages_for_api ─────────────────────────────────────`


			`class TestPrepareAnthropicMessages:`
			`def test_no_images_passes_through(self):`
			`agent = _make_agent()`
			`msgs = [PLAIN_USER_MSG]`
			`out = agent._prepare_anthropic_messages_for_api(msgs)`
			`assert out is msgs # unchanged reference`

			`def test_vision_capable_passes_images_through(self):`
			`"""The Anthropic adapter handles image_url/input_image natively."""`
			`agent = _make_agent()`
			`with patch.object(agent, "_model_supports_vision", return_value=True):`
			`out = agent._prepare_anthropic_messages_for_api([IMG_PARTS_USER_MSG])`
			`# Passes through unchanged — image_url parts still present.`
			`assert out[0]["content"][1]["type"] == "image_url"`

			`def test_non_vision_replaces_images_with_text(self):`
			`agent = _make_agent()`
			`with patch.object(agent, "_model_supports_vision", return_value=False), \`
			`patch.object(`
			`agent,`
			`"_describe_image_for_anthropic_fallback",`
			`return_value="[Image description: a cat]",`
			`):`
			`out = agent._prepare_anthropic_messages_for_api([IMG_PARTS_USER_MSG])`
			`# Content collapsed to a string containing the description + user text.`
			`content = out[0]["content"]`
			`assert isinstance(content, str)`
			`assert "[Image description: a cat]" in content`
			`assert "What's in this image?" in content`
			`# No more image parts.`
			`assert "image_url" not in content`


			`# ─── _prepare_messages_for_non_vision_model ──────────────────────────────────`


			`class TestPrepareMessagesForNonVision:`
			`def test_no_images_passes_through(self):`
			`agent = _make_agent()`
			`msgs = [PLAIN_USER_MSG]`
			`out = agent._prepare_messages_for_non_vision_model(msgs)`
			`assert out is msgs`

			`def test_vision_capable_passes_through(self):`
			`"""For vision-capable models on chat.completions path, provider handles pixels."""`
			`agent = _make_agent()`
			`agent.provider = "openrouter"`
			`agent.model = "anthropic/claude-sonnet-4"`
			`with patch.object(agent, "_model_supports_vision", return_value=True):`
			`out = agent._prepare_messages_for_non_vision_model([IMG_PARTS_USER_MSG])`
			`assert out[0]["content"][1]["type"] == "image_url"`

			`def test_non_vision_strips_images(self):`
			`agent = _make_agent()`
			`agent.provider = "openrouter"`
			`agent.model = "qwen/qwen3-235b-a22b"`
			`with patch.object(agent, "_model_supports_vision", return_value=False), \`
			`patch.object(`
			`agent,`
			`"_describe_image_for_anthropic_fallback",`
			`return_value="[Image description: a dog]",`
			`):`
			`out = agent._prepare_messages_for_non_vision_model([IMG_PARTS_USER_MSG])`
			`content = out[0]["content"]`
			`assert isinstance(content, str)`
			`assert "[Image description: a dog]" in content`
			`assert "image_url" not in content`

			`def test_multiple_messages_with_mixed_content(self):`
			`agent = _make_agent()`
			`agent.model = "qwen/qwen3-235b"`
			`msgs = [`
			`{"role": "user", "content": "first turn"},`
			`{"role": "assistant", "content": "ack"},`
			`IMG_PARTS_USER_MSG,`
			`]`
			`with patch.object(agent, "_model_supports_vision", return_value=False), \`
			`patch.object(`
			`agent,`
			`"_describe_image_for_anthropic_fallback",`
			`return_value="[Image: thing]",`
			`):`
			`out = agent._prepare_messages_for_non_vision_model(msgs)`
			`# First two messages unchanged (no images), third stripped.`
			`assert out[0]["content"] == "first turn"`
			`assert out[1]["content"] == "ack"`
			`assert isinstance(out[2]["content"], str)`
			`assert "[Image: thing]" in out[2]["content"]`


			`# ─── _model_supports_vision ──────────────────────────────────────────────────`


			`class TestModelSupportsVision:`
			`def test_missing_provider_or_model_returns_false(self):`
			`agent = _make_agent()`
			`agent.provider = ""`
			`agent.model = "claude-sonnet-4"`
			`assert agent._model_supports_vision() is False`
			`agent.provider = "anthropic"`
			`agent.model = ""`
			`assert agent._model_supports_vision() is False`

			`def test_uses_get_model_capabilities(self):`
			`agent = _make_agent()`
			`fake_caps = MagicMock()`
			`fake_caps.supports_vision = True`
			`with patch("agent.models_dev.get_model_capabilities", return_value=fake_caps):`
			`assert agent._model_supports_vision() is True`
			`fake_caps.supports_vision = False`
			`with patch("agent.models_dev.get_model_capabilities", return_value=fake_caps):`
			`assert agent._model_supports_vision() is False`

			`def test_none_caps_returns_false(self):`
			`agent = _make_agent()`
			`with patch("agent.models_dev.get_model_capabilities", return_value=None):`
			`assert agent._model_supports_vision() is False`

			`def test_exception_returns_false(self):`
			`agent = _make_agent()`
			`with patch("agent.models_dev.get_model_capabilities", side_effect=RuntimeError("boom")):`
			`assert agent._model_supports_vision() is False`