agent/image_routing.py

"""Routing helpers for inbound user-attached images.

Two modes:

  native  — attach images as OpenAI-style ``image_url`` content parts on the
            user turn. Provider adapters (Anthropic, Gemini, Bedrock, Codex,
            OpenAI chat.completions) already translate these into their
            vendor-specific multimodal formats.

  text    — run ``vision_analyze`` on each image up-front and prepend the
            description to the user's text. The model never sees the pixels;
            it only sees a lossy text summary. This is the pre-existing
            behaviour and still the right choice for non-vision models.

The decision is made once per message turn by :func:`decide_image_input_mode`.
It reads ``agent.image_input_mode`` from config.yaml (``auto`` | ``native``
| ``text``, default ``auto``) and the active model's capability metadata.

In ``auto`` mode:
  - If the user has explicitly configured ``auxiliary.vision.provider``
    (i.e. not ``auto`` and not empty), we assume they want the text pipeline
    regardless of the main model — they've opted in to a specific vision
    backend for a reason (cost, quality, local-only, etc.).
  - Otherwise, if the active model reports ``supports_vision=True`` in its
    models.dev metadata, we attach natively.
  - Otherwise (non-vision model, no explicit override), we fall back to text.

This keeps ``vision_analyze`` surfaced as a tool in every session — skills
and agent flows that chain it (browser screenshots, deeper inspection of
URL-referenced images, style-gating loops) keep working. The routing only
affects *how user-attached images on the current turn* are presented to the
main model.
"""

from __future__ import annotations

import base64
import logging
import mimetypes
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

logger = logging.getLogger(__name__)


_VALID_MODES = frozenset({"auto", "native", "text"})


def _coerce_mode(raw: Any) -> str:
    """Normalize a config value into one of the valid modes."""
    if not isinstance(raw, str):
        return "auto"
    val = raw.strip().lower()
    if val in _VALID_MODES:
        return val
    return "auto"


def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool:
    """True when the user configured a specific auxiliary vision backend.

    An explicit override means the user *wants* the text pipeline (they're
    paying for a dedicated vision model), so we don't silently bypass it.
    """
    if not isinstance(cfg, dict):
        return False
    aux = cfg.get("auxiliary") or {}
    if not isinstance(aux, dict):
        return False
    vision = aux.get("vision") or {}
    if not isinstance(vision, dict):
        return False

    provider = str(vision.get("provider") or "").strip().lower()
    model = str(vision.get("model") or "").strip()
    base_url = str(vision.get("base_url") or "").strip()

    # "auto" / "" / blank = not explicit
    if provider in ("", "auto") and not model and not base_url:
        return False
    return True


def _lookup_supports_vision(provider: str, model: str) -> Optional[bool]:
    """Return True/False if we can resolve caps, None if unknown."""
    if not provider or not model:
        return None
    try:
        from agent.models_dev import get_model_capabilities
        caps = get_model_capabilities(provider, model)
    except Exception as exc:  # pragma: no cover - defensive
        logger.debug("image_routing: caps lookup failed for %s:%s — %s", provider, model, exc)
        return None
    if caps is None:
        return None
    return bool(caps.supports_vision)


def decide_image_input_mode(
    provider: str,
    model: str,
    cfg: Optional[Dict[str, Any]],
) -> str:
    """Return ``"native"`` or ``"text"`` for the given turn.

    Args:
      provider: active inference provider ID (e.g. ``"anthropic"``, ``"openrouter"``).
      model:    active model slug as it would be sent to the provider.
      cfg:      loaded config.yaml dict, or None. When None, behaves as auto.
    """
    mode_cfg = "auto"
    if isinstance(cfg, dict):
        agent_cfg = cfg.get("agent") or {}
        if isinstance(agent_cfg, dict):
            mode_cfg = _coerce_mode(agent_cfg.get("image_input_mode"))

    if mode_cfg == "native":
        return "native"
    if mode_cfg == "text":
        return "text"

    # auto
    if _explicit_aux_vision_override(cfg):
        return "text"

    supports = _lookup_supports_vision(provider, model)
    if supports is True:
        return "native"
    return "text"


# Image size handling is REACTIVE rather than proactive: we attempt native
# attachment at full size regardless of provider, and rely on
# ``run_agent._try_shrink_image_parts_in_messages`` to shrink + retry if
# the provider rejects the request (e.g. Anthropic's hard 5 MB per-image
# ceiling returned as HTTP 400 "image exceeds 5 MB maximum").
#
# Why reactive: our knowledge of provider ceilings is partial and evolving
# (OpenAI accepts 49 MB+, Anthropic 5 MB, Gemini 100 MB, others unknown).
# A proactive per-provider table would be stale the moment a provider raises
# or lowers its limit, and silently degrading quality for users on providers
# that would have accepted the full image is the worse failure mode.
# The shrink-on-reject path loses 1 API call + maybe 1s of Pillow work when
# it fires, which is cheaper than permanent quality loss.


def _guess_mime(path: Path) -> str:
    mime, _ = mimetypes.guess_type(str(path))
    if mime and mime.startswith("image/"):
        return mime
    # mimetypes on some Linux distros mis-maps .jpg; default to jpeg when
    # the suffix looks imagey.
    suffix = path.suffix.lower()
    return {
        ".jpg": "image/jpeg",
        ".jpeg": "image/jpeg",
        ".png": "image/png",
        ".gif": "image/gif",
        ".webp": "image/webp",
        ".bmp": "image/bmp",
    }.get(suffix, "image/jpeg")


def _file_to_data_url(path: Path) -> Optional[str]:
    """Encode a local image as a base64 data URL at its native size.

    Size limits are NOT enforced here — the agent retry loop
    (``run_agent._try_shrink_image_parts_in_messages``) shrinks on the
    provider's first rejection. Keeping this simple means providers that
    accept large images (OpenAI 49 MB+, Gemini 100 MB) don't pay a silent
    quality tax just because one other provider is stricter.

    Returns None only if the file can't be read (missing, permission
    denied, etc.); the caller reports those paths in ``skipped``.
    """
    try:
        raw = path.read_bytes()
    except Exception as exc:
        logger.warning("image_routing: failed to read %s — %s", path, exc)
        return None
    mime = _guess_mime(path)
    b64 = base64.b64encode(raw).decode("ascii")
    return f"data:{mime};base64,{b64}"


def build_native_content_parts(
    user_text: str,
    image_paths: List[str],
) -> Tuple[List[Dict[str, Any]], List[str]]:
    """Build an OpenAI-style ``content`` list for a user turn.

    Shape:
      [{"type": "text", "text": "..."},
       {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
       ...]

    Images are attached at their native size. If a provider rejects the
    request because an image is too large (e.g. Anthropic's 5 MB per-image
    ceiling), the agent's retry loop transparently shrinks and retries
    once — see ``run_agent._try_shrink_image_parts_in_messages``.

    Returns (content_parts, skipped_paths). Skipped paths are files that
    couldn't be read from disk.
    """
    parts: List[Dict[str, Any]] = []
    skipped: List[str] = []

    text = (user_text or "").strip()
    if text:
        parts.append({"type": "text", "text": text})

    for raw_path in image_paths:
        p = Path(raw_path)
        if not p.exists() or not p.is_file():
            skipped.append(str(raw_path))
            continue
        data_url = _file_to_data_url(p)
        if not data_url:
            skipped.append(str(raw_path))
            continue
        parts.append({
            "type": "image_url",
            "image_url": {"url": data_url},
        })

    # If the text was empty, add a neutral prompt so the turn isn't just images.
    if not text and any(p.get("type") == "image_url" for p in parts):
        parts.insert(0, {"type": "text", "text": "What do you see in this image?"})

    return parts, skipped


__all__ = [
    "decide_image_input_mode",
    "build_native_content_parts",
]
feat(image-input): native multimodal routing based on model vision capability (#16506) * feat(image-input): native multimodal routing based on model vision capability Attach user-sent images as OpenAI-style content parts on the user turn when the active model supports native vision, so vision-capable models see real pixels instead of a lossy text description from vision_analyze. Routing decision (agent/image_routing.py::decide_image_input_mode): agent.image_input_mode = auto \| native \| text (default: auto) In auto mode: - If auxiliary.vision.provider/model is explicitly configured, keep the text pipeline (user paid for a dedicated vision backend). - Else if models.dev reports supports_vision=True for the active provider/model, attach natively. - Else fall back to text (current behaviour). Call sites updated: gateway/run.py (all messaging platforms), tui_gateway (dashboard/Ink), cli.py (interactive /attach + drag-drop). run_agent.py changes: - _prepare_anthropic_messages_for_api now passes image parts through unchanged when the model supports vision — the Anthropic adapter translates them to native image blocks. Previous behaviour (vision_analyze → text) only runs for non-vision Anthropic models. - New _prepare_messages_for_non_vision_model mirrors the same contract for chat.completions and codex_responses paths, so non-vision models on any provider get text-fallback instead of failing at the provider. - New _model_supports_vision() helper reads models.dev caps. vision_analyze description rewritten: positions it as a tool for images NOT already visible in the conversation (URLs, tool output, deeper inspection). Prevents the model from redundantly calling it on images already attached natively. Config default: agent.image_input_mode = auto. Tests: 35 new (test_image_routing.py + test_vision_aware_preprocessing.py), all existing tests that reference _prepare_anthropic_messages_for_api still pass (198 targeted + new tests green). * feat(image-input): size-cap + resize oversized images, charge image tokens in compressor Two follow-ups that make the native image routing safer for long / heavy sessions: 1) Oversize handling in build_native_content_parts: - 20 MB ceiling per image (matches vision_tools._MAX_BASE64_BYTES, the most restrictive provider — Gemini inline data). - Delegates to vision_tools._resize_image_for_vision (Pillow-based, already battle-tested) to downscale to 5 MB first-try. - If Pillow is missing or resize still overshoots, the image is dropped and reported back in skipped[]; caller falls back to text enrichment for that image. 2) Image-token accounting in context_compressor: - New _IMAGE_TOKEN_ESTIMATE = 1600 (matches Claude Code's constant; within the realistic range for Anthropic/GPT-4o/Gemini billing). - _content_length_for_budget() helper: sums text-part lengths and charges _IMAGE_CHAR_EQUIVALENT (1600 * 4 chars) per image/image_url/ input_image part. Base64 payload inside image_url is NOT counted as chars — dimensions don't matter, only image-presence. - Both tail-cut sites (_prune_old_tool_results L527 and _find_tail_cut_by_tokens L1126) now call the helper so multi-image conversations don't slip past compression budget. Tests: 9 new in test_image_routing.py (oversize triggers resize, resize-fails-returns-None, oversize-skipped-reported), 11 new in test_compressor_image_tokens.py (flat charge per image, multiple images, Responses-API / Anthropic-native / OpenAI-chat shapes, no-inflation on raw base64, bounds-check on the constant, integration test that an image-heavy tail actually gets trimmed). * fix(image-input): replace blanket 20MB ceiling with empirically-verified per-provider limits The previous commit imposed a hardcoded 20 MB base64 ceiling on all providers, triggering auto-resize on anything larger. This was wrong in both directions: * Too loose for Anthropic — actual limit is 5 MB (returns HTTP 400 'image exceeds 5 MB maximum' above that). * Too strict for OpenAI / Codex / OpenRouter — accept 49 MB+ without complaint (empirically verified April 2026 with progressive PNG sizes). New behaviour: * _PROVIDER_BASE64_CEILING table: only anthropic and bedrock have a ceiling (5 MB, since bedrock-on-Claude shares Anthropic's decoder). * Providers NOT in the table get no ceiling — images attach at native size and we trust the provider to return its own error if it disagrees. A provider-specific 400 message is clearer than us guessing wrong and silently degrading image quality. * build_native_content_parts() gains a keyword-only provider arg; gateway/CLI/TUI pass the active provider so Anthropic users get auto-resize protection while OpenAI users don't pay it. * Resize target dropped from 5 MB to 4 MB to slide safely under Anthropic's boundary with header overhead. Empirical measurements (direct API, no Hermes in the loop): image b64 anthropic openrouter/gpt5.5 codex-oauth/gpt5.5 0.19 MB ✓ ✓ ✓ 12.37 MB ✗ 400 5MB ✓ ✓ 23.85 MB ✗ 400 5MB ✓ ✓ 49.46 MB ✗ 413 ✓ ✓ Tests: rewrote TestOversizeHandling (5 tests): no-ceiling pass-through, Anthropic resize fires, Anthropic skip on resize-fail, build_native_parts routes ceiling by provider, unknown provider gets no ceiling. All 52 targeted tests pass. * refactor(image-input): attempt native, shrink-and-retry on provider reject Replace proactive per-provider size ceilings with a reactive shrink path on the provider's actual rejection. All providers now attempt native full-size attachment first; if the provider returns an image-too-large error, the agent silently shrinks and retries once. Why the previous design was wrong: hardcoding provider ceilings (anthropic=5MB, others=unlimited) meant OpenAI users on a 10MB image paid no tax, but Anthropic users lost quality on anything >5MB even though the empirical behaviour at provider-reject time is the same (shrink + retry). Baking the table into the routing layer also requires updating Hermes every time a provider's limit changes. Reactive design: - image_routing.py: _file_to_data_url encodes native size, no ceiling. build_native_content_parts drops its provider kwarg. - error_classifier.py: new FailoverReason.image_too_large + pattern match ("image exceeds", "image too large", etc.) checked BEFORE context_overflow so Anthropic's 5MB rejection lands in the right bucket. - run_agent.py: new _try_shrink_image_parts_in_messages walks api messages in-place, re-encodes oversized data: URL image parts through vision_tools._resize_image_for_vision to fit under 4MB, handles both chat.completions (dict image_url) and Responses (string image_url) shapes, ignores http URLs (provider-fetched). New image_shrink_retry_attempted flag in the retry loop fires the shrink exactly once per turn after credential-pool recovery but before auth retries. E2E verified live against Anthropic claude-sonnet-4-6: - 17.9MB PNG (23.9MB b64) attached at native size - Anthropic returns 400 "image exceeds 5 MB maximum" - Agent logs '📐 Image(s) exceeded provider size limit — shrank and retrying...' - Retry succeeds, correct response delivered in 6.8s total. Tests: 12 new (8 shrink-helper shapes + 4 classifier signals), replaces 5 proactive-ceiling tests with 3 simpler 'native attach works' tests. 181 targeted tests pass. test_enum_members_exist in test_error_classifier.py updated for the new enum value. 2026-04-27 06:27:59 -07:00			`"""Routing helpers for inbound user-attached images.`

			`Two modes:`

			native — attach images as OpenAI-style ``image_url`` content parts on the
			`user turn. Provider adapters (Anthropic, Gemini, Bedrock, Codex,`
			`OpenAI chat.completions) already translate these into their`
			`vendor-specific multimodal formats.`

			text — run ``vision_analyze`` on each image up-front and prepend the
			`description to the user's text. The model never sees the pixels;`
			`it only sees a lossy text summary. This is the pre-existing`
			`behaviour and still the right choice for non-vision models.`

			The decision is made once per message turn by :func:`decide_image_input_mode`.
			It reads ``agent.image_input_mode`` from config.yaml (``auto`` \| ``native``
			\| ``text``, default ``auto``) and the active model's capability metadata.

			In ``auto`` mode:
			- If the user has explicitly configured ``auxiliary.vision.provider``
			(i.e. not ``auto`` and not empty), we assume they want the text pipeline
			`regardless of the main model — they've opted in to a specific vision`
			`backend for a reason (cost, quality, local-only, etc.).`
			- Otherwise, if the active model reports ``supports_vision=True`` in its
			`models.dev metadata, we attach natively.`
			`- Otherwise (non-vision model, no explicit override), we fall back to text.`

			This keeps ``vision_analyze`` surfaced as a tool in every session — skills
			`and agent flows that chain it (browser screenshots, deeper inspection of`
			`URL-referenced images, style-gating loops) keep working. The routing only`
			`affects how user-attached images on the current turn are presented to the`
			`main model.`
			`"""`

			`from __future__ import annotations`

			`import base64`
			`import logging`
			`import mimetypes`
			`from pathlib import Path`
			`from typing import Any, Dict, List, Optional, Tuple`

			`logger = logging.getLogger(__name__)`


			`_VALID_MODES = frozenset({"auto", "native", "text"})`


			`def _coerce_mode(raw: Any) -> str:`
			`"""Normalize a config value into one of the valid modes."""`
			`if not isinstance(raw, str):`
			`return "auto"`
			`val = raw.strip().lower()`
			`if val in _VALID_MODES:`
			`return val`
			`return "auto"`


			`def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool:`
			`"""True when the user configured a specific auxiliary vision backend.`

			`An explicit override means the user wants the text pipeline (they're`
			`paying for a dedicated vision model), so we don't silently bypass it.`
			`"""`
			`if not isinstance(cfg, dict):`
			`return False`
			`aux = cfg.get("auxiliary") or {}`
			`if not isinstance(aux, dict):`
			`return False`
			`vision = aux.get("vision") or {}`
			`if not isinstance(vision, dict):`
			`return False`

			`provider = str(vision.get("provider") or "").strip().lower()`
			`model = str(vision.get("model") or "").strip()`
			`base_url = str(vision.get("base_url") or "").strip()`

			`# "auto" / "" / blank = not explicit`
			`if provider in ("", "auto") and not model and not base_url:`
			`return False`
			`return True`


			`def _lookup_supports_vision(provider: str, model: str) -> Optional[bool]:`
			`"""Return True/False if we can resolve caps, None if unknown."""`
			`if not provider or not model:`
			`return None`
			`try:`
			`from agent.models_dev import get_model_capabilities`
			`caps = get_model_capabilities(provider, model)`
			`except Exception as exc: # pragma: no cover - defensive`
			`logger.debug("image_routing: caps lookup failed for %s:%s — %s", provider, model, exc)`
			`return None`
			`if caps is None:`
			`return None`
			`return bool(caps.supports_vision)`


			`def decide_image_input_mode(`
			`provider: str,`
			`model: str,`
			`cfg: Optional[Dict[str, Any]],`
			`) -> str:`
			"""Return ``"native"`` or ``"text"`` for the given turn.

			`Args:`
			provider: active inference provider ID (e.g. ``"anthropic"``, ``"openrouter"``).
			`model: active model slug as it would be sent to the provider.`
			`cfg: loaded config.yaml dict, or None. When None, behaves as auto.`
			`"""`
			`mode_cfg = "auto"`
			`if isinstance(cfg, dict):`
			`agent_cfg = cfg.get("agent") or {}`
			`if isinstance(agent_cfg, dict):`
			`mode_cfg = _coerce_mode(agent_cfg.get("image_input_mode"))`

			`if mode_cfg == "native":`
			`return "native"`
			`if mode_cfg == "text":`
			`return "text"`

			`# auto`
			`if _explicit_aux_vision_override(cfg):`
			`return "text"`

			`supports = _lookup_supports_vision(provider, model)`
			`if supports is True:`
			`return "native"`
			`return "text"`


			`# Image size handling is REACTIVE rather than proactive: we attempt native`
			`# attachment at full size regardless of provider, and rely on`
			# ``run_agent._try_shrink_image_parts_in_messages`` to shrink + retry if
			`# the provider rejects the request (e.g. Anthropic's hard 5 MB per-image`
			`# ceiling returned as HTTP 400 "image exceeds 5 MB maximum").`
			`#`
			`# Why reactive: our knowledge of provider ceilings is partial and evolving`
			`# (OpenAI accepts 49 MB+, Anthropic 5 MB, Gemini 100 MB, others unknown).`
			`# A proactive per-provider table would be stale the moment a provider raises`
			`# or lowers its limit, and silently degrading quality for users on providers`
			`# that would have accepted the full image is the worse failure mode.`
			`# The shrink-on-reject path loses 1 API call + maybe 1s of Pillow work when`
			`# it fires, which is cheaper than permanent quality loss.`


			`def _guess_mime(path: Path) -> str:`
			`mime, _ = mimetypes.guess_type(str(path))`
			`if mime and mime.startswith("image/"):`
			`return mime`
			`# mimetypes on some Linux distros mis-maps .jpg; default to jpeg when`
			`# the suffix looks imagey.`
			`suffix = path.suffix.lower()`
			`return {`
			`".jpg": "image/jpeg",`
			`".jpeg": "image/jpeg",`
			`".png": "image/png",`
			`".gif": "image/gif",`
			`".webp": "image/webp",`
			`".bmp": "image/bmp",`
			`}.get(suffix, "image/jpeg")`


			`def _file_to_data_url(path: Path) -> Optional[str]:`
			`"""Encode a local image as a base64 data URL at its native size.`

			`Size limits are NOT enforced here — the agent retry loop`
			(``run_agent._try_shrink_image_parts_in_messages``) shrinks on the
			`provider's first rejection. Keeping this simple means providers that`
			`accept large images (OpenAI 49 MB+, Gemini 100 MB) don't pay a silent`
			`quality tax just because one other provider is stricter.`

			`Returns None only if the file can't be read (missing, permission`
			denied, etc.); the caller reports those paths in ``skipped``.
			`"""`
			`try:`
			`raw = path.read_bytes()`
			`except Exception as exc:`
			`logger.warning("image_routing: failed to read %s — %s", path, exc)`
			`return None`
			`mime = _guess_mime(path)`
			`b64 = base64.b64encode(raw).decode("ascii")`
			`return f"data:{mime};base64,{b64}"`


			`def build_native_content_parts(`
			`user_text: str,`
			`image_paths: List[str],`
			`) -> Tuple[List[Dict[str, Any]], List[str]]:`
			"""Build an OpenAI-style ``content`` list for a user turn.

			`Shape:`
			`[{"type": "text", "text": "..."},`
			`{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},`
			`...]`

			`Images are attached at their native size. If a provider rejects the`
			`request because an image is too large (e.g. Anthropic's 5 MB per-image`
			`ceiling), the agent's retry loop transparently shrinks and retries`
			once — see ``run_agent._try_shrink_image_parts_in_messages``.

			`Returns (content_parts, skipped_paths). Skipped paths are files that`
			`couldn't be read from disk.`
			`"""`
			`parts: List[Dict[str, Any]] = []`
			`skipped: List[str] = []`

			`text = (user_text or "").strip()`
			`if text:`
			`parts.append({"type": "text", "text": text})`

			`for raw_path in image_paths:`
			`p = Path(raw_path)`
			`if not p.exists() or not p.is_file():`
			`skipped.append(str(raw_path))`
			`continue`
			`data_url = _file_to_data_url(p)`
			`if not data_url:`
			`skipped.append(str(raw_path))`
			`continue`
			`parts.append({`
			`"type": "image_url",`
			`"image_url": {"url": data_url},`
			`})`

			`# If the text was empty, add a neutral prompt so the turn isn't just images.`
			`if not text and any(p.get("type") == "image_url" for p in parts):`
			`parts.insert(0, {"type": "text", "text": "What do you see in this image?"})`

			`return parts, skipped`


			`__all__ = [`
			`"decide_image_input_mode",`
			`"build_native_content_parts",`
			`]`