diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 887be7f7bf..9bed919503 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -61,9 +61,52 @@ _PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]" # Chars per token rough estimate _CHARS_PER_TOKEN = 4 +# Flat token cost per attached image part. Real cost varies by provider and +# dimensions (Anthropic ≈ width×height/750, GPT-4o up to ~1700 for +# high-detail 2048×2048, Gemini 258/tile), but 1600 is a realistic ceiling +# that keeps compression budgeting honest for multi-image conversations. +# Matches Claude Code's IMAGE_TOKEN_ESTIMATE constant. +_IMAGE_TOKEN_ESTIMATE = 1600 +# Same figure expressed in the char-budget currency the rest of the +# compressor speaks in. Used when accumulating message "content length" +# for tail-cut decisions. +_IMAGE_CHAR_EQUIVALENT = _IMAGE_TOKEN_ESTIMATE * _CHARS_PER_TOKEN _SUMMARY_FAILURE_COOLDOWN_SECONDS = 600 +def _content_length_for_budget(raw_content: Any) -> int: + """Return the effective char-length of a message's content for token budgeting. + + Plain strings: ``len(content)``. Multimodal lists: sum of text-part + ``len(text)`` plus a flat ``_IMAGE_CHAR_EQUIVALENT`` per image part + (``image_url`` / ``input_image`` / Anthropic-style ``image``). This + keeps the compressor from treating a turn with 5 attached images as + near-zero tokens just because the text part is empty. + """ + if isinstance(raw_content, str): + return len(raw_content) + if not isinstance(raw_content, list): + return len(str(raw_content or "")) + + total = 0 + for p in raw_content: + if isinstance(p, str): + total += len(p) + continue + if not isinstance(p, dict): + total += len(str(p)) + continue + ptype = p.get("type") + if ptype in {"image_url", "input_image", "image"}: + total += _IMAGE_CHAR_EQUIVALENT + else: + # text / input_text / tool_result-with-text / anything else with + # a text field. Ignore the raw base64 payload inside image_url + # dicts — dimensions don't matter, only whether it's an image. + total += len(p.get("text", "") or "") + return total + + def _content_text_for_contains(content: Any) -> str: """Return a best-effort text view of message content. @@ -484,18 +527,7 @@ class ContextCompressor(ContextEngine): for i in range(len(result) - 1, -1, -1): msg = result[i] raw_content = msg.get("content") or "" - content_len = ( - sum( - len(p.get("text", "")) - if isinstance(p, dict) - else len(p) - if isinstance(p, str) - else len(str(p)) - for p in raw_content - ) - if isinstance(raw_content, list) - else len(raw_content) - ) + content_len = _content_length_for_budget(raw_content) msg_tokens = content_len // _CHARS_PER_TOKEN + 10 for tc in msg.get("tool_calls") or []: if isinstance(tc, dict): @@ -1094,18 +1126,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio for i in range(n - 1, head_end - 1, -1): msg = messages[i] raw_content = msg.get("content") or "" - content_len = ( - sum( - len(p.get("text", "")) - if isinstance(p, dict) - else len(p) - if isinstance(p, str) - else len(str(p)) - for p in raw_content - ) - if isinstance(raw_content, list) - else len(raw_content) - ) + content_len = _content_length_for_budget(raw_content) msg_tokens = content_len // _CHARS_PER_TOKEN + 10 # +10 for role/metadata # Include tool call arguments in estimate for tc in msg.get("tool_calls") or []: diff --git a/agent/error_classifier.py b/agent/error_classifier.py index 87324d6767..0780bde90d 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -42,6 +42,7 @@ class FailoverReason(enum.Enum): # Context / payload context_overflow = "context_overflow" # Context too large — compress, not failover payload_too_large = "payload_too_large" # 413 — compress payload + image_too_large = "image_too_large" # Native image part exceeds provider's per-image limit — shrink and retry # Model model_not_found = "model_not_found" # 404 or invalid model — fallback to different model @@ -147,6 +148,20 @@ _PAYLOAD_TOO_LARGE_PATTERNS = [ "error code: 413", ] +# Image-size patterns. Matched against 400 bodies (not 413) because most +# providers return a 400 with a specific image-too-big message before the +# whole request hits the 413 size limit. Anthropic's wording is the most +# important here (hard 5 MB per image, returned as +# "messages.N.content.K.image.source.base64: image exceeds 5 MB maximum"). +_IMAGE_TOO_LARGE_PATTERNS = [ + "image exceeds", # Anthropic: "image exceeds 5 MB maximum" + "image too large", # generic + "image_too_large", # error_code variant + "image size exceeds", # variant + # "request_too_large" on a request known to contain an image → image is + # the likely culprit; we still try the shrink path before giving up. +] + # Context overflow patterns _CONTEXT_OVERFLOW_PATTERNS = [ "context length", @@ -671,6 +686,15 @@ def _classify_400( ) -> ClassifiedError: """Classify 400 Bad Request — context overflow, format error, or generic.""" + # Image-too-large from 400 (Anthropic's 5 MB per-image check fires this way). + # Must be checked BEFORE context_overflow because messages can trip both + # patterns ("exceeds" + "image") and image-shrink is a cheaper recovery. + if any(p in error_msg for p in _IMAGE_TOO_LARGE_PATTERNS): + return result_fn( + FailoverReason.image_too_large, + retryable=True, + ) + # Context overflow from 400 if any(p in error_msg for p in _CONTEXT_OVERFLOW_PATTERNS): return result_fn( @@ -798,6 +822,13 @@ def _classify_by_message( should_compress=True, ) + # Image-too-large patterns (from message text when no status_code) + if any(p in error_msg for p in _IMAGE_TOO_LARGE_PATTERNS): + return result_fn( + FailoverReason.image_too_large, + retryable=True, + ) + # Usage-limit patterns need the same disambiguation as 402: some providers # surface "usage limit" errors without an HTTP status code. A transient # signal ("try again", "resets at", …) means it's a periodic quota, not diff --git a/agent/image_routing.py b/agent/image_routing.py new file mode 100644 index 0000000000..bd2ba83c87 --- /dev/null +++ b/agent/image_routing.py @@ -0,0 +1,236 @@ +"""Routing helpers for inbound user-attached images. + +Two modes: + + native — attach images as OpenAI-style ``image_url`` content parts on the + user turn. Provider adapters (Anthropic, Gemini, Bedrock, Codex, + OpenAI chat.completions) already translate these into their + vendor-specific multimodal formats. + + text — run ``vision_analyze`` on each image up-front and prepend the + description to the user's text. The model never sees the pixels; + it only sees a lossy text summary. This is the pre-existing + behaviour and still the right choice for non-vision models. + +The decision is made once per message turn by :func:`decide_image_input_mode`. +It reads ``agent.image_input_mode`` from config.yaml (``auto`` | ``native`` +| ``text``, default ``auto``) and the active model's capability metadata. + +In ``auto`` mode: + - If the user has explicitly configured ``auxiliary.vision.provider`` + (i.e. not ``auto`` and not empty), we assume they want the text pipeline + regardless of the main model — they've opted in to a specific vision + backend for a reason (cost, quality, local-only, etc.). + - Otherwise, if the active model reports ``supports_vision=True`` in its + models.dev metadata, we attach natively. + - Otherwise (non-vision model, no explicit override), we fall back to text. + +This keeps ``vision_analyze`` surfaced as a tool in every session — skills +and agent flows that chain it (browser screenshots, deeper inspection of +URL-referenced images, style-gating loops) keep working. The routing only +affects *how user-attached images on the current turn* are presented to the +main model. +""" + +from __future__ import annotations + +import base64 +import logging +import mimetypes +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +logger = logging.getLogger(__name__) + + +_VALID_MODES = frozenset({"auto", "native", "text"}) + + +def _coerce_mode(raw: Any) -> str: + """Normalize a config value into one of the valid modes.""" + if not isinstance(raw, str): + return "auto" + val = raw.strip().lower() + if val in _VALID_MODES: + return val + return "auto" + + +def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool: + """True when the user configured a specific auxiliary vision backend. + + An explicit override means the user *wants* the text pipeline (they're + paying for a dedicated vision model), so we don't silently bypass it. + """ + if not isinstance(cfg, dict): + return False + aux = cfg.get("auxiliary") or {} + if not isinstance(aux, dict): + return False + vision = aux.get("vision") or {} + if not isinstance(vision, dict): + return False + + provider = str(vision.get("provider") or "").strip().lower() + model = str(vision.get("model") or "").strip() + base_url = str(vision.get("base_url") or "").strip() + + # "auto" / "" / blank = not explicit + if provider in ("", "auto") and not model and not base_url: + return False + return True + + +def _lookup_supports_vision(provider: str, model: str) -> Optional[bool]: + """Return True/False if we can resolve caps, None if unknown.""" + if not provider or not model: + return None + try: + from agent.models_dev import get_model_capabilities + caps = get_model_capabilities(provider, model) + except Exception as exc: # pragma: no cover - defensive + logger.debug("image_routing: caps lookup failed for %s:%s — %s", provider, model, exc) + return None + if caps is None: + return None + return bool(caps.supports_vision) + + +def decide_image_input_mode( + provider: str, + model: str, + cfg: Optional[Dict[str, Any]], +) -> str: + """Return ``"native"`` or ``"text"`` for the given turn. + + Args: + provider: active inference provider ID (e.g. ``"anthropic"``, ``"openrouter"``). + model: active model slug as it would be sent to the provider. + cfg: loaded config.yaml dict, or None. When None, behaves as auto. + """ + mode_cfg = "auto" + if isinstance(cfg, dict): + agent_cfg = cfg.get("agent") or {} + if isinstance(agent_cfg, dict): + mode_cfg = _coerce_mode(agent_cfg.get("image_input_mode")) + + if mode_cfg == "native": + return "native" + if mode_cfg == "text": + return "text" + + # auto + if _explicit_aux_vision_override(cfg): + return "text" + + supports = _lookup_supports_vision(provider, model) + if supports is True: + return "native" + return "text" + + +# Image size handling is REACTIVE rather than proactive: we attempt native +# attachment at full size regardless of provider, and rely on +# ``run_agent._try_shrink_image_parts_in_messages`` to shrink + retry if +# the provider rejects the request (e.g. Anthropic's hard 5 MB per-image +# ceiling returned as HTTP 400 "image exceeds 5 MB maximum"). +# +# Why reactive: our knowledge of provider ceilings is partial and evolving +# (OpenAI accepts 49 MB+, Anthropic 5 MB, Gemini 100 MB, others unknown). +# A proactive per-provider table would be stale the moment a provider raises +# or lowers its limit, and silently degrading quality for users on providers +# that would have accepted the full image is the worse failure mode. +# The shrink-on-reject path loses 1 API call + maybe 1s of Pillow work when +# it fires, which is cheaper than permanent quality loss. + + +def _guess_mime(path: Path) -> str: + mime, _ = mimetypes.guess_type(str(path)) + if mime and mime.startswith("image/"): + return mime + # mimetypes on some Linux distros mis-maps .jpg; default to jpeg when + # the suffix looks imagey. + suffix = path.suffix.lower() + return { + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".webp": "image/webp", + ".bmp": "image/bmp", + }.get(suffix, "image/jpeg") + + +def _file_to_data_url(path: Path) -> Optional[str]: + """Encode a local image as a base64 data URL at its native size. + + Size limits are NOT enforced here — the agent retry loop + (``run_agent._try_shrink_image_parts_in_messages``) shrinks on the + provider's first rejection. Keeping this simple means providers that + accept large images (OpenAI 49 MB+, Gemini 100 MB) don't pay a silent + quality tax just because one other provider is stricter. + + Returns None only if the file can't be read (missing, permission + denied, etc.); the caller reports those paths in ``skipped``. + """ + try: + raw = path.read_bytes() + except Exception as exc: + logger.warning("image_routing: failed to read %s — %s", path, exc) + return None + mime = _guess_mime(path) + b64 = base64.b64encode(raw).decode("ascii") + return f"data:{mime};base64,{b64}" + + +def build_native_content_parts( + user_text: str, + image_paths: List[str], +) -> Tuple[List[Dict[str, Any]], List[str]]: + """Build an OpenAI-style ``content`` list for a user turn. + + Shape: + [{"type": "text", "text": "..."}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}, + ...] + + Images are attached at their native size. If a provider rejects the + request because an image is too large (e.g. Anthropic's 5 MB per-image + ceiling), the agent's retry loop transparently shrinks and retries + once — see ``run_agent._try_shrink_image_parts_in_messages``. + + Returns (content_parts, skipped_paths). Skipped paths are files that + couldn't be read from disk. + """ + parts: List[Dict[str, Any]] = [] + skipped: List[str] = [] + + text = (user_text or "").strip() + if text: + parts.append({"type": "text", "text": text}) + + for raw_path in image_paths: + p = Path(raw_path) + if not p.exists() or not p.is_file(): + skipped.append(str(raw_path)) + continue + data_url = _file_to_data_url(p) + if not data_url: + skipped.append(str(raw_path)) + continue + parts.append({ + "type": "image_url", + "image_url": {"url": data_url}, + }) + + # If the text was empty, add a neutral prompt so the turn isn't just images. + if not text and any(p.get("type") == "image_url" for p in parts): + parts.insert(0, {"type": "text", "text": "What do you see in this image?"}) + + return parts, skipped + + +__all__ = [ + "decide_image_input_mode", + "build_native_content_parts", +] diff --git a/cli.py b/cli.py index 3b65bd547d..39dab48445 100644 --- a/cli.py +++ b/cli.py @@ -8433,13 +8433,62 @@ class HermesCLI: ): return None - # Pre-process images through the vision tool (Gemini Flash) so the - # main model receives text descriptions instead of raw base64 image - # content — works with any model, not just vision-capable ones. + # Route image attachments based on the active model's vision capability. + # "native" → pass pixels as OpenAI-style content parts (adapters + # translate for Anthropic/Gemini/Bedrock). + # "text" → pre-analyze each image with vision_analyze and prepend the + # description as text — works with non-vision models. + # See agent/image_routing.py for the decision table. if images: - message = self._preprocess_images_with_vision( - message if isinstance(message, str) else "", images - ) + try: + from agent.image_routing import ( + build_native_content_parts, + decide_image_input_mode, + ) + from hermes_cli.config import load_config + + _img_mode = decide_image_input_mode( + (self.provider or "").strip(), + (self.model or "").strip(), + load_config(), + ) + except Exception as _img_exc: + logging.debug("image_routing decision failed, defaulting to text: %s", _img_exc) + _img_mode = "text" + + if _img_mode == "native": + try: + _text_for_parts = message if isinstance(message, str) else "" + _img_str_paths = [str(p) for p in images] + _parts, _skipped = build_native_content_parts( + _text_for_parts, + _img_str_paths, + ) + if _skipped: + _cprint( + f" {_DIM}⚠ skipped {len(_skipped)} unreadable image path(s){_RST}" + ) + if any(p.get("type") == "image_url" for p in _parts): + _img_names = ", ".join(Path(p).name for p in _img_str_paths) + _cprint( + f" {_DIM}📎 attaching {len(images)} image(s) natively " + f"(model supports vision): {_img_names}{_RST}" + ) + message = _parts + else: + # All images unreadable — fall back to text enrichment. + message = self._preprocess_images_with_vision( + message if isinstance(message, str) else "", images + ) + except Exception as _img_exc: + logging.warning("native image attach failed, falling back to text: %s", _img_exc) + message = self._preprocess_images_with_vision( + message if isinstance(message, str) else "", images + ) + else: + message = self._preprocess_images_with_vision( + message if isinstance(message, str) else "", images + ) # Expand @ context references (e.g. @file:main.py, @diff, @folder:src/) if isinstance(message, str) and "@" in message: diff --git a/gateway/run.py b/gateway/run.py index 01eb529693..51e1fc612e 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -4199,9 +4199,18 @@ class GatewayRunner: Keep the normal inbound path and the queued follow-up path on the same preprocessing pipeline so sender attribution, image enrichment, STT, document notes, reply context, and @ references all behave the same. + + Side effect: writes ``self._pending_native_image_paths`` to a list of + local image paths when the active model supports native vision AND + the user has images attached. The caller consumes and clears this + attribute at the ``run_conversation`` site to build a multimodal user + turn. When the list is empty, the ``_enrich_message_with_vision`` + text path has already run and images are represented in-text. """ history = history or [] message_text = event.text or "" + # Reset per-call buffer; set only when native routing is chosen. + self._pending_native_image_paths = [] _is_shared_multi_user = is_shared_multi_user_session( source, @@ -4222,10 +4231,25 @@ class GatewayRunner: audio_paths.append(path) if image_paths: - message_text = await self._enrich_message_with_vision( - message_text, - image_paths, - ) + # Decide routing: native (attach pixels) vs text (vision_analyze + # pre-run + prepend description). See agent/image_routing.py. + _img_mode = self._decide_image_input_mode() + if _img_mode == "native": + # Defer attachment to the run_conversation call site. + self._pending_native_image_paths = list(image_paths) + logger.info( + "Image routing: native (model supports vision). %d image(s) will be attached inline.", + len(image_paths), + ) + else: + logger.info( + "Image routing: text (mode=%s). Pre-analyzing %d image(s) via vision_analyze.", + _img_mode, len(image_paths), + ) + message_text = await self._enrich_message_with_vision( + message_text, + image_paths, + ) if audio_paths: message_text = await self._enrich_message_with_transcription( @@ -8378,6 +8402,29 @@ class GatewayRunner: ctx = copy_context() return await loop.run_in_executor(None, ctx.run, func, *args) + def _decide_image_input_mode(self) -> str: + """Resolve the image-input routing for the currently active model. + + Returns ``"native"`` (attach pixels on the user turn) or ``"text"`` + (pre-analyze with vision_analyze and prepend the description). See + agent/image_routing.py for the full decision table. + + The active provider/model are read from config.yaml so the decision + tracks ``/model`` switches automatically on the next message. + """ + try: + from agent.image_routing import decide_image_input_mode + from agent.auxiliary_client import _read_main_model, _read_main_provider + from hermes_cli.config import load_config + + cfg = load_config() + provider = _read_main_provider() + model = _read_main_model() + return decide_image_input_mode(provider, model, cfg) + except Exception as exc: + logger.debug("image_routing: decision failed, falling back to text — %s", exc) + return "text" + async def _enrich_message_with_vision( self, user_text: str, @@ -10394,7 +10441,39 @@ class GatewayRunner: _approval_session_token = set_current_session_key(_approval_session_key) register_gateway_notify(_approval_session_key, _approval_notify_sync) try: - result = agent.run_conversation(message, conversation_history=agent_history, task_id=session_id) + # If _prepare_inbound_message_text buffered image paths for native + # attachment, wrap the user turn as an OpenAI-style multimodal + # content list. Consume-and-clear so subsequent turns on the same + # runner instance don't re-attach stale images. + _native_imgs = list(getattr(self, "_pending_native_image_paths", []) or []) + self._pending_native_image_paths = [] + if _native_imgs: + try: + from agent.image_routing import build_native_content_parts + _parts, _skipped = build_native_content_parts( + message, + _native_imgs, + ) + if _skipped: + logger.warning( + "Native image attachment: skipped %d unreadable path(s): %s", + len(_skipped), _skipped, + ) + if any(p.get("type") == "image_url" for p in _parts): + _run_message: Any = _parts + else: + # All images failed to read — fall back to plain text. + _run_message = message + except Exception as _img_exc: + logger.warning( + "Native image attachment failed, falling back to text: %s", + _img_exc, + ) + _run_message = message + else: + _run_message = message + + result = agent.run_conversation(_run_message, conversation_history=agent_history, task_id=session_id) finally: unregister_gateway_notify(_approval_session_key) reset_current_session_key(_approval_session_token) diff --git a/hermes_cli/config.py b/hermes_cli/config.py index f0777b80aa..6d5356890a 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -389,6 +389,20 @@ DEFAULT_CONFIG = { # (60+ tool iterations with tiny output) before users assume the # bot is dead and /restart. "gateway_notify_interval": 180, + # How user-attached images are presented to the main model on each turn. + # "auto" — attach natively when the active model reports + # supports_vision=True AND the user hasn't explicitly + # configured auxiliary.vision.provider. Otherwise fall + # back to text (vision_analyze pre-analysis). + # "native" — always attach natively; non-vision models will either + # error at the provider or get a last-chance text fallback + # (see run_agent._prepare_messages_for_api). + # "text" — always pre-analyze with vision_analyze and prepend the + # description as text; the main model never sees pixels. + # Affects gateway platforms, the TUI, and CLI /attach. vision_analyze + # remains available as a tool regardless of this setting — the routing + # only controls how inbound user images are presented. + "image_input_mode": "auto", }, "terminal": { diff --git a/run_agent.py b/run_agent.py index 224845c110..8c5c79bb14 100644 --- a/run_agent.py +++ b/run_agent.py @@ -7287,6 +7287,26 @@ class AIAgent: self._anthropic_image_fallback_cache[cache_key] = note return note + def _model_supports_vision(self) -> bool: + """Return True if the active provider+model reports native vision. + + Used to decide whether to strip image content parts from API-bound + messages (for non-vision models) or let the provider adapter handle + them natively (for vision-capable models). + """ + try: + from agent.models_dev import get_model_capabilities + provider = (getattr(self, "provider", "") or "").strip() + model = (getattr(self, "model", "") or "").strip() + if not provider or not model: + return False + caps = get_model_capabilities(provider, model) + if caps is None: + return False + return bool(caps.supports_vision) + except Exception: + return False + def _preprocess_anthropic_content(self, content: Any, role: str) -> Any: if not self._content_has_image_parts(content): return content @@ -7350,12 +7370,23 @@ class AIAgent: return t def _prepare_anthropic_messages_for_api(self, api_messages: list) -> list: + # Fast exit when no message carries image content at all. if not any( isinstance(msg, dict) and self._content_has_image_parts(msg.get("content")) for msg in api_messages ): return api_messages + # The Anthropic adapter (agent/anthropic_adapter.py:_convert_content_part_to_anthropic) + # already translates OpenAI-style image_url/input_image parts into + # native Anthropic ``{"type": "image", "source": ...}`` blocks. When + # the active model supports vision we let the adapter do its job and + # skip this legacy text-fallback preprocessor entirely. + if self._model_supports_vision(): + return api_messages + + # Non-vision Anthropic model (rare today, but keep the fallback for + # compat): replace each image part with a vision_analyze text note. transformed = copy.deepcopy(api_messages) for msg in transformed: if not isinstance(msg, dict): @@ -7366,6 +7397,150 @@ class AIAgent: ) return transformed + def _prepare_messages_for_non_vision_model(self, api_messages: list) -> list: + """Strip native image parts when the active model lacks vision. + + Runs on the chat.completions / codex_responses paths. Vision-capable + models pass through unchanged (provider and any downstream translator + handle the image parts natively). Non-vision models get each image + replaced by a cached vision_analyze text description so the turn + doesn't fail with "model does not support image input". + """ + if not any( + isinstance(msg, dict) and self._content_has_image_parts(msg.get("content")) + for msg in api_messages + ): + return api_messages + + if self._model_supports_vision(): + return api_messages + + transformed = copy.deepcopy(api_messages) + for msg in transformed: + if not isinstance(msg, dict): + continue + # Reuse the Anthropic text-fallback preprocessor — the behaviour is + # identical (walk content parts, replace images with cached + # descriptions, merge back into a single text or structured + # content). Naming is historical. + msg["content"] = self._preprocess_anthropic_content( + msg.get("content"), + str(msg.get("role", "user") or "user"), + ) + return transformed + + def _try_shrink_image_parts_in_messages(self, api_messages: list) -> bool: + """Re-encode all native image parts at a smaller size to recover from + image-too-large errors (Anthropic 5 MB, unknown other providers). + + Mutates ``api_messages`` in place. Returns True if any image part was + actually replaced, False if there were no image parts to shrink or + Pillow couldn't help (caller should surface the original error). + + Strategy: look for ``image_url`` / ``input_image`` parts carrying a + ``data:image/...;base64,...`` payload. For each one whose encoded + size exceeds 4 MB (a safe target that slides under Anthropic's 5 MB + ceiling with header overhead), write the base64 to a tempfile, call + ``vision_tools._resize_image_for_vision`` to produce a smaller data + URL, and substitute it in place. + + Non-data-URL images (http/https URLs) are not touched — the provider + fetches those itself and the size limit is different. + """ + if not api_messages: + return False + + try: + from tools.vision_tools import _resize_image_for_vision + except Exception as exc: + logger.warning("image-shrink recovery: vision_tools unavailable — %s", exc) + return False + + # 4 MB target leaves comfortable headroom under Anthropic's 5 MB. + # Non-Anthropic providers we haven't observed rejecting are fine with + # much larger; shrinking to 4 MB here loses quality but only fires + # after a confirmed provider rejection, so the alternative is failure. + target_bytes = 4 * 1024 * 1024 + changed_count = 0 + + def _shrink_data_url(url: str) -> Optional[str]: + """Return a smaller data URL, or None if shrink can't help.""" + if not isinstance(url, str) or not url.startswith("data:"): + return None + if len(url) <= target_bytes: + # This specific image wasn't the oversized one. + return None + try: + header, _, data = url.partition(",") + mime = "image/jpeg" + if header.startswith("data:"): + mime_part = header[len("data:"):].split(";", 1)[0].strip() + if mime_part.startswith("image/"): + mime = mime_part + import base64 as _b64 + raw = _b64.b64decode(data) + suffix = { + "image/png": ".png", "image/gif": ".gif", "image/webp": ".webp", + "image/jpeg": ".jpg", "image/jpg": ".jpg", "image/bmp": ".bmp", + }.get(mime, ".jpg") + tmp = tempfile.NamedTemporaryFile( + prefix="hermes_shrink_", suffix=suffix, delete=False, + ) + try: + tmp.write(raw) + tmp.close() + resized = _resize_image_for_vision( + Path(tmp.name), + mime_type=mime, + max_base64_bytes=target_bytes, + ) + finally: + try: + Path(tmp.name).unlink(missing_ok=True) + except Exception: + pass + if not resized or len(resized) >= len(url): + # Shrink didn't help (or made it bigger — corrupt input?). + return None + return resized + except Exception as exc: + logger.warning("image-shrink recovery: re-encode failed — %s", exc) + return None + + for msg in api_messages: + if not isinstance(msg, dict): + continue + content = msg.get("content") + if not isinstance(content, list): + continue + for part in content: + if not isinstance(part, dict): + continue + ptype = part.get("type") + if ptype not in {"image_url", "input_image"}: + continue + image_value = part.get("image_url") + # OpenAI chat.completions: {"image_url": {"url": "data:..."}} + # OpenAI Responses: {"image_url": "data:..."} + if isinstance(image_value, dict): + url = image_value.get("url", "") + resized = _shrink_data_url(url) + if resized: + image_value["url"] = resized + changed_count += 1 + elif isinstance(image_value, str): + resized = _shrink_data_url(image_value) + if resized: + part["image_url"] = resized + changed_count += 1 + + if changed_count: + logger.info( + "image-shrink recovery: re-encoded %d image part(s) to fit under %.0f MB", + changed_count, target_bytes / (1024 * 1024), + ) + return changed_count > 0 + def _anthropic_preserve_dots(self) -> bool: """True when using an anthropic-compatible endpoint that preserves dots in model names. Alibaba/DashScope keeps dots (e.g. qwen3.5-plus). @@ -7514,9 +7689,10 @@ class AIAgent: ) ) is_xai_responses = self.provider == "xai" or self._base_url_hostname == "api.x.ai" + _msgs_for_codex = self._prepare_messages_for_non_vision_model(api_messages) return _ct.build_kwargs( model=self.model, - messages=api_messages, + messages=_msgs_for_codex, tools=self.tools, reasoning_config=self.reasoning_config, session_id=getattr(self, "session_id", None), @@ -7595,9 +7771,12 @@ class AIAgent: if _ephemeral_out is not None: self._ephemeral_max_output_tokens = None + # Strip image parts for non-vision models (no-op when vision-capable). + _msgs_for_chat = self._prepare_messages_for_non_vision_model(api_messages) + return _ct.build_kwargs( model=self.model, - messages=api_messages, + messages=_msgs_for_chat, tools=self.tools, timeout=self._resolved_api_call_timeout(), max_tokens=self.max_tokens, @@ -9891,6 +10070,7 @@ class AIAgent: nous_auth_retry_attempted=False copilot_auth_retry_attempted=False thinking_sig_retry_attempted = False + image_shrink_retry_attempted = False has_retried_429 = False restart_with_compressed_messages = False restart_with_length_continuation = False @@ -10812,6 +10992,31 @@ class AIAgent: ) if recovered_with_pool: continue + + # Image-too-large recovery: shrink oversized native image + # parts in-place and retry once. Triggered by Anthropic's + # per-image 5 MB ceiling (400 with "image exceeds 5 MB + # maximum") or any other provider that complains about + # image size. If shrink fails or a second attempt still + # fails, fall through to normal error handling. + if ( + classified.reason == FailoverReason.image_too_large + and not image_shrink_retry_attempted + ): + image_shrink_retry_attempted = True + if self._try_shrink_image_parts_in_messages(api_messages): + self._vprint( + f"{self.log_prefix}📐 Image(s) exceeded provider size limit — " + f"shrank and retrying...", + force=True, + ) + continue + else: + logger.info( + "image-shrink recovery: no data-URL image parts found " + "or shrink didn't reduce size; surfacing original error." + ) + if ( self.api_mode == "codex_responses" and self.provider == "openai-codex" diff --git a/tests/agent/test_compressor_image_tokens.py b/tests/agent/test_compressor_image_tokens.py new file mode 100644 index 0000000000..83198e5de9 --- /dev/null +++ b/tests/agent/test_compressor_image_tokens.py @@ -0,0 +1,141 @@ +"""Tests for image-token accounting in the context compressor. + +Covers the native-image-routing PR's companion change: the compressor's +multimodal message length counter now charges ~1600 tokens per attached +image part instead of 0, so tail-cut / prune decisions are accurate for +creative workflows that iterate on images across many turns. +""" + +from __future__ import annotations + +import pytest + +from agent.context_compressor import ( + _CHARS_PER_TOKEN, + _IMAGE_CHAR_EQUIVALENT, + _IMAGE_TOKEN_ESTIMATE, + _content_length_for_budget, +) + + +class TestContentLengthForBudget: + def test_plain_string(self): + assert _content_length_for_budget("hello world") == 11 + + def test_empty_string(self): + assert _content_length_for_budget("") == 0 + + def test_none_coerces_to_zero(self): + assert _content_length_for_budget(None) == 0 + + def test_text_only_list(self): + content = [ + {"type": "text", "text": "first"}, + {"type": "text", "text": "second"}, + ] + assert _content_length_for_budget(content) == 5 + 6 + + def test_single_image_part_charges_fixed_budget(self): + content = [ + {"type": "text", "text": "look"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,XXXX"}}, + ] + # 4 chars of text + 1 image at fixed char-equivalent + assert _content_length_for_budget(content) == 4 + _IMAGE_CHAR_EQUIVALENT + + def test_image_url_raw_base64_is_not_counted_as_chars(self): + """A 1MB base64 blob inside an image_url must NOT inflate token count. + + The flat image estimate is what the provider actually bills; the raw + base64 is transport payload, not context tokens. + """ + huge_url = "data:image/png;base64," + ("A" * 1_000_000) + content = [ + {"type": "image_url", "image_url": {"url": huge_url}}, + ] + # Exactly one image's worth, not 1M + something. + assert _content_length_for_budget(content) == _IMAGE_CHAR_EQUIVALENT + + def test_multiple_image_parts(self): + content = [ + {"type": "text", "text": "compare"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAA"}}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,BBB"}}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,CCC"}}, + ] + assert _content_length_for_budget(content) == 7 + 3 * _IMAGE_CHAR_EQUIVALENT + + def test_openai_responses_input_image_shape(self): + """Responses API uses type=input_image with top-level image_url string.""" + content = [ + {"type": "input_text", "text": "hey"}, + {"type": "input_image", "image_url": "data:image/png;base64,XX"}, + ] + # input_text has .text "hey" (3 chars) + 1 image + assert _content_length_for_budget(content) == 3 + _IMAGE_CHAR_EQUIVALENT + + def test_anthropic_native_image_shape(self): + """Anthropic native shape: {type: image, source: {...}}.""" + content = [ + {"type": "text", "text": "hi"}, + {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": "XX"}}, + ] + assert _content_length_for_budget(content) == 2 + _IMAGE_CHAR_EQUIVALENT + + def test_bare_string_part_in_list(self): + """Older code paths sometimes produce mixed list-of-strings content.""" + content = ["hello", {"type": "text", "text": "world"}] + assert _content_length_for_budget(content) == 5 + 5 + + def test_image_estimate_constant_is_reasonable(self): + """Sanity-check the estimate aligns with real provider billing. + + Anthropic ≈ width*height/750 → ~1600 for 1000×1200. + OpenAI GPT-4o high-detail 2048×2048 ≈ 1445. + Gemini 258/tile × 6 tiles for a 2048×2048 ≈ 1548. + Anything in the 800-2000 range is defensible. Enforce bounds so an + accidental edit doesn't drop it to e.g. 16. + """ + assert 800 <= _IMAGE_TOKEN_ESTIMATE <= 2500 + assert _IMAGE_CHAR_EQUIVALENT == _IMAGE_TOKEN_ESTIMATE * _CHARS_PER_TOKEN + + +class TestTokenBudgetWithImages: + """Integration: the compressor's tail-cut decision now respects image cost.""" + + def test_image_heavy_turns_count_toward_budget(self): + """A tail with 5 image-bearing turns should blow past a 5K token budget.""" + from agent.context_compressor import ContextCompressor + + # Minimal compressor fixture — just enough to call _find_tail_cut_by_tokens + cc = object.__new__(ContextCompressor) + cc.tail_token_budget = 5000 + + # Build 10 messages: 5 with images, 5 with short text. Without the + # image-tokens fix, the compressor would think all 10 fit in 5K and + # protect them all. With the fix, images alone cost 5 × 1600 = 8K, + # so the tail should be trimmed. + messages = [{"role": "system", "content": "sys"}] + for i in range(5): + messages.append({ + "role": "user", + "content": [ + {"type": "text", "text": f"turn {i}"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAA"}}, + ], + }) + messages.append({ + "role": "assistant", + "content": f"response {i}", + }) + + cut = cc._find_tail_cut_by_tokens(messages, head_end=0, token_budget=5000) + + # Budget is 5K, soft ceiling 7.5K. 5 images alone = 8000 image-tokens. + # Walking backward, the compressor should stop before including all 5. + # Exact cut depends on text lengths and min_tail, but it MUST be > 1 + # (at least some head-side messages should be compressible). + assert cut > 1, ( + f"Expected image-heavy tail to be trimmed; compressor placed cut at " + f"{cut} out of {len(messages)} (image tokens were likely ignored)." + ) diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index e8a92774b4..d6598b66a3 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -54,7 +54,7 @@ class TestFailoverReason: expected = { "auth", "auth_permanent", "billing", "rate_limit", "overloaded", "server_error", "timeout", - "context_overflow", "payload_too_large", + "context_overflow", "payload_too_large", "image_too_large", "model_not_found", "format_error", "provider_policy_blocked", "thinking_signature", "long_context_tier", "unknown", diff --git a/tests/agent/test_image_routing.py b/tests/agent/test_image_routing.py new file mode 100644 index 0000000000..9fd02eeecc --- /dev/null +++ b/tests/agent/test_image_routing.py @@ -0,0 +1,213 @@ +"""Tests for agent/image_routing.py — the per-turn image input mode decision.""" + +from __future__ import annotations + +import base64 +from pathlib import Path +from unittest.mock import patch + +import pytest + +from agent.image_routing import ( + _coerce_mode, + _explicit_aux_vision_override, + build_native_content_parts, + decide_image_input_mode, +) + + +# ─── _coerce_mode ──────────────────────────────────────────────────────────── + + +class TestCoerceMode: + def test_valid_modes_pass_through(self): + assert _coerce_mode("auto") == "auto" + assert _coerce_mode("native") == "native" + assert _coerce_mode("text") == "text" + + def test_case_insensitive(self): + assert _coerce_mode("NATIVE") == "native" + assert _coerce_mode("Auto") == "auto" + + def test_invalid_falls_back_to_auto(self): + assert _coerce_mode("nonsense") == "auto" + assert _coerce_mode("") == "auto" + assert _coerce_mode(None) == "auto" + assert _coerce_mode(42) == "auto" + + def test_strips_whitespace(self): + assert _coerce_mode(" native ") == "native" + + +# ─── _explicit_aux_vision_override ─────────────────────────────────────────── + + +class TestExplicitAuxVisionOverride: + def test_none_config(self): + assert _explicit_aux_vision_override(None) is False + + def test_empty_config(self): + assert _explicit_aux_vision_override({}) is False + + def test_default_auto_is_not_explicit(self): + cfg = {"auxiliary": {"vision": {"provider": "auto", "model": "", "base_url": ""}}} + assert _explicit_aux_vision_override(cfg) is False + + def test_provider_set_is_explicit(self): + cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": ""}}} + assert _explicit_aux_vision_override(cfg) is True + + def test_model_set_is_explicit(self): + cfg = {"auxiliary": {"vision": {"provider": "auto", "model": "google/gemini-2.5-flash"}}} + assert _explicit_aux_vision_override(cfg) is True + + def test_base_url_set_is_explicit(self): + cfg = {"auxiliary": {"vision": {"provider": "auto", "base_url": "http://localhost:11434"}}} + assert _explicit_aux_vision_override(cfg) is True + + +# ─── decide_image_input_mode ───────────────────────────────────────────────── + + +class TestDecideImageInputMode: + def test_explicit_native_overrides_everything(self): + cfg = {"agent": {"image_input_mode": "native"}} + # Non-vision model, aux-vision explicitly configured: native still wins. + cfg["auxiliary"] = {"vision": {"provider": "openrouter", "model": "foo"}} + with patch("agent.image_routing._lookup_supports_vision", return_value=False): + assert decide_image_input_mode("openrouter", "some-non-vision-model", cfg) == "native" + + def test_explicit_text_overrides_everything(self): + cfg = {"agent": {"image_input_mode": "text"}} + with patch("agent.image_routing._lookup_supports_vision", return_value=True): + assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "text" + + def test_auto_with_vision_capable_model(self): + with patch("agent.image_routing._lookup_supports_vision", return_value=True): + assert decide_image_input_mode("anthropic", "claude-sonnet-4", {}) == "native" + + def test_auto_with_non_vision_model(self): + with patch("agent.image_routing._lookup_supports_vision", return_value=False): + assert decide_image_input_mode("openrouter", "qwen/qwen3-235b", {}) == "text" + + def test_auto_with_unknown_model(self): + with patch("agent.image_routing._lookup_supports_vision", return_value=None): + assert decide_image_input_mode("openrouter", "brand-new-slug", {}) == "text" + + def test_auto_respects_aux_vision_override_even_for_vision_model(self): + """If the user configured a dedicated vision backend, don't bypass it.""" + cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}} + with patch("agent.image_routing._lookup_supports_vision", return_value=True): + assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "text" + + def test_none_config_is_auto(self): + with patch("agent.image_routing._lookup_supports_vision", return_value=True): + assert decide_image_input_mode("anthropic", "claude-sonnet-4", None) == "native" + + def test_invalid_mode_coerces_to_auto(self): + cfg = {"agent": {"image_input_mode": "weird-value"}} + with patch("agent.image_routing._lookup_supports_vision", return_value=True): + assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "native" + + +# ─── build_native_content_parts ────────────────────────────────────────────── + + +def _png_bytes() -> bytes: + """Return a tiny valid 1x1 transparent PNG.""" + return base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGNgYGBgAAAABQABpfZFQAAAAABJRU5ErkJggg==" + ) + + +class TestBuildNativeContentParts: + def test_text_then_image(self, tmp_path: Path): + img = tmp_path / "cat.png" + img.write_bytes(_png_bytes()) + parts, skipped = build_native_content_parts("hello", [str(img)]) + assert skipped == [] + assert len(parts) == 2 + assert parts[0] == {"type": "text", "text": "hello"} + assert parts[1]["type"] == "image_url" + assert parts[1]["image_url"]["url"].startswith("data:image/png;base64,") + + def test_empty_text_inserts_default_prompt(self, tmp_path: Path): + img = tmp_path / "cat.jpg" + img.write_bytes(_png_bytes()) + parts, skipped = build_native_content_parts("", [str(img)]) + assert skipped == [] + # Even with empty user text, we insert a neutral prompt so the turn + # isn't just pixels. + assert parts[0]["type"] == "text" + assert parts[0]["text"] == "What do you see in this image?" + assert parts[1]["type"] == "image_url" + + def test_missing_file_is_skipped(self, tmp_path: Path): + parts, skipped = build_native_content_parts("hi", [str(tmp_path / "missing.png")]) + assert skipped == [str(tmp_path / "missing.png")] + # Only text remains. + assert parts == [{"type": "text", "text": "hi"}] + + def test_multiple_images(self, tmp_path: Path): + img1 = tmp_path / "a.png" + img2 = tmp_path / "b.png" + img1.write_bytes(_png_bytes()) + img2.write_bytes(_png_bytes()) + parts, skipped = build_native_content_parts("compare these", [str(img1), str(img2)]) + assert skipped == [] + image_parts = [p for p in parts if p.get("type") == "image_url"] + assert len(image_parts) == 2 + + def test_mime_inference_jpg(self, tmp_path: Path): + img = tmp_path / "photo.jpg" + img.write_bytes(_png_bytes()) # bytes are PNG but extension is jpg + parts, _ = build_native_content_parts("x", [str(img)]) + url = parts[1]["image_url"]["url"] + assert url.startswith("data:image/jpeg;base64,") + + def test_mime_inference_webp(self, tmp_path: Path): + img = tmp_path / "pic.webp" + img.write_bytes(_png_bytes()) + parts, _ = build_native_content_parts("", [str(img)]) + url = parts[1]["image_url"]["url"] + assert url.startswith("data:image/webp;base64,") + + +# ─── Oversize handling ─────────────────────────────────────────────────────── + + +class TestLargeImageHandling: + """Large images attach at native size; shrink is handled reactively at + retry time in ``run_agent._try_shrink_image_parts_in_messages`` rather + than proactively here. + """ + + def test_large_image_passes_through_unchanged(self, tmp_path: Path): + """A multi-MB image is attached as-is — no resize, no skip.""" + from agent import image_routing as _ir + + img = tmp_path / "medium.png" + # 200 KB of real bytes; not huge but enough to verify no size gate fires. + img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"X" * 200_000) + url = _ir._file_to_data_url(img) + assert url is not None + assert url.startswith("data:image/png;base64,") + # Base64 expansion means output is ~4/3 of input, plus header. + assert len(url) > 200_000 + + def test_missing_file_returns_none(self, tmp_path: Path): + from agent import image_routing as _ir + missing = tmp_path / "does_not_exist.png" + assert _ir._file_to_data_url(missing) is None + + def test_build_native_parts_no_provider_kwarg(self, tmp_path: Path): + """build_native_content_parts takes text + paths, no provider kwarg.""" + from agent import image_routing as _ir + + img = tmp_path / "cat.png" + img.write_bytes(_png_bytes()) + parts, skipped = _ir.build_native_content_parts("hi", [str(img)]) + assert skipped == [] + assert len(parts) == 2 + assert parts[0]["type"] == "text" + assert parts[1]["type"] == "image_url" diff --git a/tests/run_agent/test_image_shrink_recovery.py b/tests/run_agent/test_image_shrink_recovery.py new file mode 100644 index 0000000000..7435bb7a13 --- /dev/null +++ b/tests/run_agent/test_image_shrink_recovery.py @@ -0,0 +1,277 @@ +"""Tests for reactive image-shrink recovery. + +Covers the full chain for Anthropic's 5 MB per-image ceiling (and any +future provider that returns an image-too-large error): + + 1. agent/error_classifier.py: 400 with "image exceeds 5 MB maximum" + gets FailoverReason.image_too_large, not context_overflow. + 2. run_agent._try_shrink_image_parts_in_messages mutates the API + payload in-place, re-encoding native data: URL image parts to fit + under 4 MB using vision_tools._resize_image_for_vision. + +The end-to-end wiring in the retry loop is not unit-tested here — it's +covered by the live E2E in the PR description. These tests lock in the +two pieces that matter independently: the classifier signal and the +payload rewriter. +""" + +from __future__ import annotations + +import base64 +from pathlib import Path + +import pytest + +from agent.error_classifier import FailoverReason, classify_api_error + + +class _FakeApiError(Exception): + """Stand-in for an openai.BadRequestError with status_code + body.""" + + def __init__(self, status_code: int, message: str, body: dict | None = None): + super().__init__(message) + self.status_code = status_code + self.body = body or {"error": {"message": message}} + self.response = None # required by some code paths + + +# ─── Classifier ────────────────────────────────────────────────────────────── + + +class TestImageTooLargeClassification: + def test_anthropic_400_image_exceeds_message(self): + """Anthropic's exact wording must classify as image_too_large, not context.""" + err = _FakeApiError( + status_code=400, + message=( + "messages.0.content.1.image.source.base64: image exceeds 5 MB " + "maximum: 12966600 bytes > 5242880 bytes" + ), + ) + result = classify_api_error(err, provider="anthropic", model="claude-sonnet-4-6") + assert result.reason == FailoverReason.image_too_large + assert result.retryable is True + + def test_generic_image_too_large_no_status(self): + """No status_code path: message text alone triggers classification.""" + err = Exception("image too large for this endpoint") + result = classify_api_error(err, provider="some-provider", model="some-model") + assert result.reason == FailoverReason.image_too_large + assert result.retryable is True + + def test_image_too_large_not_confused_with_context_overflow(self): + """'image exceeds' must NOT be mis-classified as context_overflow. + + The context_overflow patterns include 'exceeds the limit' which is a + superstring risk — verify the image-too-large check fires first. + """ + err = _FakeApiError( + status_code=400, + message="image exceeds the limit for this model", + ) + result = classify_api_error(err, provider="anthropic", model="claude-sonnet-4-6") + assert result.reason == FailoverReason.image_too_large + + def test_regular_context_overflow_unaffected(self): + """Context-overflow errors without image keywords still classify correctly.""" + err = _FakeApiError( + status_code=400, + message="prompt is too long: context length 300000 exceeds max of 200000", + ) + result = classify_api_error(err, provider="anthropic", model="claude-sonnet-4-6") + assert result.reason == FailoverReason.context_overflow + + +# ─── Shrink helper ─────────────────────────────────────────────────────────── + + +def _big_png_data_url(size_kb: int) -> str: + """Build a data URL with a plausible large base64 payload.""" + # Use real PNG header so MIME detection works; fill to target size. + raw = b"\x89PNG\r\n\x1a\n" + b"X" * (size_kb * 1024) + return "data:image/png;base64," + base64.b64encode(raw).decode("ascii") + + +def _make_agent(): + """Build a bare AIAgent for method-level testing, no provider setup.""" + from run_agent import AIAgent + agent = object.__new__(AIAgent) + agent.provider = "anthropic" + agent.model = "claude-sonnet-4-6" + return agent + + +class TestShrinkImagePartsHelper: + def test_no_messages_returns_false(self): + agent = _make_agent() + assert agent._try_shrink_image_parts_in_messages([]) is False + assert agent._try_shrink_image_parts_in_messages(None) is False + + def test_no_image_parts_returns_false(self): + agent = _make_agent() + msgs = [ + {"role": "user", "content": "plain text"}, + {"role": "assistant", "content": "ack"}, + ] + assert agent._try_shrink_image_parts_in_messages(msgs) is False + + def test_small_image_part_not_shrunk(self, monkeypatch): + """An image under 4 MB is left alone — shrink helper only touches oversized ones.""" + agent = _make_agent() + small_url = _big_png_data_url(100) # ~100 KB + b64 overhead + + resize_hits = {"count": 0} + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + lambda *a, **kw: resize_hits.__setitem__("count", resize_hits["count"] + 1) or small_url, + raising=False, + ) + + msgs = [{ + "role": "user", + "content": [ + {"type": "text", "text": "hi"}, + {"type": "image_url", "image_url": {"url": small_url}}, + ], + }] + assert agent._try_shrink_image_parts_in_messages(msgs) is False + assert resize_hits["count"] == 0 + # URL unchanged. + assert msgs[0]["content"][1]["image_url"]["url"] == small_url + + def test_oversized_image_url_dict_shape_rewritten(self, monkeypatch): + """OpenAI chat.completions shape: {image_url: {url: data:...}}.""" + agent = _make_agent() + oversized_url = _big_png_data_url(5000) # ~5 MB raw → ~6.7 MB b64 + shrunk = "data:image/jpeg;base64," + "A" * 1000 # small + + def _fake_resize(path, mime_type=None, max_base64_bytes=None): + return shrunk + + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + _fake_resize, + raising=False, + ) + + msgs = [{ + "role": "user", + "content": [ + {"type": "text", "text": "look"}, + {"type": "image_url", "image_url": {"url": oversized_url}}, + ], + }] + changed = agent._try_shrink_image_parts_in_messages(msgs) + assert changed is True + assert msgs[0]["content"][1]["image_url"]["url"] == shrunk + + def test_oversized_input_image_string_shape_rewritten(self, monkeypatch): + """OpenAI Responses shape: {type: input_image, image_url: "data:..."}.""" + agent = _make_agent() + oversized_url = _big_png_data_url(5000) + shrunk = "data:image/jpeg;base64," + "B" * 1000 + + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + lambda *a, **kw: shrunk, + raising=False, + ) + + msgs = [{ + "role": "user", + "content": [ + {"type": "input_text", "text": "look"}, + {"type": "input_image", "image_url": oversized_url}, + ], + }] + changed = agent._try_shrink_image_parts_in_messages(msgs) + assert changed is True + assert msgs[0]["content"][1]["image_url"] == shrunk + + def test_multiple_images_all_shrunk(self, monkeypatch): + agent = _make_agent() + big1 = _big_png_data_url(5000) + big2 = _big_png_data_url(6000) + shrunk = "data:image/jpeg;base64," + "C" * 500 + + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + lambda *a, **kw: shrunk, + raising=False, + ) + + msgs = [{ + "role": "user", + "content": [ + {"type": "text", "text": "compare"}, + {"type": "image_url", "image_url": {"url": big1}}, + {"type": "image_url", "image_url": {"url": big2}}, + ], + }] + changed = agent._try_shrink_image_parts_in_messages(msgs) + assert changed is True + assert msgs[0]["content"][1]["image_url"]["url"] == shrunk + assert msgs[0]["content"][2]["image_url"]["url"] == shrunk + + def test_http_url_images_not_touched(self, monkeypatch): + """Only data: URLs are candidates — http URLs are server-fetched.""" + agent = _make_agent() + + resize_hits = {"count": 0} + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + lambda *a, **kw: resize_hits.__setitem__("count", resize_hits["count"] + 1) or "shrunk", + raising=False, + ) + + msgs = [{ + "role": "user", + "content": [ + {"type": "text", "text": "at this url"}, + {"type": "image_url", "image_url": {"url": "https://example.com/big.png"}}, + ], + }] + assert agent._try_shrink_image_parts_in_messages(msgs) is False + assert resize_hits["count"] == 0 + + def test_shrink_failure_returns_false_and_leaves_url_intact(self, monkeypatch): + """If re-encode fails, leave the URL alone so the caller surfaces the original error.""" + agent = _make_agent() + oversized_url = _big_png_data_url(5000) + + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + lambda *a, **kw: None, # resize returned nothing usable + raising=False, + ) + + msgs = [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": oversized_url}}, + ], + }] + assert agent._try_shrink_image_parts_in_messages(msgs) is False + assert msgs[0]["content"][0]["image_url"]["url"] == oversized_url + + def test_shrink_that_makes_it_bigger_rejected(self, monkeypatch): + """If the 'shrink' somehow produces a larger payload, skip it.""" + agent = _make_agent() + oversized_url = _big_png_data_url(5000) + even_bigger = "data:image/png;base64," + "Z" * (10 * 1024 * 1024) + + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + lambda *a, **kw: even_bigger, + raising=False, + ) + + msgs = [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": oversized_url}}, + ], + }] + assert agent._try_shrink_image_parts_in_messages(msgs) is False + # Original URL still in place, not replaced by the bigger one. + assert msgs[0]["content"][0]["image_url"]["url"] == oversized_url diff --git a/tests/run_agent/test_vision_aware_preprocessing.py b/tests/run_agent/test_vision_aware_preprocessing.py new file mode 100644 index 0000000000..5211ead2a4 --- /dev/null +++ b/tests/run_agent/test_vision_aware_preprocessing.py @@ -0,0 +1,170 @@ +"""Tests for the vision-aware image preprocessing in run_agent.py. + +Covers: + +* ``_prepare_anthropic_messages_for_api`` — passes image parts through + unchanged when the active model reports ``supports_vision=True`` (the + adapter handles them natively), and falls back to text-description + replacement when the model lacks vision. + +* ``_prepare_messages_for_non_vision_model`` — the mirror method for the + chat.completions / codex_responses paths. Same contract. +""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest + +from run_agent import AIAgent + + +def _make_agent() -> AIAgent: + """Build a bare-bones AIAgent instance without running __init__. + + Avoids the heavy provider/credential setup for these pure-method tests. + """ + agent = object.__new__(AIAgent) + agent.provider = "anthropic" + agent.model = "claude-sonnet-4" + agent._anthropic_image_fallback_cache = {} + return agent + + +IMG_PARTS_USER_MSG = { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}}, + ], +} + +PLAIN_USER_MSG = {"role": "user", "content": "hello, no images here"} + + +# ─── _prepare_anthropic_messages_for_api ───────────────────────────────────── + + +class TestPrepareAnthropicMessages: + def test_no_images_passes_through(self): + agent = _make_agent() + msgs = [PLAIN_USER_MSG] + out = agent._prepare_anthropic_messages_for_api(msgs) + assert out is msgs # unchanged reference + + def test_vision_capable_passes_images_through(self): + """The Anthropic adapter handles image_url/input_image natively.""" + agent = _make_agent() + with patch.object(agent, "_model_supports_vision", return_value=True): + out = agent._prepare_anthropic_messages_for_api([IMG_PARTS_USER_MSG]) + # Passes through unchanged — image_url parts still present. + assert out[0]["content"][1]["type"] == "image_url" + + def test_non_vision_replaces_images_with_text(self): + agent = _make_agent() + with patch.object(agent, "_model_supports_vision", return_value=False), \ + patch.object( + agent, + "_describe_image_for_anthropic_fallback", + return_value="[Image description: a cat]", + ): + out = agent._prepare_anthropic_messages_for_api([IMG_PARTS_USER_MSG]) + # Content collapsed to a string containing the description + user text. + content = out[0]["content"] + assert isinstance(content, str) + assert "[Image description: a cat]" in content + assert "What's in this image?" in content + # No more image parts. + assert "image_url" not in content + + +# ─── _prepare_messages_for_non_vision_model ────────────────────────────────── + + +class TestPrepareMessagesForNonVision: + def test_no_images_passes_through(self): + agent = _make_agent() + msgs = [PLAIN_USER_MSG] + out = agent._prepare_messages_for_non_vision_model(msgs) + assert out is msgs + + def test_vision_capable_passes_through(self): + """For vision-capable models on chat.completions path, provider handles pixels.""" + agent = _make_agent() + agent.provider = "openrouter" + agent.model = "anthropic/claude-sonnet-4" + with patch.object(agent, "_model_supports_vision", return_value=True): + out = agent._prepare_messages_for_non_vision_model([IMG_PARTS_USER_MSG]) + assert out[0]["content"][1]["type"] == "image_url" + + def test_non_vision_strips_images(self): + agent = _make_agent() + agent.provider = "openrouter" + agent.model = "qwen/qwen3-235b-a22b" + with patch.object(agent, "_model_supports_vision", return_value=False), \ + patch.object( + agent, + "_describe_image_for_anthropic_fallback", + return_value="[Image description: a dog]", + ): + out = agent._prepare_messages_for_non_vision_model([IMG_PARTS_USER_MSG]) + content = out[0]["content"] + assert isinstance(content, str) + assert "[Image description: a dog]" in content + assert "image_url" not in content + + def test_multiple_messages_with_mixed_content(self): + agent = _make_agent() + agent.model = "qwen/qwen3-235b" + msgs = [ + {"role": "user", "content": "first turn"}, + {"role": "assistant", "content": "ack"}, + IMG_PARTS_USER_MSG, + ] + with patch.object(agent, "_model_supports_vision", return_value=False), \ + patch.object( + agent, + "_describe_image_for_anthropic_fallback", + return_value="[Image: thing]", + ): + out = agent._prepare_messages_for_non_vision_model(msgs) + # First two messages unchanged (no images), third stripped. + assert out[0]["content"] == "first turn" + assert out[1]["content"] == "ack" + assert isinstance(out[2]["content"], str) + assert "[Image: thing]" in out[2]["content"] + + +# ─── _model_supports_vision ────────────────────────────────────────────────── + + +class TestModelSupportsVision: + def test_missing_provider_or_model_returns_false(self): + agent = _make_agent() + agent.provider = "" + agent.model = "claude-sonnet-4" + assert agent._model_supports_vision() is False + agent.provider = "anthropic" + agent.model = "" + assert agent._model_supports_vision() is False + + def test_uses_get_model_capabilities(self): + agent = _make_agent() + fake_caps = MagicMock() + fake_caps.supports_vision = True + with patch("agent.models_dev.get_model_capabilities", return_value=fake_caps): + assert agent._model_supports_vision() is True + fake_caps.supports_vision = False + with patch("agent.models_dev.get_model_capabilities", return_value=fake_caps): + assert agent._model_supports_vision() is False + + def test_none_caps_returns_false(self): + agent = _make_agent() + with patch("agent.models_dev.get_model_capabilities", return_value=None): + assert agent._model_supports_vision() is False + + def test_exception_returns_false(self): + agent = _make_agent() + with patch("agent.models_dev.get_model_capabilities", side_effect=RuntimeError("boom")): + assert agent._model_supports_vision() is False diff --git a/tools/vision_tools.py b/tools/vision_tools.py index d3019b1d0b..32a1a68938 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -754,7 +754,15 @@ from tools.registry import registry, tool_error VISION_ANALYZE_SCHEMA = { "name": "vision_analyze", - "description": "Analyze images using AI vision. Provides a comprehensive description and answers a specific question about the image content.", + "description": ( + "Inspect an image from a URL, file path, or tool output when you need " + "closer detail than what's visible in the conversation. If the user's " + "image is already attached to the conversation and you can see it, " + "just answer directly — only call this tool for images referenced by " + "URL/path, images returned inside other tool results (browser " + "screenshots, search thumbnails), or when you need a deeper look at " + "a specific region the main model's vision may have missed." + ), "parameters": { "type": "object", "properties": { diff --git a/tui_gateway/server.py b/tui_gateway/server.py index 3818248047..ae1c0d90fb 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -13,7 +13,7 @@ import time import uuid from datetime import datetime from pathlib import Path -from typing import Optional +from typing import Any, Optional from hermes_constants import get_hermes_home from hermes_cli.env_loader import load_hermes_dotenv @@ -2274,7 +2274,60 @@ def _(rid, params: dict) -> dict: return prompt = ctx.message - prompt = _enrich_with_attached_images(prompt, images) if images else prompt + # Decide image routing per-turn based on active provider/model. + # "native" → pass pixels to the main model as OpenAI-style content + # parts (adapters translate for Anthropic/Gemini/Bedrock/etc.). + # "text" → pre-analyze with vision_analyze and prepend the text. + # See agent/image_routing.py for the full decision table. + run_message: Any = prompt + if images: + try: + from agent.image_routing import ( + decide_image_input_mode, + build_native_content_parts, + ) + from agent.auxiliary_client import ( + _read_main_model, + _read_main_provider, + ) + from hermes_cli.config import load_config as _tui_load_config + + _cfg = _tui_load_config() + _mode = decide_image_input_mode( + _read_main_provider(), + _read_main_model(), + _cfg, + ) + except Exception as _img_exc: + print( + f"[tui_gateway] image_routing decision failed, defaulting to text: {_img_exc}", + file=sys.stderr, + ) + _mode = "text" + + if _mode == "native": + try: + _parts, _skipped = build_native_content_parts( + prompt, + images, + ) + if _skipped: + print( + f"[tui_gateway] native image attachment skipped {len(_skipped)} unreadable path(s)", + file=sys.stderr, + ) + if any(p.get("type") == "image_url" for p in _parts): + run_message = _parts + else: + run_message = _enrich_with_attached_images(prompt, images) + except Exception as _img_exc: + print( + f"[tui_gateway] native attach failed, falling back to text: {_img_exc}", + file=sys.stderr, + ) + run_message = _enrich_with_attached_images(prompt, images) + else: + run_message = _enrich_with_attached_images(prompt, images) def _stream(delta): payload = {"text": delta} @@ -2283,7 +2336,7 @@ def _(rid, params: dict) -> dict: _emit("message.delta", sid, payload) result = agent.run_conversation( - prompt, + run_message, conversation_history=list(history), stream_callback=_stream, )