Compare commits

...

8 Commits

Author SHA1 Message Date
yoniebans
f3058fbec7 test(image-shrink): accept clamp_dimensions kwarg in mock resize signature
The _fake_resize mock in test_image_shrink_recovery.py predates the
clamp_dimensions kwarg on _resize_image_for_vision. Add it to keep the
mock signature aligned.
2026-05-22 19:09:17 +02:00
yoniebans
bd3bad232b refactor(vision): revert Pillow-missing warning to match codebase pattern 2026-05-22 18:49:31 +02:00
yoniebans
a2546ed4fe refactor(vision): simplify _is_anthropic_provider; drop manual probe script
- Inline the provider check via _ANTHROPIC_IMAGE_PROVIDERS frozenset
  instead of duplicating the predicate logic in a function body.
- Drop scripts/verify_anthropic_pixel_cap.py — it was a one-off
  development probe, not a repeatable utility. Moved to local workspace.
2026-05-22 18:31:28 +02:00
yoniebans
2f253a4f55 fix(vision): broaden Anthropic detection + wire clamp into browser/compression paths
- Broaden _is_anthropic_provider to cover claude/claude-code aliases and
  aggregators that proxy Claude (openrouter, nous, vertex, bedrock,
  anthropic-vertex, google-vertex) — same set as
  _supports_media_in_tool_results.
- Wire clamp_dimensions through browser_tool screenshot resize and
  conversation_compression image-shrink recovery, both of which were
  bypassing the clamp.
- Promote Pillow-missing log to warning when clamp was requested.
- Add parametrized tests for _is_anthropic_provider covering 19 cases.
2026-05-22 18:17:22 +02:00
yoniebans
80a0c829d7 test(vision): add wiring regression for provider-gated dimension clamp 2026-05-21 22:21:20 +02:00
yoniebans
f4b32301ce test(vision): add manual verification script for Anthropic pixel cap
Manual script that hits real Anthropic API to confirm: (1) >8000 px images
are still rejected with the same error message, (2) our clamp produces an
image Anthropic accepts. Run when threshold drift is suspected.
2026-05-21 22:16:43 +02:00
yoniebans
b3309f3c0f fix(vision): gate pixel-dimension clamp on Anthropic provider
Anthropic is the only major provider that hard-rejects >8000 px images.
Clamping unconditionally silently downscaled images for OpenAI/Gemini/custom
hosts that could handle larger inputs. Gate the clamp on the active provider
and add an opt-in clamp_dimensions kwarg to _resize_image_for_vision.
2026-05-21 21:53:45 +02:00
yoniebans
2882899925 fix(vision): clamp image dimensions before inline base64 encode
Anthropic's Messages API rejects any image whose width or height
exceeds 8000 px with a non_retryable_client_error 400:

  messages.N.content.M.image.source.base64.data:
    At least one of the image dimensions exceed max allowed size: 8000 pixels

The native vision fast path inlined oversized screenshots (e.g. tall
or panoramic captures from browser_vision / vision_analyze) directly
into the tool-result envelope before any size check.  Once present in
the message history, every subsequent request replayed the same
oversized image and got the same 400 — permanently bricking the
session, since the error is non-retryable.  Recovery required manually
editing the session JSON to drop the poisoned tool result.

Fix:

  * Add _MAX_IMAGE_DIMENSION = 7999 (one px under Anthropic's cap).
  * Add _get_image_dimensions / _image_exceeds_pixel_cap helpers
    (header-only Pillow read, no full decode).
  * _resize_image_for_vision now clamps proportionally to the cap
    before any byte-size work.
  * Three call sites (native fast path + legacy path initial check)
    trigger resize on dimension overflow as well as byte overflow.

Pillow remains a soft dependency: when missing, the dimension check
returns False and the existing byte-size guard remains the last line
of defence (same behaviour as today).

Adds TestPixelDimensionCap covering the helpers, the Pillow-missing
fallback, and the 10000x100 / 100x10000 regression cases.  All 125
tests pass across vision_tools, vision_native_fast_path,
image_shrink_recovery, and image_rejection_fallback.
2026-05-21 11:27:24 +02:00
5 changed files with 329 additions and 13 deletions

View File

@@ -504,7 +504,7 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
return False
try:
from tools.vision_tools import _resize_image_for_vision
from tools.vision_tools import _resize_image_for_vision, _is_anthropic_provider
except Exception as exc:
logger.warning("image-shrink recovery: vision_tools unavailable — %s", exc)
return False
@@ -546,6 +546,7 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
Path(tmp.name),
mime_type=mime,
max_base64_bytes=target_bytes,
clamp_dimensions=_is_anthropic_provider(),
)
finally:
try:

View File

@@ -145,7 +145,7 @@ class TestShrinkImagePartsHelper:
oversized_url = _big_png_data_url(5000) # ~5 MB raw → ~6.7 MB b64
shrunk = "data:image/jpeg;base64," + "A" * 1000 # small
def _fake_resize(path, mime_type=None, max_base64_bytes=None):
def _fake_resize(path, mime_type=None, max_base64_bytes=None, clamp_dimensions=False):
return shrunk
monkeypatch.setattr(

View File

@@ -17,8 +17,11 @@ from tools.vision_tools import (
_image_to_base64_data_url,
_resize_image_for_vision,
_is_image_size_error,
_image_exceeds_pixel_cap,
_get_image_dimensions,
_MAX_BASE64_BYTES,
_RESIZE_TARGET_BYTES,
_MAX_IMAGE_DIMENSION,
vision_analyze_tool,
check_vision_requirements,
)
@@ -890,6 +893,239 @@ class TestResizeImageForVision:
assert len(result) > 100
# ---------------------------------------------------------------------------
# Pixel-dimension cap — Anthropic rejects >8000 px on either axis as
# non_retryable_client_error, which permanently bricks the session via the
# native fast path (image gets inlined into the tool-result envelope before
# the API call fails). See `_MAX_IMAGE_DIMENSION`.
# ---------------------------------------------------------------------------
class TestPixelDimensionCap:
"""Tests for the per-axis pixel cap enforcement."""
def test_max_dimension_below_anthropic_limit(self):
"""The cap must stay strictly below Anthropic's 8000 px hard limit."""
assert _MAX_IMAGE_DIMENSION < 8000
def test_get_dimensions_reads_header(self, tmp_path):
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (123, 456), (0, 0, 0))
path = tmp_path / "dims.png"
img.save(path, "PNG")
assert _get_image_dimensions(path) == (123, 456)
def test_exceeds_cap_wide(self, tmp_path):
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (_MAX_IMAGE_DIMENSION + 100, 200), (0, 0, 0))
path = tmp_path / "wide.png"
img.save(path, "PNG")
assert _image_exceeds_pixel_cap(path) is True
def test_exceeds_cap_tall(self, tmp_path):
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (200, _MAX_IMAGE_DIMENSION + 100), (0, 0, 0))
path = tmp_path / "tall.png"
img.save(path, "PNG")
assert _image_exceeds_pixel_cap(path) is True
def test_within_cap(self, tmp_path):
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (4000, 3000), (0, 0, 0))
path = tmp_path / "ok.png"
img.save(path, "PNG")
assert _image_exceeds_pixel_cap(path) is False
def test_no_pillow_returns_false(self, tmp_path):
"""Without Pillow, the dimension check is a no-op (byte guard remains)."""
path = tmp_path / "fake.png"
path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100)
with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}):
assert _image_exceeds_pixel_cap(path) is False
assert _get_image_dimensions(path) is None
def test_resize_clamps_oversized_dimension(self, tmp_path):
"""A 10000x100 image must come back ≤ _MAX_IMAGE_DIMENSION on the long side.
This is the regression test for the bug: an image well under 20 MB
base64 but >8000 px on one axis would slip through and brick the
session. After the fix, ``_resize_image_for_vision`` must clamp
dimensions to the cap regardless of byte size.
"""
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
# 10000x100 solid-colour PNG compresses to a few KB — well under any
# byte cap — but violates the pixel-dimension cap.
img = Image.new("RGB", (10000, 100), (50, 100, 150))
path = tmp_path / "wide.png"
img.save(path, "PNG")
result = _resize_image_for_vision(path, mime_type="image/png", clamp_dimensions=True)
assert result.startswith("data:image/png;base64,")
# Decode the returned data URL and assert dimensions
import base64
from io import BytesIO
_, b64data = result.split(",", 1)
decoded = Image.open(BytesIO(base64.b64decode(b64data)))
assert decoded.width <= _MAX_IMAGE_DIMENSION
assert decoded.height <= _MAX_IMAGE_DIMENSION
# Aspect ratio should be roughly preserved
original_ratio = 10000 / 100 # 100:1
new_ratio = decoded.width / max(decoded.height, 1)
# Proportional clamp ⇒ ratio preserved within rounding tolerance.
# Pillow's int rounding can cost ~1% on extreme aspect ratios.
assert new_ratio >= original_ratio * 0.95, (
f"Aspect ratio drifted: {decoded.width}x{decoded.height} "
f"(ratio {new_ratio:.1f}, original {original_ratio:.1f})"
)
def test_resize_clamps_oversized_tall_image(self, tmp_path):
"""Mirror of the wide case for the tall axis."""
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (100, 10000), (150, 100, 50))
path = tmp_path / "tall.png"
img.save(path, "PNG")
result = _resize_image_for_vision(path, mime_type="image/png", clamp_dimensions=True)
import base64
from io import BytesIO
_, b64data = result.split(",", 1)
decoded = Image.open(BytesIO(base64.b64decode(b64data)))
assert decoded.width <= _MAX_IMAGE_DIMENSION
assert decoded.height <= _MAX_IMAGE_DIMENSION
def test_resize_within_cap_not_dimension_clamped(self, tmp_path):
"""An image already within the cap shouldn't be dimension-clamped.
(Byte-size resize may still apply for very large files, but a 4000x3000
image small enough in bytes should pass through untouched.)
"""
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (4000, 3000), (0, 128, 255))
path = tmp_path / "ok.png"
img.save(path, "PNG")
result = _resize_image_for_vision(path, mime_type="image/png", clamp_dimensions=True)
import base64
from io import BytesIO
_, b64data = result.split(",", 1)
decoded = Image.open(BytesIO(base64.b64decode(b64data)))
assert decoded.size == (4000, 3000)
def test_clamp_disabled_by_default_preserves_oversized(self, tmp_path):
"""Non-Anthropic providers (clamp_dimensions=False) keep original dims."""
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (10000, 100), (50, 100, 150))
path = tmp_path / "wide.png"
img.save(path, "PNG")
# Default: clamp_dimensions=False — small enough in bytes, so fast-exit.
result = _resize_image_for_vision(path, mime_type="image/png")
import base64
from io import BytesIO
_, b64data = result.split(",", 1)
decoded = Image.open(BytesIO(base64.b64decode(b64data)))
assert decoded.size == (10000, 100)
class TestNativeVisionDimensionWiring:
"""Wire-up regression: _vision_analyze_native must pass clamp_dimensions
matching the active provider to _resize_image_for_vision."""
def _make_oversized_png(self, tmp_path):
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (10000, 100), (128, 128, 128))
path = tmp_path / "wide.png"
img.save(path, "PNG")
return path
def test_anthropic_provider_passes_clamp_true(self, tmp_path):
from tools.vision_tools import _vision_analyze_native
path = self._make_oversized_png(tmp_path)
with patch("tools.vision_tools._is_anthropic_provider", return_value=True), \
patch("tools.vision_tools._resize_image_for_vision",
return_value="data:image/png;base64,AAAA") as resize_spy, \
patch("tools.vision_tools._build_native_vision_tool_result",
return_value={"ok": True}):
asyncio.run(_vision_analyze_native(str(path), "describe"))
resize_spy.assert_called_once()
assert resize_spy.call_args.kwargs.get("clamp_dimensions") is True
def test_non_anthropic_provider_passes_clamp_false(self, tmp_path):
from tools.vision_tools import _vision_analyze_native
path = self._make_oversized_png(tmp_path)
# Non-Anthropic: oversized-but-small-bytes image should skip the resize
# entirely (byte cap not exceeded, pixel guard gated off).
with patch("tools.vision_tools._is_anthropic_provider", return_value=False), \
patch("tools.vision_tools._resize_image_for_vision") as resize_spy, \
patch("tools.vision_tools._build_native_vision_tool_result",
return_value={"ok": True}):
asyncio.run(_vision_analyze_native(str(path), "describe"))
resize_spy.assert_not_called()
class TestIsAnthropicProvider:
"""_is_anthropic_provider must cover native Anthropic, common aliases,
and aggregators that proxy Claude. Same provider set as
_supports_media_in_tool_results."""
@pytest.mark.parametrize("provider", [
"anthropic", "claude", "claude-code", "anthropic-direct",
"openrouter", "nous", "vertex", "bedrock",
"anthropic-vertex", "google-vertex",
])
def test_matches(self, provider):
from tools.vision_tools import _is_anthropic_provider
with patch("agent.auxiliary_client._read_main_provider",
return_value=provider):
assert _is_anthropic_provider() is True
@pytest.mark.parametrize("provider", [
"openai", "openai-chat", "openai-codex", "azure-openai",
"gemini", "google", "xai", "deepseek", "custom", "",
])
def test_does_not_match(self, provider):
from tools.vision_tools import _is_anthropic_provider
with patch("agent.auxiliary_client._read_main_provider",
return_value=provider):
assert _is_anthropic_provider() is False
def test_uppercase_normalized(self):
from tools.vision_tools import _is_anthropic_provider
with patch("agent.auxiliary_client._read_main_provider",
return_value=" ANTHROPIC "):
assert _is_anthropic_provider() is True
# ---------------------------------------------------------------------------
# _is_image_size_error — detect size-related API errors
# ---------------------------------------------------------------------------

View File

@@ -3241,6 +3241,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
except Exception as _api_err:
from tools.vision_tools import (
_is_image_size_error, _resize_image_for_vision, _RESIZE_TARGET_BYTES,
_is_anthropic_provider,
)
if (_is_image_size_error(_api_err)
and len(data_url) > _RESIZE_TARGET_BYTES):
@@ -3251,7 +3252,8 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
_RESIZE_TARGET_BYTES / (1024 * 1024),
)
data_url = _resize_image_for_vision(
screenshot_path, mime_type="image/png")
screenshot_path, mime_type="image/png",
clamp_dimensions=_is_anthropic_provider())
call_kwargs["messages"][0]["content"][1]["image_url"]["url"] = data_url
response = call_llm(**call_kwargs)
else:

View File

@@ -287,6 +287,63 @@ _MAX_BASE64_BYTES = 20 * 1024 * 1024
# rejects an image, we downscale to this target and retry once.
_RESIZE_TARGET_BYTES = 5 * 1024 * 1024
# Anthropic Messages API rejects any image with width or height >8000 px with
# a non_retryable_client_error 400, which poisons message history on the native
# fast path. We clamp to 7999 only when sending to Anthropic; other providers
# (OpenAI, Gemini, custom hosts) auto-downscale or handle their own limits.
_MAX_IMAGE_DIMENSION = 7999
# Providers that use Anthropic image-block format and enforce the 8000 px cap.
# Mirrors the Anthropic + aggregators-proxying-Claude subset of
# ``_supports_media_in_tool_results`` below.
_ANTHROPIC_IMAGE_PROVIDERS = frozenset({
"anthropic", "claude", "claude-code", "anthropic-direct",
"openrouter", "nous", "vertex", "bedrock",
"anthropic-vertex", "google-vertex",
})
def _is_anthropic_provider() -> bool:
"""True if the active main provider uses Anthropic image-block format."""
try:
from agent.auxiliary_client import _read_main_provider
return (_read_main_provider() or "").strip().lower() in _ANTHROPIC_IMAGE_PROVIDERS
except Exception:
return False
def _get_image_dimensions(image_path: Path) -> Optional[tuple]:
"""Return ``(width, height)`` for an image, or ``None`` if Pillow is
unavailable or the file cannot be opened.
Uses ``Image.open`` which only reads the header — no full decode — so this
is cheap even for large files.
"""
try:
from PIL import Image
except ImportError:
return None
try:
with Image.open(image_path) as img:
return (img.width, img.height)
except Exception as exc:
logger.debug("Could not read image dimensions for %s: %s", image_path, exc)
return None
def _image_exceeds_pixel_cap(image_path: Path,
max_dim: int = _MAX_IMAGE_DIMENSION) -> bool:
"""Return True if either axis of the image exceeds ``max_dim`` pixels.
Returns False when Pillow is missing or dimensions cannot be read — the
caller's byte-size guard remains the last line of defence in that case.
"""
dims = _get_image_dimensions(image_path)
if dims is None:
return False
return dims[0] > max_dim or dims[1] > max_dim
def _is_image_size_error(error: Exception) -> bool:
"""Detect if an API error is related to image or payload size."""
@@ -299,7 +356,8 @@ def _is_image_size_error(error: Exception) -> bool:
def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str:
max_base64_bytes: int = _RESIZE_TARGET_BYTES,
clamp_dimensions: bool = False) -> str:
"""Convert an image to a base64 data URL, auto-resizing if too large.
Tries Pillow first to progressively downscale oversized images. If Pillow
@@ -309,10 +367,11 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
Returns the base64 data URL string.
"""
# Quick file-size estimate: base64 expands by ~4/3, plus data URL header.
# Skip the expensive full-read + encode if Pillow can resize directly.
# Skip the fast-exit when caller wants dimension clamping (Anthropic).
file_size = image_path.stat().st_size
estimated_b64 = (file_size * 4) // 3 + 100 # ~header overhead
if estimated_b64 <= max_base64_bytes:
needs_dim_clamp = clamp_dimensions and _image_exceeds_pixel_cap(image_path)
if estimated_b64 <= max_base64_bytes and not needs_dim_clamp:
# Small enough — just encode directly.
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
if len(data_url) <= max_base64_bytes:
@@ -350,6 +409,18 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
if pil_format == "JPEG" and img.mode in {"RGBA", "P"}:
img = img.convert("RGB")
# Dimension clamp (Anthropic only): proportionally shrink to fit the cap
# before any byte-size work, since Anthropic rejects on dimensions alone.
if clamp_dimensions and (img.width > _MAX_IMAGE_DIMENSION or img.height > _MAX_IMAGE_DIMENSION):
scale = _MAX_IMAGE_DIMENSION / max(img.width, img.height)
new_w = max(int(img.width * scale), 1)
new_h = max(int(img.height * scale), 1)
logger.info(
"Image %dx%d exceeds %d px cap, clamping to %dx%d",
img.width, img.height, _MAX_IMAGE_DIMENSION, new_w, new_h,
)
img = img.resize((new_w, new_h), Image.LANCZOS)
# Strategy: halve dimensions until base64 fits, up to 4 rounds.
# For JPEG, also try reducing quality at each size step.
# For PNG, quality is irrelevant — only dimension reduction helps.
@@ -594,10 +665,13 @@ async def _vision_analyze_native(
temp_image_path, mime_type=detected_mime_type,
)
# Honour the same hard cap as the legacy path. Resize if needed.
if len(image_data_url) > _MAX_BASE64_BYTES:
# Honour the byte cap; also apply the pixel cap for Anthropic only.
is_anthropic = _is_anthropic_provider()
if (len(image_data_url) > _MAX_BASE64_BYTES
or (is_anthropic and _image_exceeds_pixel_cap(temp_image_path))):
image_data_url = _resize_image_for_vision(
temp_image_path, mime_type=detected_mime_type,
clamp_dimensions=is_anthropic,
)
if len(image_data_url) > _MAX_BASE64_BYTES:
return tool_error(
@@ -738,11 +812,13 @@ async def vision_analyze_tool(
data_size_kb = len(image_data_url) / 1024
logger.info("Image converted to base64 (%.1f KB)", data_size_kb)
# Hard limit (20 MB) — no provider accepts payloads this large.
if len(image_data_url) > _MAX_BASE64_BYTES:
# Try to resize down to 5 MB before giving up.
# Hard 20 MB byte cap; pixel cap applies to Anthropic only.
is_anthropic = _is_anthropic_provider()
if (len(image_data_url) > _MAX_BASE64_BYTES
or (is_anthropic and _image_exceeds_pixel_cap(temp_image_path))):
image_data_url = _resize_image_for_vision(
temp_image_path, mime_type=detected_mime_type)
temp_image_path, mime_type=detected_mime_type,
clamp_dimensions=is_anthropic)
if len(image_data_url) > _MAX_BASE64_BYTES:
raise ValueError(
f"Image too large for vision API: base64 payload is "
@@ -818,7 +894,8 @@ async def vision_analyze_tool(
_RESIZE_TARGET_BYTES / (1024 * 1024),
)
image_data_url = _resize_image_for_vision(
temp_image_path, mime_type=detected_mime_type)
temp_image_path, mime_type=detected_mime_type,
clamp_dimensions=_is_anthropic_provider())
messages[0]["content"][1]["image_url"]["url"] = image_data_url
response = await async_call_llm(**call_kwargs)
else: