mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-10 20:29:00 +08:00
Compare commits
8 Commits
ethie/fix-
...
fix/vision
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f3058fbec7 | ||
|
|
bd3bad232b | ||
|
|
a2546ed4fe | ||
|
|
2f253a4f55 | ||
|
|
80a0c829d7 | ||
|
|
f4b32301ce | ||
|
|
b3309f3c0f | ||
|
|
2882899925 |
@@ -504,7 +504,7 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
|
||||
return False
|
||||
|
||||
try:
|
||||
from tools.vision_tools import _resize_image_for_vision
|
||||
from tools.vision_tools import _resize_image_for_vision, _is_anthropic_provider
|
||||
except Exception as exc:
|
||||
logger.warning("image-shrink recovery: vision_tools unavailable — %s", exc)
|
||||
return False
|
||||
@@ -546,6 +546,7 @@ def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
|
||||
Path(tmp.name),
|
||||
mime_type=mime,
|
||||
max_base64_bytes=target_bytes,
|
||||
clamp_dimensions=_is_anthropic_provider(),
|
||||
)
|
||||
finally:
|
||||
try:
|
||||
|
||||
@@ -145,7 +145,7 @@ class TestShrinkImagePartsHelper:
|
||||
oversized_url = _big_png_data_url(5000) # ~5 MB raw → ~6.7 MB b64
|
||||
shrunk = "data:image/jpeg;base64," + "A" * 1000 # small
|
||||
|
||||
def _fake_resize(path, mime_type=None, max_base64_bytes=None):
|
||||
def _fake_resize(path, mime_type=None, max_base64_bytes=None, clamp_dimensions=False):
|
||||
return shrunk
|
||||
|
||||
monkeypatch.setattr(
|
||||
|
||||
@@ -17,8 +17,11 @@ from tools.vision_tools import (
|
||||
_image_to_base64_data_url,
|
||||
_resize_image_for_vision,
|
||||
_is_image_size_error,
|
||||
_image_exceeds_pixel_cap,
|
||||
_get_image_dimensions,
|
||||
_MAX_BASE64_BYTES,
|
||||
_RESIZE_TARGET_BYTES,
|
||||
_MAX_IMAGE_DIMENSION,
|
||||
vision_analyze_tool,
|
||||
check_vision_requirements,
|
||||
)
|
||||
@@ -890,6 +893,239 @@ class TestResizeImageForVision:
|
||||
assert len(result) > 100
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pixel-dimension cap — Anthropic rejects >8000 px on either axis as
|
||||
# non_retryable_client_error, which permanently bricks the session via the
|
||||
# native fast path (image gets inlined into the tool-result envelope before
|
||||
# the API call fails). See `_MAX_IMAGE_DIMENSION`.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestPixelDimensionCap:
|
||||
"""Tests for the per-axis pixel cap enforcement."""
|
||||
|
||||
def test_max_dimension_below_anthropic_limit(self):
|
||||
"""The cap must stay strictly below Anthropic's 8000 px hard limit."""
|
||||
assert _MAX_IMAGE_DIMENSION < 8000
|
||||
|
||||
def test_get_dimensions_reads_header(self, tmp_path):
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed")
|
||||
img = Image.new("RGB", (123, 456), (0, 0, 0))
|
||||
path = tmp_path / "dims.png"
|
||||
img.save(path, "PNG")
|
||||
assert _get_image_dimensions(path) == (123, 456)
|
||||
|
||||
def test_exceeds_cap_wide(self, tmp_path):
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed")
|
||||
img = Image.new("RGB", (_MAX_IMAGE_DIMENSION + 100, 200), (0, 0, 0))
|
||||
path = tmp_path / "wide.png"
|
||||
img.save(path, "PNG")
|
||||
assert _image_exceeds_pixel_cap(path) is True
|
||||
|
||||
def test_exceeds_cap_tall(self, tmp_path):
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed")
|
||||
img = Image.new("RGB", (200, _MAX_IMAGE_DIMENSION + 100), (0, 0, 0))
|
||||
path = tmp_path / "tall.png"
|
||||
img.save(path, "PNG")
|
||||
assert _image_exceeds_pixel_cap(path) is True
|
||||
|
||||
def test_within_cap(self, tmp_path):
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed")
|
||||
img = Image.new("RGB", (4000, 3000), (0, 0, 0))
|
||||
path = tmp_path / "ok.png"
|
||||
img.save(path, "PNG")
|
||||
assert _image_exceeds_pixel_cap(path) is False
|
||||
|
||||
def test_no_pillow_returns_false(self, tmp_path):
|
||||
"""Without Pillow, the dimension check is a no-op (byte guard remains)."""
|
||||
path = tmp_path / "fake.png"
|
||||
path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100)
|
||||
with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}):
|
||||
assert _image_exceeds_pixel_cap(path) is False
|
||||
assert _get_image_dimensions(path) is None
|
||||
|
||||
def test_resize_clamps_oversized_dimension(self, tmp_path):
|
||||
"""A 10000x100 image must come back ≤ _MAX_IMAGE_DIMENSION on the long side.
|
||||
|
||||
This is the regression test for the bug: an image well under 20 MB
|
||||
base64 but >8000 px on one axis would slip through and brick the
|
||||
session. After the fix, ``_resize_image_for_vision`` must clamp
|
||||
dimensions to the cap regardless of byte size.
|
||||
"""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed")
|
||||
# 10000x100 solid-colour PNG compresses to a few KB — well under any
|
||||
# byte cap — but violates the pixel-dimension cap.
|
||||
img = Image.new("RGB", (10000, 100), (50, 100, 150))
|
||||
path = tmp_path / "wide.png"
|
||||
img.save(path, "PNG")
|
||||
|
||||
result = _resize_image_for_vision(path, mime_type="image/png", clamp_dimensions=True)
|
||||
assert result.startswith("data:image/png;base64,")
|
||||
|
||||
# Decode the returned data URL and assert dimensions
|
||||
import base64
|
||||
from io import BytesIO
|
||||
_, b64data = result.split(",", 1)
|
||||
decoded = Image.open(BytesIO(base64.b64decode(b64data)))
|
||||
assert decoded.width <= _MAX_IMAGE_DIMENSION
|
||||
assert decoded.height <= _MAX_IMAGE_DIMENSION
|
||||
# Aspect ratio should be roughly preserved
|
||||
original_ratio = 10000 / 100 # 100:1
|
||||
new_ratio = decoded.width / max(decoded.height, 1)
|
||||
# Proportional clamp ⇒ ratio preserved within rounding tolerance.
|
||||
# Pillow's int rounding can cost ~1% on extreme aspect ratios.
|
||||
assert new_ratio >= original_ratio * 0.95, (
|
||||
f"Aspect ratio drifted: {decoded.width}x{decoded.height} "
|
||||
f"(ratio {new_ratio:.1f}, original {original_ratio:.1f})"
|
||||
)
|
||||
|
||||
def test_resize_clamps_oversized_tall_image(self, tmp_path):
|
||||
"""Mirror of the wide case for the tall axis."""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed")
|
||||
img = Image.new("RGB", (100, 10000), (150, 100, 50))
|
||||
path = tmp_path / "tall.png"
|
||||
img.save(path, "PNG")
|
||||
|
||||
result = _resize_image_for_vision(path, mime_type="image/png", clamp_dimensions=True)
|
||||
import base64
|
||||
from io import BytesIO
|
||||
_, b64data = result.split(",", 1)
|
||||
decoded = Image.open(BytesIO(base64.b64decode(b64data)))
|
||||
assert decoded.width <= _MAX_IMAGE_DIMENSION
|
||||
assert decoded.height <= _MAX_IMAGE_DIMENSION
|
||||
|
||||
def test_resize_within_cap_not_dimension_clamped(self, tmp_path):
|
||||
"""An image already within the cap shouldn't be dimension-clamped.
|
||||
|
||||
(Byte-size resize may still apply for very large files, but a 4000x3000
|
||||
image small enough in bytes should pass through untouched.)
|
||||
"""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed")
|
||||
img = Image.new("RGB", (4000, 3000), (0, 128, 255))
|
||||
path = tmp_path / "ok.png"
|
||||
img.save(path, "PNG")
|
||||
result = _resize_image_for_vision(path, mime_type="image/png", clamp_dimensions=True)
|
||||
import base64
|
||||
from io import BytesIO
|
||||
_, b64data = result.split(",", 1)
|
||||
decoded = Image.open(BytesIO(base64.b64decode(b64data)))
|
||||
assert decoded.size == (4000, 3000)
|
||||
|
||||
def test_clamp_disabled_by_default_preserves_oversized(self, tmp_path):
|
||||
"""Non-Anthropic providers (clamp_dimensions=False) keep original dims."""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed")
|
||||
img = Image.new("RGB", (10000, 100), (50, 100, 150))
|
||||
path = tmp_path / "wide.png"
|
||||
img.save(path, "PNG")
|
||||
# Default: clamp_dimensions=False — small enough in bytes, so fast-exit.
|
||||
result = _resize_image_for_vision(path, mime_type="image/png")
|
||||
import base64
|
||||
from io import BytesIO
|
||||
_, b64data = result.split(",", 1)
|
||||
decoded = Image.open(BytesIO(base64.b64decode(b64data)))
|
||||
assert decoded.size == (10000, 100)
|
||||
|
||||
|
||||
class TestNativeVisionDimensionWiring:
|
||||
"""Wire-up regression: _vision_analyze_native must pass clamp_dimensions
|
||||
matching the active provider to _resize_image_for_vision."""
|
||||
|
||||
def _make_oversized_png(self, tmp_path):
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed")
|
||||
img = Image.new("RGB", (10000, 100), (128, 128, 128))
|
||||
path = tmp_path / "wide.png"
|
||||
img.save(path, "PNG")
|
||||
return path
|
||||
|
||||
def test_anthropic_provider_passes_clamp_true(self, tmp_path):
|
||||
from tools.vision_tools import _vision_analyze_native
|
||||
path = self._make_oversized_png(tmp_path)
|
||||
|
||||
with patch("tools.vision_tools._is_anthropic_provider", return_value=True), \
|
||||
patch("tools.vision_tools._resize_image_for_vision",
|
||||
return_value="data:image/png;base64,AAAA") as resize_spy, \
|
||||
patch("tools.vision_tools._build_native_vision_tool_result",
|
||||
return_value={"ok": True}):
|
||||
asyncio.run(_vision_analyze_native(str(path), "describe"))
|
||||
|
||||
resize_spy.assert_called_once()
|
||||
assert resize_spy.call_args.kwargs.get("clamp_dimensions") is True
|
||||
|
||||
def test_non_anthropic_provider_passes_clamp_false(self, tmp_path):
|
||||
from tools.vision_tools import _vision_analyze_native
|
||||
path = self._make_oversized_png(tmp_path)
|
||||
|
||||
# Non-Anthropic: oversized-but-small-bytes image should skip the resize
|
||||
# entirely (byte cap not exceeded, pixel guard gated off).
|
||||
with patch("tools.vision_tools._is_anthropic_provider", return_value=False), \
|
||||
patch("tools.vision_tools._resize_image_for_vision") as resize_spy, \
|
||||
patch("tools.vision_tools._build_native_vision_tool_result",
|
||||
return_value={"ok": True}):
|
||||
asyncio.run(_vision_analyze_native(str(path), "describe"))
|
||||
|
||||
resize_spy.assert_not_called()
|
||||
|
||||
|
||||
class TestIsAnthropicProvider:
|
||||
"""_is_anthropic_provider must cover native Anthropic, common aliases,
|
||||
and aggregators that proxy Claude. Same provider set as
|
||||
_supports_media_in_tool_results."""
|
||||
|
||||
@pytest.mark.parametrize("provider", [
|
||||
"anthropic", "claude", "claude-code", "anthropic-direct",
|
||||
"openrouter", "nous", "vertex", "bedrock",
|
||||
"anthropic-vertex", "google-vertex",
|
||||
])
|
||||
def test_matches(self, provider):
|
||||
from tools.vision_tools import _is_anthropic_provider
|
||||
with patch("agent.auxiliary_client._read_main_provider",
|
||||
return_value=provider):
|
||||
assert _is_anthropic_provider() is True
|
||||
|
||||
@pytest.mark.parametrize("provider", [
|
||||
"openai", "openai-chat", "openai-codex", "azure-openai",
|
||||
"gemini", "google", "xai", "deepseek", "custom", "",
|
||||
])
|
||||
def test_does_not_match(self, provider):
|
||||
from tools.vision_tools import _is_anthropic_provider
|
||||
with patch("agent.auxiliary_client._read_main_provider",
|
||||
return_value=provider):
|
||||
assert _is_anthropic_provider() is False
|
||||
|
||||
def test_uppercase_normalized(self):
|
||||
from tools.vision_tools import _is_anthropic_provider
|
||||
with patch("agent.auxiliary_client._read_main_provider",
|
||||
return_value=" ANTHROPIC "):
|
||||
assert _is_anthropic_provider() is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _is_image_size_error — detect size-related API errors
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -3241,6 +3241,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
||||
except Exception as _api_err:
|
||||
from tools.vision_tools import (
|
||||
_is_image_size_error, _resize_image_for_vision, _RESIZE_TARGET_BYTES,
|
||||
_is_anthropic_provider,
|
||||
)
|
||||
if (_is_image_size_error(_api_err)
|
||||
and len(data_url) > _RESIZE_TARGET_BYTES):
|
||||
@@ -3251,7 +3252,8 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
||||
_RESIZE_TARGET_BYTES / (1024 * 1024),
|
||||
)
|
||||
data_url = _resize_image_for_vision(
|
||||
screenshot_path, mime_type="image/png")
|
||||
screenshot_path, mime_type="image/png",
|
||||
clamp_dimensions=_is_anthropic_provider())
|
||||
call_kwargs["messages"][0]["content"][1]["image_url"]["url"] = data_url
|
||||
response = call_llm(**call_kwargs)
|
||||
else:
|
||||
|
||||
@@ -287,6 +287,63 @@ _MAX_BASE64_BYTES = 20 * 1024 * 1024
|
||||
# rejects an image, we downscale to this target and retry once.
|
||||
_RESIZE_TARGET_BYTES = 5 * 1024 * 1024
|
||||
|
||||
# Anthropic Messages API rejects any image with width or height >8000 px with
|
||||
# a non_retryable_client_error 400, which poisons message history on the native
|
||||
# fast path. We clamp to 7999 only when sending to Anthropic; other providers
|
||||
# (OpenAI, Gemini, custom hosts) auto-downscale or handle their own limits.
|
||||
_MAX_IMAGE_DIMENSION = 7999
|
||||
|
||||
|
||||
# Providers that use Anthropic image-block format and enforce the 8000 px cap.
|
||||
# Mirrors the Anthropic + aggregators-proxying-Claude subset of
|
||||
# ``_supports_media_in_tool_results`` below.
|
||||
_ANTHROPIC_IMAGE_PROVIDERS = frozenset({
|
||||
"anthropic", "claude", "claude-code", "anthropic-direct",
|
||||
"openrouter", "nous", "vertex", "bedrock",
|
||||
"anthropic-vertex", "google-vertex",
|
||||
})
|
||||
|
||||
|
||||
def _is_anthropic_provider() -> bool:
|
||||
"""True if the active main provider uses Anthropic image-block format."""
|
||||
try:
|
||||
from agent.auxiliary_client import _read_main_provider
|
||||
return (_read_main_provider() or "").strip().lower() in _ANTHROPIC_IMAGE_PROVIDERS
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _get_image_dimensions(image_path: Path) -> Optional[tuple]:
|
||||
"""Return ``(width, height)`` for an image, or ``None`` if Pillow is
|
||||
unavailable or the file cannot be opened.
|
||||
|
||||
Uses ``Image.open`` which only reads the header — no full decode — so this
|
||||
is cheap even for large files.
|
||||
"""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
return None
|
||||
try:
|
||||
with Image.open(image_path) as img:
|
||||
return (img.width, img.height)
|
||||
except Exception as exc:
|
||||
logger.debug("Could not read image dimensions for %s: %s", image_path, exc)
|
||||
return None
|
||||
|
||||
|
||||
def _image_exceeds_pixel_cap(image_path: Path,
|
||||
max_dim: int = _MAX_IMAGE_DIMENSION) -> bool:
|
||||
"""Return True if either axis of the image exceeds ``max_dim`` pixels.
|
||||
|
||||
Returns False when Pillow is missing or dimensions cannot be read — the
|
||||
caller's byte-size guard remains the last line of defence in that case.
|
||||
"""
|
||||
dims = _get_image_dimensions(image_path)
|
||||
if dims is None:
|
||||
return False
|
||||
return dims[0] > max_dim or dims[1] > max_dim
|
||||
|
||||
|
||||
def _is_image_size_error(error: Exception) -> bool:
|
||||
"""Detect if an API error is related to image or payload size."""
|
||||
@@ -299,7 +356,8 @@ def _is_image_size_error(error: Exception) -> bool:
|
||||
|
||||
|
||||
def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
|
||||
max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str:
|
||||
max_base64_bytes: int = _RESIZE_TARGET_BYTES,
|
||||
clamp_dimensions: bool = False) -> str:
|
||||
"""Convert an image to a base64 data URL, auto-resizing if too large.
|
||||
|
||||
Tries Pillow first to progressively downscale oversized images. If Pillow
|
||||
@@ -309,10 +367,11 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
|
||||
Returns the base64 data URL string.
|
||||
"""
|
||||
# Quick file-size estimate: base64 expands by ~4/3, plus data URL header.
|
||||
# Skip the expensive full-read + encode if Pillow can resize directly.
|
||||
# Skip the fast-exit when caller wants dimension clamping (Anthropic).
|
||||
file_size = image_path.stat().st_size
|
||||
estimated_b64 = (file_size * 4) // 3 + 100 # ~header overhead
|
||||
if estimated_b64 <= max_base64_bytes:
|
||||
needs_dim_clamp = clamp_dimensions and _image_exceeds_pixel_cap(image_path)
|
||||
if estimated_b64 <= max_base64_bytes and not needs_dim_clamp:
|
||||
# Small enough — just encode directly.
|
||||
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
|
||||
if len(data_url) <= max_base64_bytes:
|
||||
@@ -350,6 +409,18 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
|
||||
if pil_format == "JPEG" and img.mode in {"RGBA", "P"}:
|
||||
img = img.convert("RGB")
|
||||
|
||||
# Dimension clamp (Anthropic only): proportionally shrink to fit the cap
|
||||
# before any byte-size work, since Anthropic rejects on dimensions alone.
|
||||
if clamp_dimensions and (img.width > _MAX_IMAGE_DIMENSION or img.height > _MAX_IMAGE_DIMENSION):
|
||||
scale = _MAX_IMAGE_DIMENSION / max(img.width, img.height)
|
||||
new_w = max(int(img.width * scale), 1)
|
||||
new_h = max(int(img.height * scale), 1)
|
||||
logger.info(
|
||||
"Image %dx%d exceeds %d px cap, clamping to %dx%d",
|
||||
img.width, img.height, _MAX_IMAGE_DIMENSION, new_w, new_h,
|
||||
)
|
||||
img = img.resize((new_w, new_h), Image.LANCZOS)
|
||||
|
||||
# Strategy: halve dimensions until base64 fits, up to 4 rounds.
|
||||
# For JPEG, also try reducing quality at each size step.
|
||||
# For PNG, quality is irrelevant — only dimension reduction helps.
|
||||
@@ -594,10 +665,13 @@ async def _vision_analyze_native(
|
||||
temp_image_path, mime_type=detected_mime_type,
|
||||
)
|
||||
|
||||
# Honour the same hard cap as the legacy path. Resize if needed.
|
||||
if len(image_data_url) > _MAX_BASE64_BYTES:
|
||||
# Honour the byte cap; also apply the pixel cap for Anthropic only.
|
||||
is_anthropic = _is_anthropic_provider()
|
||||
if (len(image_data_url) > _MAX_BASE64_BYTES
|
||||
or (is_anthropic and _image_exceeds_pixel_cap(temp_image_path))):
|
||||
image_data_url = _resize_image_for_vision(
|
||||
temp_image_path, mime_type=detected_mime_type,
|
||||
clamp_dimensions=is_anthropic,
|
||||
)
|
||||
if len(image_data_url) > _MAX_BASE64_BYTES:
|
||||
return tool_error(
|
||||
@@ -738,11 +812,13 @@ async def vision_analyze_tool(
|
||||
data_size_kb = len(image_data_url) / 1024
|
||||
logger.info("Image converted to base64 (%.1f KB)", data_size_kb)
|
||||
|
||||
# Hard limit (20 MB) — no provider accepts payloads this large.
|
||||
if len(image_data_url) > _MAX_BASE64_BYTES:
|
||||
# Try to resize down to 5 MB before giving up.
|
||||
# Hard 20 MB byte cap; pixel cap applies to Anthropic only.
|
||||
is_anthropic = _is_anthropic_provider()
|
||||
if (len(image_data_url) > _MAX_BASE64_BYTES
|
||||
or (is_anthropic and _image_exceeds_pixel_cap(temp_image_path))):
|
||||
image_data_url = _resize_image_for_vision(
|
||||
temp_image_path, mime_type=detected_mime_type)
|
||||
temp_image_path, mime_type=detected_mime_type,
|
||||
clamp_dimensions=is_anthropic)
|
||||
if len(image_data_url) > _MAX_BASE64_BYTES:
|
||||
raise ValueError(
|
||||
f"Image too large for vision API: base64 payload is "
|
||||
@@ -818,7 +894,8 @@ async def vision_analyze_tool(
|
||||
_RESIZE_TARGET_BYTES / (1024 * 1024),
|
||||
)
|
||||
image_data_url = _resize_image_for_vision(
|
||||
temp_image_path, mime_type=detected_mime_type)
|
||||
temp_image_path, mime_type=detected_mime_type,
|
||||
clamp_dimensions=_is_anthropic_provider())
|
||||
messages[0]["content"][1]["image_url"]["url"] = image_data_url
|
||||
response = await async_call_llm(**call_kwargs)
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user