mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-30 16:01:49 +08:00
Compare commits
1 Commits
fix/plugin
...
hermes/her
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c92b93a118 |
@@ -383,7 +383,14 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit
|
|||||||
|
|
||||||
# Extract capability flags (default to False if missing)
|
# Extract capability flags (default to False if missing)
|
||||||
supports_tools = bool(entry.get("tool_call", False))
|
supports_tools = bool(entry.get("tool_call", False))
|
||||||
supports_vision = bool(entry.get("attachment", False))
|
# Vision: check both the `attachment` flag and `modalities.input` for "image".
|
||||||
|
# Some models (e.g. gemma-4) list image in input modalities but not attachment.
|
||||||
|
input_mods = entry.get("modalities", {})
|
||||||
|
if isinstance(input_mods, dict):
|
||||||
|
input_mods = input_mods.get("input", [])
|
||||||
|
else:
|
||||||
|
input_mods = []
|
||||||
|
supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods
|
||||||
supports_reasoning = bool(entry.get("reasoning", False))
|
supports_reasoning = bool(entry.get("reasoning", False))
|
||||||
|
|
||||||
# Extract limits
|
# Extract limits
|
||||||
|
|||||||
@@ -381,7 +381,7 @@ DEFAULT_CONFIG = {
|
|||||||
"model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o"
|
"model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o"
|
||||||
"base_url": "", # direct OpenAI-compatible endpoint (takes precedence over provider)
|
"base_url": "", # direct OpenAI-compatible endpoint (takes precedence over provider)
|
||||||
"api_key": "", # API key for base_url (falls back to OPENAI_API_KEY)
|
"api_key": "", # API key for base_url (falls back to OPENAI_API_KEY)
|
||||||
"timeout": 30, # seconds — LLM API call timeout; increase for slow local vision models
|
"timeout": 120, # seconds — LLM API call timeout; vision payloads need generous timeout
|
||||||
"download_timeout": 30, # seconds — image HTTP download timeout; increase for slow connections
|
"download_timeout": 30, # seconds — image HTTP download timeout; increase for slow connections
|
||||||
},
|
},
|
||||||
"web_extract": {
|
"web_extract": {
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from agent.models_dev import (
|
|||||||
PROVIDER_TO_MODELS_DEV,
|
PROVIDER_TO_MODELS_DEV,
|
||||||
_extract_context,
|
_extract_context,
|
||||||
fetch_models_dev,
|
fetch_models_dev,
|
||||||
|
get_model_capabilities,
|
||||||
lookup_models_dev_context,
|
lookup_models_dev_context,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -195,3 +196,88 @@ class TestFetchModelsDev:
|
|||||||
result = fetch_models_dev()
|
result = fetch_models_dev()
|
||||||
mock_get.assert_not_called()
|
mock_get.assert_not_called()
|
||||||
assert result == SAMPLE_REGISTRY
|
assert result == SAMPLE_REGISTRY
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# get_model_capabilities — vision via modalities.input
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
CAPS_REGISTRY = {
|
||||||
|
"google": {
|
||||||
|
"id": "google",
|
||||||
|
"models": {
|
||||||
|
"gemma-4-31b-it": {
|
||||||
|
"id": "gemma-4-31b-it",
|
||||||
|
"attachment": False,
|
||||||
|
"tool_call": True,
|
||||||
|
"modalities": {"input": ["text", "image"]},
|
||||||
|
"limit": {"context": 128000, "output": 8192},
|
||||||
|
},
|
||||||
|
"gemma-3-1b": {
|
||||||
|
"id": "gemma-3-1b",
|
||||||
|
"tool_call": True,
|
||||||
|
"limit": {"context": 32000, "output": 8192},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"anthropic": {
|
||||||
|
"id": "anthropic",
|
||||||
|
"models": {
|
||||||
|
"claude-sonnet-4": {
|
||||||
|
"id": "claude-sonnet-4",
|
||||||
|
"attachment": True,
|
||||||
|
"tool_call": True,
|
||||||
|
"limit": {"context": 200000, "output": 64000},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetModelCapabilities:
|
||||||
|
"""Tests for get_model_capabilities vision detection."""
|
||||||
|
|
||||||
|
def test_vision_from_attachment_flag(self):
|
||||||
|
"""Models with attachment=True should report supports_vision=True."""
|
||||||
|
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
|
||||||
|
caps = get_model_capabilities("anthropic", "claude-sonnet-4")
|
||||||
|
assert caps is not None
|
||||||
|
assert caps.supports_vision is True
|
||||||
|
|
||||||
|
def test_vision_from_modalities_input_image(self):
|
||||||
|
"""Models with 'image' in modalities.input but attachment=False should
|
||||||
|
still report supports_vision=True (the core fix in this PR)."""
|
||||||
|
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
|
||||||
|
caps = get_model_capabilities("google", "gemma-4-31b-it")
|
||||||
|
assert caps is not None
|
||||||
|
assert caps.supports_vision is True
|
||||||
|
|
||||||
|
def test_no_vision_without_attachment_or_modalities(self):
|
||||||
|
"""Models with neither attachment nor image modality should be non-vision."""
|
||||||
|
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
|
||||||
|
caps = get_model_capabilities("google", "gemma-3-1b")
|
||||||
|
assert caps is not None
|
||||||
|
assert caps.supports_vision is False
|
||||||
|
|
||||||
|
def test_modalities_non_dict_handled(self):
|
||||||
|
"""Non-dict modalities field should not crash."""
|
||||||
|
registry = {
|
||||||
|
"google": {"id": "google", "models": {
|
||||||
|
"weird-model": {
|
||||||
|
"id": "weird-model",
|
||||||
|
"modalities": "text", # not a dict
|
||||||
|
"limit": {"context": 200000, "output": 8192},
|
||||||
|
},
|
||||||
|
}},
|
||||||
|
}
|
||||||
|
with patch("agent.models_dev.fetch_models_dev", return_value=registry):
|
||||||
|
caps = get_model_capabilities("gemini", "weird-model")
|
||||||
|
assert caps is not None
|
||||||
|
assert caps.supports_vision is False
|
||||||
|
|
||||||
|
def test_model_not_found_returns_none(self):
|
||||||
|
"""Unknown model should return None."""
|
||||||
|
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
|
||||||
|
caps = get_model_capabilities("anthropic", "nonexistent-model")
|
||||||
|
assert caps is None
|
||||||
|
|||||||
@@ -15,6 +15,10 @@ from tools.vision_tools import (
|
|||||||
_handle_vision_analyze,
|
_handle_vision_analyze,
|
||||||
_determine_mime_type,
|
_determine_mime_type,
|
||||||
_image_to_base64_data_url,
|
_image_to_base64_data_url,
|
||||||
|
_resize_image_for_vision,
|
||||||
|
_is_image_size_error,
|
||||||
|
_MAX_BASE64_BYTES,
|
||||||
|
_RESIZE_TARGET_BYTES,
|
||||||
vision_analyze_tool,
|
vision_analyze_tool,
|
||||||
check_vision_requirements,
|
check_vision_requirements,
|
||||||
get_debug_session_info,
|
get_debug_session_info,
|
||||||
@@ -590,11 +594,13 @@ class TestBase64SizeLimit:
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_oversized_image_rejected_before_api_call(self, tmp_path):
|
async def test_oversized_image_rejected_before_api_call(self, tmp_path):
|
||||||
"""Images exceeding 5 MB base64 should fail with a clear size error."""
|
"""Images exceeding the 20 MB hard limit should fail with a clear error."""
|
||||||
img = tmp_path / "huge.png"
|
img = tmp_path / "huge.png"
|
||||||
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * (4 * 1024 * 1024))
|
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * (4 * 1024 * 1024))
|
||||||
|
|
||||||
with patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm:
|
# Patch the hard limit to a small value so the test runs fast.
|
||||||
|
with patch("tools.vision_tools._MAX_BASE64_BYTES", 1000), \
|
||||||
|
patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm:
|
||||||
result = json.loads(await vision_analyze_tool(str(img), "describe this"))
|
result = json.loads(await vision_analyze_tool(str(img), "describe this"))
|
||||||
|
|
||||||
assert result["success"] is False
|
assert result["success"] is False
|
||||||
@@ -686,3 +692,124 @@ class TestVisionRegistration:
|
|||||||
|
|
||||||
entry = registry._tools.get("vision_analyze")
|
entry = registry._tools.get("vision_analyze")
|
||||||
assert callable(entry.handler)
|
assert callable(entry.handler)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _resize_image_for_vision — auto-resize oversized images
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestResizeImageForVision:
|
||||||
|
"""Tests for the auto-resize function."""
|
||||||
|
|
||||||
|
def test_small_image_returned_as_is(self, tmp_path):
|
||||||
|
"""Images under the limit should be returned unchanged."""
|
||||||
|
# Create a small 10x10 red PNG
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("Pillow not installed")
|
||||||
|
img = Image.new("RGB", (10, 10), (255, 0, 0))
|
||||||
|
path = tmp_path / "small.png"
|
||||||
|
img.save(path, "PNG")
|
||||||
|
|
||||||
|
result = _resize_image_for_vision(path, mime_type="image/png")
|
||||||
|
assert result.startswith("data:image/png;base64,")
|
||||||
|
assert len(result) < _MAX_BASE64_BYTES
|
||||||
|
|
||||||
|
def test_large_image_is_resized(self, tmp_path):
|
||||||
|
"""Images over the default target should be auto-resized to fit."""
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("Pillow not installed")
|
||||||
|
# Create a large image that will exceed 5 MB in base64
|
||||||
|
# A 4000x4000 uncompressed PNG will be large
|
||||||
|
img = Image.new("RGB", (4000, 4000), (128, 200, 50))
|
||||||
|
path = tmp_path / "large.png"
|
||||||
|
img.save(path, "PNG")
|
||||||
|
|
||||||
|
result = _resize_image_for_vision(path, mime_type="image/png")
|
||||||
|
assert result.startswith("data:image/png;base64,")
|
||||||
|
# Default target is _RESIZE_TARGET_BYTES (5 MB), not _MAX_BASE64_BYTES (20 MB)
|
||||||
|
assert len(result) <= _RESIZE_TARGET_BYTES
|
||||||
|
|
||||||
|
def test_custom_max_bytes(self, tmp_path):
|
||||||
|
"""The max_base64_bytes parameter should be respected."""
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("Pillow not installed")
|
||||||
|
img = Image.new("RGB", (200, 200), (0, 128, 255))
|
||||||
|
path = tmp_path / "medium.png"
|
||||||
|
img.save(path, "PNG")
|
||||||
|
|
||||||
|
# Set a very low limit to force resizing
|
||||||
|
result = _resize_image_for_vision(path, max_base64_bytes=500)
|
||||||
|
# Should still return a valid data URL
|
||||||
|
assert result.startswith("data:image/")
|
||||||
|
|
||||||
|
def test_jpeg_output_for_non_png(self, tmp_path):
|
||||||
|
"""Non-PNG images should be resized as JPEG."""
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("Pillow not installed")
|
||||||
|
img = Image.new("RGB", (2000, 2000), (255, 128, 0))
|
||||||
|
path = tmp_path / "photo.jpg"
|
||||||
|
img.save(path, "JPEG", quality=95)
|
||||||
|
|
||||||
|
result = _resize_image_for_vision(path, mime_type="image/jpeg",
|
||||||
|
max_base64_bytes=50_000)
|
||||||
|
assert result.startswith("data:image/jpeg;base64,")
|
||||||
|
|
||||||
|
def test_constants_sane(self):
|
||||||
|
"""Hard limit should be larger than resize target."""
|
||||||
|
assert _MAX_BASE64_BYTES == 20 * 1024 * 1024
|
||||||
|
assert _RESIZE_TARGET_BYTES == 5 * 1024 * 1024
|
||||||
|
assert _MAX_BASE64_BYTES > _RESIZE_TARGET_BYTES
|
||||||
|
|
||||||
|
def test_no_pillow_returns_original(self, tmp_path):
|
||||||
|
"""Without Pillow, oversized images should be returned as-is."""
|
||||||
|
# Create a dummy file
|
||||||
|
path = tmp_path / "test.png"
|
||||||
|
# Write enough bytes to exceed a tiny limit
|
||||||
|
path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 1000)
|
||||||
|
|
||||||
|
with patch("tools.vision_tools._image_to_base64_data_url") as mock_b64:
|
||||||
|
# Simulate a large base64 result
|
||||||
|
mock_b64.return_value = "data:image/png;base64," + "A" * 200
|
||||||
|
with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}):
|
||||||
|
result = _resize_image_for_vision(path, max_base64_bytes=100)
|
||||||
|
# Should return the original (oversized) data url
|
||||||
|
assert len(result) > 100
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _is_image_size_error — detect size-related API errors
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestIsImageSizeError:
|
||||||
|
"""Tests for the size-error detection helper."""
|
||||||
|
|
||||||
|
def test_too_large_message(self):
|
||||||
|
assert _is_image_size_error(Exception("Request payload too large"))
|
||||||
|
|
||||||
|
def test_413_status(self):
|
||||||
|
assert _is_image_size_error(Exception("HTTP 413 Payload Too Large"))
|
||||||
|
|
||||||
|
def test_invalid_request(self):
|
||||||
|
assert _is_image_size_error(Exception("invalid_request_error: image too big"))
|
||||||
|
|
||||||
|
def test_exceeds_limit(self):
|
||||||
|
assert _is_image_size_error(Exception("Image exceeds maximum size"))
|
||||||
|
|
||||||
|
def test_unrelated_error(self):
|
||||||
|
assert not _is_image_size_error(Exception("Connection refused"))
|
||||||
|
|
||||||
|
def test_auth_error(self):
|
||||||
|
assert not _is_image_size_error(Exception("401 Unauthorized"))
|
||||||
|
|
||||||
|
def test_empty_message(self):
|
||||||
|
assert not _is_image_size_error(Exception(""))
|
||||||
|
|||||||
@@ -1873,10 +1873,10 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
|||||||
),
|
),
|
||||||
}, ensure_ascii=False)
|
}, ensure_ascii=False)
|
||||||
|
|
||||||
# Read and convert to base64
|
# Convert screenshot to base64 at full resolution.
|
||||||
image_data = screenshot_path.read_bytes()
|
_screenshot_bytes = screenshot_path.read_bytes()
|
||||||
image_base64 = base64.b64encode(image_data).decode("ascii")
|
_screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii")
|
||||||
data_url = f"data:image/png;base64,{image_base64}"
|
data_url = f"data:image/png;base64,{_screenshot_b64}"
|
||||||
|
|
||||||
vision_prompt = (
|
vision_prompt = (
|
||||||
f"You are analyzing a screenshot of a web browser.\n\n"
|
f"You are analyzing a screenshot of a web browser.\n\n"
|
||||||
@@ -1890,7 +1890,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
|||||||
# Use the centralized LLM router
|
# Use the centralized LLM router
|
||||||
vision_model = _get_vision_model()
|
vision_model = _get_vision_model()
|
||||||
logger.debug("browser_vision: analysing screenshot (%d bytes)",
|
logger.debug("browser_vision: analysing screenshot (%d bytes)",
|
||||||
len(image_data))
|
len(_screenshot_bytes))
|
||||||
|
|
||||||
# Read vision timeout from config (auxiliary.vision.timeout), default 120s.
|
# Read vision timeout from config (auxiliary.vision.timeout), default 120s.
|
||||||
# Local vision models (llama.cpp, ollama) can take well over 30s for
|
# Local vision models (llama.cpp, ollama) can take well over 30s for
|
||||||
@@ -1922,7 +1922,27 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
|||||||
}
|
}
|
||||||
if vision_model:
|
if vision_model:
|
||||||
call_kwargs["model"] = vision_model
|
call_kwargs["model"] = vision_model
|
||||||
|
# Try full-size screenshot; on size-related rejection, downscale and retry.
|
||||||
|
try:
|
||||||
response = call_llm(**call_kwargs)
|
response = call_llm(**call_kwargs)
|
||||||
|
except Exception as _api_err:
|
||||||
|
from tools.vision_tools import (
|
||||||
|
_is_image_size_error, _resize_image_for_vision, _RESIZE_TARGET_BYTES,
|
||||||
|
)
|
||||||
|
if (_is_image_size_error(_api_err)
|
||||||
|
and len(data_url) > _RESIZE_TARGET_BYTES):
|
||||||
|
logger.info(
|
||||||
|
"Vision API rejected screenshot (%.1f MB); "
|
||||||
|
"auto-resizing to ~%.0f MB and retrying...",
|
||||||
|
len(data_url) / (1024 * 1024),
|
||||||
|
_RESIZE_TARGET_BYTES / (1024 * 1024),
|
||||||
|
)
|
||||||
|
data_url = _resize_image_for_vision(
|
||||||
|
screenshot_path, mime_type="image/png")
|
||||||
|
call_kwargs["messages"][0]["content"][1]["image_url"]["url"] = data_url
|
||||||
|
response = call_llm(**call_kwargs)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
analysis = (response.choices[0].message.content or "").strip()
|
analysis = (response.choices[0].message.content or "").strip()
|
||||||
# Redact secrets the vision LLM may have read from the screenshot.
|
# Redact secrets the vision LLM may have read from the screenshot.
|
||||||
|
|||||||
@@ -277,6 +277,120 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
|
|||||||
return data_url
|
return data_url
|
||||||
|
|
||||||
|
|
||||||
|
# Hard limit for vision API payloads (20 MB) — matches the most restrictive
|
||||||
|
# major provider (Gemini inline data limit). Images above this are rejected.
|
||||||
|
_MAX_BASE64_BYTES = 20 * 1024 * 1024
|
||||||
|
|
||||||
|
# Target size when auto-resizing on API failure (5 MB). After a provider
|
||||||
|
# rejects an image, we downscale to this target and retry once.
|
||||||
|
_RESIZE_TARGET_BYTES = 5 * 1024 * 1024
|
||||||
|
|
||||||
|
|
||||||
|
def _is_image_size_error(error: Exception) -> bool:
|
||||||
|
"""Detect if an API error is related to image or payload size."""
|
||||||
|
err_str = str(error).lower()
|
||||||
|
return any(hint in err_str for hint in (
|
||||||
|
"too large", "payload", "413", "content_too_large",
|
||||||
|
"request_too_large", "image_url", "invalid_request",
|
||||||
|
"exceeds", "size limit",
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
|
||||||
|
max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str:
|
||||||
|
"""Convert an image to a base64 data URL, auto-resizing if too large.
|
||||||
|
|
||||||
|
Tries Pillow first to progressively downscale oversized images. If Pillow
|
||||||
|
is not installed or resizing still exceeds the limit, falls back to the raw
|
||||||
|
bytes and lets the caller handle the size check.
|
||||||
|
|
||||||
|
Returns the base64 data URL string.
|
||||||
|
"""
|
||||||
|
# Quick file-size estimate: base64 expands by ~4/3, plus data URL header.
|
||||||
|
# Skip the expensive full-read + encode if Pillow can resize directly.
|
||||||
|
file_size = image_path.stat().st_size
|
||||||
|
estimated_b64 = (file_size * 4) // 3 + 100 # ~header overhead
|
||||||
|
if estimated_b64 <= max_base64_bytes:
|
||||||
|
# Small enough — just encode directly.
|
||||||
|
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
|
||||||
|
if len(data_url) <= max_base64_bytes:
|
||||||
|
return data_url
|
||||||
|
else:
|
||||||
|
data_url = None # defer full encode; try Pillow resize first
|
||||||
|
|
||||||
|
# Attempt auto-resize with Pillow (soft dependency)
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
import io as _io
|
||||||
|
except ImportError:
|
||||||
|
logger.info("Pillow not installed — cannot auto-resize oversized image")
|
||||||
|
if data_url is None:
|
||||||
|
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
|
||||||
|
return data_url # caller will raise the size error
|
||||||
|
|
||||||
|
logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...",
|
||||||
|
file_size / (1024 * 1024), estimated_b64 / (1024 * 1024),
|
||||||
|
max_base64_bytes / (1024 * 1024))
|
||||||
|
|
||||||
|
mime = mime_type or _determine_mime_type(image_path)
|
||||||
|
# Choose output format: JPEG for photos (smaller), PNG for transparency
|
||||||
|
pil_format = "PNG" if mime == "image/png" else "JPEG"
|
||||||
|
out_mime = "image/png" if pil_format == "PNG" else "image/jpeg"
|
||||||
|
|
||||||
|
try:
|
||||||
|
img = Image.open(image_path)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.info("Pillow cannot open image for resizing: %s", exc)
|
||||||
|
if data_url is None:
|
||||||
|
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
|
||||||
|
return data_url # fall through to size-check in caller
|
||||||
|
# Convert RGBA to RGB for JPEG output
|
||||||
|
if pil_format == "JPEG" and img.mode in ("RGBA", "P"):
|
||||||
|
img = img.convert("RGB")
|
||||||
|
|
||||||
|
# Strategy: halve dimensions until base64 fits, up to 4 rounds.
|
||||||
|
# For JPEG, also try reducing quality at each size step.
|
||||||
|
# For PNG, quality is irrelevant — only dimension reduction helps.
|
||||||
|
quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,)
|
||||||
|
prev_dims = (img.width, img.height)
|
||||||
|
candidate = None # will be set on first loop iteration
|
||||||
|
|
||||||
|
for attempt in range(5):
|
||||||
|
if attempt > 0:
|
||||||
|
new_w = max(img.width // 2, 64)
|
||||||
|
new_h = max(img.height // 2, 64)
|
||||||
|
# Stop if dimensions can't shrink further
|
||||||
|
if (new_w, new_h) == prev_dims:
|
||||||
|
break
|
||||||
|
img = img.resize((new_w, new_h), Image.LANCZOS)
|
||||||
|
prev_dims = (new_w, new_h)
|
||||||
|
logger.info("Resized to %dx%d (attempt %d)", new_w, new_h, attempt)
|
||||||
|
|
||||||
|
for q in quality_steps:
|
||||||
|
buf = _io.BytesIO()
|
||||||
|
save_kwargs = {"format": pil_format}
|
||||||
|
if q is not None:
|
||||||
|
save_kwargs["quality"] = q
|
||||||
|
img.save(buf, **save_kwargs)
|
||||||
|
encoded = base64.b64encode(buf.getvalue()).decode("ascii")
|
||||||
|
candidate = f"data:{out_mime};base64,{encoded}"
|
||||||
|
if len(candidate) <= max_base64_bytes:
|
||||||
|
logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)",
|
||||||
|
len(candidate) / (1024 * 1024), q,
|
||||||
|
img.width, img.height)
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
# If we still can't get it small enough, return the best attempt
|
||||||
|
# and let the caller decide
|
||||||
|
if candidate is not None:
|
||||||
|
logger.warning("Auto-resize could not fit image under %.1f MB (best: %.1f MB)",
|
||||||
|
max_base64_bytes / (1024 * 1024), len(candidate) / (1024 * 1024))
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
# Shouldn't reach here, but fall back to full encode
|
||||||
|
return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type)
|
||||||
|
|
||||||
|
|
||||||
async def vision_analyze_tool(
|
async def vision_analyze_tool(
|
||||||
image_url: str,
|
image_url: str,
|
||||||
user_prompt: str,
|
user_prompt: str,
|
||||||
@@ -376,23 +490,26 @@ async def vision_analyze_tool(
|
|||||||
if not detected_mime_type:
|
if not detected_mime_type:
|
||||||
raise ValueError("Only real image files are supported for vision analysis.")
|
raise ValueError("Only real image files are supported for vision analysis.")
|
||||||
|
|
||||||
# Convert image to base64 data URL
|
# Convert image to base64 — send at full resolution first.
|
||||||
|
# If the provider rejects it as too large, we auto-resize and retry.
|
||||||
logger.info("Converting image to base64...")
|
logger.info("Converting image to base64...")
|
||||||
image_data_url = _image_to_base64_data_url(temp_image_path, mime_type=detected_mime_type)
|
image_data_url = _image_to_base64_data_url(temp_image_path, mime_type=detected_mime_type)
|
||||||
# Calculate size in KB for better readability
|
|
||||||
data_size_kb = len(image_data_url) / 1024
|
data_size_kb = len(image_data_url) / 1024
|
||||||
logger.info("Image converted to base64 (%.1f KB)", data_size_kb)
|
logger.info("Image converted to base64 (%.1f KB)", data_size_kb)
|
||||||
|
|
||||||
# Pre-flight size check: most vision APIs cap base64 payloads at 5 MB.
|
# Hard limit (20 MB) — no provider accepts payloads this large.
|
||||||
# Reject early with a clear message instead of a cryptic provider 400.
|
if len(image_data_url) > _MAX_BASE64_BYTES:
|
||||||
_MAX_BASE64_BYTES = 5 * 1024 * 1024 # 5 MB
|
# Try to resize down to 5 MB before giving up.
|
||||||
# The data URL includes the header (e.g. "data:image/jpeg;base64,") which
|
image_data_url = _resize_image_for_vision(
|
||||||
# is negligible, but measure the full string to be safe.
|
temp_image_path, mime_type=detected_mime_type)
|
||||||
if len(image_data_url) > _MAX_BASE64_BYTES:
|
if len(image_data_url) > _MAX_BASE64_BYTES:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Image too large for vision API: base64 payload is "
|
f"Image too large for vision API: base64 payload is "
|
||||||
f"{len(image_data_url) / (1024 * 1024):.1f} MB (limit 5 MB). "
|
f"{len(image_data_url) / (1024 * 1024):.1f} MB "
|
||||||
f"Resize or compress the image and try again."
|
f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) "
|
||||||
|
f"even after resizing. "
|
||||||
|
f"Install Pillow (`pip install Pillow`) for better auto-resize, "
|
||||||
|
f"or compress the image manually."
|
||||||
)
|
)
|
||||||
|
|
||||||
debug_call_data["image_size_bytes"] = image_size_bytes
|
debug_call_data["image_size_bytes"] = image_size_bytes
|
||||||
@@ -442,7 +559,24 @@ async def vision_analyze_tool(
|
|||||||
}
|
}
|
||||||
if model:
|
if model:
|
||||||
call_kwargs["model"] = model
|
call_kwargs["model"] = model
|
||||||
|
# Try full-size image first; on size-related rejection, downscale and retry.
|
||||||
|
try:
|
||||||
response = await async_call_llm(**call_kwargs)
|
response = await async_call_llm(**call_kwargs)
|
||||||
|
except Exception as _api_err:
|
||||||
|
if (_is_image_size_error(_api_err)
|
||||||
|
and len(image_data_url) > _RESIZE_TARGET_BYTES):
|
||||||
|
logger.info(
|
||||||
|
"API rejected image (%.1f MB, likely too large); "
|
||||||
|
"auto-resizing to ~%.0f MB and retrying...",
|
||||||
|
len(image_data_url) / (1024 * 1024),
|
||||||
|
_RESIZE_TARGET_BYTES / (1024 * 1024),
|
||||||
|
)
|
||||||
|
image_data_url = _resize_image_for_vision(
|
||||||
|
temp_image_path, mime_type=detected_mime_type)
|
||||||
|
messages[0]["content"][1]["image_url"]["url"] = image_data_url
|
||||||
|
response = await async_call_llm(**call_kwargs)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
# Extract the analysis — fall back to reasoning if content is empty
|
# Extract the analysis — fall back to reasoning if content is empty
|
||||||
analysis = extract_content_or_reasoning(response)
|
analysis = extract_content_or_reasoning(response)
|
||||||
@@ -498,8 +632,8 @@ async def vision_analyze_tool(
|
|||||||
elif "invalid_request" in err_str or "image_url" in err_str:
|
elif "invalid_request" in err_str or "image_url" in err_str:
|
||||||
analysis = (
|
analysis = (
|
||||||
"The vision API rejected the image. This can happen when the "
|
"The vision API rejected the image. This can happen when the "
|
||||||
"image is too large, in an unsupported format, or corrupted. "
|
"image is in an unsupported format, corrupted, or still too "
|
||||||
"Try a smaller JPEG/PNG (under 3.5 MB) and retry. "
|
"large after auto-resize. Try a smaller JPEG/PNG and retry. "
|
||||||
f"Error: {e}"
|
f"Error: {e}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user