test(image_generate): replace unit tests with thin e2e

Drop the granular unit/invariant tests in favor of one thin e2e that drives the real image_generate handler end-to-end (real catalog, real payload build, real local-file→data-URI encoding; only the FAL HTTP submit is stubbed): one image-edit happy path + one text-to-image regression.
feat(image_generate): support image input for image-to-image / editing
2026-06-17 07:31:21 +08:00 · 2026-06-15 16:33:41 +05:30 · 2026-06-15 16:31:26 +05:30
3 changed files with 477 additions and 23 deletions
--- a/tests/tools/test_image_generation.py
+++ b/tests/tools/test_image_generation.py
@@ -363,11 +363,12 @@ class TestAspectRatioNormalization:

 class TestRegistryIntegration:

-    def test_schema_exposes_only_prompt_and_aspect_ratio_to_agent(self, image_tool):
+    def test_schema_exposes_only_prompt_aspect_ratio_image_urls(self, image_tool):
        """The agent-facing schema must stay tight — model selection is a
-        user-level config choice, not an agent-level arg."""
+        user-level config choice, not an agent-level arg. (image_urls added
+        for image-to-image editing.)"""
        props = image_tool.IMAGE_GENERATE_SCHEMA["parameters"]["properties"]
-        assert set(props.keys()) == {"prompt", "aspect_ratio"}
+        assert set(props.keys()) == {"prompt", "aspect_ratio", "image_urls"}

    def test_aspect_ratio_enum_is_three_values(self, image_tool):
        enum = image_tool.IMAGE_GENERATE_SCHEMA["parameters"]["properties"]["aspect_ratio"]["enum"]
--- a/tests/tools/test_image_generation_image_input.py
+++ b/tests/tools/test_image_generation_image_input.py
@@ -0,0 +1,90 @@
+"""Thin end-to-end test for image_generate's image-to-image / edit path.
+
+Drives the real registered ``image_generate`` handler through the real module
+— real catalog, real payload construction, real local-file → data-URI
+encoding — and only stubs the outbound FAL HTTP submit (so it needs no FAL key
+and spends no credits). One happy-path edit and one no-edit fallback; that's
+the whole feature surface.
+"""
+
+from __future__ import annotations
+
+import json
+import struct
+import zlib
+
+import pytest
+
+
+@pytest.fixture
+def image_tool():
+    import importlib
+    import tools.image_generation_tool as mod
+    return importlib.reload(mod)
+
+
+def _tiny_png(path) -> str:
+    """Write a minimal valid 1x1 PNG so the encoder can sniff + open it."""
+    def chunk(typ, data):
+        return (struct.pack(">I", len(data)) + typ + data
+                + struct.pack(">I", zlib.crc32(typ + data) & 0xFFFFFFFF))
+    png = (b"\x89PNG\r\n\x1a\n"
+           + chunk(b"IHDR", struct.pack(">IIBBBBB", 1, 1, 8, 2, 0, 0, 0))
+           + chunk(b"IDAT", zlib.compress(b"\x00\xff\x00\x00"))
+           + chunk(b"IEND", b""))
+    path.write_bytes(png)
+    return str(path)
+
+
+def _stub_fal(image_tool, monkeypatch, captured):
+    """Stub the FAL backend so we run everything except the network call."""
+    monkeypatch.setattr(image_tool, "fal_key_is_configured", lambda: True)
+    monkeypatch.setattr(image_tool, "_resolve_managed_fal_gateway", lambda: None)
+
+    class _Handler:
+        def get(self):
+            return {"images": [{"url": "https://out/result.png", "width": 1, "height": 1}]}
+
+    def _submit(model, arguments=None, **kw):
+        captured["model"] = model
+        captured["arguments"] = arguments
+        return _Handler()
+
+    monkeypatch.setattr(image_tool, "_submit_fal_request", _submit)
+    # Pin an edit-capable generate model regardless of local config.
+    monkeypatch.setattr(
+        image_tool, "_resolve_fal_model",
+        lambda: ("fal-ai/nano-banana-pro",
+                 image_tool.FAL_MODELS["fal-ai/nano-banana-pro"]),
+    )
+
+
+def test_image_edit_e2e(image_tool, monkeypatch, tmp_path):
+    """A local image + prompt routes to the edit endpoint with the image
+    encoded as a data URI, and reports success."""
+    captured = {}
+    _stub_fal(image_tool, monkeypatch, captured)
+    ref = _tiny_png(tmp_path / "ref.png")
+
+    out = json.loads(image_tool._handle_image_generate(
+        {"prompt": "make it night", "image_urls": [ref]}
+    ))
+
+    assert out["success"] is True
+    assert out["image"] == "https://out/result.png"
+    # Routed to the edit endpoint, local file encoded to a data URI.
+    assert captured["model"] == "fal-ai/nano-banana-pro/edit"
+    assert captured["arguments"]["image_urls"][0].startswith("data:image/")
+    assert captured["arguments"]["prompt"] == "make it night"
+
+
+def test_text_to_image_still_works(image_tool, monkeypatch):
+    """No image_urls → unchanged text-to-image on the generate endpoint."""
+    captured = {}
+    _stub_fal(image_tool, monkeypatch, captured)
+
+    out = json.loads(image_tool._handle_image_generate({"prompt": "a cat"}))
+
+    assert out["success"] is True
+    assert captured["model"] == "fal-ai/nano-banana-pro"
+    assert "image_urls" not in captured["arguments"]
--- a/tools/image_generation_tool.py
+++ b/tools/image_generation_tool.py
@@ -26,7 +26,7 @@ import os
 import datetime
 import threading
 import uuid
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional

 # fal_client is imported lazily — see _load_fal_client(). Pulling it
 # eagerly added ~64 ms to every CLI cold start because
@@ -116,6 +116,24 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
            "output_format", "enable_safety_checker",
        },
        "upscale": False,
+        # Image-to-image / edit: when the caller supplies input images we
+        # switch to the model's sibling edit endpoint. Each edit block declares
+        # the endpoint's OWN accepted-param whitelist (edit and generate
+        # endpoints diverge — e.g. flux-2-pro/edit rejects num_inference_steps),
+        # the native key the reference images go under, and whether the endpoint
+        # takes a list (``multi``) or a single image. Edits never inject an
+        # explicit image_size — FAL infers output size from the input.
+        "edit": {
+            "model": "fal-ai/flux-2/klein/9b/edit",
+            "image_key": "image_urls",
+            "multi": True,
+            "max_images": 4,
+            "supports": {
+                "prompt", "image_urls", "image_size", "num_inference_steps",
+                "num_images", "seed", "output_format", "enable_safety_checker",
+                "sync_mode",
+            },
+        },
    },
    "fal-ai/flux-2-pro": {
        "display": "FLUX 2 Pro",
@@ -143,6 +161,19 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
            "safety_tolerance", "sync_mode", "seed",
        },
        "upscale": True,   # Backward-compat: current default behavior.
+        "edit": {
+            "model": "fal-ai/flux-2-pro/edit",
+            "image_key": "image_urls",
+            "multi": True,
+            "max_images": 9,   # FLUX 2 Pro edit accepts up to 9 reference images.
+            # NOTE: flux-2-pro/edit rejects num_inference_steps / guidance_scale
+            # / num_images that the generate endpoint accepts.
+            "supports": {
+                "prompt", "image_urls", "image_size", "seed",
+                "safety_tolerance", "enable_safety_checker", "output_format",
+                "sync_mode",
+            },
+        },
    },
    "fal-ai/z-image/turbo": {
        "display": "Z-Image Turbo",
@@ -194,6 +225,20 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
            "enable_web_search", "limit_generations",
        },
        "upscale": False,
+        "edit": {
+            "model": "fal-ai/nano-banana-pro/edit",
+            "image_key": "image_urls",
+            "multi": True,
+            # Gemini 3 Pro Image edit accepts multiple reference images; cap
+            # conservatively to keep payloads (and cost) bounded.
+            "max_images": 6,
+            "supports": {
+                "prompt", "image_urls", "aspect_ratio", "resolution",
+                "num_images", "output_format", "safety_tolerance", "seed",
+                "sync_mode", "enable_web_search", "limit_generations",
+                "system_prompt",
+            },
+        },
    },
    "fal-ai/gpt-image-1.5": {
        "display": "GPT Image 1.5",
@@ -218,6 +263,17 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
            "background", "sync_mode",
        },
        "upscale": False,
+        "edit": {
+            "model": "fal-ai/gpt-image-1.5/edit",
+            "image_key": "image_urls",
+            "multi": True,
+            "max_images": 4,
+            "supports": {
+                "prompt", "image_urls", "image_size", "quality", "num_images",
+                "output_format", "background", "input_fidelity",
+                "mask_image_url", "sync_mode",
+            },
+        },
    },
    "fal-ai/gpt-image-2": {
        "display": "GPT Image 2",
@@ -250,6 +306,18 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
            # through the shared FAL billing path.
        },
        "upscale": False,
+        "edit": {
+            # GPT Image 2's edit endpoint lives under the ``openai/`` namespace
+            # (not ``fal-ai/``), unlike its text-to-image generate endpoint.
+            "model": "openai/gpt-image-2/edit",
+            "image_key": "image_urls",
+            "multi": True,
+            "max_images": 4,
+            "supports": {
+                "prompt", "image_urls", "image_size", "quality", "num_images",
+                "output_format", "mask_url", "sync_mode",
+            },
+        },
    },
    "fal-ai/ideogram/v3": {
        "display": "Ideogram V3",
@@ -317,6 +385,21 @@ FAL_MODELS: Dict[str, Dict[str, Any]] = {
            "num_images", "output_format", "acceleration", "seed", "sync_mode",
        },
        "upscale": False,
+        "edit": {
+            # Qwen's edit endpoint takes a SINGLE image under the singular key
+            # ``image_url`` (not a list) — unlike the flux/gpt/nano edit
+            # endpoints which take an ``image_urls`` list.
+            "model": "fal-ai/qwen-image-edit",
+            "image_key": "image_url",
+            "multi": False,
+            "max_images": 1,
+            "supports": {
+                "prompt", "image_url", "image_size", "num_inference_steps",
+                "guidance_scale", "num_images", "acceleration", "seed",
+                "output_format", "negative_prompt", "enable_safety_checker",
+                "sync_mode",
+            },
+        },
    },
    # Krea 2 — Krea's first foundation image model, day-0 partner launch on
    # fal (2026-05-27). Same model family as our direct ``plugins/image_gen/krea``
@@ -517,14 +600,20 @@ def _build_fal_payload(
    aspect_ratio: str = DEFAULT_ASPECT_RATIO,
    seed: Optional[int] = None,
    overrides: Optional[Dict[str, Any]] = None,
+    meta: Optional[Dict[str, Any]] = None,
 ) -> Dict[str, Any]:
    """Build a FAL request payload for `model_id` from unified inputs.

    Translates aspect_ratio into the model's native size spec (preset enum,
    aspect-ratio enum, or GPT literal string), merges model defaults, applies
    caller overrides, then filters to the model's ``supports`` whitelist.
+
+    ``meta`` may be passed explicitly for synthesized endpoints that are not in
+    the static ``FAL_MODELS`` catalog (e.g. the edit endpoint produced by
+    :func:`_resolve_edit_target`). When omitted it's looked up by ``model_id``.
    """
-    meta = FAL_MODELS[model_id]
+    if meta is None:
+        meta = FAL_MODELS[model_id]
    size_style = meta["size_style"]
    sizes = meta["sizes"]

@@ -539,6 +628,10 @@ def _build_fal_payload(
        payload["image_size"] = sizes[aspect]
    elif size_style == "aspect_ratio":
        payload["aspect_ratio"] = sizes[aspect]
+    elif size_style == "none":
+        # Edit endpoints infer output size from the input image — don't inject
+        # an explicit size/aspect_ratio.
+        pass
    else:
        raise ValueError(f"Unknown size_style: {size_style!r}")

@@ -554,6 +647,163 @@ def _build_fal_payload(
    return {k: v for k, v in payload.items() if k in supports}


+def _resolve_edit_target(
+    model_id: str, input_images: List[str],
+) -> Optional[tuple]:
+    """Decide whether to route this call to an image-edit endpoint.
+
+    Returns ``(edit_model_id, edit_meta_dict)`` when the active model declares
+    an ``edit`` endpoint AND the caller supplied at least one input image, or
+    ``None`` to keep the normal text-to-image path.
+
+    ``edit_meta_dict`` is a synthesized FAL_MODELS-style entry built from the
+    edit block's OWN declared ``supports`` whitelist — NOT inherited from the
+    generate endpoint, because generate and edit endpoints diverge (e.g.
+    flux-2-pro/edit rejects ``num_inference_steps``/``guidance_scale`` that the
+    generate endpoint accepts). Editing never injects an explicit output size
+    (``size_style="none"``) — FAL infers it from the input image — and never
+    chains the upscaler.
+    """
+    if not input_images:
+        return None
+    meta = FAL_MODELS.get(model_id)
+    if not meta:
+        return None
+    edit = meta.get("edit")
+    if not isinstance(edit, dict) or not edit.get("model"):
+        return None
+
+    image_key = edit.get("image_key", "image_urls")
+    supports = set(edit.get("supports") or set())
+    # The image key must always be allowed through, even if a catalog author
+    # forgot to list it explicitly.
+    supports.add(image_key)
+    edit_meta: Dict[str, Any] = {
+        "display": meta.get("display", model_id) + " (edit)",
+        # Edits don't map our abstract aspect_ratio onto a forced size — the
+        # endpoint infers output dimensions from the reference image.
+        "size_style": "none",
+        "sizes": {},
+        "defaults": dict(edit.get("defaults") or {}),
+        "supports": supports,
+        "upscale": False,
+        "_image_key": image_key,
+        "_multi": bool(edit.get("multi", True)),
+        "_max_images": edit.get("max_images"),
+    }
+    return edit["model"], edit_meta
+
+
+# ---------------------------------------------------------------------------
+# Image input (image-to-image / edit) resolution
+# ---------------------------------------------------------------------------
+#
+# FAL edit/image-to-image endpoints accept each reference image as one of:
+#   - a public http(s) URL (FAL fetches it server-side), or
+#   - a base64 ``data:`` URI (FAL decodes it inline).
+#
+# A gateway-delivered image (Telegram/Discord/WhatsApp photo) lands on disk as
+# a local cache file (see gateway adapters' ``cache_image_from_url``), so we
+# must encode local paths to a data URI before submission — FAL cannot read the
+# host filesystem. We reuse the vision tool's encoder, which already sniffs the
+# MIME type and applies a size-aware downscale so we never blow past provider
+# payload ceilings.
+#
+# Per-image hard cap mirrors the vision pipeline (20 MB); above that no major
+# backend accepts the image and we drop it with a warning rather than fail the
+# whole generation.
+
+_MAX_INPUT_IMAGE_BYTES = 20 * 1024 * 1024
+
+
+def _resolve_input_image_ref(ref: str) -> Optional[str]:
+    """Resolve one user-supplied image reference to a FAL-submittable string.
+
+    Accepts three shapes (the same contract as ``video_generate``'s
+    ``image_url`` and ``vision_analyze``'s ``image_url``):
+
+      * ``http(s)://…``  — passed through unchanged (FAL fetches it).
+      * ``data:image/…`` — passed through unchanged (already inline base64).
+      * absolute / ``~``-relative local path — read from disk and encoded to a
+        base64 ``data:`` URI (size-aware downscale via the vision encoder).
+
+    Returns ``None`` (and logs) when the ref is empty, points at a missing
+    file, or can't be encoded — the caller skips it.
+    """
+    if not isinstance(ref, str):
+        return None
+    ref = ref.strip()
+    if not ref:
+        return None
+
+    lower = ref.lower()
+    if lower.startswith(("http://", "https://", "data:")):
+        return ref
+
+    # Treat everything else as a local filesystem path. Expand ``~`` so
+    # gateway-cached paths and user-typed home-relative paths both resolve.
+    candidate = os.path.expanduser(ref)
+    if not os.path.isfile(candidate):
+        logger.warning(
+            "image input ref is not an http(s)/data URL and no file exists at "
+            "%r — skipping", ref,
+        )
+        return None
+
+    try:
+        from pathlib import Path as _Path
+        from tools.vision_tools import (
+            _EMBED_MAX_DIMENSION,
+            _resize_image_for_vision,
+        )
+
+        # _resize_image_for_vision returns a data: URI, downscaling if the
+        # encoded payload would exceed the byte cap OR the pixel-dimension cap.
+        # Passing max_dimension matters: some edit endpoints (e.g.
+        # nano-banana-pro/edit) reject by pixel count before decode, so a
+        # small-byte-but-huge-pixel image would 422 without the dimension cap.
+        return _resize_image_for_vision(
+            _Path(candidate),
+            max_base64_bytes=_MAX_INPUT_IMAGE_BYTES,
+            max_dimension=_EMBED_MAX_DIMENSION,
+        )
+    except Exception as exc:  # noqa: BLE001 - one bad image must not kill the call
+        logger.warning("Could not encode local image input %r: %s", ref, exc)
+        return None
+
+
+def _resolve_input_images(
+    image_urls: Optional[Any], max_images: Optional[int] = None,
+) -> List[str]:
+    """Normalize the ``image_urls`` arg into a list of submittable refs.
+
+    Accepts a single string or a list of strings; resolves each via
+    :func:`_resolve_input_image_ref`; drops any that fail to resolve. When
+    ``max_images`` is set, the resolved list is truncated to that many (FAL
+    edit endpoints cap reference counts — e.g. FLUX 2 Pro allows up to 9).
+    """
+    if image_urls is None:
+        return []
+    if isinstance(image_urls, str):
+        image_urls = [image_urls]
+    if not isinstance(image_urls, (list, tuple)):
+        return []
+
+    resolved: List[str] = []
+    for ref in image_urls:
+        out = _resolve_input_image_ref(ref)
+        if out:
+            resolved.append(out)
+    if max_images is not None and max_images > 0:
+        if len(resolved) > max_images:
+            logger.info(
+                "Got %d input images but the active model caps at %d — using "
+                "the first %d", len(resolved), max_images, max_images,
+            )
+        resolved = resolved[:max_images]
+    return resolved
+
+
 # ---------------------------------------------------------------------------
 # Upscaler
 # ---------------------------------------------------------------------------
@@ -729,13 +979,21 @@ def image_generate_tool(
    num_images: Optional[int] = None,
    output_format: Optional[str] = None,
    seed: Optional[int] = None,
+    image_urls: Optional[Any] = None,
 ) -> str:
    """Generate an image from a text prompt using the configured FAL model.

-    The agent-facing schema exposes only ``prompt`` and ``aspect_ratio``; the
-    remaining kwargs are overrides for direct Python callers and are filtered
-    per-model via the ``supports`` whitelist (unsupported overrides are
-    silently dropped so legacy callers don't break when switching models).
+    The agent-facing schema exposes ``prompt``, ``aspect_ratio``, and
+    ``image_urls``; the remaining kwargs are overrides for direct Python
+    callers and are filtered per-model via the ``supports`` whitelist
+    (unsupported overrides are silently dropped so legacy callers don't break
+    when switching models).
+
+    ``image_urls`` (a string or list of strings — http(s) URL, ``data:`` URI,
+    or local file path) turns this into an image-to-image / edit call when the
+    active model declares an ``edit`` endpoint. Models without an edit endpoint
+    ignore the images and fall back to text-to-image (logged), so the call
+    never hard-fails just because the user attached a reference.

    Returns a JSON string with ``{"success": bool, "image": url | None,
    "error": str, "error_type": str}``.
@@ -752,6 +1010,10 @@ def image_generate_tool(
            "num_images": num_images,
            "output_format": output_format,
            "seed": seed,
+            "image_urls_count": (
+                len(image_urls) if isinstance(image_urls, (list, tuple))
+                else (1 if isinstance(image_urls, str) and image_urls.strip() else 0)
+            ),
        },
        "error": None,
        "success": False,
@@ -786,16 +1048,59 @@ def image_generate_tool(
        if output_format is not None:
            overrides["output_format"] = output_format

+        # ---- Image-to-image / edit routing -------------------------------
+        # Resolve user-supplied reference images (url / data-uri / local path)
+        # to FAL-submittable strings. If the active model has an edit endpoint
+        # and we have at least one usable image, switch model + payload to it.
+        # Otherwise we keep text-to-image (warning if images were supplied but
+        # the model can't use them).
+        submit_model_id = model_id
+        submit_meta = meta
+        edit_target = None
+        edit_note = None
+        if image_urls is not None:
+            edit = meta.get("edit") if isinstance(meta, dict) else None
+            max_imgs = edit.get("max_images") if isinstance(edit, dict) else None
+            resolved_images = _resolve_input_images(image_urls, max_images=max_imgs)
+            if resolved_images:
+                edit_target = _resolve_edit_target(model_id, resolved_images)
+                if edit_target is not None:
+                    submit_model_id, submit_meta = edit_target
+                    image_key = submit_meta["_image_key"]
+                    if submit_meta.get("_multi", True):
+                        overrides[image_key] = resolved_images
+                    else:
+                        # Single-image endpoint (e.g. qwen): the payload key is
+                        # a bare string, not a list.
+                        overrides[image_key] = resolved_images[0]
+                else:
+                    edit_note = (
+                        f"The active model '{meta.get('display', model_id)}' "
+                        f"does not support image input/editing; generated from "
+                        f"the prompt alone. To edit an image, switch to an "
+                        f"edit-capable model via `hermes tools` → Image "
+                        f"Generation (e.g. Nano Banana Pro, FLUX 2 Pro, GPT "
+                        f"Image)."
+                    )
+                    logger.warning(
+                        "image_urls supplied but model '%s' has no edit "
+                        "endpoint — ignoring images and generating from text",
+                        model_id,
+                    )
+
        arguments = _build_fal_payload(
-            model_id, prompt, aspect_lc, seed=seed, overrides=overrides,
+            submit_model_id, prompt, aspect_lc, seed=seed, overrides=overrides,
+            meta=(submit_meta if edit_target is not None else None),
        )

        logger.info(
-            "Generating image with %s (%s) — prompt: %s",
-            meta.get("display", model_id), model_id, prompt[:80],
+            "%s image with %s (%s) — prompt: %s",
+            "Editing" if edit_target is not None else "Generating",
+            submit_meta.get("display", submit_model_id), submit_model_id,
+            prompt[:80],
        )

-        handler = _submit_fal_request(model_id, arguments=arguments)
+        handler = _submit_fal_request(submit_model_id, arguments=arguments)
        result = handler.get()

        generation_time = (datetime.datetime.now() - start_time).total_seconds()
@@ -807,7 +1112,7 @@ def image_generate_tool(
        if not images:
            raise ValueError("No images were generated")

-        should_upscale = bool(meta.get("upscale", False))
+        should_upscale = bool(submit_meta.get("upscale", False))

        formatted_images = []
        for img in images:
@@ -842,6 +1147,13 @@ def image_generate_tool(
            "success": True,
            "image": formatted_images[0]["url"] if formatted_images else None,
        }
+        if edit_note:
+            # Surface the "images were ignored" reason in the tool RESULT (not
+            # just a server log) so the agent can react (switch model / tell
+            # the user) instead of silently returning a from-scratch image.
+            response_data["note"] = edit_note
+        elif edit_target is not None:
+            response_data["edited"] = True

        debug_call_data["success"] = True
        debug_call_data["images_generated"] = len(formatted_images)
@@ -1002,10 +1314,11 @@ from tools.registry import registry, tool_error
 IMAGE_GENERATE_SCHEMA = {
    "name": "image_generate",
    "description": (
-        "Generate high-quality images from text prompts. The underlying "
-        "backend (FAL, OpenAI, etc.) and model are user-configured and not "
-        "selectable by the agent. Returns either a URL or an absolute file "
-        "path in the `image` field; display it with markdown "
+        "Generate high-quality images from text prompts, or edit/transform "
+        "existing images when `image_urls` is supplied (image-to-image). The "
+        "underlying backend (FAL, OpenAI, etc.) and model are user-configured "
+        "and not selectable by the agent. Returns either a URL or an absolute "
+        "file path in the `image` field; display it with markdown "
        "![description](url-or-path) and the gateway will deliver it. When "
        "the active terminal backend has a different filesystem, successful "
        "local-file results may also include `agent_visible_image` for "
@@ -1016,7 +1329,7 @@ IMAGE_GENERATE_SCHEMA = {
        "properties": {
            "prompt": {
                "type": "string",
-                "description": "The text prompt describing the desired image. Be detailed and descriptive.",
+                "description": "The text prompt describing the desired image. Be detailed and descriptive. When editing (image_urls set), describe the change you want.",
            },
            "aspect_ratio": {
                "type": "string",
@@ -1024,6 +1337,21 @@ IMAGE_GENERATE_SCHEMA = {
                "description": "The aspect ratio of the generated image. 'landscape' is 16:9 wide, 'portrait' is 16:9 tall, 'square' is 1:1.",
                "default": DEFAULT_ASPECT_RATIO,
            },
+            "image_urls": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": (
+                    "Optional list of input images to edit or use as "
+                    "references (image-to-image). Each item is an http(s) URL, "
+                    "a data: URI, or a local file path (e.g. an image a user "
+                    "sent in chat). When supplied, the call routes to the "
+                    "active model's edit endpoint if it has one; models "
+                    "without edit support ignore the images and generate from "
+                    "the prompt alone. Reference multiple images in the prompt "
+                    "by index (e.g. 'put the person from image 1 in the scene "
+                    "from image 2')."
+                ),
+            },
        },
        "required": ["prompt"],
    },
@@ -1153,18 +1481,53 @@ def _handle_image_generate(args, **kw):
    if not prompt:
        return tool_error("prompt is required for image generation")
    aspect_ratio = args.get("aspect_ratio", DEFAULT_ASPECT_RATIO)
+    image_urls = args.get("image_urls")
    task_id = kw.get("task_id")

    # Route to a plugin-registered provider if one is active (and it's
-    # not the in-tree FAL path).
-    dispatched = _dispatch_to_plugin_provider(prompt, aspect_ratio)
-    if dispatched is not None:
-        return _postprocess_image_generate_result(dispatched, task_id=task_id)
+    # not the in-tree FAL path). Plugin providers don't yet accept input
+    # images through this tool, so when the caller supplied image_urls we
+    # skip plugin dispatch and use the in-tree FAL edit path rather than
+    # silently dropping the images.
+    has_input_images = bool(
+        image_urls if isinstance(image_urls, (list, tuple)) else
+        (isinstance(image_urls, str) and image_urls.strip())
+    )
+    if not has_input_images:
+        dispatched = _dispatch_to_plugin_provider(prompt, aspect_ratio)
+        if dispatched is not None:
+            return _postprocess_image_generate_result(dispatched, task_id=task_id)
+
+    # If a non-FAL plugin provider is configured but the caller supplied input
+    # images, we fall back to the in-tree FAL edit path (plugin providers don't
+    # accept image input through this tool yet). Note the backend switch in the
+    # result so it isn't a silent surprise.
+    reroute_note = None
+    if has_input_images:
+        configured = _read_configured_image_provider()
+        if configured and configured.lower() != "fal":
+            reroute_note = (
+                f"Image input was supplied, so the call used the built-in FAL "
+                f"edit path instead of the configured image_gen.provider="
+                f"'{configured}' (that provider does not accept image input "
+                f"through this tool)."
+            )

    raw = image_generate_tool(
        prompt=prompt,
        aspect_ratio=aspect_ratio,
+        image_urls=image_urls,
    )
+    if reroute_note:
+        try:
+            payload = json.loads(raw)
+            if isinstance(payload, dict):
+                # Don't clobber an existing edit_note; append.
+                existing = payload.get("note")
+                payload["note"] = (existing + " " + reroute_note) if existing else reroute_note
+                raw = json.dumps(payload, ensure_ascii=False)
+        except Exception:
+            pass
    return _postprocess_image_generate_result(raw, task_id=task_id)