mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-01 00:11:39 +08:00
Adds tools.schema_sanitizer.strip_nullable_unions as the single
implementation for collapsing anyOf/oneOf nullable unions. Both the
MCP input-schema normalizer and the Anthropic tool-schema guard now
delegate to it instead of re-implementing the same walk three times.
The global sanitizer also gains a final pass so any tool that slips
past the two earlier hooks (plugin tools, non-MCP custom tools with
Pydantic-shaped schemas) still gets safe input_schemas on Anthropic.
- tools/schema_sanitizer.py:
* New public strip_nullable_unions(schema, keep_nullable_hint=True).
* _sanitize_single_tool() calls it as a final pass (hint preserved
so coerce_tool_args can still map string "null" to None).
- tools/mcp_tool.py: _normalize_mcp_input_schema delegates.
- agent/anthropic_adapter.py: _normalize_tool_input_schema delegates
with keep_nullable_hint=False (Anthropic does not recognize nullable).
No behavioral change for the fix itself; tests (73/73 targeted +
E2E across MCP→sanitizer→Anthropic paths) pass.
258 lines
11 KiB
Python
258 lines
11 KiB
Python
"""Sanitize tool JSON schemas for broad LLM-backend compatibility.
|
|
|
|
Some local inference backends (notably llama.cpp's ``json-schema-to-grammar``
|
|
converter used to build GBNF tool-call parsers) are strict about what JSON
|
|
Schema shapes they accept. Schemas that OpenAI / Anthropic / most cloud
|
|
providers silently accept can make llama.cpp fail the entire request with:
|
|
|
|
HTTP 400: Unable to generate parser for this template.
|
|
Automatic parser generation failed: JSON schema conversion failed:
|
|
Unrecognized schema: "object"
|
|
|
|
The failure modes we've seen in the wild:
|
|
|
|
* ``{"type": "object"}`` with no ``properties`` — rejected as a node the
|
|
grammar generator can't constrain.
|
|
* A schema value that is the bare string ``"object"`` instead of a dict
|
|
(malformed MCP server output, e.g. ``additionalProperties: "object"``).
|
|
* ``"type": ["string", "null"]`` array types — many converters only accept
|
|
single-string ``type``.
|
|
* ``anyOf`` / ``oneOf`` unions whose only purpose is to permit ``null`` for
|
|
optional fields (common Pydantic/MCP shape). Anthropic rejects these at
|
|
the top of ``input_schema``; collapse them to the non-null branch.
|
|
* Unconstrained ``additionalProperties`` on objects with empty properties.
|
|
|
|
This module walks the final tool schema tree (after MCP-level normalization
|
|
and any per-tool dynamic rebuilds) and fixes the known-hostile constructs
|
|
in-place on a deep copy. It is intentionally conservative: it only modifies
|
|
shapes the LLM backend couldn't use anyway.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import copy
|
|
import logging
|
|
from typing import Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def sanitize_tool_schemas(tools: list[dict]) -> list[dict]:
|
|
"""Return a copy of ``tools`` with each tool's parameter schema sanitized.
|
|
|
|
Input is an OpenAI-format tool list:
|
|
``[{"type": "function", "function": {"name": ..., "parameters": {...}}}]``
|
|
|
|
The returned list is a deep copy — callers can safely mutate it without
|
|
affecting the original registry entries.
|
|
"""
|
|
if not tools:
|
|
return tools
|
|
|
|
sanitized: list[dict] = []
|
|
for tool in tools:
|
|
sanitized.append(_sanitize_single_tool(tool))
|
|
return sanitized
|
|
|
|
|
|
def _sanitize_single_tool(tool: dict) -> dict:
|
|
"""Deep-copy and sanitize a single OpenAI-format tool entry."""
|
|
out = copy.deepcopy(tool)
|
|
fn = out.get("function") if isinstance(out, dict) else None
|
|
if not isinstance(fn, dict):
|
|
return out
|
|
|
|
params = fn.get("parameters")
|
|
# Missing / non-dict parameters → substitute the minimal valid shape.
|
|
if not isinstance(params, dict):
|
|
fn["parameters"] = {"type": "object", "properties": {}}
|
|
return out
|
|
|
|
fn["parameters"] = _sanitize_node(params, path=fn.get("name", "<tool>"))
|
|
# After recursion, guarantee the top-level is an object with properties.
|
|
top = fn["parameters"]
|
|
if not isinstance(top, dict):
|
|
fn["parameters"] = {"type": "object", "properties": {}}
|
|
else:
|
|
if top.get("type") != "object":
|
|
top["type"] = "object"
|
|
if "properties" not in top or not isinstance(top.get("properties"), dict):
|
|
top["properties"] = {}
|
|
# Final pass: collapse nullable anyOf/oneOf unions that the recursive
|
|
# sanitizer above leaves intact (it only handles the array-form
|
|
# ``type: [X, "null"]``). Keep the ``nullable: true`` hint so runtime
|
|
# argument coercion (``model_tools._schema_allows_null``) can still
|
|
# map a model-emitted ``"null"`` string to Python ``None``.
|
|
fn["parameters"] = strip_nullable_unions(fn["parameters"], keep_nullable_hint=True)
|
|
return out
|
|
|
|
|
|
def strip_nullable_unions(
|
|
schema: Any,
|
|
*,
|
|
keep_nullable_hint: bool = True,
|
|
) -> Any:
|
|
"""Collapse ``anyOf`` / ``oneOf`` nullable unions to the non-null branch.
|
|
|
|
MCP / Pydantic optional fields commonly arrive as::
|
|
|
|
{"anyOf": [{"type": "string"}, {"type": "null"}], "default": null}
|
|
|
|
Anthropic's tool input-schema validator rejects the null branch. Tool
|
|
optionality is already represented by the parent object's ``required``
|
|
array, so we collapse the union to the single non-null variant.
|
|
|
|
Metadata (``title``, ``description``, ``default``, ``examples``) on the
|
|
outer union node is carried over to the replacement variant.
|
|
|
|
Args:
|
|
schema: JSON-Schema fragment (dict, list, or scalar).
|
|
keep_nullable_hint: If True, set ``nullable: true`` on the replacement
|
|
to preserve the "this field may be None" signal for downstream
|
|
consumers that care (e.g. runtime argument coercion that maps the
|
|
literal string ``"null"`` to Python ``None``). Anthropic's
|
|
validator accepts ``nullable: true`` but strict producers may
|
|
prefer False.
|
|
|
|
Returns:
|
|
The schema with nullable unions collapsed. Non-union nodes are
|
|
returned unchanged.
|
|
"""
|
|
if isinstance(schema, list):
|
|
return [strip_nullable_unions(item, keep_nullable_hint=keep_nullable_hint) for item in schema]
|
|
if not isinstance(schema, dict):
|
|
return schema
|
|
|
|
stripped = {
|
|
k: strip_nullable_unions(v, keep_nullable_hint=keep_nullable_hint)
|
|
for k, v in schema.items()
|
|
}
|
|
for key in ("anyOf", "oneOf"):
|
|
variants = stripped.get(key)
|
|
if not isinstance(variants, list):
|
|
continue
|
|
non_null = [
|
|
item for item in variants
|
|
if not (isinstance(item, dict) and item.get("type") == "null")
|
|
]
|
|
# Only collapse when we actually dropped a null branch AND exactly
|
|
# one non-null branch survives (otherwise the union is meaningful
|
|
# and we leave it alone).
|
|
if len(non_null) == 1 and len(non_null) != len(variants):
|
|
replacement = dict(non_null[0]) if isinstance(non_null[0], dict) else {}
|
|
if keep_nullable_hint:
|
|
replacement.setdefault("nullable", True)
|
|
for meta_key in ("title", "description", "default", "examples"):
|
|
if meta_key in stripped and meta_key not in replacement:
|
|
replacement[meta_key] = stripped[meta_key]
|
|
return strip_nullable_unions(replacement, keep_nullable_hint=keep_nullable_hint)
|
|
return stripped
|
|
|
|
|
|
def _sanitize_node(node: Any, path: str) -> Any:
|
|
"""Recursively sanitize a JSON-Schema fragment.
|
|
|
|
- Replaces bare-string schema values ("object", "string", ...) with
|
|
``{"type": <value>}`` so downstream consumers see a dict.
|
|
- Injects ``properties: {}`` into object-typed nodes missing it.
|
|
- Normalizes ``type: [X, "null"]`` arrays to single ``type: X`` (keeping
|
|
``nullable: true`` as a hint).
|
|
- Recurses into ``properties``, ``items``, ``additionalProperties``,
|
|
``anyOf``, ``oneOf``, ``allOf``, and ``$defs`` / ``definitions``.
|
|
"""
|
|
# Malformed: the schema position holds a bare string like "object".
|
|
if isinstance(node, str):
|
|
if node in {"object", "string", "number", "integer", "boolean", "array", "null"}:
|
|
logger.debug(
|
|
"schema_sanitizer[%s]: replacing bare-string schema %r "
|
|
"with {'type': %r}",
|
|
path, node, node,
|
|
)
|
|
return {"type": node} if node != "object" else {
|
|
"type": "object",
|
|
"properties": {},
|
|
}
|
|
# Any other stray string is not a schema — drop it by replacing with
|
|
# a permissive object schema rather than propagate something the
|
|
# backend will reject.
|
|
logger.debug(
|
|
"schema_sanitizer[%s]: replacing non-schema string %r "
|
|
"with empty object schema", path, node,
|
|
)
|
|
return {"type": "object", "properties": {}}
|
|
|
|
if isinstance(node, list):
|
|
return [_sanitize_node(item, f"{path}[{i}]") for i, item in enumerate(node)]
|
|
|
|
if not isinstance(node, dict):
|
|
return node
|
|
|
|
out: dict = {}
|
|
for key, value in node.items():
|
|
# type: [X, "null"] → type: X (the backend's tool-call parser only
|
|
# accepts singular string types; nullable is lost but the call still
|
|
# succeeds, and the model can still pass null on its own.)
|
|
if key == "type" and isinstance(value, list):
|
|
non_null = [t for t in value if t != "null"]
|
|
if len(non_null) == 1 and isinstance(non_null[0], str):
|
|
out["type"] = non_null[0]
|
|
if "null" in value:
|
|
out.setdefault("nullable", True)
|
|
continue
|
|
# Fallback: pick the first string type, drop the rest.
|
|
first_str = next((t for t in value if isinstance(t, str) and t != "null"), None)
|
|
if first_str:
|
|
out["type"] = first_str
|
|
continue
|
|
# All-null or empty list → treat as object.
|
|
out["type"] = "object"
|
|
continue
|
|
|
|
if key in {"properties", "$defs", "definitions"} and isinstance(value, dict):
|
|
out[key] = {
|
|
sub_k: _sanitize_node(sub_v, f"{path}.{key}.{sub_k}")
|
|
for sub_k, sub_v in value.items()
|
|
}
|
|
elif key in {"items", "additionalProperties"}:
|
|
if isinstance(value, bool):
|
|
# Keep bool ``additionalProperties`` as-is — it's a valid form
|
|
# and widely accepted. ``items: true/false`` is non-standard
|
|
# but we preserve rather than drop.
|
|
out[key] = value
|
|
else:
|
|
out[key] = _sanitize_node(value, f"{path}.{key}")
|
|
elif key in {"anyOf", "oneOf", "allOf"} and isinstance(value, list):
|
|
out[key] = [
|
|
_sanitize_node(item, f"{path}.{key}[{i}]")
|
|
for i, item in enumerate(value)
|
|
]
|
|
elif key in {"required", "enum", "examples"}:
|
|
# Schema "sibling" keywords whose values are NOT schemas:
|
|
# - ``required``: list of property-name strings
|
|
# - ``enum``: list of literal values (any JSON type)
|
|
# - ``examples``: list of example values (any JSON type)
|
|
# Recursing into these with _sanitize_node() would mis-interpret
|
|
# literal strings like "path" as bare-string schemas and replace
|
|
# them with {"type": "object"} dicts. Pass through unchanged.
|
|
out[key] = copy.deepcopy(value) if isinstance(value, (list, dict)) else value
|
|
else:
|
|
out[key] = _sanitize_node(value, f"{path}.{key}") if isinstance(value, (dict, list)) else value
|
|
|
|
# Object nodes without properties: inject empty properties dict.
|
|
# llama.cpp's grammar generator can't constrain a free-form object.
|
|
if out.get("type") == "object" and not isinstance(out.get("properties"), dict):
|
|
out["properties"] = {}
|
|
|
|
# Prune ``required`` entries that don't exist in properties (defense
|
|
# against malformed MCP schemas; also caught upstream for MCP tools, but
|
|
# built-in tools or plugin tools may not have been through that path).
|
|
if out.get("type") == "object" and isinstance(out.get("required"), list):
|
|
props = out.get("properties") or {}
|
|
valid = [r for r in out["required"] if isinstance(r, str) and r in props]
|
|
if not valid:
|
|
out.pop("required", None)
|
|
elif len(valid) != len(out["required"]):
|
|
out["required"] = valid
|
|
|
|
return out
|