diff --git a/agent/anthropic_adapter.py b/agent/anthropic_adapter.py index f3f08039de..90a3a412e8 100644 --- a/agent/anthropic_adapter.py +++ b/agent/anthropic_adapter.py @@ -28,19 +28,37 @@ except ImportError: logger = logging.getLogger(__name__) THINKING_BUDGET = {"xhigh": 32000, "high": 16000, "medium": 8000, "low": 4000} +# Hermes effort → Anthropic adaptive-thinking effort (output_config.effort). +# Anthropic exposes 5 levels on 4.7+: low, medium, high, xhigh, max. +# We preserve xhigh as xhigh (the recommended default for coding/agentic on +# 4.7) and expose max as a distinct ceiling. "minimal" is a legacy alias that +# maps to low. See: +# https://platform.claude.com/docs/en/about-claude/models/migration-guide ADAPTIVE_EFFORT_MAP = { - "xhigh": "max", - "high": "high", - "medium": "medium", - "low": "low", + "max": "max", + "xhigh": "xhigh", + "high": "high", + "medium": "medium", + "low": "low", "minimal": "low", } +# Models where extended thinking is deprecated/removed (4.6+ behavior: adaptive +# is the only supported mode; 4.7 additionally forbids manual thinking entirely +# and drops temperature/top_p/top_k). +_ADAPTIVE_THINKING_SUBSTRINGS = ("4-6", "4.6", "4-7", "4.7") + +# Models where temperature/top_p/top_k return 400 if set to non-default values. +# This is the Opus 4.7 contract; future 4.x+ models are expected to follow it. +_NO_SAMPLING_PARAMS_SUBSTRINGS = ("4-7", "4.7") + # ── Max output token limits per Anthropic model ─────────────────────── # Source: Anthropic docs + Cline model catalog. Anthropic's API requires # max_tokens as a mandatory field. Previously we hardcoded 16384, which # starves thinking-enabled models (thinking tokens count toward the limit). _ANTHROPIC_OUTPUT_LIMITS = { + # Claude 4.7 + "claude-opus-4-7": 128_000, # Claude 4.6 "claude-opus-4-6": 128_000, "claude-sonnet-4-6": 64_000, @@ -91,11 +109,26 @@ def _get_anthropic_max_output(model: str) -> int: def _supports_adaptive_thinking(model: str) -> bool: - """Return True for Claude 4.6 models that support adaptive thinking.""" - return any(v in model for v in ("4-6", "4.6")) + """Return True for Claude 4.6+ models that support adaptive thinking.""" + return any(v in model for v in _ADAPTIVE_THINKING_SUBSTRINGS) -# Beta headers for enhanced features (sent with ALL auth types) +def _forbids_sampling_params(model: str) -> bool: + """Return True for models that 400 on any non-default temperature/top_p/top_k. + + Opus 4.7 explicitly rejects sampling parameters; later Claude releases are + expected to follow suit. Callers should omit these fields entirely rather + than passing zero/default values (the API rejects anything non-null). + """ + return any(v in model for v in _NO_SAMPLING_PARAMS_SUBSTRINGS) + + +# Beta headers for enhanced features (sent with ALL auth types). +# As of Opus 4.7 (2026-04-16), both of these are GA on Claude 4.6+ — the +# beta headers are still accepted (harmless no-op) but not required. Kept +# here so older Claude (4.5, 4.1) + third-party Anthropic-compat endpoints +# that still gate on the headers continue to get the enhanced features. +# Migration guide: remove these if you no longer support ≤4.5 models. _COMMON_BETAS = [ "interleaved-thinking-2025-05-14", "fine-grained-tool-streaming-2025-05-14", @@ -1341,18 +1374,26 @@ def build_anthropic_kwargs( kwargs["tool_choice"] = {"type": "tool", "name": tool_choice} # Map reasoning_config to Anthropic's thinking parameter. - # Claude 4.6 models use adaptive thinking + output_config.effort. + # Claude 4.6+ models use adaptive thinking + output_config.effort. # Older models use manual thinking with budget_tokens. # MiniMax Anthropic-compat endpoints support thinking (manual mode only, # not adaptive). Haiku does NOT support extended thinking — skip entirely. + # + # On 4.7+ the `thinking.display` field defaults to "omitted", which + # silently hides reasoning text that Hermes surfaces in its CLI. We + # request "summarized" so the reasoning blocks stay populated — matching + # 4.6 behavior and preserving the activity-feed UX during long tool runs. if reasoning_config and isinstance(reasoning_config, dict): if reasoning_config.get("enabled") is not False and "haiku" not in model.lower(): effort = str(reasoning_config.get("effort", "medium")).lower() budget = THINKING_BUDGET.get(effort, 8000) if _supports_adaptive_thinking(model): - kwargs["thinking"] = {"type": "adaptive"} + kwargs["thinking"] = { + "type": "adaptive", + "display": "summarized", + } kwargs["output_config"] = { - "effort": ADAPTIVE_EFFORT_MAP.get(effort, "medium") + "effort": ADAPTIVE_EFFORT_MAP.get(effort, "medium"), } else: kwargs["thinking"] = {"type": "enabled", "budget_tokens": budget} @@ -1360,6 +1401,15 @@ def build_anthropic_kwargs( kwargs["temperature"] = 1 kwargs["max_tokens"] = max(effective_max_tokens, budget + 4096) + # ── Strip sampling params on 4.7+ ───────────────────────────────── + # Opus 4.7 rejects any non-default temperature/top_p/top_k with a 400. + # Callers (auxiliary_client, flush_memories, etc.) may set these for + # older models; drop them here as a safety net so upstream 4.6 → 4.7 + # migrations don't require coordinated edits everywhere. + if _forbids_sampling_params(model): + for _sampling_key in ("temperature", "top_p", "top_k"): + kwargs.pop(_sampling_key, None) + # ── Fast mode (Opus 4.6 only) ──────────────────────────────────── # Adds extra_body.speed="fast" + the fast-mode beta header for ~2.5x # output speed. Only for native Anthropic endpoints — third-party @@ -1417,12 +1467,20 @@ def normalize_anthropic_response( ) ) - # Map Anthropic stop_reason to OpenAI finish_reason + # Map Anthropic stop_reason to OpenAI finish_reason. + # Newer stop reasons added in Claude 4.5+ / 4.7: + # - refusal: the model declined to answer (cyber safeguards, CSAM, etc.) + # - model_context_window_exceeded: hit context limit (not max_tokens) + # Both need distinct handling upstream — a refusal should surface to the + # user with a clear message, and a context-window overflow should trigger + # compression/truncation rather than be treated as normal end-of-turn. stop_reason_map = { "end_turn": "stop", "tool_use": "tool_calls", "max_tokens": "length", "stop_sequence": "stop", + "refusal": "content_filter", + "model_context_window_exceeded": "length", } finish_reason = stop_reason_map.get(response.stop_reason, "stop") diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index c31ff55f98..4f17461662 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -518,8 +518,13 @@ class _AnthropicCompletionsAdapter: tool_choice=normalized_tool_choice, is_oauth=self._is_oauth, ) + # Opus 4.7+ rejects any non-default temperature/top_p/top_k; only set + # temperature for models that still accept it. build_anthropic_kwargs + # additionally strips these keys as a safety net — keep both layers. if temperature is not None: - anthropic_kwargs["temperature"] = temperature + from agent.anthropic_adapter import _forbids_sampling_params + if not _forbids_sampling_params(model): + anthropic_kwargs["temperature"] = temperature response = self._client.messages.create(**anthropic_kwargs) assistant_message, finish_reason = normalize_anthropic_response(response) @@ -2288,6 +2293,15 @@ def _build_call_kwargs( "timeout": timeout, } + # Opus 4.7+ rejects any non-default temperature/top_p/top_k — silently + # drop here so auxiliary callers that hardcode temperature (e.g. 0.3 on + # flush_memories, 0 on structured-JSON extraction) don't 400 the moment + # the aux model is flipped to 4.7. + if temperature is not None: + from agent.anthropic_adapter import _forbids_sampling_params + if _forbids_sampling_params(model): + temperature = None + if temperature is not None: kwargs["temperature"] = temperature diff --git a/agent/model_metadata.py b/agent/model_metadata.py index db30489415..089fd132ac 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -102,6 +102,8 @@ DEFAULT_CONTEXT_LENGTHS = { # fuzzy-match collisions (e.g. "anthropic/claude-sonnet-4" is a # substring of "anthropic/claude-sonnet-4.6"). # OpenRouter-prefixed models resolve via OpenRouter live API or models.dev. + "claude-opus-4-7": 1000000, + "claude-opus-4.7": 1000000, "claude-opus-4-6": 1000000, "claude-sonnet-4-6": 1000000, "claude-opus-4.6": 1000000, diff --git a/batch_runner.py b/batch_runner.py index 195452c0ae..1a65f473ff 100644 --- a/batch_runner.py +++ b/batch_runner.py @@ -561,7 +561,10 @@ class BatchRunner: provider_sort (str): Sort providers by price/throughput/latency (optional) max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set) reasoning_config (Dict): OpenRouter reasoning config override (e.g. {"effort": "none"} to disable thinking) - prefill_messages (List[Dict]): Messages to prepend as prefilled conversation context (few-shot priming) + prefill_messages (List[Dict]): Messages to prepend as prefilled conversation context (few-shot priming). + NOTE: Anthropic Sonnet 4.6+ and Opus 4.6+ reject a trailing assistant-role prefill + (400 error). For those models use output_config.format or structured-output + schemas instead. Safe here for user-role priming and for older Claude / non-Claude models. max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set) """ self.dataset_file = Path(dataset_file) diff --git a/hermes_cli/models.py b/hermes_cli/models.py index 309840aea5..48cf6873be 100644 --- a/hermes_cli/models.py +++ b/hermes_cli/models.py @@ -26,7 +26,8 @@ COPILOT_REASONING_EFFORTS_O_SERIES = ["low", "medium", "high"] # Fallback OpenRouter snapshot used when the live catalog is unavailable. # (model_id, display description shown in menus) OPENROUTER_MODELS: list[tuple[str, str]] = [ - ("anthropic/claude-opus-4.6", "recommended"), + ("anthropic/claude-opus-4.7", "recommended"), + ("anthropic/claude-opus-4.6", ""), ("anthropic/claude-sonnet-4.6", ""), ("qwen/qwen3.6-plus", ""), ("anthropic/claude-sonnet-4.5", ""), @@ -181,6 +182,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = { "MiniMax-M2", ], "anthropic": [ + "claude-opus-4-7", "claude-opus-4-6", "claude-sonnet-4-6", "claude-opus-4-5-20251101", diff --git a/run_agent.py b/run_agent.py index f6c67b109d..920b49c2fa 100644 --- a/run_agent.py +++ b/run_agent.py @@ -641,6 +641,9 @@ class AIAgent: prefill_messages (List[Dict]): Messages to prepend to conversation history as prefilled context. Useful for injecting a few-shot example or priming the model's response style. Example: [{"role": "user", "content": "Hi!"}, {"role": "assistant", "content": "Hello!"}] + NOTE: Anthropic Sonnet 4.6+ and Opus 4.6+ reject a conversation that ends on an + assistant-role message (400 error). For those models use structured outputs or + output_config.format instead of a trailing-assistant prefill. platform (str): The interface platform the user is on (e.g. "cli", "telegram", "discord", "whatsapp"). Used to inject platform-specific formatting hints into the system prompt. skip_context_files (bool): If True, skip auto-injection of SOUL.md, AGENTS.md, and .cursorrules diff --git a/tests/agent/test_anthropic_adapter.py b/tests/agent/test_anthropic_adapter.py index ae78888d86..9d8f3deaaa 100644 --- a/tests/agent/test_anthropic_adapter.py +++ b/tests/agent/test_anthropic_adapter.py @@ -951,13 +951,19 @@ class TestBuildAnthropicKwargs: max_tokens=4096, reasoning_config={"enabled": True, "effort": "high"}, ) - assert kwargs["thinking"] == {"type": "adaptive"} + # Adaptive thinking + display="summarized" keeps reasoning text + # populated in the response stream (Opus 4.7 default is "omitted"). + assert kwargs["thinking"] == {"type": "adaptive", "display": "summarized"} assert kwargs["output_config"] == {"effort": "high"} assert "budget_tokens" not in kwargs["thinking"] assert "temperature" not in kwargs assert kwargs["max_tokens"] == 4096 - def test_reasoning_config_maps_xhigh_to_max_effort_for_4_6_models(self): + def test_reasoning_config_maps_xhigh_to_xhigh_effort_for_4_6_models(self): + # Opus 4.7 added "xhigh" as a distinct effort level (the recommended + # default for coding/agentic work). Earlier mapping aliased xhigh→max, + # which silently over-efforted every request. 2026-04-16 migration + # guide: xhigh and max are distinct levels. kwargs = build_anthropic_kwargs( model="claude-sonnet-4-6", messages=[{"role": "user", "content": "think harder"}], @@ -965,9 +971,40 @@ class TestBuildAnthropicKwargs: max_tokens=4096, reasoning_config={"enabled": True, "effort": "xhigh"}, ) - assert kwargs["thinking"] == {"type": "adaptive"} + assert kwargs["thinking"] == {"type": "adaptive", "display": "summarized"} + assert kwargs["output_config"] == {"effort": "xhigh"} + + def test_reasoning_config_maps_max_effort_for_4_7_models(self): + kwargs = build_anthropic_kwargs( + model="claude-opus-4-7", + messages=[{"role": "user", "content": "maximum reasoning please"}], + tools=None, + max_tokens=4096, + reasoning_config={"enabled": True, "effort": "max"}, + ) + assert kwargs["thinking"] == {"type": "adaptive", "display": "summarized"} assert kwargs["output_config"] == {"effort": "max"} + def test_opus_4_7_strips_sampling_params(self): + # Opus 4.7 returns 400 on non-default temperature/top_p/top_k. + # build_anthropic_kwargs must strip them as a safety net even if an + # upstream caller injects them for older-model compatibility. + kwargs = build_anthropic_kwargs( + model="claude-opus-4-7", + messages=[{"role": "user", "content": "hi"}], + tools=None, + max_tokens=1024, + reasoning_config=None, + ) + # Manually inject sampling params then re-run through the guard. + # Because build_anthropic_kwargs doesn't currently accept sampling + # params through its signature, we exercise the strip behavior by + # calling the internal predicate directly. + from agent.anthropic_adapter import _forbids_sampling_params + assert _forbids_sampling_params("claude-opus-4-7") is True + assert _forbids_sampling_params("claude-opus-4-6") is False + assert _forbids_sampling_params("claude-sonnet-4-5") is False + def test_reasoning_disabled(self): kwargs = build_anthropic_kwargs( model="claude-sonnet-4-20250514", @@ -1248,6 +1285,21 @@ class TestNormalizeResponse: assert r2 == "tool_calls" assert r3 == "length" + def test_stop_reason_refusal_and_context_exceeded(self): + # Claude 4.5+ introduced two new stop_reason values the Messages API + # returns. We map both to OpenAI-style finish_reasons upstream + # handlers already understand, instead of silently collapsing to + # "stop" (old behavior). + block = SimpleNamespace(type="text", text="") + _, refusal_reason = normalize_anthropic_response( + self._make_response([block], "refusal") + ) + _, overflow_reason = normalize_anthropic_response( + self._make_response([block], "model_context_window_exceeded") + ) + assert refusal_reason == "content_filter" + assert overflow_reason == "length" + def test_no_text_content(self): block = SimpleNamespace( type="tool_use", id="tc_1", name="search", input={"q": "hi"} diff --git a/tests/agent/test_model_metadata.py b/tests/agent/test_model_metadata.py index df680fb241..6a0eab1512 100644 --- a/tests/agent/test_model_metadata.py +++ b/tests/agent/test_model_metadata.py @@ -113,8 +113,10 @@ class TestDefaultContextLengths: for key, value in DEFAULT_CONTEXT_LENGTHS.items(): if "claude" not in key: continue - # Claude 4.6 models have 1M context - if "4.6" in key or "4-6" in key: + # Claude 4.6+ models (4.6 and 4.7) have 1M context at standard + # API pricing (no long-context premium). Older Claude 4.x and + # 3.x models cap at 200k. + if any(tag in key for tag in ("4.6", "4-6", "4.7", "4-7")): assert value == 1000000, f"{key} should be 1000000" else: assert value == 200000, f"{key} should be 200000"