fix(kimi): route temperature override by base_url — kimi-k2.5 needs 1.0 on api.moonshot.ai

Follow-up to #12144. That PR standardized the kimi-k2.* temperature lock against the Coding Plan endpoint (api.kimi.com/coding/v1) docs, where non-thinking models require 0.6. Verified empirically against Moonshot (April 2026) that the public chat endpoint (api.moonshot.ai/v1) has a different contract for kimi-k2.5: it only accepts temperature=1, and rejects 0.6 with: HTTP 400 "invalid temperature: only 1 is allowed for this model" Users hit the public endpoint when KIMI_API_KEY is a legacy sk-* key (the sk-kimi-* prefix routes to Coding Plan — see hermes_cli/auth.py). So for Coding Plan subscribers the fix from #12144 is correct, but for public-API users it reintroduces the exact 400 reported in #9125. Reproduction on api.moonshot.ai/v1 + kimi-k2.5: temperature=1.0 → 200 OK temperature=0.6 → 400 "only 1 is allowed" ← #12144 default temperature=None → 200 OK Other kimi-k2.* models are unaffected empirically — turbo-preview accepts 0.6 and thinking-turbo accepts 1.0 on both endpoints — so only kimi-k2.5 diverges. Fix: thread the client's actual base_url through _build_call_kwargs (the parameter already existed but callers passed config-level resolved_base_url; for auto-detected routes that was often empty). _fixed_temperature_for_model now checks api.moonshot.ai first via an explicit _KIMI_PUBLIC_API_OVERRIDES map, then falls back to the Coding Plan defaults. Tests parametrize over endpoint + model to lock both contracts. Closes #9125.
2026-04-28 06:51:16 +08:00 · 2026-04-20 04:18:49 +09:00
parent 0d353ca6a8
commit 6f79b8f01d
2 changed files with 135 additions and 8 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -116,8 +116,25 @@ _KIMI_THINKING_MODELS: frozenset = frozenset({
    "kimi-k2-thinking-turbo",
 })

+# Moonshot's public chat endpoint (api.moonshot.ai/v1) enforces a different
+# temperature contract than the Coding Plan endpoint above.  Empirically,
+# `kimi-k2.5` on the public API rejects 0.6 with HTTP 400
+# "invalid temperature: only 1 is allowed for this model" — the Coding Plan
+# lock (0.6 for non-thinking) does not apply.  `kimi-k2-turbo-preview` and the
+# thinking variants already match the Coding Plan contract on the public
+# endpoint, so we only override the models that diverge.
+# Users hit this endpoint when `KIMI_API_KEY` is a legacy `sk-*` key (the
+# `sk-kimi-*` prefix routes to api.kimi.com/coding/v1 instead — see
+# hermes_cli/auth.py:_kimi_base_url_for_key).
+_KIMI_PUBLIC_API_OVERRIDES: Dict[str, float] = {
+    "kimi-k2.5": 1.0,
+}

-def _fixed_temperature_for_model(model: Optional[str]) -> Optional[float]:
+
+def _fixed_temperature_for_model(
+    model: Optional[str],
+    base_url: Optional[str] = None,
+) -> Optional[float]:
    """Return a required temperature override for models with strict contracts.

    Moonshot's kimi-for-coding endpoint rejects any non-approved temperature on
@@ -125,15 +142,31 @@ def _fixed_temperature_for_model(model: Optional[str]) -> Optional[float]:
    variants require 1.0.  An optional ``vendor/`` prefix (e.g.
    ``moonshotai/kimi-k2.5``) is tolerated for aggregator routings.

+    When ``base_url`` points to Moonshot's public chat endpoint
+    (``api.moonshot.ai``), the contract changes for ``kimi-k2.5``: the public
+    API only accepts ``temperature=1``, not 0.6.  That override takes precedence
+    over the Coding Plan defaults above.
+
    Returns ``None`` for every other model, including ``kimi-k2-instruct*``
    which is the separate non-coding K2 family with variable temperature.
    """
    normalized = (model or "").strip().lower()
+    bare = normalized.rsplit("/", 1)[-1]
+
+    # Public Moonshot API has a stricter contract for some models than the
+    # Coding Plan endpoint — check it first so it wins on conflict.
+    if base_url and "api.moonshot.ai" in base_url.lower():
+        public = _KIMI_PUBLIC_API_OVERRIDES.get(bare)
+        if public is not None:
+            logger.debug(
+                "Forcing temperature=%s for %r on public Moonshot API", public, model
+            )
+            return public
+
    fixed = _FIXED_TEMPERATURE_MODELS.get(normalized)
    if fixed is not None:
        logger.debug("Forcing temperature=%s for model %r (fixed map)", fixed, model)
        return fixed
-    bare = normalized.rsplit("/", 1)[-1]
    if bare in _KIMI_THINKING_MODELS:
        logger.debug("Forcing temperature=1.0 for kimi thinking model %r", model)
        return 1.0
@@ -2417,7 +2450,7 @@ def _build_call_kwargs(
        "timeout": timeout,
    }

-    fixed_temperature = _fixed_temperature_for_model(model)
+    fixed_temperature = _fixed_temperature_for_model(model, base_url)
    if fixed_temperature is not None:
        temperature = fixed_temperature

@@ -2598,11 +2631,14 @@ def call_llm(
                     task, resolved_provider or "auto", final_model or "default",
                     f" at {_base_info}" if _base_info and "openrouter" not in _base_info else "")

+    # Pass the client's actual base_url (not just resolved_base_url) so
+    # endpoint-specific temperature overrides can distinguish
+    # api.moonshot.ai vs api.kimi.com/coding even on auto-detected routes.
    kwargs = _build_call_kwargs(
        resolved_provider, final_model, messages,
        temperature=temperature, max_tokens=max_tokens,
        tools=tools, timeout=effective_timeout, extra_body=extra_body,
-        base_url=resolved_base_url)
+        base_url=_base_info or resolved_base_url)

    # Convert image blocks for Anthropic-compatible endpoints (e.g. MiniMax)
    _client_base = str(getattr(client, "base_url", "") or "")
@@ -2656,7 +2692,8 @@ def call_llm(
                    fb_label, fb_model, messages,
                    temperature=temperature, max_tokens=max_tokens,
                    tools=tools, timeout=effective_timeout,
-                    extra_body=extra_body)
+                    extra_body=extra_body,
+                    base_url=str(getattr(fb_client, "base_url", "") or ""))
                return _validate_llm_response(
                    fb_client.chat.completions.create(**fb_kwargs), task)
        raise
@@ -2791,14 +2828,17 @@ async def async_call_llm(

    effective_timeout = timeout if timeout is not None else _get_task_timeout(task)

+    # Pass the client's actual base_url (not just resolved_base_url) so
+    # endpoint-specific temperature overrides can distinguish
+    # api.moonshot.ai vs api.kimi.com/coding even on auto-detected routes.
+    _client_base = str(getattr(client, "base_url", "") or "")
    kwargs = _build_call_kwargs(
        resolved_provider, final_model, messages,
        temperature=temperature, max_tokens=max_tokens,
        tools=tools, timeout=effective_timeout, extra_body=extra_body,
-        base_url=resolved_base_url)
+        base_url=_client_base or resolved_base_url)

    # Convert image blocks for Anthropic-compatible endpoints (e.g. MiniMax)
-    _client_base = str(getattr(client, "base_url", "") or "")
    if _is_anthropic_compat_endpoint(resolved_provider, _client_base):
        kwargs["messages"] = _convert_openai_images_to_anthropic(kwargs["messages"])

@@ -2834,7 +2874,8 @@ async def async_call_llm(
                    fb_label, fb_model, messages,
                    temperature=temperature, max_tokens=max_tokens,
                    tools=tools, timeout=effective_timeout,
-                    extra_body=extra_body)
+                    extra_body=extra_body,
+                    base_url=str(getattr(fb_client, "base_url", "") or ""))
                # Convert sync fallback client to async
                async_fb, async_fb_model = _to_async_client(fb_client, fb_model or "")
                if async_fb_model and async_fb_model != fb_kwargs.get("model"):
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@@ -832,6 +832,92 @@ class TestKimiForCodingTemperature:

        assert kwargs["temperature"] == 0.3

+    # ── Endpoint-aware overrides: api.moonshot.ai vs api.kimi.com/coding ──
+    # The public Moonshot chat endpoint and the Coding Plan endpoint enforce
+    # different temperature contracts for the same model name.  `kimi-k2.5` on
+    # api.moonshot.ai rejects 0.6 with HTTP 400 "only 1 is allowed for this
+    # model", while the Coding Plan docs mandate 0.6.  Override must pick the
+    # right value per base_url.
+
+    @pytest.mark.parametrize(
+        "base_url",
+        [
+            "https://api.moonshot.ai/v1",
+            "https://api.moonshot.ai/v1/",
+            "https://API.MOONSHOT.AI/v1",
+        ],
+    )
+    def test_kimi_k2_5_public_api_forces_temperature_1(self, base_url):
+        """kimi-k2.5 on the public Moonshot API only accepts temperature=1."""
+        from agent.auxiliary_client import _build_call_kwargs
+
+        kwargs = _build_call_kwargs(
+            provider="kimi-coding",
+            model="kimi-k2.5",
+            messages=[{"role": "user", "content": "hello"}],
+            temperature=0.1,
+            base_url=base_url,
+        )
+
+        assert kwargs["temperature"] == 1.0
+
+    def test_kimi_k2_5_coding_plan_keeps_temperature_0_6(self):
+        """kimi-k2.5 on api.kimi.com/coding keeps the Coding Plan's 0.6 lock."""
+        from agent.auxiliary_client import _build_call_kwargs
+
+        kwargs = _build_call_kwargs(
+            provider="kimi-coding",
+            model="kimi-k2.5",
+            messages=[{"role": "user", "content": "hello"}],
+            temperature=0.1,
+            base_url="https://api.kimi.com/coding/v1",
+        )
+
+        assert kwargs["temperature"] == 0.6
+
+    def test_kimi_k2_5_no_base_url_falls_back_to_coding_plan_lock(self):
+        """Without a base_url hint, the Coding Plan default (0.6) applies.
+
+        Preserves PR #12144 backward compatibility for callers that don't thread
+        the client's base_url through.
+        """
+        from agent.auxiliary_client import _build_call_kwargs
+
+        kwargs = _build_call_kwargs(
+            provider="kimi-coding",
+            model="kimi-k2.5",
+            messages=[{"role": "user", "content": "hello"}],
+            temperature=0.1,
+        )
+
+        assert kwargs["temperature"] == 0.6
+
+    @pytest.mark.parametrize(
+        "model,expected",
+        [
+            # Only kimi-k2.5 diverges on api.moonshot.ai; the rest keep the
+            # Coding Plan lock (empirically verified against Moonshot in April
+            # 2026: turbo-preview accepts 0.6, thinking-turbo accepts 1.0).
+            ("kimi-k2-turbo-preview", 0.6),
+            ("kimi-k2-0905-preview", 0.6),
+            ("kimi-k2-thinking", 1.0),
+            ("kimi-k2-thinking-turbo", 1.0),
+            ("moonshotai/kimi-k2-thinking-turbo", 1.0),
+        ],
+    )
+    def test_other_kimi_k2_family_unchanged_on_public_api(self, model, expected):
+        from agent.auxiliary_client import _build_call_kwargs
+
+        kwargs = _build_call_kwargs(
+            provider="kimi-coding",
+            model=model,
+            messages=[{"role": "user", "content": "hello"}],
+            temperature=0.1,
+            base_url="https://api.moonshot.ai/v1",
+        )
+
+        assert kwargs["temperature"] == expected
+

 # ---------------------------------------------------------------------------
 # async_call_llm payment / connection fallback (#7512 bug 2)