mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
fix(kimi): route temperature override by base_url — kimi-k2.5 needs 1.0 on api.moonshot.ai
Follow-up to #12144. That PR standardized the kimi-k2.* temperature lock against the Coding Plan endpoint (api.kimi.com/coding/v1) docs, where non-thinking models require 0.6. Verified empirically against Moonshot (April 2026) that the public chat endpoint (api.moonshot.ai/v1) has a different contract for kimi-k2.5: it only accepts temperature=1, and rejects 0.6 with: HTTP 400 "invalid temperature: only 1 is allowed for this model" Users hit the public endpoint when KIMI_API_KEY is a legacy sk-* key (the sk-kimi-* prefix routes to Coding Plan — see hermes_cli/auth.py). So for Coding Plan subscribers the fix from #12144 is correct, but for public-API users it reintroduces the exact 400 reported in #9125. Reproduction on api.moonshot.ai/v1 + kimi-k2.5: temperature=1.0 → 200 OK temperature=0.6 → 400 "only 1 is allowed" ← #12144 default temperature=None → 200 OK Other kimi-k2.* models are unaffected empirically — turbo-preview accepts 0.6 and thinking-turbo accepts 1.0 on both endpoints — so only kimi-k2.5 diverges. Fix: thread the client's actual base_url through _build_call_kwargs (the parameter already existed but callers passed config-level resolved_base_url; for auto-detected routes that was often empty). _fixed_temperature_for_model now checks api.moonshot.ai first via an explicit _KIMI_PUBLIC_API_OVERRIDES map, then falls back to the Coding Plan defaults. Tests parametrize over endpoint + model to lock both contracts. Closes #9125.
This commit is contained in:
@@ -116,8 +116,25 @@ _KIMI_THINKING_MODELS: frozenset = frozenset({
|
||||
"kimi-k2-thinking-turbo",
|
||||
})
|
||||
|
||||
# Moonshot's public chat endpoint (api.moonshot.ai/v1) enforces a different
|
||||
# temperature contract than the Coding Plan endpoint above. Empirically,
|
||||
# `kimi-k2.5` on the public API rejects 0.6 with HTTP 400
|
||||
# "invalid temperature: only 1 is allowed for this model" — the Coding Plan
|
||||
# lock (0.6 for non-thinking) does not apply. `kimi-k2-turbo-preview` and the
|
||||
# thinking variants already match the Coding Plan contract on the public
|
||||
# endpoint, so we only override the models that diverge.
|
||||
# Users hit this endpoint when `KIMI_API_KEY` is a legacy `sk-*` key (the
|
||||
# `sk-kimi-*` prefix routes to api.kimi.com/coding/v1 instead — see
|
||||
# hermes_cli/auth.py:_kimi_base_url_for_key).
|
||||
_KIMI_PUBLIC_API_OVERRIDES: Dict[str, float] = {
|
||||
"kimi-k2.5": 1.0,
|
||||
}
|
||||
|
||||
def _fixed_temperature_for_model(model: Optional[str]) -> Optional[float]:
|
||||
|
||||
def _fixed_temperature_for_model(
|
||||
model: Optional[str],
|
||||
base_url: Optional[str] = None,
|
||||
) -> Optional[float]:
|
||||
"""Return a required temperature override for models with strict contracts.
|
||||
|
||||
Moonshot's kimi-for-coding endpoint rejects any non-approved temperature on
|
||||
@@ -125,15 +142,31 @@ def _fixed_temperature_for_model(model: Optional[str]) -> Optional[float]:
|
||||
variants require 1.0. An optional ``vendor/`` prefix (e.g.
|
||||
``moonshotai/kimi-k2.5``) is tolerated for aggregator routings.
|
||||
|
||||
When ``base_url`` points to Moonshot's public chat endpoint
|
||||
(``api.moonshot.ai``), the contract changes for ``kimi-k2.5``: the public
|
||||
API only accepts ``temperature=1``, not 0.6. That override takes precedence
|
||||
over the Coding Plan defaults above.
|
||||
|
||||
Returns ``None`` for every other model, including ``kimi-k2-instruct*``
|
||||
which is the separate non-coding K2 family with variable temperature.
|
||||
"""
|
||||
normalized = (model or "").strip().lower()
|
||||
bare = normalized.rsplit("/", 1)[-1]
|
||||
|
||||
# Public Moonshot API has a stricter contract for some models than the
|
||||
# Coding Plan endpoint — check it first so it wins on conflict.
|
||||
if base_url and "api.moonshot.ai" in base_url.lower():
|
||||
public = _KIMI_PUBLIC_API_OVERRIDES.get(bare)
|
||||
if public is not None:
|
||||
logger.debug(
|
||||
"Forcing temperature=%s for %r on public Moonshot API", public, model
|
||||
)
|
||||
return public
|
||||
|
||||
fixed = _FIXED_TEMPERATURE_MODELS.get(normalized)
|
||||
if fixed is not None:
|
||||
logger.debug("Forcing temperature=%s for model %r (fixed map)", fixed, model)
|
||||
return fixed
|
||||
bare = normalized.rsplit("/", 1)[-1]
|
||||
if bare in _KIMI_THINKING_MODELS:
|
||||
logger.debug("Forcing temperature=1.0 for kimi thinking model %r", model)
|
||||
return 1.0
|
||||
@@ -2417,7 +2450,7 @@ def _build_call_kwargs(
|
||||
"timeout": timeout,
|
||||
}
|
||||
|
||||
fixed_temperature = _fixed_temperature_for_model(model)
|
||||
fixed_temperature = _fixed_temperature_for_model(model, base_url)
|
||||
if fixed_temperature is not None:
|
||||
temperature = fixed_temperature
|
||||
|
||||
@@ -2598,11 +2631,14 @@ def call_llm(
|
||||
task, resolved_provider or "auto", final_model or "default",
|
||||
f" at {_base_info}" if _base_info and "openrouter" not in _base_info else "")
|
||||
|
||||
# Pass the client's actual base_url (not just resolved_base_url) so
|
||||
# endpoint-specific temperature overrides can distinguish
|
||||
# api.moonshot.ai vs api.kimi.com/coding even on auto-detected routes.
|
||||
kwargs = _build_call_kwargs(
|
||||
resolved_provider, final_model, messages,
|
||||
temperature=temperature, max_tokens=max_tokens,
|
||||
tools=tools, timeout=effective_timeout, extra_body=extra_body,
|
||||
base_url=resolved_base_url)
|
||||
base_url=_base_info or resolved_base_url)
|
||||
|
||||
# Convert image blocks for Anthropic-compatible endpoints (e.g. MiniMax)
|
||||
_client_base = str(getattr(client, "base_url", "") or "")
|
||||
@@ -2656,7 +2692,8 @@ def call_llm(
|
||||
fb_label, fb_model, messages,
|
||||
temperature=temperature, max_tokens=max_tokens,
|
||||
tools=tools, timeout=effective_timeout,
|
||||
extra_body=extra_body)
|
||||
extra_body=extra_body,
|
||||
base_url=str(getattr(fb_client, "base_url", "") or ""))
|
||||
return _validate_llm_response(
|
||||
fb_client.chat.completions.create(**fb_kwargs), task)
|
||||
raise
|
||||
@@ -2791,14 +2828,17 @@ async def async_call_llm(
|
||||
|
||||
effective_timeout = timeout if timeout is not None else _get_task_timeout(task)
|
||||
|
||||
# Pass the client's actual base_url (not just resolved_base_url) so
|
||||
# endpoint-specific temperature overrides can distinguish
|
||||
# api.moonshot.ai vs api.kimi.com/coding even on auto-detected routes.
|
||||
_client_base = str(getattr(client, "base_url", "") or "")
|
||||
kwargs = _build_call_kwargs(
|
||||
resolved_provider, final_model, messages,
|
||||
temperature=temperature, max_tokens=max_tokens,
|
||||
tools=tools, timeout=effective_timeout, extra_body=extra_body,
|
||||
base_url=resolved_base_url)
|
||||
base_url=_client_base or resolved_base_url)
|
||||
|
||||
# Convert image blocks for Anthropic-compatible endpoints (e.g. MiniMax)
|
||||
_client_base = str(getattr(client, "base_url", "") or "")
|
||||
if _is_anthropic_compat_endpoint(resolved_provider, _client_base):
|
||||
kwargs["messages"] = _convert_openai_images_to_anthropic(kwargs["messages"])
|
||||
|
||||
@@ -2834,7 +2874,8 @@ async def async_call_llm(
|
||||
fb_label, fb_model, messages,
|
||||
temperature=temperature, max_tokens=max_tokens,
|
||||
tools=tools, timeout=effective_timeout,
|
||||
extra_body=extra_body)
|
||||
extra_body=extra_body,
|
||||
base_url=str(getattr(fb_client, "base_url", "") or ""))
|
||||
# Convert sync fallback client to async
|
||||
async_fb, async_fb_model = _to_async_client(fb_client, fb_model or "")
|
||||
if async_fb_model and async_fb_model != fb_kwargs.get("model"):
|
||||
|
||||
@@ -832,6 +832,92 @@ class TestKimiForCodingTemperature:
|
||||
|
||||
assert kwargs["temperature"] == 0.3
|
||||
|
||||
# ── Endpoint-aware overrides: api.moonshot.ai vs api.kimi.com/coding ──
|
||||
# The public Moonshot chat endpoint and the Coding Plan endpoint enforce
|
||||
# different temperature contracts for the same model name. `kimi-k2.5` on
|
||||
# api.moonshot.ai rejects 0.6 with HTTP 400 "only 1 is allowed for this
|
||||
# model", while the Coding Plan docs mandate 0.6. Override must pick the
|
||||
# right value per base_url.
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"base_url",
|
||||
[
|
||||
"https://api.moonshot.ai/v1",
|
||||
"https://api.moonshot.ai/v1/",
|
||||
"https://API.MOONSHOT.AI/v1",
|
||||
],
|
||||
)
|
||||
def test_kimi_k2_5_public_api_forces_temperature_1(self, base_url):
|
||||
"""kimi-k2.5 on the public Moonshot API only accepts temperature=1."""
|
||||
from agent.auxiliary_client import _build_call_kwargs
|
||||
|
||||
kwargs = _build_call_kwargs(
|
||||
provider="kimi-coding",
|
||||
model="kimi-k2.5",
|
||||
messages=[{"role": "user", "content": "hello"}],
|
||||
temperature=0.1,
|
||||
base_url=base_url,
|
||||
)
|
||||
|
||||
assert kwargs["temperature"] == 1.0
|
||||
|
||||
def test_kimi_k2_5_coding_plan_keeps_temperature_0_6(self):
|
||||
"""kimi-k2.5 on api.kimi.com/coding keeps the Coding Plan's 0.6 lock."""
|
||||
from agent.auxiliary_client import _build_call_kwargs
|
||||
|
||||
kwargs = _build_call_kwargs(
|
||||
provider="kimi-coding",
|
||||
model="kimi-k2.5",
|
||||
messages=[{"role": "user", "content": "hello"}],
|
||||
temperature=0.1,
|
||||
base_url="https://api.kimi.com/coding/v1",
|
||||
)
|
||||
|
||||
assert kwargs["temperature"] == 0.6
|
||||
|
||||
def test_kimi_k2_5_no_base_url_falls_back_to_coding_plan_lock(self):
|
||||
"""Without a base_url hint, the Coding Plan default (0.6) applies.
|
||||
|
||||
Preserves PR #12144 backward compatibility for callers that don't thread
|
||||
the client's base_url through.
|
||||
"""
|
||||
from agent.auxiliary_client import _build_call_kwargs
|
||||
|
||||
kwargs = _build_call_kwargs(
|
||||
provider="kimi-coding",
|
||||
model="kimi-k2.5",
|
||||
messages=[{"role": "user", "content": "hello"}],
|
||||
temperature=0.1,
|
||||
)
|
||||
|
||||
assert kwargs["temperature"] == 0.6
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model,expected",
|
||||
[
|
||||
# Only kimi-k2.5 diverges on api.moonshot.ai; the rest keep the
|
||||
# Coding Plan lock (empirically verified against Moonshot in April
|
||||
# 2026: turbo-preview accepts 0.6, thinking-turbo accepts 1.0).
|
||||
("kimi-k2-turbo-preview", 0.6),
|
||||
("kimi-k2-0905-preview", 0.6),
|
||||
("kimi-k2-thinking", 1.0),
|
||||
("kimi-k2-thinking-turbo", 1.0),
|
||||
("moonshotai/kimi-k2-thinking-turbo", 1.0),
|
||||
],
|
||||
)
|
||||
def test_other_kimi_k2_family_unchanged_on_public_api(self, model, expected):
|
||||
from agent.auxiliary_client import _build_call_kwargs
|
||||
|
||||
kwargs = _build_call_kwargs(
|
||||
provider="kimi-coding",
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": "hello"}],
|
||||
temperature=0.1,
|
||||
base_url="https://api.moonshot.ai/v1",
|
||||
)
|
||||
|
||||
assert kwargs["temperature"] == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# async_call_llm payment / connection fallback (#7512 bug 2)
|
||||
|
||||
Reference in New Issue
Block a user