fix: improve Kimi model selection — auto-detect endpoint, add missing models

Kimi Coding Plan setup: - New dedicated _model_flow_kimi() replaces the generic API-key flow for kimi-coding. Removes the confusing 'Base URL' prompt entirely — the endpoint is auto-detected from the API key prefix: sk-kimi-* → api.kimi.com/coding/v1 (Kimi Coding Plan) other → api.moonshot.ai/v1 (legacy Moonshot) - Shows appropriate models for each endpoint: Coding Plan: kimi-for-coding, kimi-k2.5, kimi-k2-thinking, kimi-k2-thinking-turbo Moonshot: full model catalog - Clears any stale KIMI_BASE_URL override so runtime auto-detection via _resolve_kimi_base_url() works correctly. Model catalog updates: - Added kimi-for-coding (primary Coding Plan model) and kimi-k2-thinking-turbo to models.py, main.py _PROVIDER_MODELS, and model_metadata.py context windows. - Updated User-Agent from KimiCLI/1.0 to KimiCLI/1.3 (Kimi's coding endpoint whitelists known coding agents via User-Agent sniffing).
feat: add /reasoning command to gateway (Telegram/Discord/etc)
2026-04-28 23:11:37 +08:00 · 2026-03-12 05:57:08 -07:00 · 2026-03-12 04:02:21 -07:00 · 2026-03-12 03:53:14 -07:00
7 changed files with 336 additions and 13 deletions
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -53,8 +53,10 @@ DEFAULT_CONTEXT_LENGTHS = {
    "glm-5": 202752,
    "glm-4.5": 131072,
    "glm-4.5-flash": 131072,
+    "kimi-for-coding": 262144,
    "kimi-k2.5": 262144,
    "kimi-k2-thinking": 262144,
+    "kimi-k2-thinking-turbo": 262144,
    "kimi-k2-turbo-preview": 262144,
    "kimi-k2-0905-preview": 131072,
    "MiniMax-M2.5": 204800,
--- a/cli.py
+++ b/cli.py
@@ -3120,8 +3120,8 @@ class HermesCLI:
                level = "none (disabled)"
            else:
                level = rc.get("effort", "medium")
-            display_state = "on" if self.show_reasoning else "off"
-            _cprint(f"  {_GOLD}Reasoning effort: {level}{_RST}")
+            display_state = "on ✓" if self.show_reasoning else "off"
+            _cprint(f"  {_GOLD}Reasoning effort:  {level}{_RST}")
            _cprint(f"  {_GOLD}Reasoning display: {display_state}{_RST}")
            _cprint(f"  {_DIM}Usage: /reasoning <none|low|medium|high|xhigh|show|hide>{_RST}")
            return
@@ -3133,14 +3133,16 @@ class HermesCLI:
            self.show_reasoning = True
            if self.agent:
                self.agent.reasoning_callback = self._on_reasoning
-            _cprint(f"  {_GOLD}Reasoning display: ON{_RST}")
-            _cprint(f"  {_DIM}Model thinking will be shown during and after each response.{_RST}")
+            save_config_value("display.show_reasoning", True)
+            _cprint(f"  {_GOLD}✓ Reasoning display: ON (saved){_RST}")
+            _cprint(f"  {_DIM}  Model thinking will be shown during and after each response.{_RST}")
            return
        if arg in ("hide", "off"):
            self.show_reasoning = False
            if self.agent:
                self.agent.reasoning_callback = None
-            _cprint(f"  {_GOLD}Reasoning display: OFF{_RST}")
+            save_config_value("display.show_reasoning", False)
+            _cprint(f"  {_GOLD}✓ Reasoning display: OFF (saved){_RST}")
            return

        # Effort level change
@@ -3155,9 +3157,9 @@ class HermesCLI:
        self.agent = None  # Force agent re-init with new reasoning config

        if save_config_value("agent.reasoning_effort", arg):
-            _cprint(f"  {_GOLD}Reasoning effort set to '{arg}' (saved to config){_RST}")
+            _cprint(f"  {_GOLD}✓ Reasoning effort set to '{arg}' (saved to config){_RST}")
        else:
-            _cprint(f"  {_GOLD}Reasoning effort set to '{arg}' (session only){_RST}")
+            _cprint(f"  {_GOLD}✓ Reasoning effort set to '{arg}' (session only){_RST}")

    def _on_reasoning(self, reasoning_text: str):
        """Callback for intermediate reasoning display during tool-call loops."""
@@ -4544,7 +4546,7 @@ class HermesCLI:
                    
                    # Check for commands
                    if isinstance(user_input, str) and user_input.startswith("/"):
-                        print(f"\n⚙️  {user_input}")
+                        _cprint(f"\n⚙️  {user_input}")
                        if not self.process_command(user_input):
                            self._should_exit = True
                            # Schedule app exit
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -228,6 +228,7 @@ class GatewayRunner:
        self._prefill_messages = self._load_prefill_messages()
        self._ephemeral_system_prompt = self._load_ephemeral_system_prompt()
        self._reasoning_config = self._load_reasoning_config()
+        self._show_reasoning = self._load_show_reasoning()
        self._provider_routing = self._load_provider_routing()
        self._fallback_model = self._load_fallback_model()

@@ -421,6 +422,20 @@ class GatewayRunner:
        logger.warning("Unknown reasoning_effort '%s', using default (medium)", effort)
        return None

+    @staticmethod
+    def _load_show_reasoning() -> bool:
+        """Load show_reasoning toggle from config.yaml display section."""
+        try:
+            import yaml as _y
+            cfg_path = _hermes_home / "config.yaml"
+            if cfg_path.exists():
+                with open(cfg_path, encoding="utf-8") as _f:
+                    cfg = _y.safe_load(_f) or {}
+                return bool(cfg.get("display", {}).get("show_reasoning", False))
+        except Exception:
+            pass
+        return False
+
    @staticmethod
    def _load_background_notifications_mode() -> str:
        """Load background process notification mode from config or env var.
@@ -846,7 +861,7 @@ class GatewayRunner:
                          "personality", "retry", "undo", "sethome", "set-home",
                          "compress", "usage", "insights", "reload-mcp", "reload_mcp",
                          "update", "title", "resume", "provider", "rollback",
-                          "background"}
+                          "background", "reasoning"}
        if command and command in _known_commands:
            await self.hooks.emit(f"command:{command}", {
                "platform": source.platform.value if source.platform else "",
@@ -911,6 +926,9 @@ class GatewayRunner:

        if command == "background":
            return await self._handle_background_command(event)
+
+        if command == "reasoning":
+            return await self._handle_reasoning_command(event)
        
        # User-defined quick commands (bypass agent loop, no LLM call)
        if command:
@@ -1352,7 +1370,20 @@ class GatewayRunner:
            
            response = agent_result.get("final_response", "")
            agent_messages = agent_result.get("messages", [])
-            
+
+            # Prepend reasoning/thinking if display is enabled
+            if getattr(self, "_show_reasoning", False) and response:
+                last_reasoning = agent_result.get("last_reasoning")
+                if last_reasoning:
+                    # Collapse long reasoning to keep messages readable
+                    lines = last_reasoning.strip().splitlines()
+                    if len(lines) > 15:
+                        display_reasoning = "\n".join(lines[:15])
+                        display_reasoning += f"\n_... ({len(lines) - 15} more lines)_"
+                    else:
+                        display_reasoning = last_reasoning.strip()
+                    response = f"💭 **Reasoning:**\n```\n{display_reasoning}\n```\n\n{response}"
+
            # Emit agent:end hook
            await self.hooks.emit("agent:end", {
                **hook_ctx,
@@ -1543,6 +1574,7 @@ class GatewayRunner:
            "`/resume [name]` — Resume a previously-named session",
            "`/usage` — Show token usage for this session",
            "`/insights [days]` — Show usage insights and analytics",
+            "`/reasoning [level|show|hide]` — Set reasoning effort or toggle display",
            "`/rollback [number]` — List or restore filesystem checkpoints",
            "`/background <prompt>` — Run a prompt in a separate background session",
            "`/reload-mcp` — Reload MCP servers from config",
@@ -2170,6 +2202,88 @@ class GatewayRunner:
            except Exception:
                pass

+    async def _handle_reasoning_command(self, event: MessageEvent) -> str:
+        """Handle /reasoning command — manage reasoning effort and display toggle.
+
+        Usage:
+            /reasoning              Show current effort level and display state
+            /reasoning <level>      Set reasoning effort (none, low, medium, high, xhigh)
+            /reasoning show|on      Show model reasoning in responses
+            /reasoning hide|off     Hide model reasoning from responses
+        """
+        import yaml
+
+        args = event.get_command_args().strip().lower()
+        config_path = _hermes_home / "config.yaml"
+
+        def _save_config_key(key_path: str, value):
+            """Save a dot-separated key to config.yaml."""
+            try:
+                user_config = {}
+                if config_path.exists():
+                    with open(config_path, encoding="utf-8") as f:
+                        user_config = yaml.safe_load(f) or {}
+                keys = key_path.split(".")
+                current = user_config
+                for k in keys[:-1]:
+                    if k not in current or not isinstance(current[k], dict):
+                        current[k] = {}
+                    current = current[k]
+                current[keys[-1]] = value
+                with open(config_path, "w", encoding="utf-8") as f:
+                    yaml.dump(user_config, f, default_flow_style=False, sort_keys=False)
+                return True
+            except Exception as e:
+                logger.error("Failed to save config key %s: %s", key_path, e)
+                return False
+
+        if not args:
+            # Show current state
+            rc = self._reasoning_config
+            if rc is None:
+                level = "medium (default)"
+            elif rc.get("enabled") is False:
+                level = "none (disabled)"
+            else:
+                level = rc.get("effort", "medium")
+            display_state = "on ✓" if self._show_reasoning else "off"
+            return (
+                "🧠 **Reasoning Settings**\n\n"
+                f"**Effort:** `{level}`\n"
+                f"**Display:** {display_state}\n\n"
+                "_Usage:_ `/reasoning <none|low|medium|high|xhigh|show|hide>`"
+            )
+
+        # Display toggle
+        if args in ("show", "on"):
+            self._show_reasoning = True
+            _save_config_key("display.show_reasoning", True)
+            return "🧠 ✓ Reasoning display: **ON**\nModel thinking will be shown before each response."
+
+        if args in ("hide", "off"):
+            self._show_reasoning = False
+            _save_config_key("display.show_reasoning", False)
+            return "🧠 ✓ Reasoning display: **OFF**"
+
+        # Effort level change
+        effort = args.strip()
+        if effort == "none":
+            parsed = {"enabled": False}
+        elif effort in ("xhigh", "high", "medium", "low", "minimal"):
+            parsed = {"enabled": True, "effort": effort}
+        else:
+            return (
+                f"⚠️ Unknown argument: `{effort}`\n\n"
+                "**Valid levels:** none, low, minimal, medium, high, xhigh\n"
+                "**Display:** show, hide"
+            )
+
+        self._reasoning_config = parsed
+        if _save_config_key("agent.reasoning_effort", effort):
+            return f"🧠 ✓ Reasoning effort set to `{effort}` (saved to config)\n_(takes effect on next message)_"
+        else:
+            return f"🧠 ✓ Reasoning effort set to `{effort}` (this session only)"
+
    async def _handle_compress_command(self, event: MessageEvent) -> str:
        """Handle /compress command -- manually compress conversation context."""
        source = event.source
@@ -3273,6 +3387,7 @@ class GatewayRunner:
            
            return {
                "final_response": final_response,
+                "last_reasoning": result.get("last_reasoning"),
                "messages": result_holder[0].get("messages", []) if result_holder[0] else [],
                "api_calls": result_holder[0].get("api_calls", 0) if result_holder[0] else 0,
                "tools": tools_holder[0] or [],
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -831,7 +831,9 @@ def cmd_model(args):
        _model_flow_named_custom(config, _custom_provider_map[selected_provider])
    elif selected_provider == "remove-custom":
        _remove_custom_provider(config)
-    elif selected_provider in ("zai", "kimi-coding", "minimax", "minimax-cn"):
+    elif selected_provider == "kimi-coding":
+        _model_flow_kimi(config, current_model)
+    elif selected_provider in ("zai", "minimax", "minimax-cn"):
        _model_flow_api_key_provider(config, selected_provider, current_model)


@@ -1342,8 +1344,10 @@ _PROVIDER_MODELS = {
        "glm-4.5-flash",
    ],
    "kimi-coding": [
+        "kimi-for-coding",
        "kimi-k2.5",
        "kimi-k2-thinking",
+        "kimi-k2-thinking-turbo",
        "kimi-k2-turbo-preview",
        "kimi-k2-0905-preview",
    ],
@@ -1360,8 +1364,112 @@ _PROVIDER_MODELS = {
 }


+def _model_flow_kimi(config, current_model=""):
+    """Kimi / Moonshot model selection with automatic endpoint routing.
+
+    - sk-kimi-* keys   → api.kimi.com/coding/v1  (Kimi Coding Plan)
+    - Other keys        → api.moonshot.ai/v1      (legacy Moonshot)
+
+    No manual base URL prompt — endpoint is determined by key prefix.
+    """
+    from hermes_cli.auth import (
+        PROVIDER_REGISTRY, KIMI_CODE_BASE_URL, _prompt_model_selection,
+        _save_model_choice, deactivate_provider,
+    )
+    from hermes_cli.config import get_env_value, save_env_value, load_config, save_config
+
+    provider_id = "kimi-coding"
+    pconfig = PROVIDER_REGISTRY[provider_id]
+    key_env = pconfig.api_key_env_vars[0] if pconfig.api_key_env_vars else ""
+    base_url_env = pconfig.base_url_env_var or ""
+
+    # Step 1: Check / prompt for API key
+    existing_key = ""
+    for ev in pconfig.api_key_env_vars:
+        existing_key = get_env_value(ev) or os.getenv(ev, "")
+        if existing_key:
+            break
+
+    if not existing_key:
+        print(f"No {pconfig.name} API key configured.")
+        if key_env:
+            try:
+                new_key = input(f"{key_env} (or Enter to cancel): ").strip()
+            except (KeyboardInterrupt, EOFError):
+                print()
+                return
+            if not new_key:
+                print("Cancelled.")
+                return
+            save_env_value(key_env, new_key)
+            existing_key = new_key
+            print("API key saved.")
+            print()
+    else:
+        print(f"  {pconfig.name} API key: {existing_key[:8]}... ✓")
+        print()
+
+    # Step 2: Auto-detect endpoint from key prefix
+    is_coding_plan = existing_key.startswith("sk-kimi-")
+    if is_coding_plan:
+        effective_base = KIMI_CODE_BASE_URL
+        print(f"  Detected Kimi Coding Plan key → {effective_base}")
+    else:
+        effective_base = pconfig.inference_base_url
+        print(f"  Using Moonshot endpoint → {effective_base}")
+    # Clear any manual base URL override so auto-detection works at runtime
+    if base_url_env and get_env_value(base_url_env):
+        save_env_value(base_url_env, "")
+    print()
+
+    # Step 3: Model selection — show appropriate models for the endpoint
+    if is_coding_plan:
+        # Coding Plan models (kimi-for-coding first)
+        model_list = [
+            "kimi-for-coding",
+            "kimi-k2.5",
+            "kimi-k2-thinking",
+            "kimi-k2-thinking-turbo",
+        ]
+    else:
+        # Legacy Moonshot models
+        model_list = _PROVIDER_MODELS.get(provider_id, [])
+
+    if model_list:
+        selected = _prompt_model_selection(model_list, current_model=current_model)
+    else:
+        try:
+            selected = input("Enter model name: ").strip()
+        except (KeyboardInterrupt, EOFError):
+            selected = None
+
+    if selected:
+        # Clear custom endpoint if set (avoid confusion)
+        if get_env_value("OPENAI_BASE_URL"):
+            save_env_value("OPENAI_BASE_URL", "")
+            save_env_value("OPENAI_API_KEY", "")
+
+        _save_model_choice(selected)
+
+        # Update config with provider and base URL
+        cfg = load_config()
+        model = cfg.get("model")
+        if not isinstance(model, dict):
+            model = {"default": model} if model else {}
+            cfg["model"] = model
+        model["provider"] = provider_id
+        model["base_url"] = effective_base
+        save_config(cfg)
+        deactivate_provider()
+
+        endpoint_label = "Kimi Coding" if is_coding_plan else "Moonshot"
+        print(f"Default model set to: {selected} (via {endpoint_label})")
+    else:
+        print("No change.")
+
+
 def _model_flow_api_key_provider(config, provider_id, current_model=""):
-    """Generic flow for API-key providers (z.ai, Kimi, MiniMax)."""
+    """Generic flow for API-key providers (z.ai, MiniMax)."""
    from hermes_cli.auth import (
        PROVIDER_REGISTRY, _prompt_model_selection, _save_model_choice,
        _update_config_for_provider, deactivate_provider,
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@@ -51,8 +51,10 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "glm-4.5-flash",
    ],
    "kimi-coding": [
+        "kimi-for-coding",
        "kimi-k2.5",
        "kimi-k2-thinking",
+        "kimi-k2-thinking-turbo",
        "kimi-k2-turbo-preview",
        "kimi-k2-0905-preview",
    ],
--- a/run_agent.py
+++ b/run_agent.py
@@ -436,7 +436,7 @@ class AIAgent:
                }
            elif "api.kimi.com" in effective_base.lower():
                client_kwargs["default_headers"] = {
-                    "User-Agent": "KimiCLI/1.0",
+                    "User-Agent": "KimiCLI/1.3",
                }
        else:
            # No explicit creds — use the centralized provider router
@@ -2442,6 +2442,16 @@ class AIAgent:
        """
        reasoning_text = self._extract_reasoning(assistant_message)

+        # Fallback: extract inline <think> blocks from content when no structured
+        # reasoning fields are present (some models/providers embed thinking
+        # directly in the content rather than returning separate API fields).
+        if not reasoning_text:
+            content = assistant_message.content or ""
+            think_blocks = re.findall(r'<think>(.*?)</think>', content, flags=re.DOTALL)
+            if think_blocks:
+                combined = "\n\n".join(b.strip() for b in think_blocks if b.strip())
+                reasoning_text = combined or None
+
        if reasoning_text and self.verbose_logging:
            preview = reasoning_text[:100] + "..." if len(reasoning_text) > 100 else reasoning_text
            logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {preview}")
--- a/tests/test_reasoning_command.py
+++ b/tests/test_reasoning_command.py
@@ -342,6 +342,90 @@ class TestExtractReasoningFormats(unittest.TestCase):
        self.assertIsNone(result)


+# ---------------------------------------------------------------------------
+# Inline <think> block extraction fallback
+# ---------------------------------------------------------------------------
+
+class TestInlineThinkBlockExtraction(unittest.TestCase):
+    """Test _build_assistant_message extracts inline <think> blocks as reasoning
+    when no structured API-level reasoning fields are present."""
+
+    def _build_msg(self, content, reasoning=None, reasoning_content=None, reasoning_details=None, tool_calls=None):
+        """Create a mock API response message."""
+        msg = SimpleNamespace(content=content, tool_calls=tool_calls)
+        if reasoning is not None:
+            msg.reasoning = reasoning
+        if reasoning_content is not None:
+            msg.reasoning_content = reasoning_content
+        if reasoning_details is not None:
+            msg.reasoning_details = reasoning_details
+        return msg
+
+    def _make_agent(self):
+        """Create a minimal agent with _build_assistant_message."""
+        from run_agent import AIAgent
+        agent = MagicMock(spec=AIAgent)
+        agent._build_assistant_message = AIAgent._build_assistant_message.__get__(agent)
+        agent._extract_reasoning = AIAgent._extract_reasoning.__get__(agent)
+        agent.verbose_logging = False
+        agent.reasoning_callback = None
+        return agent
+
+    def test_single_think_block_extracted(self):
+        agent = self._make_agent()
+        api_msg = self._build_msg("<think>Let me calculate 2+2=4.</think>The answer is 4.")
+        result = agent._build_assistant_message(api_msg, "stop")
+        self.assertEqual(result["reasoning"], "Let me calculate 2+2=4.")
+
+    def test_multiple_think_blocks_extracted(self):
+        agent = self._make_agent()
+        api_msg = self._build_msg("<think>First thought.</think>Some text<think>Second thought.</think>More text")
+        result = agent._build_assistant_message(api_msg, "stop")
+        self.assertIn("First thought.", result["reasoning"])
+        self.assertIn("Second thought.", result["reasoning"])
+
+    def test_no_think_blocks_no_reasoning(self):
+        agent = self._make_agent()
+        api_msg = self._build_msg("Just a plain response.")
+        result = agent._build_assistant_message(api_msg, "stop")
+        # No structured reasoning AND no inline think blocks → None
+        self.assertIsNone(result["reasoning"])
+
+    def test_structured_reasoning_takes_priority(self):
+        """When structured API reasoning exists, inline think blocks should NOT override."""
+        agent = self._make_agent()
+        api_msg = self._build_msg(
+            "<think>Inline thought.</think>Response text.",
+            reasoning="Structured reasoning from API.",
+        )
+        result = agent._build_assistant_message(api_msg, "stop")
+        self.assertEqual(result["reasoning"], "Structured reasoning from API.")
+
+    def test_empty_think_block_ignored(self):
+        agent = self._make_agent()
+        api_msg = self._build_msg("<think></think>Hello!")
+        result = agent._build_assistant_message(api_msg, "stop")
+        # Empty think block should not produce reasoning
+        self.assertIsNone(result["reasoning"])
+
+    def test_multiline_think_block(self):
+        agent = self._make_agent()
+        api_msg = self._build_msg("<think>\nStep 1: Analyze.\nStep 2: Solve.\n</think>Done.")
+        result = agent._build_assistant_message(api_msg, "stop")
+        self.assertIn("Step 1: Analyze.", result["reasoning"])
+        self.assertIn("Step 2: Solve.", result["reasoning"])
+
+    def test_callback_fires_for_inline_think(self):
+        """Reasoning callback should fire when reasoning is extracted from inline think blocks."""
+        agent = self._make_agent()
+        captured = []
+        agent.reasoning_callback = lambda t: captured.append(t)
+        api_msg = self._build_msg("<think>Deep analysis here.</think>Answer.")
+        agent._build_assistant_message(api_msg, "stop")
+        self.assertEqual(len(captured), 1)
+        self.assertIn("Deep analysis", captured[0])
+
+
 # ---------------------------------------------------------------------------
 # Config defaults
 # ---------------------------------------------------------------------------