docs: add Hugging Face provider to all documentation pages

- quickstart.md: add to provider table - configuration.md: add to provider table, add dedicated section with usage examples, config.yaml snippet, routing suffixes, and token info; also fix pre-existing duplicate Alibaba Cloud entry - environment-variables.md: add HF_TOKEN + HF_BASE_URL, add huggingface to HERMES_INFERENCE_PROVIDER values - fallback-providers.md: add to supported providers table and auto-detection chain
feat: add Hugging Face as a first-class inference provider
2026-05-09 20:27:24 +08:00 · 2026-03-17 05:40:02 -07:00 · 2026-03-17 05:21:17 -07:00
47 changed files with 721 additions and 3485 deletions
--- a/.env.example
+++ b/.env.example
@@ -59,6 +59,15 @@ OPENCODE_ZEN_API_KEY=
 # OpenCode Go provides access to open models (GLM-5, Kimi K2.5, MiniMax M2.5)
 # $10/month subscription. Get your key at: https://opencode.ai/auth
 OPENCODE_GO_API_KEY=
+
+# =============================================================================
+# LLM PROVIDER (Hugging Face Inference Providers)
+# =============================================================================
+# Hugging Face routes to 20+ open models via unified OpenAI-compatible endpoint.
+# Free tier included ($0.10/month), no markup on provider rates.
+# Get your token at: https://huggingface.co/settings/tokens
+# Required permission: "Make calls to Inference Providers"
+HF_TOKEN=
 # OPENCODE_GO_BASE_URL=https://opencode.ai/zen/go/v1  # Override default base URL

 # =============================================================================
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -364,7 +364,7 @@ Rendering bugs in tmux/iTerm2 — ghosting on scroll. Use `curses` (stdlib) inst
 Leaks as literal `?[K` text under `prompt_toolkit`'s `patch_stdout`. Use space-padding: `f"\r{line}{' ' * pad}"`.

 ### `_last_resolved_tool_names` is a process-global in `model_tools.py`
-`_run_single_child()` in `delegate_tool.py` saves and restores this global around subagent execution. If you add new code that reads this global, be aware it may be temporarily stale during child agent runs.
+When subagents overwrite this global, `execute_code` calls after delegation may fail with missing tool imports. Known bug.

 ### Tests must not write to `~/.hermes/`
 The `_isolate_hermes_home` autouse fixture in `tests/conftest.py` redirects `HERMES_HOME` to a temp dir. Never hardcode `~/.hermes/` paths in tests.
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -311,7 +311,6 @@ Write only the summary body. Do not include any preamble or prefix; the system w
                )
            compressed.append(msg)

-        _merge_summary_into_tail = False
        if summary:
            last_head_role = messages[compress_start - 1].get("role", "user") if compress_start > 0 else "user"
            first_tail_role = messages[compress_end].get("role", "user") if compress_end < n_messages else "user"
@@ -327,25 +326,13 @@ Write only the summary body. Do not include any preamble or prefix; the system w
                flipped = "assistant" if summary_role == "user" else "user"
                if flipped != last_head_role:
                    summary_role = flipped
-                else:
-                    # Both roles would create consecutive same-role messages
-                    # (e.g. head=assistant, tail=user — neither role works).
-                    # Merge the summary into the first tail message instead
-                    # of inserting a standalone message that breaks alternation.
-                    _merge_summary_into_tail = True
-            if not _merge_summary_into_tail:
-                compressed.append({"role": summary_role, "content": summary})
+            compressed.append({"role": summary_role, "content": summary})
        else:
            if not self.quiet_mode:
                print("   ⚠️  No summary model available — middle turns dropped without summary")

        for i in range(compress_end, n_messages):
-            msg = messages[i].copy()
-            if _merge_summary_into_tail and i == compress_end:
-                original = msg.get("content") or ""
-                msg["content"] = summary + "\n\n" + original
-                _merge_summary_into_tail = False
-            compressed.append(msg)
+            compressed.append(messages[i].copy())

        self.compression_count += 1

--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -119,6 +119,25 @@ DEFAULT_CONTEXT_LENGTHS = {
    "qwen-plus-latest": 131072,
    "qwen3.5-flash": 131072,
    "qwen-vl-max": 32768,
+    # Hugging Face Inference Providers — model IDs use org/name format
+    "Qwen/Qwen3.5-397B-A17B": 131072,
+    "Qwen/Qwen3-235B-A22B-Thinking-2507": 131072,
+    "Qwen/Qwen3-Coder-480B-A35B-Instruct": 131072,
+    "Qwen/Qwen3-Coder-Next": 131072,
+    "Qwen/Qwen3-Next-80B-A3B-Instruct": 131072,
+    "Qwen/Qwen3-Next-80B-A3B-Thinking": 131072,
+    "deepseek-ai/DeepSeek-R1-0528": 65536,
+    "deepseek-ai/DeepSeek-V3.2": 65536,
+    "moonshotai/Kimi-K2-Instruct": 262144,
+    "moonshotai/Kimi-K2-Instruct-0905": 262144,
+    "moonshotai/Kimi-K2.5": 262144,
+    "moonshotai/Kimi-K2-Thinking": 262144,
+    "MiniMaxAI/MiniMax-M2.5": 204800,
+    "MiniMaxAI/MiniMax-M2.1": 204800,
+    "XiaomiMiMo/MiMo-V2-Flash": 32768,
+    "zai-org/GLM-5": 202752,
+    "zai-org/GLM-4.7": 202752,
+    "zai-org/GLM-4.7-Flash": 202752,
 }


--- a/cli.py
+++ b/cli.py
@@ -219,6 +219,7 @@ def load_cli_config() -> Dict[str, Any]:
            "streaming": False,

            "skin": "default",
+            "theme_mode": "auto",
        },
        "clarify": {
            "timeout": 120,  # Seconds to wait for a clarify answer before auto-proceeding
@@ -3271,7 +3272,7 @@ class HermesCLI:
            print("  To start the gateway:")
            print("    python cli.py --gateway")
            print()
-            print("  Configuration file: ~/.hermes/config.yaml")
+            print("  Configuration file: ~/.hermes/gateway.json")
            print()
            
        except Exception as e:
@@ -3281,7 +3282,7 @@ class HermesCLI:
            print("    1. Set environment variables:")
            print("       TELEGRAM_BOT_TOKEN=your_token")
            print("       DISCORD_BOT_TOKEN=your_token")
-            print("    2. Or configure settings in ~/.hermes/config.yaml")
+            print("    2. Or create ~/.hermes/gateway.json")
            print()
    
    def process_command(self, command: str) -> bool:
@@ -3560,7 +3561,7 @@ class HermesCLI:
        elif canonical == "reload-mcp":
            with self._busy_command(self._slow_command_status(cmd_original)):
                self._reload_mcp()
-        elif canonical == "browser":
+        elif _base_word == "browser":
            self._handle_browser_command(cmd_original)
        elif canonical == "plugins":
            try:
--- a/gateway/config.py
+++ b/gateway/config.py
@@ -46,7 +46,6 @@ class Platform(Enum):
    EMAIL = "email"
    SMS = "sms"
    DINGTALK = "dingtalk"
-    API_SERVER = "api_server"


@dataclass
@@ -239,9 +238,6 @@ class GatewayConfig:
            # SMS uses api_key (Twilio auth token) — SID checked via env
            elif platform == Platform.SMS and os.getenv("TWILIO_ACCOUNT_SID"):
                connected.append(platform)
-            # API Server uses enabled flag only (no token needed)
-            elif platform == Platform.API_SERVER:
-                connected.append(platform)
        return connected
    
    def get_home_channel(self, platform: Platform) -> Optional[HomeChannel]:
@@ -350,73 +346,65 @@ class GatewayConfig:
 def load_gateway_config() -> GatewayConfig:
    """
    Load gateway configuration from multiple sources.
-
+    
    Priority (highest to lowest):
    1. Environment variables
-    2. ~/.hermes/config.yaml (primary user-facing config)
-    3. ~/.hermes/gateway.json (legacy — provides defaults under config.yaml)
-    4. Built-in defaults
+    2. ~/.hermes/gateway.json
+    3. cli-config.yaml gateway section
+    4. Defaults
    """
+    config = GatewayConfig()
+    
+    # Try loading from ~/.hermes/gateway.json
    _home = get_hermes_home()
-    gw_data: dict = {}
-
-    # Legacy fallback: gateway.json provides the base layer.
-    # config.yaml keys always win when both specify the same setting.
-    gateway_json_path = _home / "gateway.json"
-    if gateway_json_path.exists():
+    gateway_config_path = _home / "gateway.json"
+    if gateway_config_path.exists():
        try:
-            with open(gateway_json_path, "r", encoding="utf-8") as f:
-                gw_data = json.load(f) or {}
-            logger.info(
-                "Loaded legacy %s — consider moving settings to config.yaml",
-                gateway_json_path,
-            )
+            with open(gateway_config_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+                config = GatewayConfig.from_dict(data)
        except Exception as e:
-            logger.warning("Failed to load %s: %s", gateway_json_path, e)
+            print(f"[gateway] Warning: Failed to load {gateway_config_path}: {e}")

-    # Primary source: config.yaml
+    # Bridge session_reset from config.yaml (the user-facing config file)
+    # into the gateway config. config.yaml takes precedence over gateway.json
+    # for session reset policy since that's where hermes setup writes it.
    try:
        import yaml
        config_yaml_path = _home / "config.yaml"
        if config_yaml_path.exists():
            with open(config_yaml_path, encoding="utf-8") as f:
                yaml_cfg = yaml.safe_load(f) or {}
-
-            # Map config.yaml keys → GatewayConfig.from_dict() schema.
-            # Each key overwrites whatever gateway.json may have set.
            sr = yaml_cfg.get("session_reset")
            if sr and isinstance(sr, dict):
-                gw_data["default_reset_policy"] = sr
+                config.default_reset_policy = SessionResetPolicy.from_dict(sr)

+            # Bridge quick commands from config.yaml into gateway runtime config.
+            # config.yaml is the user-facing config source, so when present it
+            # should override gateway.json for this setting.
            qc = yaml_cfg.get("quick_commands")
            if qc is not None:
                if isinstance(qc, dict):
-                    gw_data["quick_commands"] = qc
+                    config.quick_commands = qc
                else:
-                    logger.warning(
-                        "Ignoring invalid quick_commands in config.yaml "
-                        "(expected mapping, got %s)",
-                        type(qc).__name__,
-                    )
+                    logger.warning("Ignoring invalid quick_commands in config.yaml (expected mapping, got %s)", type(qc).__name__)

+            # Bridge STT enable/disable from config.yaml into gateway runtime.
+            # This keeps the gateway aligned with the user-facing config source.
            stt_cfg = yaml_cfg.get("stt")
-            if isinstance(stt_cfg, dict):
-                gw_data["stt"] = stt_cfg
+            if isinstance(stt_cfg, dict) and "enabled" in stt_cfg:
+                config.stt_enabled = _coerce_bool(stt_cfg.get("enabled"), True)

+            # Bridge group session isolation from config.yaml into gateway runtime.
+            # Secure default is per-user isolation in shared chats.
            if "group_sessions_per_user" in yaml_cfg:
-                gw_data["group_sessions_per_user"] = yaml_cfg["group_sessions_per_user"]
+                config.group_sessions_per_user = _coerce_bool(
+                    yaml_cfg.get("group_sessions_per_user"),
+                    True,
+                )

-            streaming_cfg = yaml_cfg.get("streaming")
-            if isinstance(streaming_cfg, dict):
-                gw_data["streaming"] = streaming_cfg
-
-            if "reset_triggers" in yaml_cfg:
-                gw_data["reset_triggers"] = yaml_cfg["reset_triggers"]
-
-            if "always_log_local" in yaml_cfg:
-                gw_data["always_log_local"] = yaml_cfg["always_log_local"]
-
-            # Discord settings → env vars (env vars take precedence)
+            # Bridge discord settings from config.yaml to env vars
+            # (env vars take precedence — only set if not already defined)
            discord_cfg = yaml_cfg.get("discord", {})
            if isinstance(discord_cfg, dict):
                if "require_mention" in discord_cfg and not os.getenv("DISCORD_REQUIRE_MENTION"):
@@ -428,18 +416,9 @@ def load_gateway_config() -> GatewayConfig:
                    os.environ["DISCORD_FREE_RESPONSE_CHANNELS"] = str(frc)
                if "auto_thread" in discord_cfg and not os.getenv("DISCORD_AUTO_THREAD"):
                    os.environ["DISCORD_AUTO_THREAD"] = str(discord_cfg["auto_thread"]).lower()
-
-            # Bridge whatsapp settings from config.yaml into platform config
-            whatsapp_cfg = yaml_cfg.get("whatsapp", {})
-            if isinstance(whatsapp_cfg, dict) and "reply_prefix" in whatsapp_cfg:
-                if Platform.WHATSAPP not in config.platforms:
-                    config.platforms[Platform.WHATSAPP] = PlatformConfig()
-                config.platforms[Platform.WHATSAPP].extra["reply_prefix"] = whatsapp_cfg["reply_prefix"]
    except Exception:
        pass

-    config = GatewayConfig.from_dict(gw_data)
-
    # Override with environment variables
    _apply_env_overrides(config)
    
@@ -655,25 +634,6 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
                name=os.getenv("SMS_HOME_CHANNEL_NAME", "Home"),
            )

-    # API Server
-    api_server_enabled = os.getenv("API_SERVER_ENABLED", "").lower() in ("true", "1", "yes")
-    api_server_key = os.getenv("API_SERVER_KEY", "")
-    api_server_port = os.getenv("API_SERVER_PORT")
-    api_server_host = os.getenv("API_SERVER_HOST")
-    if api_server_enabled or api_server_key:
-        if Platform.API_SERVER not in config.platforms:
-            config.platforms[Platform.API_SERVER] = PlatformConfig()
-        config.platforms[Platform.API_SERVER].enabled = True
-        if api_server_key:
-            config.platforms[Platform.API_SERVER].extra["key"] = api_server_key
-        if api_server_port:
-            try:
-                config.platforms[Platform.API_SERVER].extra["port"] = int(api_server_port)
-            except ValueError:
-                pass
-        if api_server_host:
-            config.platforms[Platform.API_SERVER].extra["host"] = api_server_host
-
    # Session settings
    idle_minutes = os.getenv("SESSION_IDLE_MINUTES")
    if idle_minutes:
@@ -690,4 +650,10 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
            pass


-
+def save_gateway_config(config: GatewayConfig) -> None:
+    """Save gateway configuration to ~/.hermes/gateway.json."""
+    gateway_config_path = get_hermes_home() / "gateway.json"
+    gateway_config_path.parent.mkdir(parents=True, exist_ok=True)
+    
+    with open(gateway_config_path, "w", encoding="utf-8") as f:
+        json.dump(config.to_dict(), f, indent=2)
--- a/gateway/platforms/api_server.py
+++ b/gateway/platforms/api_server.py
@@ -1,790 +0,0 @@
-"""
-OpenAI-compatible API server platform adapter.
-
-Exposes an HTTP server with endpoints:
- POST /v1/chat/completions        — OpenAI Chat Completions format (stateless)
- POST /v1/responses               — OpenAI Responses API format (stateful via previous_response_id)
- GET  /v1/responses/{response_id} — Retrieve a stored response
- DELETE /v1/responses/{response_id} — Delete a stored response
- GET  /v1/models                  — lists hermes-agent as an available model
- GET  /health                     — health check
-
-Any OpenAI-compatible frontend (Open WebUI, LobeChat, LibreChat,
-AnythingLLM, NextChat, ChatBox, etc.) can connect to hermes-agent
-through this adapter by pointing at http://localhost:8642/v1.
-
-Requires:
- aiohttp (already available in the gateway)
-"""
-
-import asyncio
-import collections
-import json
-import logging
-import os
-import time
-import uuid
-from typing import Any, Dict, List, Optional
-
-try:
-    from aiohttp import web
-    AIOHTTP_AVAILABLE = True
-except ImportError:
-    AIOHTTP_AVAILABLE = False
-    web = None  # type: ignore[assignment]
-
-from gateway.config import Platform, PlatformConfig
-from gateway.platforms.base import (
-    BasePlatformAdapter,
-    SendResult,
-)
-
-logger = logging.getLogger(__name__)
-
-# Default settings
-DEFAULT_HOST = "127.0.0.1"
-DEFAULT_PORT = 8642
-MAX_STORED_RESPONSES = 100
-
-
-def check_api_server_requirements() -> bool:
-    """Check if API server dependencies are available."""
-    return AIOHTTP_AVAILABLE
-
-
-class ResponseStore:
-    """
-    In-memory LRU store for Responses API state.
-
-    Each stored response includes the full internal conversation history
-    (with tool calls and results) so it can be reconstructed on subsequent
-    requests via previous_response_id.
-    """
-
-    def __init__(self, max_size: int = MAX_STORED_RESPONSES):
-        self._store: collections.OrderedDict[str, Dict[str, Any]] = collections.OrderedDict()
-        self._max_size = max_size
-
-    def get(self, response_id: str) -> Optional[Dict[str, Any]]:
-        """Retrieve a stored response by ID (moves to end for LRU)."""
-        if response_id in self._store:
-            self._store.move_to_end(response_id)
-            return self._store[response_id]
-        return None
-
-    def put(self, response_id: str, data: Dict[str, Any]) -> None:
-        """Store a response, evicting the oldest if at capacity."""
-        if response_id in self._store:
-            self._store.move_to_end(response_id)
-        self._store[response_id] = data
-        while len(self._store) > self._max_size:
-            self._store.popitem(last=False)
-
-    def delete(self, response_id: str) -> bool:
-        """Remove a response from the store. Returns True if found and deleted."""
-        if response_id in self._store:
-            del self._store[response_id]
-            return True
-        return False
-
-    def __len__(self) -> int:
-        return len(self._store)
-
-
-# ---------------------------------------------------------------------------
-# CORS middleware
-# ---------------------------------------------------------------------------
-
-_CORS_HEADERS = {
-    "Access-Control-Allow-Origin": "*",
-    "Access-Control-Allow-Methods": "GET, POST, DELETE, OPTIONS",
-    "Access-Control-Allow-Headers": "Authorization, Content-Type",
-}
-
-
-if AIOHTTP_AVAILABLE:
-    @web.middleware
-    async def cors_middleware(request, handler):
-        """Add CORS headers to every response; handle OPTIONS preflight."""
-        if request.method == "OPTIONS":
-            return web.Response(status=200, headers=_CORS_HEADERS)
-        response = await handler(request)
-        response.headers.update(_CORS_HEADERS)
-        return response
-else:
-    cors_middleware = None  # type: ignore[assignment]
-
-
-class APIServerAdapter(BasePlatformAdapter):
-    """
-    OpenAI-compatible HTTP API server adapter.
-
-    Runs an aiohttp web server that accepts OpenAI-format requests
-    and routes them through hermes-agent's AIAgent.
-    """
-
-    def __init__(self, config: PlatformConfig):
-        super().__init__(config, Platform.API_SERVER)
-        extra = config.extra or {}
-        self._host: str = extra.get("host", os.getenv("API_SERVER_HOST", DEFAULT_HOST))
-        self._port: int = int(extra.get("port", os.getenv("API_SERVER_PORT", str(DEFAULT_PORT))))
-        self._api_key: str = extra.get("key", os.getenv("API_SERVER_KEY", ""))
-        self._app: Optional["web.Application"] = None
-        self._runner: Optional["web.AppRunner"] = None
-        self._site: Optional["web.TCPSite"] = None
-        self._response_store = ResponseStore()
-        # Conversation name → latest response_id mapping
-        self._conversations: Dict[str, str] = {}
-
-    # ------------------------------------------------------------------
-    # Auth helper
-    # ------------------------------------------------------------------
-
-    def _check_auth(self, request: "web.Request") -> Optional["web.Response"]:
-        """
-        Validate Bearer token from Authorization header.
-
-        Returns None if auth is OK, or a 401 web.Response on failure.
-        If no API key is configured, all requests are allowed.
-        """
-        if not self._api_key:
-            return None  # No key configured — allow all (local-only use)
-
-        auth_header = request.headers.get("Authorization", "")
-        if auth_header.startswith("Bearer "):
-            token = auth_header[7:].strip()
-            if token == self._api_key:
-                return None  # Auth OK
-
-        return web.json_response(
-            {"error": {"message": "Invalid API key", "type": "invalid_request_error", "code": "invalid_api_key"}},
-            status=401,
-        )
-
-    # ------------------------------------------------------------------
-    # Agent creation helper
-    # ------------------------------------------------------------------
-
-    def _create_agent(
-        self,
-        ephemeral_system_prompt: Optional[str] = None,
-        session_id: Optional[str] = None,
-        stream_delta_callback=None,
-    ) -> Any:
-        """
-        Create an AIAgent instance using the gateway's runtime config.
-
-        Uses _resolve_runtime_agent_kwargs() to pick up model, api_key,
-        base_url, etc. from config.yaml / env vars.
-        """
-        from run_agent import AIAgent
-        from gateway.run import _resolve_runtime_agent_kwargs, _resolve_gateway_model
-
-        runtime_kwargs = _resolve_runtime_agent_kwargs()
-        model = _resolve_gateway_model()
-
-        max_iterations = int(os.getenv("HERMES_MAX_ITERATIONS", "90"))
-
-        agent = AIAgent(
-            model=model,
-            **runtime_kwargs,
-            max_iterations=max_iterations,
-            quiet_mode=True,
-            verbose_logging=False,
-            ephemeral_system_prompt=ephemeral_system_prompt or None,
-            session_id=session_id,
-            platform="api_server",
-            stream_delta_callback=stream_delta_callback,
-        )
-        return agent
-
-    # ------------------------------------------------------------------
-    # HTTP Handlers
-    # ------------------------------------------------------------------
-
-    async def _handle_health(self, request: "web.Request") -> "web.Response":
-        """GET /health — simple health check."""
-        return web.json_response({"status": "ok", "platform": "hermes-agent"})
-
-    async def _handle_models(self, request: "web.Request") -> "web.Response":
-        """GET /v1/models — return hermes-agent as an available model."""
-        auth_err = self._check_auth(request)
-        if auth_err:
-            return auth_err
-
-        return web.json_response({
-            "object": "list",
-            "data": [
-                {
-                    "id": "hermes-agent",
-                    "object": "model",
-                    "created": int(time.time()),
-                    "owned_by": "hermes",
-                    "permission": [],
-                    "root": "hermes-agent",
-                    "parent": None,
-                }
-            ],
-        })
-
-    async def _handle_chat_completions(self, request: "web.Request") -> "web.Response":
-        """POST /v1/chat/completions — OpenAI Chat Completions format."""
-        auth_err = self._check_auth(request)
-        if auth_err:
-            return auth_err
-
-        # Parse request body
-        try:
-            body = await request.json()
-        except (json.JSONDecodeError, Exception):
-            return web.json_response(
-                {"error": {"message": "Invalid JSON in request body", "type": "invalid_request_error"}},
-                status=400,
-            )
-
-        messages = body.get("messages")
-        if not messages or not isinstance(messages, list):
-            return web.json_response(
-                {"error": {"message": "Missing or invalid 'messages' field", "type": "invalid_request_error"}},
-                status=400,
-            )
-
-        stream = body.get("stream", False)
-
-        # Extract system message (becomes ephemeral system prompt layered ON TOP of core)
-        system_prompt = None
-        conversation_messages: List[Dict[str, str]] = []
-
-        for msg in messages:
-            role = msg.get("role", "")
-            content = msg.get("content", "")
-            if role == "system":
-                # Accumulate system messages
-                if system_prompt is None:
-                    system_prompt = content
-                else:
-                    system_prompt = system_prompt + "\n" + content
-            elif role in ("user", "assistant"):
-                conversation_messages.append({"role": role, "content": content})
-
-        # Extract the last user message as the primary input
-        user_message = ""
-        history = []
-        if conversation_messages:
-            user_message = conversation_messages[-1].get("content", "")
-            history = conversation_messages[:-1]
-
-        if not user_message:
-            return web.json_response(
-                {"error": {"message": "No user message found in messages", "type": "invalid_request_error"}},
-                status=400,
-            )
-
-        session_id = str(uuid.uuid4())
-        completion_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
-        model_name = body.get("model", "hermes-agent")
-        created = int(time.time())
-
-        if stream:
-            import queue as _q
-            _stream_q: _q.Queue = _q.Queue()
-
-            def _on_delta(delta):
-                _stream_q.put(delta)
-
-            # Start agent in background
-            agent_task = asyncio.ensure_future(self._run_agent(
-                user_message=user_message,
-                conversation_history=history,
-                ephemeral_system_prompt=system_prompt,
-                session_id=session_id,
-                stream_delta_callback=_on_delta,
-            ))
-
-            return await self._write_sse_chat_completion(
-                request, completion_id, model_name, created, _stream_q, agent_task
-            )
-
-        # Non-streaming: run the agent and return full response
-        try:
-            result, usage = await self._run_agent(
-                user_message=user_message,
-                conversation_history=history,
-                ephemeral_system_prompt=system_prompt,
-                session_id=session_id,
-            )
-        except Exception as e:
-            logger.error("Error running agent for chat completions: %s", e, exc_info=True)
-            return web.json_response(
-                {"error": {"message": f"Internal server error: {e}", "type": "server_error"}},
-                status=500,
-            )
-
-        final_response = result.get("final_response", "")
-        if not final_response:
-            final_response = result.get("error", "(No response generated)")
-
-        response_data = {
-            "id": completion_id,
-            "object": "chat.completion",
-            "created": created,
-            "model": model_name,
-            "choices": [
-                {
-                    "index": 0,
-                    "message": {
-                        "role": "assistant",
-                        "content": final_response,
-                    },
-                    "finish_reason": "stop",
-                }
-            ],
-            "usage": {
-                "prompt_tokens": usage.get("input_tokens", 0),
-                "completion_tokens": usage.get("output_tokens", 0),
-                "total_tokens": usage.get("total_tokens", 0),
-            },
-        }
-
-        return web.json_response(response_data)
-
-    async def _write_sse_chat_completion(
-        self, request: "web.Request", completion_id: str, model: str,
-        created: int, stream_q, agent_task,
-    ) -> "web.StreamResponse":
-        """Write real streaming SSE from agent's stream_delta_callback queue."""
-        import queue as _q
-
-        response = web.StreamResponse(
-            status=200,
-            headers={"Content-Type": "text/event-stream", "Cache-Control": "no-cache"},
-        )
-        await response.prepare(request)
-
-        # Role chunk
-        role_chunk = {
-            "id": completion_id, "object": "chat.completion.chunk",
-            "created": created, "model": model,
-            "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
-        }
-        await response.write(f"data: {json.dumps(role_chunk)}\n\n".encode())
-
-        # Stream content chunks as they arrive from the agent
-        loop = asyncio.get_event_loop()
-        while True:
-            try:
-                delta = await loop.run_in_executor(None, lambda: stream_q.get(timeout=0.5))
-            except _q.Empty:
-                if agent_task.done():
-                    # Drain any remaining items
-                    while True:
-                        try:
-                            delta = stream_q.get_nowait()
-                            if delta is None:
-                                break
-                            content_chunk = {
-                                "id": completion_id, "object": "chat.completion.chunk",
-                                "created": created, "model": model,
-                                "choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}],
-                            }
-                            await response.write(f"data: {json.dumps(content_chunk)}\n\n".encode())
-                        except _q.Empty:
-                            break
-                    break
-                continue
-
-            if delta is None:  # End of stream sentinel
-                break
-
-            content_chunk = {
-                "id": completion_id, "object": "chat.completion.chunk",
-                "created": created, "model": model,
-                "choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}],
-            }
-            await response.write(f"data: {json.dumps(content_chunk)}\n\n".encode())
-
-        # Get usage from completed agent
-        usage = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
-        try:
-            result, agent_usage = await agent_task
-            usage = agent_usage or usage
-        except Exception:
-            pass
-
-        # Finish chunk
-        finish_chunk = {
-            "id": completion_id, "object": "chat.completion.chunk",
-            "created": created, "model": model,
-            "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
-            "usage": {
-                "prompt_tokens": usage.get("input_tokens", 0),
-                "completion_tokens": usage.get("output_tokens", 0),
-                "total_tokens": usage.get("total_tokens", 0),
-            },
-        }
-        await response.write(f"data: {json.dumps(finish_chunk)}\n\n".encode())
-        await response.write(b"data: [DONE]\n\n")
-
-        return response
-
-    async def _handle_responses(self, request: "web.Request") -> "web.Response":
-        """POST /v1/responses — OpenAI Responses API format."""
-        auth_err = self._check_auth(request)
-        if auth_err:
-            return auth_err
-
-        # Parse request body
-        try:
-            body = await request.json()
-        except (json.JSONDecodeError, Exception):
-            return web.json_response(
-                {"error": {"message": "Invalid JSON in request body", "type": "invalid_request_error"}},
-                status=400,
-            )
-
-        raw_input = body.get("input")
-        if raw_input is None:
-            return web.json_response(
-                {"error": {"message": "Missing 'input' field", "type": "invalid_request_error"}},
-                status=400,
-            )
-
-        instructions = body.get("instructions")
-        previous_response_id = body.get("previous_response_id")
-        conversation = body.get("conversation")
-        store = body.get("store", True)
-
-        # conversation and previous_response_id are mutually exclusive
-        if conversation and previous_response_id:
-            return web.json_response(
-                {"error": {"message": "Cannot use both 'conversation' and 'previous_response_id'", "type": "invalid_request_error"}},
-                status=400,
-            )
-
-        # Resolve conversation name to latest response_id
-        if conversation:
-            previous_response_id = self._conversations.get(conversation)
-            # No error if conversation doesn't exist yet — it's a new conversation
-
-        # Normalize input to message list
-        input_messages: List[Dict[str, str]] = []
-        if isinstance(raw_input, str):
-            input_messages = [{"role": "user", "content": raw_input}]
-        elif isinstance(raw_input, list):
-            for item in raw_input:
-                if isinstance(item, str):
-                    input_messages.append({"role": "user", "content": item})
-                elif isinstance(item, dict):
-                    role = item.get("role", "user")
-                    content = item.get("content", "")
-                    # Handle content that may be a list of content parts
-                    if isinstance(content, list):
-                        text_parts = []
-                        for part in content:
-                            if isinstance(part, dict) and part.get("type") == "input_text":
-                                text_parts.append(part.get("text", ""))
-                            elif isinstance(part, dict) and part.get("type") == "output_text":
-                                text_parts.append(part.get("text", ""))
-                            elif isinstance(part, str):
-                                text_parts.append(part)
-                        content = "\n".join(text_parts)
-                    input_messages.append({"role": role, "content": content})
-        else:
-            return web.json_response(
-                {"error": {"message": "'input' must be a string or array", "type": "invalid_request_error"}},
-                status=400,
-            )
-
-        # Reconstruct conversation history from previous_response_id
-        conversation_history: List[Dict[str, str]] = []
-        if previous_response_id:
-            stored = self._response_store.get(previous_response_id)
-            if stored is None:
-                return web.json_response(
-                    {"error": {"message": f"Previous response not found: {previous_response_id}", "type": "invalid_request_error"}},
-                    status=404,
-                )
-            conversation_history = list(stored.get("conversation_history", []))
-            # If no instructions provided, carry forward from previous
-            if instructions is None:
-                instructions = stored.get("instructions")
-
-        # Append new input messages to history (all but the last become history)
-        for msg in input_messages[:-1]:
-            conversation_history.append(msg)
-
-        # Last input message is the user_message
-        user_message = input_messages[-1].get("content", "") if input_messages else ""
-        if not user_message:
-            return web.json_response(
-                {"error": {"message": "No user message found in input", "type": "invalid_request_error"}},
-                status=400,
-            )
-
-        # Truncation support
-        if body.get("truncation") == "auto" and len(conversation_history) > 100:
-            conversation_history = conversation_history[-100:]
-
-        # Run the agent
-        session_id = str(uuid.uuid4())
-        try:
-            result, usage = await self._run_agent(
-                user_message=user_message,
-                conversation_history=conversation_history,
-                ephemeral_system_prompt=instructions,
-                session_id=session_id,
-            )
-        except Exception as e:
-            logger.error("Error running agent for responses: %s", e, exc_info=True)
-            return web.json_response(
-                {"error": {"message": f"Internal server error: {e}", "type": "server_error"}},
-                status=500,
-            )
-
-        final_response = result.get("final_response", "")
-        if not final_response:
-            final_response = result.get("error", "(No response generated)")
-
-        response_id = f"resp_{uuid.uuid4().hex[:28]}"
-        created_at = int(time.time())
-
-        # Build the full conversation history for storage
-        # (includes tool calls from the agent run)
-        full_history = list(conversation_history)
-        full_history.append({"role": "user", "content": user_message})
-        # Add agent's internal messages if available
-        agent_messages = result.get("messages", [])
-        if agent_messages:
-            full_history.extend(agent_messages)
-        else:
-            full_history.append({"role": "assistant", "content": final_response})
-
-        # Build output items (includes tool calls + final message)
-        output_items = self._extract_output_items(result)
-
-        response_data = {
-            "id": response_id,
-            "object": "response",
-            "status": "completed",
-            "created_at": created_at,
-            "model": body.get("model", "hermes-agent"),
-            "output": output_items,
-            "usage": {
-                "input_tokens": usage.get("input_tokens", 0),
-                "output_tokens": usage.get("output_tokens", 0),
-                "total_tokens": usage.get("total_tokens", 0),
-            },
-        }
-
-        # Store the complete response object for future chaining / GET retrieval
-        if store:
-            self._response_store.put(response_id, {
-                "response": response_data,
-                "conversation_history": full_history,
-                "instructions": instructions,
-            })
-            # Update conversation mapping so the next request with the same
-            # conversation name automatically chains to this response
-            if conversation:
-                self._conversations[conversation] = response_id
-
-        return web.json_response(response_data)
-
-    # ------------------------------------------------------------------
-    # GET / DELETE response endpoints
-    # ------------------------------------------------------------------
-
-    async def _handle_get_response(self, request: "web.Request") -> "web.Response":
-        """GET /v1/responses/{response_id} — retrieve a stored response."""
-        auth_err = self._check_auth(request)
-        if auth_err:
-            return auth_err
-
-        response_id = request.match_info["response_id"]
-        stored = self._response_store.get(response_id)
-        if stored is None:
-            return web.json_response(
-                {"error": {"message": f"Response not found: {response_id}", "type": "invalid_request_error"}},
-                status=404,
-            )
-
-        return web.json_response(stored["response"])
-
-    async def _handle_delete_response(self, request: "web.Request") -> "web.Response":
-        """DELETE /v1/responses/{response_id} — delete a stored response."""
-        auth_err = self._check_auth(request)
-        if auth_err:
-            return auth_err
-
-        response_id = request.match_info["response_id"]
-        deleted = self._response_store.delete(response_id)
-        if not deleted:
-            return web.json_response(
-                {"error": {"message": f"Response not found: {response_id}", "type": "invalid_request_error"}},
-                status=404,
-            )
-
-        return web.json_response({
-            "id": response_id,
-            "object": "response",
-            "deleted": True,
-        })
-
-    # ------------------------------------------------------------------
-    # Output extraction helper
-    # ------------------------------------------------------------------
-
-    @staticmethod
-    def _extract_output_items(result: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """
-        Build the full output item array from the agent's messages.
-
-        Walks *result["messages"]* and emits:
-        - ``function_call`` items for each tool_call on assistant messages
-        - ``function_call_output`` items for each tool-role message
-        - a final ``message`` item with the assistant's text reply
-        """
-        items: List[Dict[str, Any]] = []
-        messages = result.get("messages", [])
-
-        for msg in messages:
-            role = msg.get("role")
-            if role == "assistant" and msg.get("tool_calls"):
-                for tc in msg["tool_calls"]:
-                    func = tc.get("function", {})
-                    items.append({
-                        "type": "function_call",
-                        "name": func.get("name", ""),
-                        "arguments": func.get("arguments", ""),
-                        "call_id": tc.get("id", ""),
-                    })
-            elif role == "tool":
-                items.append({
-                    "type": "function_call_output",
-                    "call_id": msg.get("tool_call_id", ""),
-                    "output": msg.get("content", ""),
-                })
-
-        # Final assistant message
-        final = result.get("final_response", "")
-        if not final:
-            final = result.get("error", "(No response generated)")
-
-        items.append({
-            "type": "message",
-            "role": "assistant",
-            "content": [
-                {
-                    "type": "output_text",
-                    "text": final,
-                }
-            ],
-        })
-        return items
-
-    # ------------------------------------------------------------------
-    # Agent execution
-    # ------------------------------------------------------------------
-
-    async def _run_agent(
-        self,
-        user_message: str,
-        conversation_history: List[Dict[str, str]],
-        ephemeral_system_prompt: Optional[str] = None,
-        session_id: Optional[str] = None,
-        stream_delta_callback=None,
-    ) -> tuple:
-        """
-        Create an agent and run a conversation in a thread executor.
-
-        Returns ``(result_dict, usage_dict)`` where *usage_dict* contains
-        ``input_tokens``, ``output_tokens`` and ``total_tokens``.
-        """
-        loop = asyncio.get_event_loop()
-
-        def _run():
-            agent = self._create_agent(
-                ephemeral_system_prompt=ephemeral_system_prompt,
-                session_id=session_id,
-                stream_delta_callback=stream_delta_callback,
-            )
-            result = agent.run_conversation(
-                user_message=user_message,
-                conversation_history=conversation_history,
-            )
-            usage = {
-                "input_tokens": getattr(agent, "session_prompt_tokens", 0) or 0,
-                "output_tokens": getattr(agent, "session_completion_tokens", 0) or 0,
-                "total_tokens": getattr(agent, "session_total_tokens", 0) or 0,
-            }
-            return result, usage
-
-        return await loop.run_in_executor(None, _run)
-
-    # ------------------------------------------------------------------
-    # BasePlatformAdapter interface
-    # ------------------------------------------------------------------
-
-    async def connect(self) -> bool:
-        """Start the aiohttp web server."""
-        if not AIOHTTP_AVAILABLE:
-            logger.warning("[%s] aiohttp not installed", self.name)
-            return False
-
-        try:
-            self._app = web.Application(middlewares=[cors_middleware])
-            self._app.router.add_get("/health", self._handle_health)
-            self._app.router.add_get("/v1/models", self._handle_models)
-            self._app.router.add_post("/v1/chat/completions", self._handle_chat_completions)
-            self._app.router.add_post("/v1/responses", self._handle_responses)
-            self._app.router.add_get("/v1/responses/{response_id}", self._handle_get_response)
-            self._app.router.add_delete("/v1/responses/{response_id}", self._handle_delete_response)
-
-            self._runner = web.AppRunner(self._app)
-            await self._runner.setup()
-            self._site = web.TCPSite(self._runner, self._host, self._port)
-            await self._site.start()
-
-            self._mark_connected()
-            logger.info(
-                "[%s] API server listening on http://%s:%d",
-                self.name, self._host, self._port,
-            )
-            return True
-
-        except Exception as e:
-            logger.error("[%s] Failed to start API server: %s", self.name, e)
-            return False
-
-    async def disconnect(self) -> None:
-        """Stop the aiohttp web server."""
-        self._mark_disconnected()
-        if self._site:
-            await self._site.stop()
-            self._site = None
-        if self._runner:
-            await self._runner.cleanup()
-            self._runner = None
-        self._app = None
-        logger.info("[%s] API server stopped", self.name)
-
-    async def send(
-        self,
-        chat_id: str,
-        content: str,
-        reply_to: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> SendResult:
-        """
-        Not used — HTTP request/response cycle handles delivery directly.
-        """
-        return SendResult(success=False, error="API server uses HTTP request/response, not send()")
-
-    async def get_chat_info(self, chat_id: str) -> Dict[str, Any]:
-        """Return basic info about the API server."""
-        return {
-            "name": "API Server",
-            "type": "api",
-            "host": self._host,
-            "port": self._port,
-        }
--- a/gateway/platforms/telegram.py
+++ b/gateway/platforms/telegram.py
@@ -414,10 +414,7 @@ class TelegramAdapter(BasePlatformAdapter):
                    text=formatted,
                    parse_mode=ParseMode.MARKDOWN_V2,
                )
-            except Exception as fmt_err:
-                # "Message is not modified" is a no-op, not an error
-                if "not modified" in str(fmt_err).lower():
-                    return SendResult(success=True, message_id=message_id)
+            except Exception:
                # Fallback: retry without markdown formatting
                await self._bot.edit_message_text(
                    chat_id=int(chat_id),
@@ -426,46 +423,6 @@ class TelegramAdapter(BasePlatformAdapter):
                )
            return SendResult(success=True, message_id=message_id)
        except Exception as e:
-            err_str = str(e).lower()
-            # "Message is not modified" — content identical, treat as success
-            if "not modified" in err_str:
-                return SendResult(success=True, message_id=message_id)
-            # Message too long — content exceeded 4096 chars (e.g. during
-            # streaming).  Truncate and succeed so the stream consumer can
-            # split the overflow into a new message instead of dying.
-            if "message_too_long" in err_str or "too long" in err_str:
-                truncated = content[: self.MAX_MESSAGE_LENGTH - 20] + "…"
-                try:
-                    await self._bot.edit_message_text(
-                        chat_id=int(chat_id),
-                        message_id=int(message_id),
-                        text=truncated,
-                    )
-                except Exception:
-                    pass  # best-effort truncation
-                return SendResult(success=True, message_id=message_id)
-            # Flood control / RetryAfter — back off and retry once
-            retry_after = getattr(e, "retry_after", None)
-            if retry_after is not None or "retry after" in err_str:
-                wait = retry_after if retry_after else 1.0
-                logger.warning(
-                    "[%s] Telegram flood control, waiting %.1fs",
-                    self.name, wait,
-                )
-                await asyncio.sleep(wait)
-                try:
-                    await self._bot.edit_message_text(
-                        chat_id=int(chat_id),
-                        message_id=int(message_id),
-                        text=content,
-                    )
-                    return SendResult(success=True, message_id=message_id)
-                except Exception as retry_err:
-                    logger.error(
-                        "[%s] Edit retry failed after flood wait: %s",
-                        self.name, retry_err,
-                    )
-                    return SendResult(success=False, error=str(retry_err))
            logger.error(
                "[%s] Failed to edit Telegram message %s: %s",
                self.name,
--- a/gateway/platforms/whatsapp.py
+++ b/gateway/platforms/whatsapp.py
@@ -136,7 +136,6 @@ class WhatsAppAdapter(BasePlatformAdapter):
            "session_path",
            get_hermes_home() / "whatsapp" / "session"
        ))
-        self._reply_prefix: Optional[str] = config.extra.get("reply_prefix")
        self._message_queue: asyncio.Queue = asyncio.Queue()
        self._bridge_log_fh = None
        self._bridge_log: Optional[Path] = None
@@ -194,14 +193,6 @@ class WhatsAppAdapter(BasePlatformAdapter):
            self._bridge_log = self._session_path.parent / "bridge.log"
            bridge_log_fh = open(self._bridge_log, "a")
            self._bridge_log_fh = bridge_log_fh
-
-            # Build bridge subprocess environment.
-            # Pass WHATSAPP_REPLY_PREFIX from config.yaml so the Node bridge
-            # can use it without the user needing to set a separate env var.
-            bridge_env = os.environ.copy()
-            if self._reply_prefix is not None:
-                bridge_env["WHATSAPP_REPLY_PREFIX"] = self._reply_prefix
-
            self._bridge_process = subprocess.Popen(
                [
                    "node",
@@ -213,7 +204,6 @@ class WhatsAppAdapter(BasePlatformAdapter):
                stdout=bridge_log_fh,
                stderr=bridge_log_fh,
                preexec_fn=None if _IS_WINDOWS else os.setsid,
-                env=bridge_env,
            )
            
            # Wait for the bridge to connect to WhatsApp.
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -1162,13 +1162,6 @@ class GatewayRunner:
                return None
            return MatrixAdapter(config)

-        elif platform == Platform.API_SERVER:
-            from gateway.platforms.api_server import APIServerAdapter, check_api_server_requirements
-            if not check_api_server_requirements():
-                logger.warning("API Server: aiohttp not installed")
-                return None
-            return APIServerAdapter(config)
-
        return None
    
    def _is_user_authorized(self, source: SessionSource) -> bool:
--- a/gateway/session.py
+++ b/gateway/session.py
@@ -944,13 +944,7 @@ class SessionStore:
            for line in f:
                line = line.strip()
                if line:
-                    try:
-                        messages.append(json.loads(line))
-                    except json.JSONDecodeError:
-                        logger.warning(
-                            "Skipping corrupt line in transcript %s: %s",
-                            session_id, line[:120],
-                        )
+                    messages.append(json.loads(line))
        
        return messages

--- a/gateway/stream_consumer.py
+++ b/gateway/stream_consumer.py
@@ -68,7 +68,6 @@ class GatewayStreamConsumer:
        self._already_sent = False
        self._edit_supported = True  # Disabled on first edit failure (Signal/Email/HA)
        self._last_edit_time = 0.0
-        self._last_sent_text = ""   # Track last-sent text to skip redundant edits

    @property
    def already_sent(self) -> bool:
@@ -87,10 +86,6 @@ class GatewayStreamConsumer:

    async def run(self) -> None:
        """Async task that drains the queue and edits the platform message."""
-        # Platform message length limit — leave room for cursor + formatting
-        _raw_limit = getattr(self.adapter, "MAX_MESSAGE_LENGTH", 4096)
-        _safe_limit = max(500, _raw_limit - len(self.cfg.cursor) - 100)
-
        try:
            while True:
                # Drain all available items from the queue
@@ -116,21 +111,6 @@ class GatewayStreamConsumer:
                )

                if should_edit and self._accumulated:
-                    # Split overflow: if accumulated text exceeds the platform
-                    # limit, finalize the current message and start a new one.
-                    while (
-                        len(self._accumulated) > _safe_limit
-                        and self._message_id is not None
-                    ):
-                        split_at = self._accumulated.rfind("\n", 0, _safe_limit)
-                        if split_at < _safe_limit // 2:
-                            split_at = _safe_limit
-                        chunk = self._accumulated[:split_at]
-                        await self._send_or_edit(chunk)
-                        self._accumulated = self._accumulated[split_at:].lstrip("\n")
-                        self._message_id = None
-                        self._last_sent_text = ""
-
                    display_text = self._accumulated
                    if not got_done:
                        display_text += self.cfg.cursor
@@ -161,9 +141,6 @@ class GatewayStreamConsumer:
        try:
            if self._message_id is not None:
                if self._edit_supported:
-                    # Skip if text is identical to what we last sent
-                    if text == self._last_sent_text:
-                        return
                    # Edit existing message
                    result = await self.adapter.edit_message(
                        chat_id=self.chat_id,
@@ -172,7 +149,6 @@ class GatewayStreamConsumer:
                    )
                    if result.success:
                        self._already_sent = True
-                        self._last_sent_text = text
                    else:
                        # Edit not supported by this adapter — stop streaming,
                        # let the normal send path handle the final response.
@@ -194,7 +170,6 @@ class GatewayStreamConsumer:
                if result.success and result.message_id:
                    self._message_id = result.message_id
                    self._already_sent = True
-                    self._last_sent_text = text
                else:
                    # Initial send failed — disable streaming for this session
                    self._edit_supported = False
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@@ -195,6 +195,14 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
        api_key_env_vars=("KILOCODE_API_KEY",),
        base_url_env_var="KILOCODE_BASE_URL",
    ),
+    "huggingface": ProviderConfig(
+        id="huggingface",
+        name="Hugging Face",
+        auth_type="api_key",
+        inference_base_url="https://router.huggingface.co/v1",
+        api_key_env_vars=("HF_TOKEN",),
+        base_url_env_var="HF_BASE_URL",
+    ),
 }


@@ -574,6 +582,7 @@ def resolve_provider(
        "claude": "anthropic", "claude-code": "anthropic",
        "aigateway": "ai-gateway", "vercel": "ai-gateway", "vercel-ai-gateway": "ai-gateway",
        "opencode": "opencode-zen", "zen": "opencode-zen",
+        "hf": "huggingface", "hugging-face": "huggingface", "huggingface-hub": "huggingface",
        "go": "opencode-go", "opencode-go-sub": "opencode-go",
        "kilo": "kilocode", "kilo-code": "kilocode", "kilo-gateway": "kilocode",
    }
--- a/hermes_cli/colors.py
+++ b/hermes_cli/colors.py
@@ -1,5 +1,6 @@
 """Shared ANSI color utilities for Hermes CLI modules."""

+import os
 import sys


@@ -20,3 +21,123 @@ def color(text: str, *codes) -> str:
    if not sys.stdout.isatty():
        return text
    return "".join(codes) + text + Colors.RESET
+
+
+# =============================================================================
+# Terminal background detection (light vs dark)
+# =============================================================================
+
+
+def _detect_via_colorfgbg() -> str:
+    """Check the COLORFGBG environment variable.
+
+    Some terminals (rxvt, xterm, iTerm2) set COLORFGBG to ``<fg>;<bg>``
+    where bg >= 8 usually means a dark background.
+    Returns "light", "dark", or "unknown".
+    """
+    val = os.environ.get("COLORFGBG", "")
+    if not val:
+        return "unknown"
+    parts = val.split(";")
+    try:
+        bg = int(parts[-1])
+    except (ValueError, IndexError):
+        return "unknown"
+    # Standard terminal colors 0-6 are dark, 7+ are light.
+    # bg < 7 → dark background; bg >= 7 → light background.
+    if bg >= 7:
+        return "light"
+    return "dark"
+
+
+def _detect_via_macos_appearance() -> str:
+    """Check macOS AppleInterfaceStyle via ``defaults read``.
+
+    Returns "light", "dark", or "unknown".
+    """
+    if sys.platform != "darwin":
+        return "unknown"
+    try:
+        import subprocess
+        result = subprocess.run(
+            ["defaults", "read", "-g", "AppleInterfaceStyle"],
+            capture_output=True, text=True, timeout=2,
+        )
+        if result.returncode == 0 and "dark" in result.stdout.lower():
+            return "dark"
+        # If the key doesn't exist, macOS is in light mode.
+        return "light"
+    except Exception:
+        return "unknown"
+
+
+def _detect_via_osc11() -> str:
+    """Query the terminal background colour via the OSC 11 escape sequence.
+
+    Writes ``\\e]11;?\\a`` and reads the response to determine luminance.
+    Only works when stdin/stdout are connected to a real TTY (not piped).
+    Returns "light", "dark", or "unknown".
+    """
+    if sys.platform == "win32":
+        return "unknown"
+    if not (sys.stdin.isatty() and sys.stdout.isatty()):
+        return "unknown"
+    try:
+        import select
+        import termios
+        import tty
+
+        fd = sys.stdin.fileno()
+        old_attrs = termios.tcgetattr(fd)
+        try:
+            tty.setraw(fd)
+            # Send OSC 11 query
+            sys.stdout.write("\x1b]11;?\x07")
+            sys.stdout.flush()
+            # Wait briefly for response
+            if not select.select([fd], [], [], 0.1)[0]:
+                return "unknown"
+            response = b""
+            while select.select([fd], [], [], 0.05)[0]:
+                response += os.read(fd, 128)
+        finally:
+            termios.tcsetattr(fd, termios.TCSADRAIN, old_attrs)
+
+        # Parse response: \x1b]11;rgb:RRRR/GGGG/BBBB\x07  (or \x1b\\)
+        text = response.decode("latin-1", errors="replace")
+        if "rgb:" not in text:
+            return "unknown"
+        rgb_part = text.split("rgb:")[-1].split("\x07")[0].split("\x1b")[0]
+        channels = rgb_part.split("/")
+        if len(channels) < 3:
+            return "unknown"
+        # Each channel is 2 or 4 hex digits; normalise to 0-255
+        vals = []
+        for ch in channels[:3]:
+            ch = ch.strip()
+            if len(ch) <= 2:
+                vals.append(int(ch, 16))
+            else:
+                vals.append(int(ch[:2], 16))  # take high byte
+        # Perceived luminance (ITU-R BT.601)
+        luminance = 0.299 * vals[0] + 0.587 * vals[1] + 0.114 * vals[2]
+        return "light" if luminance > 128 else "dark"
+    except Exception:
+        return "unknown"
+
+
+def detect_terminal_background() -> str:
+    """Detect whether the terminal has a light or dark background.
+
+    Tries three strategies in order:
+    1. COLORFGBG environment variable
+    2. macOS appearance setting
+    3. OSC 11 escape sequence query
+
+    Returns "light", "dark", or "unknown" if detection fails.
+    """
+    for detector in (_detect_via_colorfgbg, _detect_via_macos_appearance, _detect_via_osc11):
+        result = detector()
+        if result != "unknown":
+            return result
+    return "unknown"
--- a/hermes_cli/commands.py
+++ b/hermes_cli/commands.py
@@ -104,9 +104,6 @@ COMMAND_REGISTRY: list[CommandDef] = [
               subcommands=("list", "add", "create", "edit", "pause", "resume", "run", "remove")),
    CommandDef("reload-mcp", "Reload MCP servers from config", "Tools & Skills",
               aliases=("reload_mcp",)),
-    CommandDef("browser", "Connect browser tools to your live Chrome via CDP", "Tools & Skills",
-               cli_only=True, args_hint="[connect|disconnect|status]",
-               subcommands=("connect", "disconnect", "status")),
    CommandDef("plugins", "List installed plugins and their status",
               "Tools & Skills", cli_only=True),

--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -236,6 +236,7 @@ DEFAULT_CONFIG = {
        "streaming": False,
        "show_cost": False,       # Show $ cost in the status bar (off by default)
        "skin": "default",
+        "theme_mode": "auto",
    },

    # Privacy settings
@@ -332,14 +333,6 @@ DEFAULT_CONFIG = {
        "auto_thread": True,           # Auto-create threads on @mention in channels (like Slack)
    },

-    # WhatsApp platform settings (gateway mode)
-    "whatsapp": {
-        # Reply prefix prepended to every outgoing WhatsApp message.
-        # Default (None) uses the built-in "⚕ *Hermes Agent*" header.
-        # Set to "" (empty string) to disable the header entirely.
-        # Supports \n for newlines, e.g. "🤖 *My Bot*\n──────\n"
-    },
-
    # Approval mode for dangerous commands:
    #   manual — always prompt the user (default)
    #   smart  — use auxiliary LLM to auto-approve low-risk commands, prompt for high-risk
@@ -372,7 +365,7 @@ DEFAULT_CONFIG = {
    },

    # Config schema version - bump this when adding new required fields
-    "_config_version": 10,
+    "_config_version": 9,
 }

 # =============================================================================
@@ -556,6 +549,21 @@ OPTIONAL_ENV_VARS = {
        "category": "provider",
        "advanced": True,
    },
+    "HF_TOKEN": {
+        "description": "Hugging Face token for Inference Providers (20+ open models via router.huggingface.co)",
+        "prompt": "Hugging Face Token",
+        "url": "https://huggingface.co/settings/tokens",
+        "password": True,
+        "category": "provider",
+    },
+    "HF_BASE_URL": {
+        "description": "Hugging Face Inference Providers base URL override",
+        "prompt": "HF base URL (leave empty for default)",
+        "url": None,
+        "password": False,
+        "category": "provider",
+        "advanced": True,
+    },

    # ── Tool API keys ──
    "PARALLEL_API_KEY": {
@@ -775,38 +783,6 @@ OPTIONAL_ENV_VARS = {
        "category": "messaging",
        "advanced": True,
    },
-    "API_SERVER_ENABLED": {
-        "description": "Enable the OpenAI-compatible API server (true/false). Allows frontends like Open WebUI, LobeChat, etc. to connect.",
-        "prompt": "Enable API server (true/false)",
-        "url": None,
-        "password": False,
-        "category": "messaging",
-        "advanced": True,
-    },
-    "API_SERVER_KEY": {
-        "description": "Bearer token for API server authentication. If empty, all requests are allowed (local use only).",
-        "prompt": "API server auth key (optional)",
-        "url": None,
-        "password": True,
-        "category": "messaging",
-        "advanced": True,
-    },
-    "API_SERVER_PORT": {
-        "description": "Port for the API server (default: 8642).",
-        "prompt": "API server port",
-        "url": None,
-        "password": False,
-        "category": "messaging",
-        "advanced": True,
-    },
-    "API_SERVER_HOST": {
-        "description": "Host/bind address for the API server (default: 127.0.0.1). Use 0.0.0.0 for network access — requires API_SERVER_KEY for security.",
-        "prompt": "API server host",
-        "url": None,
-        "password": False,
-        "category": "messaging",
-        "advanced": True,
-    },

    # ── Agent settings ──
    "MESSAGING_CWD": {
--- a/hermes_cli/gateway.py
+++ b/hermes_cli/gateway.py
@@ -6,7 +6,6 @@ Handles: hermes gateway [run|start|stop|restart|status|install|uninstall|setup]

 import asyncio
 import os
-import shutil
 import signal
 import subprocess
 import sys
@@ -402,14 +401,8 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None)
    venv_bin = str(PROJECT_ROOT / "venv" / "bin")
    node_bin = str(PROJECT_ROOT / "node_modules" / ".bin")

-    path_entries = [venv_bin, node_bin]
-    resolved_node = shutil.which("node")
-    if resolved_node:
-        resolved_node_dir = str(Path(resolved_node).resolve().parent)
-        if resolved_node_dir not in path_entries:
-            path_entries.append(resolved_node_dir)
-    path_entries.extend(["/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"])
-    sane_path = ":".join(path_entries)
+    # Build a PATH that includes the venv, node_modules, and standard system dirs
+    sane_path = f"{venv_bin}:{node_bin}:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"

    hermes_home = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")).resolve())

--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -785,6 +785,7 @@ def cmd_model(args):
        "ai-gateway": "AI Gateway",
        "kilocode": "Kilo Code",
        "alibaba": "Alibaba Cloud (DashScope)",
+        "huggingface": "Hugging Face",
        "custom": "Custom endpoint",
    }
    active_label = provider_labels.get(active, active)
@@ -809,6 +810,7 @@ def cmd_model(args):
        ("opencode-go", "OpenCode Go (open models, $10/month subscription)"),
        ("ai-gateway", "AI Gateway (Vercel — 200+ models, pay-per-use)"),
        ("alibaba", "Alibaba Cloud / DashScope (Qwen models, Anthropic-compatible)"),
+        ("huggingface", "Hugging Face Inference Providers (20+ open models)"),
    ]

    # Add user-defined custom providers from config.yaml
@@ -877,7 +879,7 @@ def cmd_model(args):
        _model_flow_anthropic(config, current_model)
    elif selected_provider == "kimi-coding":
        _model_flow_kimi(config, current_model)
-    elif selected_provider in ("zai", "minimax", "minimax-cn", "kilocode", "opencode-zen", "opencode-go", "ai-gateway", "alibaba"):
+    elif selected_provider in ("zai", "minimax", "minimax-cn", "kilocode", "opencode-zen", "opencode-go", "ai-gateway", "alibaba", "huggingface"):
        _model_flow_api_key_provider(config, selected_provider, current_model)


@@ -1444,6 +1446,27 @@ _PROVIDER_MODELS = {
        "google/gemini-3-pro-preview",
        "google/gemini-3-flash-preview",
    ],
+    # Curated model list sourced from https://models.dev (huggingface provider)
+    "huggingface": [
+        "Qwen/Qwen3.5-397B-A17B",
+        "Qwen/Qwen3-235B-A22B-Thinking-2507",
+        "Qwen/Qwen3-Coder-480B-A35B-Instruct",
+        "Qwen/Qwen3-Coder-Next",
+        "Qwen/Qwen3-Next-80B-A3B-Instruct",
+        "Qwen/Qwen3-Next-80B-A3B-Thinking",
+        "deepseek-ai/DeepSeek-R1-0528",
+        "deepseek-ai/DeepSeek-V3.2",
+        "moonshotai/Kimi-K2-Instruct",
+        "moonshotai/Kimi-K2-Instruct-0905",
+        "moonshotai/Kimi-K2.5",
+        "moonshotai/Kimi-K2-Thinking",
+        "MiniMaxAI/MiniMax-M2.5",
+        "MiniMaxAI/MiniMax-M2.1",
+        "XiaomiMiMo/MiMo-V2-Flash",
+        "zai-org/GLM-5",
+        "zai-org/GLM-4.7",
+        "zai-org/GLM-4.7-Flash",
+    ],
 }


@@ -2642,7 +2665,7 @@ For more help on a command:
    )
    chat_parser.add_argument(
        "--provider",
-        choices=["auto", "openrouter", "nous", "openai-codex", "anthropic", "zai", "kimi-coding", "minimax", "minimax-cn", "kilocode"],
+        choices=["auto", "openrouter", "nous", "openai-codex", "anthropic", "huggingface", "zai", "kimi-coding", "minimax", "minimax-cn", "kilocode"],
        default=None,
        help="Inference provider (default: auto)"
    )
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@@ -155,6 +155,28 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "qwen3.5-flash",
        "qwen-vl-max",
    ],
+    # Curated model list for Hugging Face Inference Providers
+    # sourced from https://models.dev (huggingface provider)
+    "huggingface": [
+        "Qwen/Qwen3.5-397B-A17B",
+        "Qwen/Qwen3-235B-A22B-Thinking-2507",
+        "Qwen/Qwen3-Coder-480B-A35B-Instruct",
+        "Qwen/Qwen3-Coder-Next",
+        "Qwen/Qwen3-Next-80B-A3B-Instruct",
+        "Qwen/Qwen3-Next-80B-A3B-Thinking",
+        "deepseek-ai/DeepSeek-R1-0528",
+        "deepseek-ai/DeepSeek-V3.2",
+        "moonshotai/Kimi-K2-Instruct",
+        "moonshotai/Kimi-K2-Instruct-0905",
+        "moonshotai/Kimi-K2.5",
+        "moonshotai/Kimi-K2-Thinking",
+        "MiniMaxAI/MiniMax-M2.5",
+        "MiniMaxAI/MiniMax-M2.1",
+        "XiaomiMiMo/MiMo-V2-Flash",
+        "zai-org/GLM-5",
+        "zai-org/GLM-4.7",
+        "zai-org/GLM-4.7-Flash",
+    ],
 }

 _PROVIDER_LABELS = {
@@ -172,6 +194,7 @@ _PROVIDER_LABELS = {
    "ai-gateway": "AI Gateway",
    "kilocode": "Kilo Code",
    "alibaba": "Alibaba Cloud (DashScope)",
+    "huggingface": "Hugging Face",
    "custom": "Custom endpoint",
 }

@@ -201,6 +224,9 @@ _PROVIDER_ALIASES = {
    "aliyun": "alibaba",
    "qwen": "alibaba",
    "alibaba-cloud": "alibaba",
+    "hf": "huggingface",
+    "hugging-face": "huggingface",
+    "huggingface-hub": "huggingface",
 }


@@ -234,7 +260,7 @@ def list_available_providers() -> list[dict[str, str]]:
    # Canonical providers in display order
    _PROVIDER_ORDER = [
        "openrouter", "nous", "openai-codex",
-        "zai", "kimi-coding", "minimax", "minimax-cn", "kilocode", "anthropic", "alibaba",
+        "huggingface", "zai", "kimi-coding", "minimax", "minimax-cn", "kilocode", "anthropic", "alibaba",
        "opencode-zen", "opencode-go",
        "ai-gateway", "deepseek", "custom",
    ]
--- a/hermes_cli/setup.py
+++ b/hermes_cli/setup.py
@@ -61,6 +61,11 @@ _DEFAULT_PROVIDER_MODELS = {
    "minimax-cn": ["MiniMax-M2.5", "MiniMax-M2.5-highspeed", "MiniMax-M2.1"],
    "ai-gateway": ["anthropic/claude-opus-4.6", "anthropic/claude-sonnet-4.6", "openai/gpt-5", "google/gemini-3-flash"],
    "kilocode": ["anthropic/claude-opus-4.6", "anthropic/claude-sonnet-4.6", "openai/gpt-5.4", "google/gemini-3-pro-preview", "google/gemini-3-flash-preview"],
+    "huggingface": [
+        "Qwen/Qwen3.5-397B-A17B", "Qwen/Qwen3-235B-A22B-Thinking-2507",
+        "Qwen/Qwen3-Coder-480B-A35B-Instruct", "deepseek-ai/DeepSeek-R1-0528",
+        "deepseek-ai/DeepSeek-V3.2", "moonshotai/Kimi-K2.5",
+    ],
 }


@@ -741,6 +746,7 @@ def setup_model_provider(config: dict):
        "Alibaba Cloud / DashScope (Qwen models via Anthropic-compatible API)",
        "OpenCode Zen (35+ curated models, pay-as-you-go)",
        "OpenCode Go (open models, $10/month subscription)",
+        "Hugging Face Inference Providers (20+ open models)",
    ]
    if keep_label:
        provider_choices.append(keep_label)
@@ -1360,12 +1366,12 @@ def setup_model_provider(config: dict):
        if existing_key:
            print_info(f"Current: {existing_key[:8]}... (configured)")
            if prompt_yes_no("Update API key?", False):
-                api_key = prompt("  OpenCode Zen API key", password=True)
+                api_key = prompt_text("OpenCode Zen API key", password=True)
                if api_key:
                    save_env_value("OPENCODE_ZEN_API_KEY", api_key)
                    print_success("OpenCode Zen API key updated")
        else:
-            api_key = prompt("  OpenCode Zen API key", password=True)
+            api_key = prompt_text("OpenCode Zen API key", password=True)
            if api_key:
                save_env_value("OPENCODE_ZEN_API_KEY", api_key)
                print_success("OpenCode Zen API key saved")
@@ -1393,12 +1399,12 @@ def setup_model_provider(config: dict):
        if existing_key:
            print_info(f"Current: {existing_key[:8]}... (configured)")
            if prompt_yes_no("Update API key?", False):
-                api_key = prompt("  OpenCode Go API key", password=True)
+                api_key = prompt_text("OpenCode Go API key", password=True)
                if api_key:
                    save_env_value("OPENCODE_GO_API_KEY", api_key)
                    print_success("OpenCode Go API key updated")
        else:
-            api_key = prompt("  OpenCode Go API key", password=True)
+            api_key = prompt_text("OpenCode Go API key", password=True)
            if api_key:
                save_env_value("OPENCODE_GO_API_KEY", api_key)
                print_success("OpenCode Go API key saved")
@@ -1412,7 +1418,29 @@ def setup_model_provider(config: dict):
        _set_model_provider(config, "opencode-go", pconfig.inference_base_url)
        selected_base_url = pconfig.inference_base_url

-    # else: provider_idx == 14 (Keep current) — only shown when a provider already exists
+    elif provider_idx == 14:  # Hugging Face Inference Providers
+        selected_provider = "huggingface"
+        print()
+        print_header("Hugging Face API Token")
+        pconfig = PROVIDER_REGISTRY["huggingface"]
+        print_info(f"Provider: {pconfig.name}")
+        print_info("Get your token at: https://huggingface.co/settings/tokens")
+        print_info("Required permission: 'Make calls to Inference Providers'")
+        print()
+
+        api_key = prompt_fn(
+            "  HF_TOKEN: ",
+            is_password=True,
+        ).strip()
+        if api_key:
+            save_env_value("HF_TOKEN", api_key)
+            # Clear OpenRouter env vars to prevent routing confusion
+            save_env_value("OPENAI_BASE_URL", "")
+            save_env_value("OPENAI_API_KEY", "")
+        _set_model_provider(config, "huggingface", pconfig.inference_base_url)
+        selected_base_url = pconfig.inference_base_url
+
+    # else: provider_idx == 15 (Keep current) — only shown when a provider already exists
    # Normalize "keep current" to an explicit provider so downstream logic
    # doesn't fall back to the generic OpenRouter/static-model path.
    if selected_provider is None:
--- a/hermes_cli/skin_engine.py
+++ b/hermes_cli/skin_engine.py
@@ -114,6 +114,7 @@ class SkinConfig:
    name: str
    description: str = ""
    colors: Dict[str, str] = field(default_factory=dict)
+    colors_light: Dict[str, str] = field(default_factory=dict)
    spinner: Dict[str, Any] = field(default_factory=dict)
    branding: Dict[str, str] = field(default_factory=dict)
    tool_prefix: str = "┊"
@@ -122,7 +123,12 @@ class SkinConfig:
    banner_hero: str = ""    # Rich-markup hero art (replaces HERMES_CADUCEUS)

    def get_color(self, key: str, fallback: str = "") -> str:
-        """Get a color value with fallback."""
+        """Get a color value with fallback.
+
+        In light theme mode, returns the light override if available.
+        """
+        if get_theme_mode() == "light" and key in self.colors_light:
+            return self.colors_light[key]
        return self.colors.get(key, fallback)

    def get_spinner_list(self, key: str) -> List[str]:
@@ -168,6 +174,21 @@ _BUILTIN_SKINS: Dict[str, Dict[str, Any]] = {
            "session_label": "#DAA520",
            "session_border": "#8B8682",
        },
+        "colors_light": {
+            "banner_border": "#7A5A00",
+            "banner_title": "#6B4C00",
+            "banner_accent": "#7A5500",
+            "banner_dim": "#8B7355",
+            "banner_text": "#3D2B00",
+            "prompt": "#3D2B00",
+            "ui_accent": "#7A5500",
+            "ui_label": "#01579B",
+            "ui_ok": "#1B5E20",
+            "input_rule": "#7A5A00",
+            "response_border": "#6B4C00",
+            "session_label": "#5C4300",
+            "session_border": "#8B7355",
+        },
        "spinner": {
            # Empty = use hardcoded defaults in display.py
        },
@@ -201,6 +222,21 @@ _BUILTIN_SKINS: Dict[str, Dict[str, Any]] = {
            "session_label": "#C7A96B",
            "session_border": "#6E584B",
        },
+        "colors_light": {
+            "banner_border": "#6B1010",
+            "banner_title": "#5C4300",
+            "banner_accent": "#8B1A1A",
+            "banner_dim": "#5C4030",
+            "banner_text": "#3A1800",
+            "prompt": "#3A1800",
+            "ui_accent": "#8B1A1A",
+            "ui_label": "#5C4300",
+            "ui_ok": "#1B5E20",
+            "input_rule": "#6B1010",
+            "response_border": "#7A1515",
+            "session_label": "#5C4300",
+            "session_border": "#5C4A3A",
+        },
        "spinner": {
            "waiting_faces": ["(⚔)", "(⛨)", "(▲)", "(<>)", "(/)"],
            "thinking_faces": ["(⚔)", "(⛨)", "(▲)", "(⌁)", "(<>)"],
@@ -265,6 +301,22 @@ _BUILTIN_SKINS: Dict[str, Dict[str, Any]] = {
            "session_label": "#888888",
            "session_border": "#555555",
        },
+        "colors_light": {
+            "banner_border": "#333333",
+            "banner_title": "#222222",
+            "banner_accent": "#333333",
+            "banner_dim": "#555555",
+            "banner_text": "#333333",
+            "prompt": "#222222",
+            "ui_accent": "#333333",
+            "ui_label": "#444444",
+            "ui_ok": "#444444",
+            "ui_error": "#333333",
+            "input_rule": "#333333",
+            "response_border": "#444444",
+            "session_label": "#444444",
+            "session_border": "#666666",
+        },
        "spinner": {},
        "branding": {
            "agent_name": "Hermes Agent",
@@ -296,6 +348,21 @@ _BUILTIN_SKINS: Dict[str, Dict[str, Any]] = {
            "session_label": "#7eb8f6",
            "session_border": "#4b5563",
        },
+        "colors_light": {
+            "banner_border": "#1A3A7A",
+            "banner_title": "#1A3570",
+            "banner_accent": "#1E4090",
+            "banner_dim": "#3B4555",
+            "banner_text": "#1A2A50",
+            "prompt": "#1A2A50",
+            "ui_accent": "#1A3570",
+            "ui_label": "#1E3A80",
+            "ui_ok": "#1B5E20",
+            "input_rule": "#1A3A7A",
+            "response_border": "#2A4FA0",
+            "session_label": "#1A3570",
+            "session_border": "#5A6070",
+        },
        "spinner": {},
        "branding": {
            "agent_name": "Hermes Agent",
@@ -327,6 +394,21 @@ _BUILTIN_SKINS: Dict[str, Dict[str, Any]] = {
            "session_label": "#A9DFFF",
            "session_border": "#496884",
        },
+        "colors_light": {
+            "banner_border": "#0D3060",
+            "banner_title": "#0D3060",
+            "banner_accent": "#154080",
+            "banner_dim": "#2A4565",
+            "banner_text": "#0A2850",
+            "prompt": "#0A2850",
+            "ui_accent": "#0D3060",
+            "ui_label": "#0D3060",
+            "ui_ok": "#1B5E20",
+            "input_rule": "#0D3060",
+            "response_border": "#1A5090",
+            "session_label": "#0D3060",
+            "session_border": "#3A5575",
+        },
        "spinner": {
            "waiting_faces": ["(≈)", "(Ψ)", "(∿)", "(◌)", "(◠)"],
            "thinking_faces": ["(Ψ)", "(∿)", "(≈)", "(⌁)", "(◌)"],
@@ -391,6 +473,23 @@ _BUILTIN_SKINS: Dict[str, Dict[str, Any]] = {
            "session_label": "#919191",
            "session_border": "#656565",
        },
+        "colors_light": {
+            "banner_border": "#666666",
+            "banner_title": "#222222",
+            "banner_accent": "#333333",
+            "banner_dim": "#555555",
+            "banner_text": "#333333",
+            "prompt": "#222222",
+            "ui_accent": "#333333",
+            "ui_label": "#444444",
+            "ui_ok": "#444444",
+            "ui_error": "#333333",
+            "ui_warn": "#444444",
+            "input_rule": "#666666",
+            "response_border": "#555555",
+            "session_label": "#444444",
+            "session_border": "#777777",
+        },
        "spinner": {
            "waiting_faces": ["(◉)", "(◌)", "(◬)", "(⬤)", "(::)"],
            "thinking_faces": ["(◉)", "(◬)", "(◌)", "(○)", "(●)"],
@@ -456,6 +555,21 @@ _BUILTIN_SKINS: Dict[str, Dict[str, Any]] = {
            "session_label": "#FFD39A",
            "session_border": "#6C4724",
        },
+        "colors_light": {
+            "banner_border": "#7A3511",
+            "banner_title": "#5C2D00",
+            "banner_accent": "#8B4000",
+            "banner_dim": "#5A3A1A",
+            "banner_text": "#3A1E00",
+            "prompt": "#3A1E00",
+            "ui_accent": "#8B4000",
+            "ui_label": "#5C2D00",
+            "ui_ok": "#1B5E20",
+            "input_rule": "#7A3511",
+            "response_border": "#8B4513",
+            "session_label": "#5C2D00",
+            "session_border": "#6B5540",
+        },
        "spinner": {
            "waiting_faces": ["(✦)", "(▲)", "(◇)", "(<>)", "(🔥)"],
            "thinking_faces": ["(✦)", "(▲)", "(◇)", "(⌁)", "(🔥)"],
@@ -509,6 +623,8 @@ _BUILTIN_SKINS: Dict[str, Dict[str, Any]] = {

 _active_skin: Optional[SkinConfig] = None
 _active_skin_name: str = "default"
+_theme_mode: str = "auto"
+_resolved_theme_mode: Optional[str] = None


 def _skins_dir() -> Path:
@@ -536,6 +652,8 @@ def _build_skin_config(data: Dict[str, Any]) -> SkinConfig:
    default = _BUILTIN_SKINS["default"]
    colors = dict(default.get("colors", {}))
    colors.update(data.get("colors", {}))
+    colors_light = dict(default.get("colors_light", {}))
+    colors_light.update(data.get("colors_light", {}))
    spinner = dict(default.get("spinner", {}))
    spinner.update(data.get("spinner", {}))
    branding = dict(default.get("branding", {}))
@@ -545,6 +663,7 @@ def _build_skin_config(data: Dict[str, Any]) -> SkinConfig:
        name=data.get("name", "unknown"),
        description=data.get("description", ""),
        colors=colors,
+        colors_light=colors_light,
        spinner=spinner,
        branding=branding,
        tool_prefix=data.get("tool_prefix", default.get("tool_prefix", "┊")),
@@ -625,6 +744,39 @@ def get_active_skin_name() -> str:
    return _active_skin_name


+def get_theme_mode() -> str:
+    """Return the resolved theme mode: "light" or "dark".
+
+    When ``_theme_mode`` is ``"auto"``, detection is attempted once and cached.
+    If detection returns ``"unknown"``, defaults to ``"dark"``.
+    """
+    global _resolved_theme_mode
+    if _theme_mode in ("light", "dark"):
+        return _theme_mode
+    # Auto mode — detect and cache
+    if _resolved_theme_mode is None:
+        try:
+            from hermes_cli.colors import detect_terminal_background
+            detected = detect_terminal_background()
+        except Exception:
+            detected = "unknown"
+        _resolved_theme_mode = detected if detected in ("light", "dark") else "dark"
+    return _resolved_theme_mode
+
+
+def set_theme_mode(mode: str) -> None:
+    """Set the theme mode to "light", "dark", or "auto"."""
+    global _theme_mode, _resolved_theme_mode
+    _theme_mode = mode
+    # Reset cached detection so it re-runs on next get_theme_mode() if auto
+    _resolved_theme_mode = None
+
+
+def get_theme_mode_setting() -> str:
+    """Return the raw theme mode setting (may be "auto", "light", or "dark")."""
+    return _theme_mode
+
+
 def init_skin_from_config(config: dict) -> None:
    """Initialize the active skin from CLI config at startup.

@@ -637,6 +789,13 @@ def init_skin_from_config(config: dict) -> None:
    else:
        set_active_skin("default")

+    # Theme mode
+    theme_mode = display.get("theme_mode", "auto")
+    if isinstance(theme_mode, str) and theme_mode.strip():
+        set_theme_mode(theme_mode.strip())
+    else:
+        set_theme_mode("auto")
+

 # =============================================================================
 # Convenience helpers for CLI modules
@@ -690,6 +849,14 @@ def get_prompt_toolkit_style_overrides() -> Dict[str, str]:
    warn = skin.get_color("ui_warn", "#FF8C00")
    error = skin.get_color("ui_error", "#FF6B6B")

+    # Use lighter background colours for completion menus in light mode
+    if get_theme_mode() == "light":
+        menu_bg = "bg:#e8e8e8"
+        menu_sel_bg = "bg:#d0d0d0"
+    else:
+        menu_bg = "bg:#1a1a2e"
+        menu_sel_bg = "bg:#333355"
+
    return {
        "input-area": prompt,
        "placeholder": f"{dim} italic",
@@ -698,11 +865,11 @@ def get_prompt_toolkit_style_overrides() -> Dict[str, str]:
        "hint": f"{dim} italic",
        "input-rule": input_rule,
        "image-badge": f"{label} bold",
-        "completion-menu": f"bg:#1a1a2e {text}",
-        "completion-menu.completion": f"bg:#1a1a2e {text}",
-        "completion-menu.completion.current": f"bg:#333355 {title}",
-        "completion-menu.meta.completion": f"bg:#1a1a2e {dim}",
-        "completion-menu.meta.completion.current": f"bg:#333355 {label}",
+        "completion-menu": f"{menu_bg} {text}",
+        "completion-menu.completion": f"{menu_bg} {text}",
+        "completion-menu.completion.current": f"{menu_sel_bg} {title}",
+        "completion-menu.meta.completion": f"{menu_bg} {dim}",
+        "completion-menu.meta.completion.current": f"{menu_sel_bg} {label}",
        "clarify-border": input_rule,
        "clarify-title": f"{title} bold",
        "clarify-question": f"{text} bold",
--- a/hermes_state.py
+++ b/hermes_state.py
@@ -689,45 +689,21 @@ class SessionDB:
        ``NOT``) have special meaning.  Passing raw user input directly to
        MATCH can cause ``sqlite3.OperationalError``.

-        Strategy:
-        - Preserve properly paired quoted phrases (``"exact phrase"``)
-        - Strip unmatched FTS5-special characters that would cause errors
-        - Wrap unquoted hyphenated terms in quotes so FTS5 matches them
-          as exact phrases instead of splitting on the hyphen
+        Strategy: strip characters that are only meaningful as FTS5 operators
+        and would otherwise cause syntax errors.  This preserves normal keyword
+        search while preventing crashes on inputs like ``C++``, ``"unterminated``,
+        or ``hello AND``.
        """
-        # Step 1: Extract balanced double-quoted phrases and protect them
-        # from further processing via numbered placeholders.
-        _quoted_parts: list = []
-
-        def _preserve_quoted(m: re.Match) -> str:
-            _quoted_parts.append(m.group(0))
-            return f"\x00Q{len(_quoted_parts) - 1}\x00"
-
-        sanitized = re.sub(r'"[^"]*"', _preserve_quoted, query)
-
-        # Step 2: Strip remaining (unmatched) FTS5-special characters
-        sanitized = re.sub(r'[+{}()\"^]', " ", sanitized)
-
-        # Step 3: Collapse repeated * (e.g. "***") into a single one,
-        # and remove leading * (prefix-only needs at least one char before *)
+        # Remove FTS5-special characters that are not useful in keyword search
+        sanitized = re.sub(r'[+{}()"^]', " ", query)
+        # Collapse repeated * (e.g. "***") into a single one, and remove
+        # leading * (prefix-only matching requires at least one char before *)
        sanitized = re.sub(r"\*+", "*", sanitized)
        sanitized = re.sub(r"(^|\s)\*", r"\1", sanitized)
-
-        # Step 4: Remove dangling boolean operators at start/end that would
-        # cause syntax errors (e.g. "hello AND" or "OR world")
+        # Remove dangling boolean operators at start/end that would cause
+        # syntax errors (e.g. "hello AND" or "OR world")
        sanitized = re.sub(r"(?i)^(AND|OR|NOT)\b\s*", "", sanitized.strip())
        sanitized = re.sub(r"(?i)\s+(AND|OR|NOT)\s*$", "", sanitized.strip())
-
-        # Step 5: Wrap unquoted hyphenated terms (e.g. ``chat-send``) in
-        # double quotes.  FTS5's tokenizer splits on hyphens, turning
-        # ``chat-send`` into ``chat AND send``.  Quoting preserves the
-        # intended phrase match.
-        sanitized = re.sub(r"\b(\w+(?:-\w+)+)\b", r'"\1"', sanitized)
-
-        # Step 6: Restore preserved quoted phrases
-        for i, quoted in enumerate(_quoted_parts):
-            sanitized = sanitized.replace(f"\x00Q{i}\x00", quoted)
-
        return sanitized.strip()

    def search_messages(
--- a/scripts/whatsapp-bridge/bridge.js
+++ b/scripts/whatsapp-bridge/bridge.js
@@ -44,14 +44,6 @@ const SESSION_DIR = getArg('session', path.join(process.env.HOME || '~', '.herme
 const PAIR_ONLY = args.includes('--pair-only');
 const WHATSAPP_MODE = getArg('mode', process.env.WHATSAPP_MODE || 'self-chat'); // "bot" or "self-chat"
 const ALLOWED_USERS = (process.env.WHATSAPP_ALLOWED_USERS || '').split(',').map(s => s.trim()).filter(Boolean);
-const DEFAULT_REPLY_PREFIX = '⚕ *Hermes Agent*\n────────────\n';
-const REPLY_PREFIX = process.env.WHATSAPP_REPLY_PREFIX === undefined
-  ? DEFAULT_REPLY_PREFIX
-  : process.env.WHATSAPP_REPLY_PREFIX.replace(/\\n/g, '\n');
-
-function formatOutgoingMessage(message) {
-  return REPLY_PREFIX ? `${REPLY_PREFIX}${message}` : message;
-}

 mkdirSync(SESSION_DIR, { recursive: true });

@@ -196,7 +188,7 @@ async function startSocket() {
      }

      // Ignore Hermes' own reply messages in self-chat mode to avoid loops.
-      if (msg.key.fromMe && ((REPLY_PREFIX && body.startsWith(REPLY_PREFIX)) || recentlySentIds.has(msg.key.id))) {
+      if (msg.key.fromMe && (body.startsWith('⚕ *Hermes Agent*') || recentlySentIds.has(msg.key.id))) {
        if (WHATSAPP_DEBUG) {
          try { console.log(JSON.stringify({ event: 'ignored', reason: 'agent_echo', chatId, messageId: msg.key.id })); } catch {}
        }
@@ -259,7 +251,10 @@ app.post('/send', async (req, res) => {
  }

  try {
-    const sent = await sock.sendMessage(chatId, { text: formatOutgoingMessage(message) });
+    // Prefix responses so the user can distinguish agent replies from their
+    // own messages (especially in self-chat / "Message Yourself").
+    const prefixed = `⚕ *Hermes Agent*\n────────────\n${message}`;
+    const sent = await sock.sendMessage(chatId, { text: prefixed });

    // Track sent message ID to prevent echo-back loops
    if (sent?.key?.id) {
@@ -287,8 +282,9 @@ app.post('/edit', async (req, res) => {
  }

  try {
+    const prefixed = `⚕ *Hermes Agent*\n────────────\n${message}`;
    const key = { id: messageId, fromMe: true, remoteJid: chatId };
-    await sock.sendMessage(chatId, { text: formatOutgoingMessage(message), edit: key });
+    await sock.sendMessage(chatId, { text: prefixed, edit: key });
    res.json({ success: true });
  } catch (err) {
    res.status(500).json({ error: err.message });
--- a/tests/agent/test_context_compressor.py
+++ b/tests/agent/test_context_compressor.py
@@ -111,11 +111,7 @@ class TestCompress:
        # First 2 messages should be preserved (protect_first_n=2)
        # Last 2 messages should be preserved (protect_last_n=2)
        assert result[-1]["content"] == msgs[-1]["content"]
-        # The second-to-last tail message may have the summary merged
-        # into it when a double-collision prevents a standalone summary
-        # (head=assistant, tail=user in this fixture).  Verify the
-        # original content is present in either case.
-        assert msgs[-2]["content"] in result[-2]["content"]
+        assert result[-2]["content"] == msgs[-2]["content"]


 class TestGenerateSummaryNoneContent:
@@ -333,146 +329,6 @@ class TestCompressWithClient:
        assert len(summary_msg) == 1
        assert summary_msg[0]["role"] == "assistant"

-    def test_summary_role_flips_to_avoid_tail_collision(self):
-        """When summary role collides with the first tail message but flipping
-        doesn't collide with head, the role should be flipped."""
-        mock_response = MagicMock()
-        mock_response.choices = [MagicMock()]
-        mock_response.choices[0].message.content = "summary text"
-
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
-            c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
-
-        # Head ends with tool (index 1), tail starts with user (index 6).
-        # Default: tool → summary_role="user" → collides with tail.
-        # Flip to "assistant" → tool→assistant is fine.
-        msgs = [
-            {"role": "user", "content": "msg 0"},
-            {"role": "assistant", "content": "", "tool_calls": [
-                {"id": "call_1", "type": "function", "function": {"name": "t", "arguments": "{}"}},
-            ]},
-            {"role": "tool", "tool_call_id": "call_1", "content": "result 1"},
-            {"role": "assistant", "content": "msg 3"},
-            {"role": "user", "content": "msg 4"},
-            {"role": "assistant", "content": "msg 5"},
-            {"role": "user", "content": "msg 6"},
-            {"role": "assistant", "content": "msg 7"},
-        ]
-        with patch("agent.context_compressor.call_llm", return_value=mock_response):
-            result = c.compress(msgs)
-        # Verify no consecutive user or assistant messages
-        for i in range(1, len(result)):
-            r1 = result[i - 1].get("role")
-            r2 = result[i].get("role")
-            if r1 in ("user", "assistant") and r2 in ("user", "assistant"):
-                assert r1 != r2, f"consecutive {r1} at indices {i-1},{i}"
-
-    def test_double_collision_merges_summary_into_tail(self):
-        """When neither role avoids collision with both neighbors, the summary
-        should be merged into the first tail message rather than creating a
-        standalone message that breaks role alternation.
-
-        Common scenario: head ends with 'assistant', tail starts with 'user'.
-        summary='user' collides with tail, summary='assistant' collides with head.
-        """
-        mock_response = MagicMock()
-        mock_response.choices = [MagicMock()]
-        mock_response.choices[0].message.content = "summary text"
-
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
-            c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=3)
-
-        # Head: [system, user, assistant]  →  last head = assistant
-        # Tail: [user, assistant, user]    →  first tail = user
-        # summary_role="user" collides with tail, "assistant" collides with head → merge
-        msgs = [
-            {"role": "system", "content": "system prompt"},
-            {"role": "user", "content": "msg 1"},
-            {"role": "assistant", "content": "msg 2"},
-            {"role": "user", "content": "msg 3"},      # compressed
-            {"role": "assistant", "content": "msg 4"},  # compressed
-            {"role": "user", "content": "msg 5"},       # compressed
-            {"role": "user", "content": "msg 6"},       # tail start
-            {"role": "assistant", "content": "msg 7"},
-            {"role": "user", "content": "msg 8"},
-        ]
-        with patch("agent.context_compressor.call_llm", return_value=mock_response):
-            result = c.compress(msgs)
-
-        # Verify no consecutive user or assistant messages
-        for i in range(1, len(result)):
-            r1 = result[i - 1].get("role")
-            r2 = result[i].get("role")
-            if r1 in ("user", "assistant") and r2 in ("user", "assistant"):
-                assert r1 != r2, f"consecutive {r1} at indices {i-1},{i}"
-
-        # The summary text should be merged into the first tail message
-        first_tail = [m for m in result if "msg 6" in (m.get("content") or "")]
-        assert len(first_tail) == 1
-        assert "summary text" in first_tail[0]["content"]
-
-    def test_double_collision_user_head_assistant_tail(self):
-        """Reverse double collision: head ends with 'user', tail starts with 'assistant'.
-        summary='assistant' collides with tail, 'user' collides with head → merge."""
-        mock_response = MagicMock()
-        mock_response.choices = [MagicMock()]
-        mock_response.choices[0].message.content = "summary text"
-
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
-            c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
-
-        # Head: [system, user]        → last head = user
-        # Tail: [assistant, user]     → first tail = assistant
-        # summary_role="assistant" collides with tail, "user" collides with head → merge
-        msgs = [
-            {"role": "system", "content": "system prompt"},
-            {"role": "user", "content": "msg 1"},
-            {"role": "assistant", "content": "msg 2"},   # compressed
-            {"role": "user", "content": "msg 3"},        # compressed
-            {"role": "assistant", "content": "msg 4"},   # compressed
-            {"role": "assistant", "content": "msg 5"},   # tail start
-            {"role": "user", "content": "msg 6"},
-        ]
-        with patch("agent.context_compressor.call_llm", return_value=mock_response):
-            result = c.compress(msgs)
-
-        # Verify no consecutive user or assistant messages
-        for i in range(1, len(result)):
-            r1 = result[i - 1].get("role")
-            r2 = result[i].get("role")
-            if r1 in ("user", "assistant") and r2 in ("user", "assistant"):
-                assert r1 != r2, f"consecutive {r1} at indices {i-1},{i}"
-
-        # The summary should be merged into the first tail message (assistant)
-        first_tail = [m for m in result if "msg 5" in (m.get("content") or "")]
-        assert len(first_tail) == 1
-        assert "summary text" in first_tail[0]["content"]
-
-    def test_no_collision_scenarios_still_work(self):
-        """Verify that the common no-collision cases (head=assistant/tail=assistant,
-        head=user/tail=user) still produce a standalone summary message."""
-        mock_response = MagicMock()
-        mock_response.choices = [MagicMock()]
-        mock_response.choices[0].message.content = "summary text"
-
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
-            c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
-
-        # Head=assistant, Tail=assistant → summary_role="user", no collision
-        msgs = [
-            {"role": "user", "content": "msg 0"},
-            {"role": "assistant", "content": "msg 1"},
-            {"role": "user", "content": "msg 2"},
-            {"role": "assistant", "content": "msg 3"},
-            {"role": "assistant", "content": "msg 4"},
-            {"role": "user", "content": "msg 5"},
-        ]
-        with patch("agent.context_compressor.call_llm", return_value=mock_response):
-            result = c.compress(msgs)
-        summary_msgs = [m for m in result if (m.get("content") or "").startswith(SUMMARY_PREFIX)]
-        assert len(summary_msgs) == 1, "should have a standalone summary message"
-        assert summary_msgs[0]["role"] == "user"
-
    def test_summarization_does_not_start_tail_with_tool_outputs(self):
        mock_response = MagicMock()
        mock_response.choices = [MagicMock()]
--- a/tests/gateway/test_api_server.py
+++ b/tests/gateway/test_api_server.py
--- a/tests/gateway/test_session.py
+++ b/tests/gateway/test_session.py
@@ -336,56 +336,6 @@ class TestSessionStoreRewriteTranscript:
        assert reloaded == []


-class TestLoadTranscriptCorruptLines:
-    """Regression: corrupt JSONL lines (e.g. from mid-write crash) must be
-    skipped instead of crashing the entire transcript load.  GH-1193."""
-
-    @pytest.fixture()
-    def store(self, tmp_path):
-        config = GatewayConfig()
-        with patch("gateway.session.SessionStore._ensure_loaded"):
-            s = SessionStore(sessions_dir=tmp_path, config=config)
-        s._db = None
-        s._loaded = True
-        return s
-
-    def test_corrupt_line_skipped(self, store, tmp_path):
-        session_id = "corrupt_test"
-        transcript_path = store.get_transcript_path(session_id)
-        transcript_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(transcript_path, "w") as f:
-            f.write('{"role": "user", "content": "hello"}\n')
-            f.write('{"role": "assistant", "content": "hi th')  # truncated
-            f.write("\n")
-            f.write('{"role": "user", "content": "goodbye"}\n')
-
-        messages = store.load_transcript(session_id)
-        assert len(messages) == 2
-        assert messages[0]["content"] == "hello"
-        assert messages[1]["content"] == "goodbye"
-
-    def test_all_lines_corrupt_returns_empty(self, store, tmp_path):
-        session_id = "all_corrupt"
-        transcript_path = store.get_transcript_path(session_id)
-        transcript_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(transcript_path, "w") as f:
-            f.write("not json at all\n")
-            f.write("{truncated\n")
-
-        messages = store.load_transcript(session_id)
-        assert messages == []
-
-    def test_valid_transcript_unaffected(self, store, tmp_path):
-        session_id = "valid_test"
-        store.append_to_transcript(session_id, {"role": "user", "content": "a"})
-        store.append_to_transcript(session_id, {"role": "assistant", "content": "b"})
-
-        messages = store.load_transcript(session_id)
-        assert len(messages) == 2
-        assert messages[0]["content"] == "a"
-        assert messages[1]["content"] == "b"
-
-
 class TestWhatsAppDMSessionKeyConsistency:
    """Regression: all session-key construction must go through build_session_key
    so DMs are isolated by chat_id across platforms."""
--- a/tests/gateway/test_whatsapp_connect.py
+++ b/tests/gateway/test_whatsapp_connect.py
@@ -51,7 +51,6 @@ def _make_adapter():
    adapter._bridge_log_fh = None
    adapter._bridge_log = None
    adapter._bridge_process = None
-    adapter._reply_prefix = None
    adapter._running = False
    adapter._message_queue = asyncio.Queue()
    return adapter
--- a/tests/gateway/test_whatsapp_reply_prefix.py
+++ b/tests/gateway/test_whatsapp_reply_prefix.py
@@ -1,121 +0,0 @@
-"""Tests for WhatsApp reply_prefix config.yaml support.
-
-Covers:
- config.yaml whatsapp.reply_prefix bridging into PlatformConfig.extra
- WhatsAppAdapter reading reply_prefix from config.extra
- Bridge subprocess receiving WHATSAPP_REPLY_PREFIX env var
- Config version covers all ENV_VARS_BY_VERSION keys (regression guard)
-"""
-
-from pathlib import Path
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from gateway.config import GatewayConfig, Platform, PlatformConfig
-
-
-# ---------------------------------------------------------------------------
-# Config bridging from config.yaml
-# ---------------------------------------------------------------------------
-
-
-class TestConfigYamlBridging:
-    """Test that whatsapp.reply_prefix in config.yaml flows into PlatformConfig."""
-
-    def test_reply_prefix_bridged_from_yaml(self, tmp_path):
-        """whatsapp.reply_prefix in config.yaml sets PlatformConfig.extra."""
-        config_yaml = tmp_path / "config.yaml"
-        config_yaml.write_text('whatsapp:\n  reply_prefix: "Custom Bot"\n')
-
-        with patch("gateway.config.get_hermes_home", return_value=tmp_path):
-            from gateway.config import load_gateway_config
-            # Need to also patch WHATSAPP_ENABLED so the platform exists
-            with patch.dict("os.environ", {"WHATSAPP_ENABLED": "true"}, clear=False):
-                config = load_gateway_config()
-
-        wa_config = config.platforms.get(Platform.WHATSAPP)
-        assert wa_config is not None
-        assert wa_config.extra.get("reply_prefix") == "Custom Bot"
-
-    def test_empty_reply_prefix_bridged(self, tmp_path):
-        """Empty string reply_prefix disables the header."""
-        config_yaml = tmp_path / "config.yaml"
-        config_yaml.write_text('whatsapp:\n  reply_prefix: ""\n')
-
-        with patch("gateway.config.get_hermes_home", return_value=tmp_path):
-            from gateway.config import load_gateway_config
-            with patch.dict("os.environ", {"WHATSAPP_ENABLED": "true"}, clear=False):
-                config = load_gateway_config()
-
-        wa_config = config.platforms.get(Platform.WHATSAPP)
-        assert wa_config is not None
-        assert wa_config.extra.get("reply_prefix") == ""
-
-    def test_no_whatsapp_section_no_extra(self, tmp_path):
-        """Without whatsapp section, no reply_prefix is set."""
-        config_yaml = tmp_path / "config.yaml"
-        config_yaml.write_text("timezone: UTC\n")
-
-        with patch("gateway.config.get_hermes_home", return_value=tmp_path):
-            from gateway.config import load_gateway_config
-            with patch.dict("os.environ", {"WHATSAPP_ENABLED": "true"}, clear=False):
-                config = load_gateway_config()
-
-        wa_config = config.platforms.get(Platform.WHATSAPP)
-        assert wa_config is not None
-        assert "reply_prefix" not in wa_config.extra
-
-    def test_whatsapp_section_without_reply_prefix(self, tmp_path):
-        """whatsapp section present but without reply_prefix key."""
-        config_yaml = tmp_path / "config.yaml"
-        config_yaml.write_text("whatsapp:\n  other_setting: true\n")
-
-        with patch("gateway.config.get_hermes_home", return_value=tmp_path):
-            from gateway.config import load_gateway_config
-            with patch.dict("os.environ", {"WHATSAPP_ENABLED": "true"}, clear=False):
-                config = load_gateway_config()
-
-        wa_config = config.platforms.get(Platform.WHATSAPP)
-        assert "reply_prefix" not in wa_config.extra
-
-
-# ---------------------------------------------------------------------------
-# WhatsAppAdapter __init__
-# ---------------------------------------------------------------------------
-
-
-class TestAdapterInit:
-    """Test that WhatsAppAdapter reads reply_prefix from config.extra."""
-
-    def test_reply_prefix_from_extra(self):
-        from gateway.platforms.whatsapp import WhatsAppAdapter
-        config = PlatformConfig(enabled=True, extra={"reply_prefix": "Bot\\n"})
-        adapter = WhatsAppAdapter(config)
-        assert adapter._reply_prefix == "Bot\\n"
-
-    def test_reply_prefix_default_none(self):
-        from gateway.platforms.whatsapp import WhatsAppAdapter
-        config = PlatformConfig(enabled=True)
-        adapter = WhatsAppAdapter(config)
-        assert adapter._reply_prefix is None
-
-    def test_reply_prefix_empty_string(self):
-        from gateway.platforms.whatsapp import WhatsAppAdapter
-        config = PlatformConfig(enabled=True, extra={"reply_prefix": ""})
-        adapter = WhatsAppAdapter(config)
-        assert adapter._reply_prefix == ""
-
-
-# ---------------------------------------------------------------------------
-# Config version regression guard
-# ---------------------------------------------------------------------------
-
-
-class TestConfigVersionCoverage:
-    """Ensure _config_version covers all ENV_VARS_BY_VERSION keys."""
-
-    def test_default_config_version_covers_env_var_versions(self):
-        """_config_version must be >= the highest ENV_VARS_BY_VERSION key."""
-        from hermes_cli.config import DEFAULT_CONFIG, ENV_VARS_BY_VERSION
-        assert DEFAULT_CONFIG["_config_version"] >= max(ENV_VARS_BY_VERSION)
--- a/tests/hermes_cli/test_gateway_service.py
+++ b/tests/hermes_cli/test_gateway_service.py
@@ -85,13 +85,6 @@ class TestGeneratedSystemdUnits:
        assert "ExecStop=" not in unit
        assert "TimeoutStopSec=60" in unit

-    def test_user_unit_includes_resolved_node_directory_in_path(self, monkeypatch):
-        monkeypatch.setattr(gateway_cli.shutil, "which", lambda cmd: "/home/test/.nvm/versions/node/v24.14.0/bin/node" if cmd == "node" else None)
-
-        unit = gateway_cli.generate_systemd_unit(system=False)
-
-        assert "/home/test/.nvm/versions/node/v24.14.0/bin" in unit
-
    def test_system_unit_avoids_recursive_execstop_and_uses_extended_stop_timeout(self):
        unit = gateway_cli.generate_systemd_unit(system=True)

--- a/tests/hermes_cli/test_skin_engine.py
+++ b/tests/hermes_cli/test_skin_engine.py
@@ -13,9 +13,13 @@ def reset_skin_state():
    from hermes_cli import skin_engine
    skin_engine._active_skin = None
    skin_engine._active_skin_name = "default"
+    skin_engine._theme_mode = "auto"
+    skin_engine._resolved_theme_mode = None
    yield
    skin_engine._active_skin = None
    skin_engine._active_skin_name = "default"
+    skin_engine._theme_mode = "auto"
+    skin_engine._resolved_theme_mode = None


 class TestSkinConfig:
@@ -312,3 +316,65 @@ class TestCliBrandingHelpers:
        assert overrides["clarify-title"] == f"{skin.get_color('banner_title')} bold"
        assert overrides["sudo-prompt"] == f"{skin.get_color('ui_error')} bold"
        assert overrides["approval-title"] == f"{skin.get_color('ui_warn')} bold"
+
+
+class TestThemeMode:
+    def test_get_theme_mode_defaults_to_dark_on_unknown(self):
+        from hermes_cli.skin_engine import get_theme_mode, set_theme_mode
+
+        set_theme_mode("auto")
+        # In a test env, detection returns "unknown" → defaults to "dark"
+        with patch("hermes_cli.colors.detect_terminal_background", return_value="unknown"):
+            from hermes_cli import skin_engine
+            skin_engine._resolved_theme_mode = None  # force re-detection
+            assert get_theme_mode() == "dark"
+
+    def test_set_theme_mode_light(self):
+        from hermes_cli.skin_engine import get_theme_mode, set_theme_mode
+
+        set_theme_mode("light")
+        assert get_theme_mode() == "light"
+
+    def test_set_theme_mode_dark(self):
+        from hermes_cli.skin_engine import get_theme_mode, set_theme_mode
+
+        set_theme_mode("dark")
+        assert get_theme_mode() == "dark"
+
+    def test_get_color_respects_light_mode(self):
+        from hermes_cli.skin_engine import SkinConfig, set_theme_mode
+
+        skin = SkinConfig(
+            name="test",
+            colors={"banner_title": "#FFD700", "prompt": "#FFF8DC"},
+            colors_light={"banner_title": "#6B4C00"},
+        )
+        set_theme_mode("light")
+        assert skin.get_color("banner_title") == "#6B4C00"
+        # Key not in colors_light falls back to colors
+        assert skin.get_color("prompt") == "#FFF8DC"
+
+    def test_get_color_falls_back_in_dark_mode(self):
+        from hermes_cli.skin_engine import SkinConfig, set_theme_mode
+
+        skin = SkinConfig(
+            name="test",
+            colors={"banner_title": "#FFD700", "prompt": "#FFF8DC"},
+            colors_light={"banner_title": "#6B4C00"},
+        )
+        set_theme_mode("dark")
+        assert skin.get_color("banner_title") == "#FFD700"
+        assert skin.get_color("prompt") == "#FFF8DC"
+
+    def test_init_skin_from_config_reads_theme_mode(self):
+        from hermes_cli.skin_engine import init_skin_from_config, get_theme_mode_setting
+
+        init_skin_from_config({"display": {"skin": "default", "theme_mode": "light"}})
+        assert get_theme_mode_setting() == "light"
+
+    def test_builtin_skins_have_colors_light(self):
+        from hermes_cli.skin_engine import _BUILTIN_SKINS, _build_skin_config
+
+        for name, data in _BUILTIN_SKINS.items():
+            skin = _build_skin_config(data)
+            assert len(skin.colors_light) > 0, f"Skin '{name}' has empty colors_light"
--- a/tests/test_hermes_state.py
+++ b/tests/test_hermes_state.py
@@ -261,30 +261,6 @@ class TestFTS5Search:
        # The word "C" appears in the content, so FTS5 should find it
        assert isinstance(results, list)

-    def test_search_hyphenated_term_does_not_crash(self, db):
-        """Hyphenated terms like 'chat-send' must not crash FTS5."""
-        db.create_session(session_id="s1", source="cli")
-        db.append_message("s1", role="user", content="Run the chat-send command")
-
-        results = db.search_messages("chat-send")
-        assert isinstance(results, list)
-        assert len(results) >= 1
-        assert any("chat-send" in (r.get("snippet") or r.get("content", "")).lower()
-                    for r in results)
-
-    def test_search_quoted_phrase_preserved(self, db):
-        """User-provided quoted phrases should be preserved for exact matching."""
-        db.create_session(session_id="s1", source="cli")
-        db.append_message("s1", role="user", content="docker networking is complex")
-        db.append_message("s1", role="assistant", content="networking docker tips")
-
-        # Quoted phrase should match only the exact order
-        results = db.search_messages('"docker networking"')
-        assert isinstance(results, list)
-        # Should find the user message (exact phrase) but may or may not find
-        # the assistant message depending on FTS5 phrase matching
-        assert len(results) >= 1
-
    def test_sanitize_fts5_query_strips_dangerous_chars(self):
        """Unit test for _sanitize_fts5_query static method."""
        from hermes_state import SessionDB
@@ -302,43 +278,6 @@ class TestFTS5Search:
        # Valid prefix kept
        assert s('deploy*') == 'deploy*'

-    def test_sanitize_fts5_preserves_quoted_phrases(self):
-        """Properly paired double-quoted phrases should be preserved."""
-        from hermes_state import SessionDB
-        s = SessionDB._sanitize_fts5_query
-        # Simple quoted phrase
-        assert s('"exact phrase"') == '"exact phrase"'
-        # Quoted phrase alongside unquoted terms
-        assert '"docker networking"' in s('"docker networking" setup')
-        # Multiple quoted phrases
-        result = s('"hello world" OR "foo bar"')
-        assert '"hello world"' in result
-        assert '"foo bar"' in result
-        # Unmatched quote still stripped
-        assert '"' not in s('"unterminated')
-
-    def test_sanitize_fts5_quotes_hyphenated_terms(self):
-        """Hyphenated terms should be wrapped in quotes for exact matching."""
-        from hermes_state import SessionDB
-        s = SessionDB._sanitize_fts5_query
-        # Simple hyphenated term
-        assert s('chat-send') == '"chat-send"'
-        # Multiple hyphens
-        assert s('docker-compose-up') == '"docker-compose-up"'
-        # Hyphenated term with other words
-        result = s('fix chat-send bug')
-        assert '"chat-send"' in result
-        assert 'fix' in result
-        assert 'bug' in result
-        # Multiple hyphenated terms with OR
-        result = s('chat-send OR deploy-prod')
-        assert '"chat-send"' in result
-        assert '"deploy-prod"' in result
-        # Already-quoted hyphenated term — no double quoting
-        assert s('"chat-send"') == '"chat-send"'
-        # Hyphenated inside a quoted phrase stays as-is
-        assert s('"my chat-send thing"') == '"my chat-send thing"'
-

 # =========================================================================
 # Session search and listing
--- a/tests/tools/test_delegate.py
+++ b/tests/tools/test_delegate.py
@@ -249,49 +249,6 @@ class TestDelegateTask(unittest.TestCase):
            self.assertEqual(kwargs["api_mode"], parent.api_mode)


-class TestToolNamePreservation(unittest.TestCase):
-    """Verify _last_resolved_tool_names is restored after subagent runs."""
-
-    def test_global_tool_names_restored_after_delegation(self):
-        """The process-global _last_resolved_tool_names must be restored
-        after a subagent completes so the parent's execute_code sandbox
-        generates correct imports."""
-        import model_tools
-
-        parent = _make_mock_parent(depth=0)
-        original_tools = ["terminal", "read_file", "web_search", "execute_code", "delegate_task"]
-        model_tools._last_resolved_tool_names = list(original_tools)
-
-        with patch("run_agent.AIAgent") as MockAgent:
-            mock_child = MagicMock()
-            mock_child.run_conversation.return_value = {
-                "final_response": "done", "completed": True, "api_calls": 1,
-            }
-            MockAgent.return_value = mock_child
-
-            delegate_task(goal="Test tool preservation", parent_agent=parent)
-
-        self.assertEqual(model_tools._last_resolved_tool_names, original_tools)
-
-    def test_global_tool_names_restored_after_child_failure(self):
-        """Even when the child agent raises, the global must be restored."""
-        import model_tools
-
-        parent = _make_mock_parent(depth=0)
-        original_tools = ["terminal", "read_file", "web_search"]
-        model_tools._last_resolved_tool_names = list(original_tools)
-
-        with patch("run_agent.AIAgent") as MockAgent:
-            mock_child = MagicMock()
-            mock_child.run_conversation.side_effect = RuntimeError("boom")
-            MockAgent.return_value = mock_child
-
-            result = json.loads(delegate_task(goal="Crash test", parent_agent=parent))
-            self.assertEqual(result["results"][0]["status"], "error")
-
-        self.assertEqual(model_tools._last_resolved_tool_names, original_tools)
-
-
 class TestDelegateObservability(unittest.TestCase):
    """Tests for enriched metadata returned by _run_single_child."""

--- a/tests/tools/test_send_message_tool.py
+++ b/tests/tools/test_send_message_tool.py
@@ -398,25 +398,6 @@ class TestSendToPlatformChunking:
 # ---------------------------------------------------------------------------


-class TestSendToPlatformWhatsapp:
-    def test_whatsapp_routes_via_local_bridge_sender(self):
-        chat_id = "test-user@lid"
-        async_mock = AsyncMock(return_value={"success": True, "platform": "whatsapp", "chat_id": chat_id, "message_id": "abc123"})
-
-        with patch("tools.send_message_tool._send_whatsapp", async_mock):
-            result = asyncio.run(
-                _send_to_platform(
-                    Platform.WHATSAPP,
-                    SimpleNamespace(enabled=True, token=None, extra={"bridge_port": 3000}),
-                    chat_id,
-                    "hello from hermes",
-                )
-            )
-
-        assert result["success"] is True
-        async_mock.assert_awaited_once_with({"bridge_port": 3000}, chat_id, "hello from hermes")
-
-
 class TestSendTelegramHtmlDetection:
    """Verify that messages containing HTML tags are sent with parse_mode=HTML
    and that plain / markdown messages use MarkdownV2."""
--- a/tests/tools/test_transcription.py
+++ b/tests/tools/test_transcription.py
@@ -26,14 +26,13 @@ class TestGetProvider:
            from tools.transcription_tools import _get_provider
            assert _get_provider({"provider": "local"}) == "local"

-    def test_explicit_local_no_cloud_fallback(self, monkeypatch):
-        """Explicit local provider must not silently fall back to cloud."""
+    def test_local_fallback_to_openai(self, monkeypatch):
        monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test")
        monkeypatch.delenv("GROQ_API_KEY", raising=False)
        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
             patch("tools.transcription_tools._HAS_OPENAI", True):
            from tools.transcription_tools import _get_provider
-            assert _get_provider({"provider": "local"}) == "none"
+            assert _get_provider({"provider": "local"}) == "openai"

    def test_local_nothing_available(self, monkeypatch):
        monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
@@ -48,13 +47,12 @@ class TestGetProvider:
            from tools.transcription_tools import _get_provider
            assert _get_provider({"provider": "openai"}) == "openai"

-    def test_explicit_openai_no_key_returns_none(self, monkeypatch):
-        """Explicit openai without key returns none — no cross-provider fallback."""
+    def test_openai_fallback_to_local(self, monkeypatch):
        monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
             patch("tools.transcription_tools._HAS_OPENAI", True):
            from tools.transcription_tools import _get_provider
-            assert _get_provider({"provider": "openai"}) == "none"
+            assert _get_provider({"provider": "openai"}) == "local"

    def test_default_provider_is_local(self):
        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True):
--- a/tests/tools/test_transcription_tools.py
+++ b/tests/tools/test_transcription_tools.py
@@ -66,12 +66,19 @@ class TestGetProviderGroq:
            from tools.transcription_tools import _get_provider
            assert _get_provider({"provider": "groq"}) == "groq"

-    def test_groq_explicit_no_fallback(self, monkeypatch):
-        """Explicit groq with no key returns none — no cross-provider fallback."""
+    def test_groq_fallback_to_local(self, monkeypatch):
        monkeypatch.delenv("GROQ_API_KEY", raising=False)
        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True):
            from tools.transcription_tools import _get_provider
-            assert _get_provider({"provider": "groq"}) == "none"
+            assert _get_provider({"provider": "groq"}) == "local"
+
+    def test_groq_fallback_to_openai(self, monkeypatch):
+        monkeypatch.delenv("GROQ_API_KEY", raising=False)
+        monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test")
+        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
+             patch("tools.transcription_tools._HAS_OPENAI", True):
+            from tools.transcription_tools import _get_provider
+            assert _get_provider({"provider": "groq"}) == "openai"

    def test_groq_nothing_available(self, monkeypatch):
        monkeypatch.delenv("GROQ_API_KEY", raising=False)
@@ -83,25 +90,36 @@ class TestGetProviderGroq:


 class TestGetProviderFallbackPriority:
-    """Auto-detect fallback priority and explicit provider behaviour."""
+    """Cross-provider fallback priority tests."""

-    def test_auto_detect_prefers_local(self):
-        """Auto-detect prefers local over any cloud provider."""
-        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True):
-            from tools.transcription_tools import _get_provider
-            assert _get_provider({}) == "local"
-
-    def test_auto_detect_prefers_groq_over_openai(self, monkeypatch):
-        """Auto-detect: groq (free) is preferred over openai (paid)."""
+    def test_local_fallback_prefers_groq_over_openai(self, monkeypatch):
+        """When local unavailable, groq (free) is preferred over openai (paid)."""
        monkeypatch.setenv("GROQ_API_KEY", "gsk-test")
        monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test")
        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
             patch("tools.transcription_tools._HAS_OPENAI", True):
            from tools.transcription_tools import _get_provider
-            assert _get_provider({}) == "groq"
+            assert _get_provider({"provider": "local"}) == "groq"

-    def test_explicit_openai_no_key_returns_none(self, monkeypatch):
-        """Explicit openai with no key returns none — no cross-provider fallback."""
+    def test_local_fallback_to_groq_only(self, monkeypatch):
+        """When only groq key available, falls back to groq."""
+        monkeypatch.setenv("GROQ_API_KEY", "gsk-test")
+        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
+             patch("tools.transcription_tools._HAS_OPENAI", True):
+            from tools.transcription_tools import _get_provider
+            assert _get_provider({"provider": "local"}) == "groq"
+
+    def test_openai_fallback_to_groq(self, monkeypatch):
+        """When openai key missing but groq available, falls back to groq."""
+        monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
+        monkeypatch.setenv("GROQ_API_KEY", "gsk-test")
+        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
+             patch("tools.transcription_tools._HAS_OPENAI", True):
+            from tools.transcription_tools import _get_provider
+            assert _get_provider({"provider": "openai"}) == "groq"
+
+    def test_openai_nothing_available(self, monkeypatch):
+        """When no openai key and no local, returns none."""
        monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
        monkeypatch.delenv("GROQ_API_KEY", raising=False)
        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
@@ -118,83 +136,18 @@ class TestGetProviderFallbackPriority:
            from tools.transcription_tools import _get_provider
            assert _get_provider({}) == "local"

-
-# ============================================================================
-# Explicit provider config respected  (GH-1774)
-# ============================================================================
-
-class TestExplicitProviderRespected:
-    """When stt.provider is explicitly set, that choice is authoritative.
-    No silent fallback to a different cloud provider."""
-
-    def test_explicit_local_no_fallback_to_openai(self, monkeypatch):
-        """GH-1774: provider=local must not silently fall back to openai
-        even when an OpenAI API key is set."""
-        monkeypatch.setenv("OPENAI_API_KEY", "sk-real-key-here")
+    def test_openai_fallback_to_local_command(self, monkeypatch):
+        monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
+        monkeypatch.delenv("OPENAI_API_KEY", raising=False)
        monkeypatch.delenv("GROQ_API_KEY", raising=False)
-        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
-             patch("tools.transcription_tools._HAS_OPENAI", True):
-            from tools.transcription_tools import _get_provider
-            result = _get_provider({"provider": "local"})
-            assert result == "none", f"Expected 'none' but got {result!r}"
-
-    def test_explicit_local_no_fallback_to_groq(self, monkeypatch):
-        monkeypatch.setenv("GROQ_API_KEY", "gsk-test")
-        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
-             patch("tools.transcription_tools._HAS_OPENAI", True):
-            from tools.transcription_tools import _get_provider
-            result = _get_provider({"provider": "local"})
-            assert result == "none"
-
-    def test_explicit_local_uses_local_command_fallback(self, monkeypatch):
-        """Local-to-local_command fallback is fine — both are local."""
        monkeypatch.setenv(
            "HERMES_LOCAL_STT_COMMAND",
            "whisper {input_path} --output_dir {output_dir} --language {language}",
        )
-        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False):
-            from tools.transcription_tools import _get_provider
-            result = _get_provider({"provider": "local"})
-            assert result == "local_command"
-
-    def test_explicit_groq_no_fallback_to_openai(self, monkeypatch):
-        monkeypatch.delenv("GROQ_API_KEY", raising=False)
-        monkeypatch.setenv("OPENAI_API_KEY", "sk-real-key")
        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
             patch("tools.transcription_tools._HAS_OPENAI", True):
            from tools.transcription_tools import _get_provider
-            result = _get_provider({"provider": "groq"})
-            assert result == "none"
-
-    def test_explicit_openai_no_fallback_to_groq(self, monkeypatch):
-        monkeypatch.delenv("OPENAI_API_KEY", raising=False)
-        monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
-        monkeypatch.setenv("GROQ_API_KEY", "gsk-test")
-        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
-             patch("tools.transcription_tools._HAS_OPENAI", True):
-            from tools.transcription_tools import _get_provider
-            result = _get_provider({"provider": "openai"})
-            assert result == "none"
-
-    def test_auto_detect_still_falls_back_to_cloud(self, monkeypatch):
-        """When no provider is explicitly set, auto-detect cloud fallback works."""
-        monkeypatch.setenv("OPENAI_API_KEY", "sk-real-key")
-        monkeypatch.delenv("GROQ_API_KEY", raising=False)
-        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
-             patch("tools.transcription_tools._HAS_OPENAI", True):
-            from tools.transcription_tools import _get_provider
-            # Empty dict = no explicit provider, uses DEFAULT_PROVIDER auto-detect
-            result = _get_provider({})
-            assert result == "openai"
-
-    def test_auto_detect_prefers_groq_over_openai(self, monkeypatch):
-        monkeypatch.setenv("GROQ_API_KEY", "gsk-test")
-        monkeypatch.setenv("OPENAI_API_KEY", "sk-real-key")
-        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
-             patch("tools.transcription_tools._HAS_OPENAI", True):
-            from tools.transcription_tools import _get_provider
-            result = _get_provider({})
-            assert result == "groq"
+            assert _get_provider({"provider": "openai"}) == "local_command"


 # ============================================================================
@@ -733,19 +686,28 @@ class TestTranscribeAudioDispatch:
        assert "faster-whisper" in result["error"]
        assert "GROQ_API_KEY" in result["error"]

-    def test_explicit_openai_no_key_returns_error(self, monkeypatch, sample_ogg):
-        """Explicit provider=openai with no key returns an error, not a fallback."""
+    def test_openai_provider_falls_back_to_local_command(self, monkeypatch, sample_ogg):
        monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
        monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+        monkeypatch.setenv(
+            "HERMES_LOCAL_STT_COMMAND",
+            "whisper {input_path} --model {model} --output_dir {output_dir} --language {language}",
+        )

        with patch("tools.transcription_tools._load_stt_config", return_value={"provider": "openai"}), \
             patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
-             patch("tools.transcription_tools._HAS_OPENAI", True):
+             patch("tools.transcription_tools._HAS_OPENAI", True), \
+             patch("tools.transcription_tools._transcribe_local_command", return_value={
+                 "success": True,
+                 "transcript": "hello from fallback",
+                 "provider": "local_command",
+             }) as mock_local_command:
            from tools.transcription_tools import transcribe_audio
            result = transcribe_audio(sample_ogg)

-        assert result["success"] is False
-        assert "No STT provider" in result["error"]
+        assert result["success"] is True
+        assert result["transcript"] == "hello from fallback"
+        mock_local_command.assert_called_once_with(sample_ogg, "base")

    def test_invalid_file_short_circuits(self):
        from tools.transcription_tools import transcribe_audio
--- a/tools/delegate_tool.py
+++ b/tools/delegate_tool.py
@@ -171,11 +171,6 @@ def _build_child_agent(
    model on OpenRouter while the parent runs on Nous Portal).
    """
    from run_agent import AIAgent
-    import model_tools
-
-    # Save the parent's resolved tool names before the child agent can
-    # overwrite the process-global via get_tool_definitions().
-    _saved_tool_names = list(model_tools._last_resolved_tool_names)

    # When no explicit toolsets given, inherit from parent's enabled toolsets
    # so disabled tools (e.g. web) don't leak to subagents.
@@ -370,10 +365,6 @@ def _run_single_child(
        }

    finally:
-        # Restore the parent's tool names so the process-global is correct
-        # for any subsequent execute_code calls or other consumers.
-        model_tools._last_resolved_tool_names = _saved_tool_names
-
        # Unregister child from interrupt propagation
        if hasattr(parent_agent, '_active_children'):
            try:
--- a/tools/send_message_tool.py
+++ b/tools/send_message_tool.py
@@ -134,7 +134,7 @@ def _handle_send(args):

    pconfig = config.platforms.get(platform)
    if not pconfig or not pconfig.enabled:
-        return json.dumps({"error": f"Platform '{platform_name}' is not configured. Set up credentials in ~/.hermes/config.yaml or environment variables."})
+        return json.dumps({"error": f"Platform '{platform_name}' is not configured. Set up credentials in ~/.hermes/gateway.json or environment variables."})

    from gateway.platforms.base import BasePlatformAdapter

@@ -331,8 +331,6 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None,
            result = await _send_discord(pconfig.token, chat_id, chunk)
        elif platform == Platform.SLACK:
            result = await _send_slack(pconfig.token, chat_id, chunk)
-        elif platform == Platform.WHATSAPP:
-            result = await _send_whatsapp(pconfig.extra, chat_id, chunk)
        elif platform == Platform.SIGNAL:
            result = await _send_signal(pconfig.extra, chat_id, chunk)
        elif platform == Platform.EMAIL:
@@ -516,34 +514,6 @@ async def _send_slack(token, chat_id, message):
        return {"error": f"Slack send failed: {e}"}


-async def _send_whatsapp(extra, chat_id, message):
-    """Send via the local WhatsApp bridge HTTP API."""
-    try:
-        import aiohttp
-    except ImportError:
-        return {"error": "aiohttp not installed. Run: pip install aiohttp"}
-    try:
-        bridge_port = extra.get("bridge_port", 3000)
-        async with aiohttp.ClientSession() as session:
-            async with session.post(
-                f"http://localhost:{bridge_port}/send",
-                json={"chatId": chat_id, "message": message},
-                timeout=aiohttp.ClientTimeout(total=30),
-            ) as resp:
-                if resp.status == 200:
-                    data = await resp.json()
-                    return {
-                        "success": True,
-                        "platform": "whatsapp",
-                        "chat_id": chat_id,
-                        "message_id": data.get("messageId"),
-                    }
-                body = await resp.text()
-                return {"error": f"WhatsApp bridge error ({resp.status}): {body}"}
-    except Exception as e:
-        return {"error": f"WhatsApp send failed: {e}"}
-
-
 async def _send_signal(extra, chat_id, message):
    """Send via signal-cli JSON-RPC API."""
    try:
--- a/tools/transcription_tools.py
+++ b/tools/transcription_tools.py
@@ -164,72 +164,76 @@ def _normalize_local_command_model(model_name: Optional[str]) -> str:
 def _get_provider(stt_config: dict) -> str:
    """Determine which STT provider to use.

-    When ``stt.provider`` is explicitly set in config, that choice is
-    honoured — no silent cloud fallback.  When no provider is configured,
-    auto-detect tries: local > groq (free) > openai (paid).
+    Priority:
+      1. Explicit config value  (``stt.provider``)
+      2. Auto-detect: local > groq (free) > openai (paid)
+      3. Disabled (returns "none")
    """
    if not is_stt_enabled(stt_config):
        return "none"

-    explicit = "provider" in stt_config
    provider = stt_config.get("provider", DEFAULT_PROVIDER)

-    # --- Explicit provider: respect the user's choice ----------------------
+    if provider == "local":
+        if _HAS_FASTER_WHISPER:
+            return "local"
+        if _has_local_command():
+            logger.info("faster-whisper not installed, falling back to local STT command")
+            return "local_command"
+        # Local requested but not available — fall back to groq, then openai
+        if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
+            logger.info("faster-whisper not installed, falling back to Groq Whisper API")
+            return "groq"
+        if _HAS_OPENAI and _resolve_openai_api_key():
+            logger.info("faster-whisper not installed, falling back to OpenAI Whisper API")
+            return "openai"
+        return "none"

-    if explicit:
-        if provider == "local":
-            if _HAS_FASTER_WHISPER:
-                return "local"
-            if _has_local_command():
-                return "local_command"
-            logger.warning(
-                "STT provider 'local' configured but unavailable "
-                "(install faster-whisper or set HERMES_LOCAL_STT_COMMAND)"
-            )
-            return "none"
+    if provider == "local_command":
+        if _has_local_command():
+            return "local_command"
+        if _HAS_FASTER_WHISPER:
+            logger.info("Local STT command unavailable, falling back to local faster-whisper")
+            return "local"
+        if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
+            logger.info("Local STT command unavailable, falling back to Groq Whisper API")
+            return "groq"
+        if _HAS_OPENAI and _resolve_openai_api_key():
+            logger.info("Local STT command unavailable, falling back to OpenAI Whisper API")
+            return "openai"
+        return "none"

-        if provider == "local_command":
-            if _has_local_command():
-                return "local_command"
-            if _HAS_FASTER_WHISPER:
-                logger.info("Local STT command unavailable, using local faster-whisper")
-                return "local"
-            logger.warning(
-                "STT provider 'local_command' configured but unavailable"
-            )
-            return "none"
+    if provider == "groq":
+        if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
+            return "groq"
+        # Groq requested but no key — fall back
+        if _HAS_FASTER_WHISPER:
+            logger.info("GROQ_API_KEY not set, falling back to local faster-whisper")
+            return "local"
+        if _has_local_command():
+            logger.info("GROQ_API_KEY not set, falling back to local STT command")
+            return "local_command"
+        if _HAS_OPENAI and _resolve_openai_api_key():
+            logger.info("GROQ_API_KEY not set, falling back to OpenAI Whisper API")
+            return "openai"
+        return "none"

-        if provider == "groq":
-            if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
-                return "groq"
-            logger.warning(
-                "STT provider 'groq' configured but GROQ_API_KEY not set"
-            )
-            return "none"
+    if provider == "openai":
+        if _HAS_OPENAI and _resolve_openai_api_key():
+            return "openai"
+        # OpenAI requested but no key — fall back
+        if _HAS_FASTER_WHISPER:
+            logger.info("OpenAI STT key not set, falling back to local faster-whisper")
+            return "local"
+        if _has_local_command():
+            logger.info("OpenAI STT key not set, falling back to local STT command")
+            return "local_command"
+        if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
+            logger.info("OpenAI STT key not set, falling back to Groq Whisper API")
+            return "groq"
+        return "none"

-        if provider == "openai":
-            if _HAS_OPENAI and _resolve_openai_api_key():
-                return "openai"
-            logger.warning(
-                "STT provider 'openai' configured but no API key available"
-            )
-            return "none"
-
-        return provider  # Unknown — let it fail downstream
-
-    # --- Auto-detect (no explicit provider): local > groq > openai ---------
-
-    if _HAS_FASTER_WHISPER:
-        return "local"
-    if _has_local_command():
-        return "local_command"
-    if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
-        logger.info("No local STT available, using Groq Whisper API")
-        return "groq"
-    if _HAS_OPENAI and _resolve_openai_api_key():
-        logger.info("No local STT available, using OpenAI Whisper API")
-        return "openai"
-    return "none"
+    return provider  # Unknown — let it fail downstream

 # ---------------------------------------------------------------------------
 # Shared validation
--- a/website/docs/getting-started/quickstart.md
+++ b/website/docs/getting-started/quickstart.md
@@ -50,6 +50,7 @@ hermes setup       # Or configure everything at once
 | **MiniMax** | International MiniMax endpoint | Set `MINIMAX_API_KEY` |
 | **MiniMax China** | China-region MiniMax endpoint | Set `MINIMAX_CN_API_KEY` |
 | **Alibaba Cloud** | Qwen models via DashScope | Set `DASHSCOPE_API_KEY` |
+| **Hugging Face** | 20+ open models via unified router (Qwen, DeepSeek, Kimi, etc.) | Set `HF_TOKEN` |
 | **Kilo Code** | KiloCode-hosted models | Set `KILOCODE_API_KEY` |
 | **Vercel AI Gateway** | Vercel AI Gateway routing | Set `AI_GATEWAY_API_KEY` |
 | **Custom Endpoint** | VLLM, SGLang, or any OpenAI-compatible API | Set base URL + API key |
--- a/website/docs/reference/environment-variables.md
+++ b/website/docs/reference/environment-variables.md
@@ -30,6 +30,8 @@ All variables go in `~/.hermes/.env`. You can also set them with `hermes config
 | `MINIMAX_CN_BASE_URL` | Override MiniMax China base URL (default: `https://api.minimaxi.com/v1`) |
 | `KILOCODE_API_KEY` | Kilo Code API key ([kilo.ai](https://kilo.ai)) |
 | `KILOCODE_BASE_URL` | Override Kilo Code base URL (default: `https://api.kilo.ai/api/gateway`) |
+| `HF_TOKEN` | Hugging Face token for Inference Providers ([huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)) |
+| `HF_BASE_URL` | Override Hugging Face base URL (default: `https://router.huggingface.co/v1`) |
 | `ANTHROPIC_API_KEY` | Anthropic Console API key ([console.anthropic.com](https://console.anthropic.com/)) |
 | `ANTHROPIC_TOKEN` | Manual or legacy Anthropic OAuth/setup-token override |
 | `DASHSCOPE_API_KEY` | Alibaba Cloud DashScope API key for Qwen models ([modelstudio.console.alibabacloud.com](https://modelstudio.console.alibabacloud.com/)) |
@@ -48,7 +50,7 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe

 | Variable | Description |
 |----------|-------------|
-| `HERMES_INFERENCE_PROVIDER` | Override provider selection: `auto`, `openrouter`, `nous`, `openai-codex`, `anthropic`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`, `kilocode`, `alibaba` (default: `auto`) |
+| `HERMES_INFERENCE_PROVIDER` | Override provider selection: `auto`, `openrouter`, `nous`, `openai-codex`, `anthropic`, `huggingface`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`, `kilocode`, `alibaba` (default: `auto`) |
 | `HERMES_PORTAL_BASE_URL` | Override Nous Portal URL (for development/testing) |
 | `NOUS_INFERENCE_BASE_URL` | Override Nous inference API URL |
 | `HERMES_NOUS_MIN_KEY_TTL_SECONDS` | Min agent key TTL before re-mint (default: 1800 = 30min) |
@@ -192,10 +194,6 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe
 | `MATRIX_ENCRYPTION` | Enable end-to-end encryption (`true`/`false`, default: `false`) |
 | `HASS_TOKEN` | Home Assistant Long-Lived Access Token (enables HA platform + tools) |
 | `HASS_URL` | Home Assistant URL (default: `http://homeassistant.local:8123`) |
-| `API_SERVER_ENABLED` | Enable the OpenAI-compatible API server (`true`/`false`). Runs alongside other platforms. |
-| `API_SERVER_KEY` | Bearer token for API server authentication. If empty, all requests are allowed (local-only use). |
-| `API_SERVER_PORT` | Port for the API server (default: `8642`) |
-| `API_SERVER_HOST` | Host/bind address for the API server (default: `127.0.0.1`). Use `0.0.0.0` for network access — set `API_SERVER_KEY` for security. |
 | `MESSAGING_CWD` | Working directory for terminal commands in messaging mode (default: `~`) |
 | `GATEWAY_ALLOWED_USERS` | Comma-separated user IDs allowed across all platforms |
 | `GATEWAY_ALLOW_ALL_USERS` | Allow all users without allowlists (`true`/`false`, default: `false`) |
--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@@ -72,7 +72,7 @@ You need at least one way to connect to an LLM. Use `hermes model` to switch pro
 | **MiniMax China** | `MINIMAX_CN_API_KEY` in `~/.hermes/.env` (provider: `minimax-cn`) |
 | **Alibaba Cloud** | `DASHSCOPE_API_KEY` in `~/.hermes/.env` (provider: `alibaba`, aliases: `dashscope`, `qwen`) |
 | **Kilo Code** | `KILOCODE_API_KEY` in `~/.hermes/.env` (provider: `kilocode`) |
-| **Alibaba Cloud** | `DASHSCOPE_API_KEY` in `~/.hermes/.env` (provider: `alibaba`) |
+| **Hugging Face** | `HF_TOKEN` in `~/.hermes/.env` (provider: `huggingface`, aliases: `hf`) |
 | **Custom Endpoint** | `hermes model` (saved in `config.yaml`) or `OPENAI_BASE_URL` + `OPENAI_API_KEY` in `~/.hermes/.env` |

 :::info Codex Note
@@ -152,6 +152,32 @@ model:

 Base URLs can be overridden with `GLM_BASE_URL`, `KIMI_BASE_URL`, `MINIMAX_BASE_URL`, `MINIMAX_CN_BASE_URL`, or `DASHSCOPE_BASE_URL` environment variables.

+### Hugging Face Inference Providers
+
+[Hugging Face Inference Providers](https://huggingface.co/docs/inference-providers) routes to 20+ open models through a unified OpenAI-compatible endpoint (`router.huggingface.co/v1`). Requests are automatically routed to the fastest available backend (Groq, Together, SambaNova, etc.) with automatic failover.
+
+```bash
+# Use any available model
+hermes chat --provider huggingface --model Qwen/Qwen3-235B-A22B-Thinking-2507
+# Requires: HF_TOKEN in ~/.hermes/.env
+
+# Short alias
+hermes chat --provider hf --model deepseek-ai/DeepSeek-V3.2
+```
+
+Or set it permanently in `config.yaml`:
+```yaml
+model:
+  provider: "huggingface"
+  default: "Qwen/Qwen3-235B-A22B-Thinking-2507"
+```
+
+Get your token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) — make sure to enable the "Make calls to Inference Providers" permission. Free tier included ($0.10/month credit, no markup on provider rates).
+
+You can append routing suffixes to model names: `:fastest` (default), `:cheapest`, or `:provider_name` to force a specific backend.
+
+The base URL can be overridden with `HF_BASE_URL`.
+
 ## Custom & Self-Hosted LLM Providers

 Hermes Agent works with **any OpenAI-compatible API endpoint**. If a server implements `/v1/chat/completions`, you can point Hermes at it. This means you can use local models, GPU inference servers, multi-provider routers, or any third-party API.
@@ -443,7 +469,7 @@ fallback_model:

 When activated, the fallback swaps the model and provider mid-session without losing your conversation. It fires **at most once** per session.

-Supported providers: `openrouter`, `nous`, `openai-codex`, `anthropic`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`, `custom`.
+Supported providers: `openrouter`, `nous`, `openai-codex`, `anthropic`, `huggingface`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`, `custom`.

 :::tip
 Fallback is configured exclusively through `config.yaml` — there are no environment variables for it. For full details on when it triggers, supported providers, and how it interacts with auxiliary tasks and delegation, see [Fallback Providers](/docs/user-guide/features/fallback-providers).
--- a/website/docs/user-guide/features/api-server.md
+++ b/website/docs/user-guide/features/api-server.md
@@ -1,223 +0,0 @@
---
-sidebar_position: 14
-title: "API Server"
-description: "Expose hermes-agent as an OpenAI-compatible API for any frontend"
---
-
-# API Server
-
-The API server exposes hermes-agent as an OpenAI-compatible HTTP endpoint. Any frontend that speaks the OpenAI format — Open WebUI, LobeChat, LibreChat, NextChat, ChatBox, and hundreds more — can connect to hermes-agent and use it as a backend.
-
-Your agent handles requests with its full toolset (terminal, file operations, web search, memory, skills) and returns the final response. Tool calls execute invisibly server-side.
-
-## Quick Start
-
-### 1. Enable the API server
-
-Add to `~/.hermes/.env`:
-
-```bash
-API_SERVER_ENABLED=true
-```
-
-### 2. Start the gateway
-
-```bash
-hermes gateway
-```
-
-You'll see:
-
-```
-[API Server] API server listening on http://127.0.0.1:8642
-```
-
-### 3. Connect a frontend
-
-Point any OpenAI-compatible client at `http://localhost:8642/v1`:
-
-```bash
-# Test with curl
-curl http://localhost:8642/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{"model": "hermes-agent", "messages": [{"role": "user", "content": "Hello!"}]}'
-```
-
-Or connect Open WebUI, LobeChat, or any other frontend — see the [Open WebUI integration guide](/docs/user-guide/messaging/open-webui) for step-by-step instructions.
-
-## Endpoints
-
-### POST /v1/chat/completions
-
-Standard OpenAI Chat Completions format. Stateless — the full conversation is included in each request via the `messages` array.
-
-**Request:**
-```json
-{
-  "model": "hermes-agent",
-  "messages": [
-    {"role": "system", "content": "You are a Python expert."},
-    {"role": "user", "content": "Write a fibonacci function"}
-  ],
-  "stream": false
-}
-```
-
-**Response:**
-```json
-{
-  "id": "chatcmpl-abc123",
-  "object": "chat.completion",
-  "created": 1710000000,
-  "model": "hermes-agent",
-  "choices": [{
-    "index": 0,
-    "message": {"role": "assistant", "content": "Here's a fibonacci function..."},
-    "finish_reason": "stop"
-  }],
-  "usage": {"prompt_tokens": 50, "completion_tokens": 200, "total_tokens": 250}
-}
-```
-
-**Streaming** (`"stream": true`): Returns Server-Sent Events (SSE) with token-by-token response chunks. When streaming is enabled in config, tokens are emitted live as the LLM generates them. When disabled, the full response is sent as a single SSE chunk.
-
-### POST /v1/responses
-
-OpenAI Responses API format. Supports server-side conversation state via `previous_response_id` — the server stores full conversation history (including tool calls and results) so multi-turn context is preserved without the client managing it.
-
-**Request:**
-```json
-{
-  "model": "hermes-agent",
-  "input": "What files are in my project?",
-  "instructions": "You are a helpful coding assistant.",
-  "store": true
-}
-```
-
-**Response:**
-```json
-{
-  "id": "resp_abc123",
-  "object": "response",
-  "status": "completed",
-  "model": "hermes-agent",
-  "output": [
-    {"type": "function_call", "name": "terminal", "arguments": "{\"command\": \"ls\"}", "call_id": "call_1"},
-    {"type": "function_call_output", "call_id": "call_1", "output": "README.md src/ tests/"},
-    {"type": "message", "role": "assistant", "content": [{"type": "output_text", "text": "Your project has..."}]}
-  ],
-  "usage": {"input_tokens": 50, "output_tokens": 200, "total_tokens": 250}
-}
-```
-
-#### Multi-turn with previous_response_id
-
-Chain responses to maintain full context (including tool calls) across turns:
-
-```json
-{
-  "input": "Now show me the README",
-  "previous_response_id": "resp_abc123"
-}
-```
-
-The server reconstructs the full conversation from the stored response chain — all previous tool calls and results are preserved.
-
-#### Named conversations
-
-Use the `conversation` parameter instead of tracking response IDs:
-
-```json
-{"input": "Hello", "conversation": "my-project"}
-{"input": "What's in src/?", "conversation": "my-project"}
-{"input": "Run the tests", "conversation": "my-project"}
-```
-
-The server automatically chains to the latest response in that conversation. Like the `/title` command for gateway sessions.
-
-### GET /v1/responses/\{id\}
-
-Retrieve a previously stored response by ID.
-
-### DELETE /v1/responses/\{id\}
-
-Delete a stored response.
-
-### GET /v1/models
-
-Lists `hermes-agent` as an available model. Required by most frontends for model discovery.
-
-### GET /health
-
-Health check. Returns `{"status": "ok"}`.
-
-## System Prompt Handling
-
-When a frontend sends a `system` message (Chat Completions) or `instructions` field (Responses API), hermes-agent **layers it on top** of its core system prompt. Your agent keeps all its tools, memory, and skills — the frontend's system prompt adds extra instructions.
-
-This means you can customize behavior per-frontend without losing capabilities:
- Open WebUI system prompt: "You are a Python expert. Always include type hints."
- The agent still has terminal, file tools, web search, memory, etc.
-
-## Authentication
-
-Bearer token auth via the `Authorization` header:
-
-```
-Authorization: Bearer ***
-```
-
-Configure the key via `API_SERVER_KEY` env var. If no key is set, all requests are allowed (for local-only use).
-
-:::warning Security
-The API server gives full access to hermes-agent's toolset, **including terminal commands**. If you change the bind address to `0.0.0.0` (network-accessible), **always set `API_SERVER_KEY`** — without it, anyone on your network can execute arbitrary commands on your machine.
-
-The default bind address (`127.0.0.1`) is safe for local-only use.
-:::
-
-## Configuration
-
-### Environment Variables
-
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `API_SERVER_ENABLED` | `false` | Enable the API server |
-| `API_SERVER_PORT` | `8642` | HTTP server port |
-| `API_SERVER_HOST` | `127.0.0.1` | Bind address (localhost only by default) |
-| `API_SERVER_KEY` | _(none)_ | Bearer token for auth |
-
-### config.yaml
-
-```yaml
-# Not yet supported — use environment variables.
-# config.yaml support coming in a future release.
-```
-
-## CORS
-
-The API server includes CORS headers on all responses (`Access-Control-Allow-Origin: *`), so browser-based frontends can connect directly.
-
-## Compatible Frontends
-
-Any frontend that supports the OpenAI API format works. Tested/documented integrations:
-
-| Frontend | Stars | Connection |
-|----------|-------|------------|
-| [Open WebUI](/docs/user-guide/messaging/open-webui) | 126k | Full guide available |
-| LobeChat | 73k | Custom provider endpoint |
-| LibreChat | 34k | Custom endpoint in librechat.yaml |
-| AnythingLLM | 56k | Generic OpenAI provider |
-| NextChat | 87k | BASE_URL env var |
-| ChatBox | 39k | API Host setting |
-| Jan | 26k | Remote model config |
-| HF Chat-UI | 8k | OPENAI_BASE_URL |
-| big-AGI | 7k | Custom endpoint |
-| OpenAI Python SDK | — | `OpenAI(base_url="http://localhost:8642/v1")` |
-| curl | — | Direct HTTP requests |
-
-## Limitations
-
- **Response storage is in-memory** — stored responses (for `previous_response_id`) are lost on gateway restart. Max 100 stored responses (LRU eviction).
- **No file upload** — vision/document analysis via uploaded files is not yet supported through the API.
- **Model field is cosmetic** — the `model` field in requests is accepted but the actual LLM model used is configured server-side in config.yaml.
--- a/website/docs/user-guide/features/fallback-providers.md
+++ b/website/docs/user-guide/features/fallback-providers.md
@@ -44,6 +44,7 @@ Both `provider` and `model` are **required**. If either is missing, the fallback
 | MiniMax | `minimax` | `MINIMAX_API_KEY` |
 | MiniMax (China) | `minimax-cn` | `MINIMAX_CN_API_KEY` |
 | Kilo Code | `kilocode` | `KILOCODE_API_KEY` |
+| Hugging Face | `huggingface` | `HF_TOKEN` |
 | Custom endpoint | `custom` | `base_url` + `api_key_env` (see below) |

 ### Custom Endpoint Fallback
@@ -161,7 +162,7 @@ When a task's provider is set to `"auto"` (the default), Hermes tries providers

 ```text
 OpenRouter → Nous Portal → Custom endpoint → Codex OAuth →
-API-key providers (z.ai, Kimi, MiniMax, Anthropic) → give up
+API-key providers (z.ai, Kimi, MiniMax, Hugging Face, Anthropic) → give up
 ```

 **For vision tasks:**
--- a/website/docs/user-guide/messaging/index.md
+++ b/website/docs/user-guide/messaging/index.md
@@ -1,7 +1,7 @@
 ---
 sidebar_position: 1
 title: "Messaging Gateway"
-description: "Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, SMS, Email, Home Assistant, Mattermost, Matrix, DingTalk, or any OpenAI-compatible frontend via the API server — architecture and setup overview"
+description: "Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, SMS, Email, Home Assistant, Mattermost, Matrix, DingTalk, or your browser — architecture and setup overview"
 ---

 # Messaging Gateway
@@ -27,7 +27,6 @@ flowchart TB
            mm[Mattermost]
            mx[Matrix]
            dt[DingTalk]
-            api["API Server<br/>(OpenAI-compatible)"]
        end

        store["Session store<br/>per chat"]
@@ -46,7 +45,6 @@ flowchart TB
    mm --> store
    mx --> store
    dt --> store
-    api --> store
    store --> agent
    cron --> store
 ```
@@ -308,7 +306,6 @@ Each platform has its own toolset:
 | Mattermost | `hermes-mattermost` | Full tools including terminal |
 | Matrix | `hermes-matrix` | Full tools including terminal |
 | DingTalk | `hermes-dingtalk` | Full tools including terminal |
-| API Server | `hermes` (default) | Full tools including terminal |

 ## Next Steps

@@ -323,4 +320,3 @@ Each platform has its own toolset:
 - [Mattermost Setup](mattermost.md)
 - [Matrix Setup](matrix.md)
 - [DingTalk Setup](dingtalk.md)
- [Open WebUI + API Server](open-webui.md)
--- a/website/docs/user-guide/messaging/open-webui.md
+++ b/website/docs/user-guide/messaging/open-webui.md
@@ -1,213 +0,0 @@
---
-sidebar_position: 8
-title: "Open WebUI"
-description: "Connect Open WebUI to Hermes Agent via the OpenAI-compatible API server"
---
-
-# Open WebUI Integration
-
-[Open WebUI](https://github.com/open-webui/open-webui) (126k★) is the most popular self-hosted chat interface for AI. With Hermes Agent's built-in API server, you can use Open WebUI as a polished web frontend for your agent — complete with conversation management, user accounts, and a modern chat interface.
-
-## Architecture
-
-```
-┌──────────────────┐    POST /v1/chat/completions    ┌──────────────────────┐
-│   Open WebUI     │ ──────────────────────────────► │  hermes-agent        │
-│   (browser UI)   │    SSE streaming response       │  gateway API server  │
-│   port 3000      │ ◄────────────────────────────── │  port 8642           │
-└──────────────────┘                                  └──────────────────────┘
-```
-
-Open WebUI connects to Hermes Agent's API server just like it would connect to OpenAI. Your agent handles the requests with its full toolset — terminal, file operations, web search, memory, skills — and returns the final response.
-
-## Quick Setup
-
-### 1. Enable the API server
-
-Add to `~/.hermes/.env`:
-
-```bash
-API_SERVER_ENABLED=true
-# Optional: set a key for auth (recommended if accessible beyond localhost)
-# API_SERVER_KEY=your-secret-key
-```
-
-### 2. Start Hermes Agent gateway
-
-```bash
-hermes gateway
-```
-
-You should see:
-
-```
-[API Server] API server listening on http://127.0.0.1:8642
-```
-
-### 3. Start Open WebUI
-
-```bash
-docker run -d -p 3000:8080 \
-  -e OPENAI_API_BASE_URL=http://host.docker.internal:8642/v1 \
-  -e OPENAI_API_KEY=not-needed \
-  --add-host=host.docker.internal:host-gateway \
-  -v open-webui:/app/backend/data \
-  --name open-webui \
-  --restart always \
-  ghcr.io/open-webui/open-webui:main
-```
-
-If you set an `API_SERVER_KEY`, use it instead of `not-needed`:
-
-```bash
-e OPENAI_API_KEY=your-secret-key
-```
-
-### 4. Open the UI
-
-Go to **http://localhost:3000**. Create your admin account (the first user becomes admin). You should see **hermes-agent** in the model dropdown. Start chatting!
-
-## Docker Compose Setup
-
-For a more permanent setup, create a `docker-compose.yml`:
-
-```yaml
-services:
-  open-webui:
-    image: ghcr.io/open-webui/open-webui:main
-    ports:
-      - "3000:8080"
-    volumes:
-      - open-webui:/app/backend/data
-    environment:
-      - OPENAI_API_BASE_URL=http://host.docker.internal:8642/v1
-      - OPENAI_API_KEY=not-needed
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-    restart: always
-
-volumes:
-  open-webui:
-```
-
-Then:
-
-```bash
-docker compose up -d
-```
-
-## Configuring via the Admin UI
-
-If you prefer to configure the connection through the UI instead of environment variables:
-
-1. Log in to Open WebUI at **http://localhost:3000**
-2. Click your **profile avatar** → **Admin Settings**
-3. Go to **Connections**
-4. Under **OpenAI API**, click the **wrench icon** (Manage)
-5. Click **+ Add New Connection**
-6. Enter:
-   - **URL**: `http://host.docker.internal:8642/v1`
-   - **API Key**: your key or any non-empty value (e.g., `not-needed`)
-7. Click the **checkmark** to verify the connection
-8. **Save**
-
-The **hermes-agent** model should now appear in the model dropdown.
-
-:::warning
-Environment variables only take effect on Open WebUI's **first launch**. After that, connection settings are stored in its internal database. To change them later, use the Admin UI or delete the Docker volume and start fresh.
-:::
-
-## API Type: Chat Completions vs Responses
-
-Open WebUI supports two API modes when connecting to a backend:
-
-| Mode | Format | When to use |
-|------|--------|-------------|
-| **Chat Completions** (default) | `/v1/chat/completions` | Recommended. Works out of the box. |
-| **Responses** (experimental) | `/v1/responses` | For server-side conversation state via `previous_response_id`. |
-
-### Using Chat Completions (recommended)
-
-This is the default and requires no extra configuration. Open WebUI sends standard OpenAI-format requests and Hermes Agent responds accordingly. Each request includes the full conversation history.
-
-### Using Responses API
-
-To use the Responses API mode:
-
-1. Go to **Admin Settings** → **Connections** → **OpenAI** → **Manage**
-2. Edit your hermes-agent connection
-3. Change **API Type** from "Chat Completions" to **"Responses (Experimental)"**
-4. Save
-
-With the Responses API, Open WebUI sends requests in the Responses format (`input` array + `instructions`), and Hermes Agent can preserve full tool call history across turns via `previous_response_id`.
-
-:::note
-Open WebUI currently manages conversation history client-side even in Responses mode — it sends the full message history in each request rather than using `previous_response_id`. The Responses API mode is mainly useful for future compatibility as frontends evolve.
-:::
-
-## How It Works
-
-When you send a message in Open WebUI:
-
-1. Open WebUI sends a `POST /v1/chat/completions` request with your message and conversation history
-2. Hermes Agent creates an AIAgent instance with its full toolset
-3. The agent processes your request — it may call tools (terminal, file operations, web search, etc.)
-4. Tool calls happen invisibly server-side
-5. The agent's final text response is returned to Open WebUI
-6. Open WebUI displays the response in its chat interface
-
-Your agent has access to all the same tools and capabilities as when using the CLI or Telegram — the only difference is the frontend.
-
-## Configuration Reference
-
-### Hermes Agent (API server)
-
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `API_SERVER_ENABLED` | `false` | Enable the API server |
-| `API_SERVER_PORT` | `8642` | HTTP server port |
-| `API_SERVER_HOST` | `127.0.0.1` | Bind address |
-| `API_SERVER_KEY` | _(none)_ | Bearer token for auth. No key = allow all. |
-
-### Open WebUI
-
-| Variable | Description |
-|----------|-------------|
-| `OPENAI_API_BASE_URL` | Hermes Agent's API URL (include `/v1`) |
-| `OPENAI_API_KEY` | Must be non-empty. Match your `API_SERVER_KEY`. |
-
-## Troubleshooting
-
-### No models appear in the dropdown
-
- **Check the URL has `/v1` suffix**: `http://host.docker.internal:8642/v1` (not just `:8642`)
- **Verify the gateway is running**: `curl http://localhost:8642/health` should return `{"status": "ok"}`
- **Check model listing**: `curl http://localhost:8642/v1/models` should return a list with `hermes-agent`
- **Docker networking**: From inside Docker, `localhost` means the container, not your host. Use `host.docker.internal` or `--network=host`.
-
-### Connection test passes but no models load
-
-This is almost always the missing `/v1` suffix. Open WebUI's connection test is a basic connectivity check — it doesn't verify model listing works.
-
-### Response takes a long time
-
-Hermes Agent may be executing multiple tool calls (reading files, running commands, searching the web) before producing its final response. This is normal for complex queries. The response appears all at once when the agent finishes.
-
-### "Invalid API key" errors
-
-Make sure your `OPENAI_API_KEY` in Open WebUI matches the `API_SERVER_KEY` in Hermes Agent. If no key is configured on the Hermes side, any non-empty value works.
-
-## Linux Docker (no Docker Desktop)
-
-On Linux without Docker Desktop, `host.docker.internal` doesn't resolve by default. Options:
-
-```bash
-# Option 1: Add host mapping
-docker run --add-host=host.docker.internal:host-gateway ...
-
-# Option 2: Use host networking
-docker run --network=host -e OPENAI_API_BASE_URL=http://localhost:8642/v1 ...
-
-# Option 3: Use Docker bridge IP
-docker run -e OPENAI_API_BASE_URL=http://172.17.0.1:8642/v1 ...
-```
--- a/website/docs/user-guide/messaging/whatsapp.md
+++ b/website/docs/user-guide/messaging/whatsapp.md
@@ -140,14 +140,7 @@ Hermes supports voice on WhatsApp:

 - **Incoming:** Voice messages (`.ogg` opus) are automatically transcribed using the configured STT provider: local `faster-whisper`, Groq Whisper (`GROQ_API_KEY`), or OpenAI Whisper (`VOICE_TOOLS_OPENAI_KEY`)
 - **Outgoing:** TTS responses are sent as MP3 audio file attachments
- Agent responses are prefixed with "⚕ **Hermes Agent**" by default. You can customize or disable this in `config.yaml`:
-
-```yaml
-# ~/.hermes/config.yaml
-whatsapp:
-  reply_prefix: ""                          # Empty string disables the header
-  # reply_prefix: "🤖 *My Bot*\n──────\n"  # Custom prefix (supports \n for newlines)
-```
+- Agent responses are prefixed with "⚕ **Hermes Agent**" for easy identification

 ---

--- a/website/sidebars.ts
+++ b/website/sidebars.ts
@@ -51,7 +51,6 @@ const sidebars: SidebarsConfig = {
            'user-guide/messaging/mattermost',
            'user-guide/messaging/matrix',
            'user-guide/messaging/dingtalk',
-            'user-guide/messaging/open-webui',
          ],
        },
        {
@@ -91,7 +90,6 @@ const sidebars: SidebarsConfig = {
          type: 'category',
          label: 'Integrations',
          items: [
-            'user-guide/features/api-server',
            'user-guide/features/acp',
            'user-guide/features/mcp',
            'user-guide/features/honcho',