From b84f9e410c011d399425056b82e4eb9c0db7a7a8 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Sat, 7 Mar 2026 10:14:19 -0800
Subject: [PATCH] feat: default reasoning effort from xhigh to medium

Reduces token usage and latency for most tasks by defaulting to
medium reasoning effort instead of xhigh. Users can still override
via config or CLI flag. Updates code, tests, example config, and docs.
---
 agent/prompt_builder.py                  |  3 ++-
 batch_runner.py                          |  4 ++--
 cli-config.yaml.example                  |  2 +-
 cli.py                                   |  4 ++--
 gateway/run.py                           |  4 ++--
 run_agent.py                             | 12 ++++++------
 tests/test_provider_parity.py            | 12 ++++++------
 tests/test_run_agent.py                  |  4 ++--
 website/docs/user-guide/configuration.md |  4 ++--
 9 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/agent/prompt_builder.py b/agent/prompt_builder.py
index c576b55c1f..b86be15a49 100644
--- a/agent/prompt_builder.py
+++ b/agent/prompt_builder.py
@@ -66,7 +66,8 @@ DEFAULT_AGENT_IDENTITY = (
     "range of tasks including answering questions, writing and editing code, "
     "analyzing information, creative work, and executing actions via your tools. "
     "You communicate clearly, admit uncertainty when appropriate, and prioritize "
-    "being genuinely useful over being verbose unless otherwise directed below."
+    "being genuinely useful over being verbose unless otherwise directed below. "
+    "Be targeted and efficient in your exploration and investigations."
 )
 
 MEMORY_GUIDANCE = (
diff --git a/batch_runner.py b/batch_runner.py
index 1bd6745b96..a4c402ffdc 100644
--- a/batch_runner.py
+++ b/batch_runner.py
@@ -1155,7 +1155,7 @@ def main(
         providers_order (str): Comma-separated list of OpenRouter providers to try in order (e.g. "anthropic,openai,google")
         provider_sort (str): Sort providers by "price", "throughput", or "latency" (OpenRouter only)
         max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
-        reasoning_effort (str): OpenRouter reasoning effort level: "xhigh", "high", "medium", "low", "minimal", "none" (default: "xhigh")
+        reasoning_effort (str): OpenRouter reasoning effort level: "xhigh", "high", "medium", "low", "minimal", "none" (default: "medium")
         reasoning_disabled (bool): Completely disable reasoning/thinking tokens (default: False)
         prefill_messages_file (str): Path to JSON file containing prefill messages (list of {role, content} dicts)
         max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
@@ -1216,7 +1216,7 @@ def main(
     providers_order_list = [p.strip() for p in providers_order.split(",")] if providers_order else None
     
     # Build reasoning_config from CLI flags
-    # --reasoning_disabled takes priority, then --reasoning_effort, then default (xhigh)
+    # --reasoning_disabled takes priority, then --reasoning_effort, then default (medium)
     reasoning_config = None
     if reasoning_disabled:
         # Completely disable reasoning/thinking tokens
diff --git a/cli-config.yaml.example b/cli-config.yaml.example
index d8489d95b2..f0d5a95bd4 100644
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@@ -285,7 +285,7 @@ agent:
   # Reasoning effort level (OpenRouter and Nous Portal)
   # Controls how much "thinking" the model does before responding.
   # Options: "xhigh" (max), "high", "medium", "low", "minimal", "none" (disable)
-  reasoning_effort: "xhigh"
+  reasoning_effort: "medium"
   
   # Predefined personalities (use with /personality command)
   personalities:
diff --git a/cli.py b/cli.py
index 4d1941f818..7dd74b0b21 100755
--- a/cli.py
+++ b/cli.py
@@ -108,7 +108,7 @@ def _parse_reasoning_config(effort: str) -> dict | None:
     """Parse a reasoning effort level into an OpenRouter reasoning config dict.
     
     Valid levels: "xhigh", "high", "medium", "low", "minimal", "none".
-    Returns None to use the default (xhigh), or a config dict to override.
+    Returns None to use the default (medium), or a config dict to override.
     """
     if not effort or not effort.strip():
         return None
@@ -118,7 +118,7 @@ def _parse_reasoning_config(effort: str) -> dict | None:
     valid = ("xhigh", "high", "medium", "low", "minimal")
     if effort in valid:
         return {"enabled": True, "effort": effort}
-    logger.warning("Unknown reasoning_effort '%s', using default (xhigh)", effort)
+    logger.warning("Unknown reasoning_effort '%s', using default (medium)", effort)
     return None
 
 
diff --git a/gateway/run.py b/gateway/run.py
index 99fd2443f5..3ed81379a8 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -330,7 +330,7 @@ class GatewayRunner:
         
         Checks HERMES_REASONING_EFFORT env var first, then agent.reasoning_effort
         in config.yaml. Valid: "xhigh", "high", "medium", "low", "minimal", "none".
-        Returns None to use default (xhigh).
+        Returns None to use default (medium).
         """
         effort = os.getenv("HERMES_REASONING_EFFORT", "")
         if not effort:
@@ -351,7 +351,7 @@ class GatewayRunner:
         valid = ("xhigh", "high", "medium", "low", "minimal")
         if effort in valid:
             return {"enabled": True, "effort": effort}
-        logger.warning("Unknown reasoning_effort '%s', using default (xhigh)", effort)
+        logger.warning("Unknown reasoning_effort '%s', using default (medium)", effort)
         return None
 
     @staticmethod
diff --git a/run_agent.py b/run_agent.py
index 00c43657b2..2fce80a9ab 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -213,7 +213,7 @@ class AIAgent:
                 Provided by the platform layer (CLI or gateway). If None, the clarify tool returns an error.
             max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
             reasoning_config (Dict): OpenRouter reasoning configuration override (e.g. {"effort": "none"} to disable thinking).
-                If None, defaults to {"enabled": True, "effort": "xhigh"} for OpenRouter. Set to disable/customize reasoning.
+                If None, defaults to {"enabled": True, "effort": "medium"} for OpenRouter. Set to disable/customize reasoning.
             prefill_messages (List[Dict]): Messages to prepend to conversation history as prefilled context.
                 Useful for injecting a few-shot example or priming the model's response style.
                 Example: [{"role": "user", "content": "Hi!"}, {"role": "assistant", "content": "Hello!"}]
@@ -287,7 +287,7 @@ class AIAgent:
         
         # Model response configuration
         self.max_tokens = max_tokens  # None = use model default
-        self.reasoning_config = reasoning_config  # None = use default (xhigh for OpenRouter)
+        self.reasoning_config = reasoning_config  # None = use default (medium for OpenRouter)
         self.prefill_messages = prefill_messages or []  # Prefilled conversation turns
         
         # Anthropic prompt caching: auto-enabled for Claude models via OpenRouter.
@@ -2157,8 +2157,8 @@ class AIAgent:
             if not instructions:
                 instructions = DEFAULT_AGENT_IDENTITY
 
-            # Resolve reasoning effort: config > default (xhigh)
-            reasoning_effort = "xhigh"
+            # Resolve reasoning effort: config > default (medium)
+            reasoning_effort = "medium"
             reasoning_enabled = True
             if self.reasoning_config and isinstance(self.reasoning_config, dict):
                 if self.reasoning_config.get("enabled") is False:
@@ -2224,7 +2224,7 @@ class AIAgent:
             else:
                 extra_body["reasoning"] = {
                     "enabled": True,
-                    "effort": "xhigh"
+                    "effort": "medium"
                 }
 
         # Nous Portal product attribution
@@ -2767,7 +2767,7 @@ class AIAgent:
                 else:
                     summary_extra_body["reasoning"] = {
                         "enabled": True,
-                        "effort": "xhigh"
+                        "effort": "medium"
                     }
             if _is_nous:
                 summary_extra_body["tags"] = ["product=hermes-agent"]
diff --git a/tests/test_provider_parity.py b/tests/test_provider_parity.py
index 00fc4dd9b7..2ee3131449 100644
--- a/tests/test_provider_parity.py
+++ b/tests/test_provider_parity.py
@@ -145,7 +145,7 @@ class TestBuildApiKwargsCodex:
         messages = [{"role": "user", "content": "hi"}]
         kwargs = agent._build_api_kwargs(messages)
         assert "reasoning" in kwargs
-        assert kwargs["reasoning"]["effort"] == "xhigh"
+        assert kwargs["reasoning"]["effort"] == "medium"
 
     def test_includes_encrypted_content_in_include(self, monkeypatch):
         agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses",
@@ -596,19 +596,19 @@ class TestCodexReasoningPreflight:
 # ── Reasoning effort consistency tests ───────────────────────────────────────
 
 class TestReasoningEffortDefaults:
-    """Verify reasoning effort defaults to xhigh across all provider paths."""
+    """Verify reasoning effort defaults to medium across all provider paths."""
 
-    def test_openrouter_default_xhigh(self, monkeypatch):
+    def test_openrouter_default_medium(self, monkeypatch):
         agent = _make_agent(monkeypatch, "openrouter")
         kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}])
         reasoning = kwargs["extra_body"]["reasoning"]
-        assert reasoning["effort"] == "xhigh"
+        assert reasoning["effort"] == "medium"
 
-    def test_codex_default_xhigh(self, monkeypatch):
+    def test_codex_default_medium(self, monkeypatch):
         agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses",
                             base_url="https://chatgpt.com/backend-api/codex")
         kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}])
-        assert kwargs["reasoning"]["effort"] == "xhigh"
+        assert kwargs["reasoning"]["effort"] == "medium"
 
     def test_codex_reasoning_disabled(self, monkeypatch):
         agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses",
diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py
index ae7924d458..226b29a6d6 100644
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@@ -498,12 +498,12 @@ class TestBuildApiKwargs:
         assert kwargs["extra_body"]["provider"]["only"] == ["Anthropic"]
 
     def test_reasoning_config_default_openrouter(self, agent):
-        """Default reasoning config for OpenRouter should be xhigh."""
+        """Default reasoning config for OpenRouter should be medium."""
         messages = [{"role": "user", "content": "hi"}]
         kwargs = agent._build_api_kwargs(messages)
         reasoning = kwargs["extra_body"]["reasoning"]
         assert reasoning["enabled"] is True
-        assert reasoning["effort"] == "xhigh"
+        assert reasoning["effort"] == "medium"
 
     def test_reasoning_config_custom(self, agent):
         agent.reasoning_config = {"enabled": False}
diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md
index 6d6897794b..33193619c4 100644
--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@@ -421,10 +421,10 @@ Control how much "thinking" the model does before responding:
 
 ```yaml
 agent:
-  reasoning_effort: ""   # empty = use model default. Options: xhigh (max), high, medium, low, minimal, none
+  reasoning_effort: ""   # empty = medium (default). Options: xhigh (max), high, medium, low, minimal, none
 ```
 
-When unset (default), the model's own default reasoning level is used. Setting a value overrides it — higher reasoning effort gives better results on complex tasks at the cost of more tokens and latency.
+When unset (default), reasoning effort defaults to "medium" — a balanced level that works well for most tasks. Setting a value overrides it — higher reasoning effort gives better results on complex tasks at the cost of more tokens and latency.
 
 ## TTS Configuration