mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
feat(providers): enforce request_timeout_seconds on OpenAI-wire primary calls
Live test with timeout_seconds: 0.5 on claude-sonnet-4.6 proved the initial wiring was insufficient: run_agent.py was overriding the client-level timeout on every call via hardcoded per-request kwargs. Root cause: run_agent.py had two sites that pass an explicit timeout= kwarg into chat.completions.create() — api_kwargs['timeout'] at line 7075 (HERMES_API_TIMEOUT=1800s default) and the streaming path's _httpx.Timeout(..., read=HERMES_STREAM_READ_TIMEOUT=120s, ...) at line 5760. Both override the per-provider config value the client was constructed with, so a 0.5s config timeout would silently not enforce. This commit: - Adds AIAgent._resolved_api_call_timeout() — config > HERMES_API_TIMEOUT env > 1800s default. - Uses it for the non-streaming api_kwargs['timeout'] field. - Uses it for the streaming path's httpx.Timeout(connect, read, write, pool) so both connect and read respect the configured value when set. Local-provider auto-bump (Ollama/vLLM cold-start) only applies when no explicit config value is set. - New test: test_resolved_api_call_timeout_priority covers all three precedence cases (config, env, default). Live verified: 0.5s config on claude-sonnet-4.6 now triggers APITimeoutError at ~3s per retry, exhausts 3 retries in ~15s total (was: 29-47s success with timeout ignored). Positive case (60s config + gpt-4o-mini) still succeeds at 1.3s.
This commit is contained in:
@@ -69,8 +69,12 @@ model:
|
|||||||
# Use this for per-provider request timeouts and per-model exceptions.
|
# Use this for per-provider request timeouts and per-model exceptions.
|
||||||
# Applies to the primary turn client on every api_mode (OpenAI-wire, native
|
# Applies to the primary turn client on every api_mode (OpenAI-wire, native
|
||||||
# Anthropic, and Anthropic-compatible providers), the fallback chain, and
|
# Anthropic, and Anthropic-compatible providers), the fallback chain, and
|
||||||
# client rebuilds during credential rotation. Leaving these unset keeps the
|
# client rebuilds during credential rotation. For OpenAI-wire chat
|
||||||
# SDK defaults (OpenAI ≈ 600s, native Anthropic 900s).
|
# completions (streaming and non-streaming) the configured value is also
|
||||||
|
# used as the per-request ``timeout=`` kwarg so it wins over the legacy
|
||||||
|
# HERMES_API_TIMEOUT env var (which still applies when no config is set).
|
||||||
|
# Leaving these unset keeps the legacy defaults (HERMES_API_TIMEOUT=1800s,
|
||||||
|
# native Anthropic 900s).
|
||||||
#
|
#
|
||||||
# providers:
|
# providers:
|
||||||
# ollama-local:
|
# ollama-local:
|
||||||
|
|||||||
58
run_agent.py
58
run_agent.py
@@ -2102,6 +2102,26 @@ class AIAgent:
|
|||||||
url = (base_url or self._base_url_lower).lower()
|
url = (base_url or self._base_url_lower).lower()
|
||||||
return "api.openai.com" in url and "openrouter" not in url
|
return "api.openai.com" in url and "openrouter" not in url
|
||||||
|
|
||||||
|
def _resolved_api_call_timeout(self) -> float:
|
||||||
|
"""Resolve the effective per-call request timeout in seconds.
|
||||||
|
|
||||||
|
Priority:
|
||||||
|
1. ``providers.<id>.models.<model>.timeout_seconds`` (per-model override)
|
||||||
|
2. ``providers.<id>.request_timeout_seconds`` (provider-wide)
|
||||||
|
3. ``HERMES_API_TIMEOUT`` env var (legacy escape hatch)
|
||||||
|
4. 1800.0s default
|
||||||
|
|
||||||
|
Used by OpenAI-wire chat completions (streaming and non-streaming) so
|
||||||
|
the per-provider config knob wins over the 1800s default. Without this
|
||||||
|
helper, the hardcoded ``HERMES_API_TIMEOUT`` fallback would always be
|
||||||
|
passed as a per-call ``timeout=`` kwarg, overriding the client-level
|
||||||
|
timeout the AIAgent.__init__ path configured.
|
||||||
|
"""
|
||||||
|
cfg = get_provider_request_timeout(self.provider, self.model)
|
||||||
|
if cfg is not None:
|
||||||
|
return cfg
|
||||||
|
return float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
|
||||||
|
|
||||||
def _is_openrouter_url(self) -> bool:
|
def _is_openrouter_url(self) -> bool:
|
||||||
"""Return True when the base URL targets OpenRouter."""
|
"""Return True when the base URL targets OpenRouter."""
|
||||||
return "openrouter" in self._base_url_lower
|
return "openrouter" in self._base_url_lower
|
||||||
@@ -5754,18 +5774,30 @@ class AIAgent:
|
|||||||
def _call_chat_completions():
|
def _call_chat_completions():
|
||||||
"""Stream a chat completions response."""
|
"""Stream a chat completions response."""
|
||||||
import httpx as _httpx
|
import httpx as _httpx
|
||||||
_base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
|
# Per-provider / per-model request_timeout_seconds (from config.yaml)
|
||||||
_stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0))
|
# wins over the HERMES_API_TIMEOUT env default if the user set it.
|
||||||
# Local providers (Ollama, llama.cpp, vLLM) can take minutes for
|
_provider_timeout_cfg = get_provider_request_timeout(self.provider, self.model)
|
||||||
# prefill on large contexts before producing the first token.
|
_base_timeout = (
|
||||||
# Auto-increase the httpx read timeout unless the user explicitly
|
_provider_timeout_cfg
|
||||||
# overrode HERMES_STREAM_READ_TIMEOUT.
|
if _provider_timeout_cfg is not None
|
||||||
if _stream_read_timeout == 120.0 and self.base_url and is_local_endpoint(self.base_url):
|
else float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
|
||||||
_stream_read_timeout = _base_timeout
|
)
|
||||||
logger.debug(
|
# Read timeout: config wins here too. Otherwise use
|
||||||
"Local provider detected (%s) — stream read timeout raised to %.0fs",
|
# HERMES_STREAM_READ_TIMEOUT (default 120s) for cloud providers.
|
||||||
self.base_url, _stream_read_timeout,
|
if _provider_timeout_cfg is not None:
|
||||||
)
|
_stream_read_timeout = _provider_timeout_cfg
|
||||||
|
else:
|
||||||
|
_stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0))
|
||||||
|
# Local providers (Ollama, llama.cpp, vLLM) can take minutes for
|
||||||
|
# prefill on large contexts before producing the first token.
|
||||||
|
# Auto-increase the httpx read timeout unless the user explicitly
|
||||||
|
# overrode HERMES_STREAM_READ_TIMEOUT.
|
||||||
|
if _stream_read_timeout == 120.0 and self.base_url and is_local_endpoint(self.base_url):
|
||||||
|
_stream_read_timeout = _base_timeout
|
||||||
|
logger.debug(
|
||||||
|
"Local provider detected (%s) — stream read timeout raised to %.0fs",
|
||||||
|
self.base_url, _stream_read_timeout,
|
||||||
|
)
|
||||||
stream_kwargs = {
|
stream_kwargs = {
|
||||||
**api_kwargs,
|
**api_kwargs,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
@@ -7081,7 +7113,7 @@ class AIAgent:
|
|||||||
api_kwargs = {
|
api_kwargs = {
|
||||||
"model": self.model,
|
"model": self.model,
|
||||||
"messages": sanitized_messages,
|
"messages": sanitized_messages,
|
||||||
"timeout": float(os.getenv("HERMES_API_TIMEOUT", 1800.0)),
|
"timeout": self._resolved_api_call_timeout(),
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
from agent.auxiliary_client import _fixed_temperature_for_model
|
from agent.auxiliary_client import _fixed_temperature_for_model
|
||||||
|
|||||||
@@ -95,3 +95,66 @@ def test_anthropic_adapter_honors_timeout_kwarg():
|
|||||||
# Connect timeout always stays at 10s regardless
|
# Connect timeout always stays at 10s regardless
|
||||||
assert c_default.timeout.connect == 10.0
|
assert c_default.timeout.connect == 10.0
|
||||||
assert c_custom.timeout.connect == 10.0
|
assert c_custom.timeout.connect == 10.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolved_api_call_timeout_priority(monkeypatch, tmp_path):
|
||||||
|
"""AIAgent._resolved_api_call_timeout() honors config > env > default priority."""
|
||||||
|
# Isolate HERMES_HOME
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
(tmp_path / ".env").write_text("", encoding="utf-8")
|
||||||
|
|
||||||
|
# Case A: config wins over env var
|
||||||
|
_write_config(tmp_path, """\
|
||||||
|
providers:
|
||||||
|
openrouter:
|
||||||
|
request_timeout_seconds: 77
|
||||||
|
models:
|
||||||
|
openai/gpt-4o-mini:
|
||||||
|
timeout_seconds: 42
|
||||||
|
""")
|
||||||
|
monkeypatch.setenv("HERMES_API_TIMEOUT", "999")
|
||||||
|
|
||||||
|
from run_agent import AIAgent
|
||||||
|
agent = AIAgent(
|
||||||
|
model="openai/gpt-4o-mini",
|
||||||
|
provider="openrouter",
|
||||||
|
api_key="sk-dummy",
|
||||||
|
base_url="https://openrouter.ai/api/v1",
|
||||||
|
quiet_mode=True,
|
||||||
|
skip_context_files=True,
|
||||||
|
skip_memory=True,
|
||||||
|
platform="cli",
|
||||||
|
)
|
||||||
|
# Per-model override wins
|
||||||
|
assert agent._resolved_api_call_timeout() == 42.0
|
||||||
|
|
||||||
|
# Provider-level (different model, no per-model override)
|
||||||
|
agent.model = "some/other-model"
|
||||||
|
assert agent._resolved_api_call_timeout() == 77.0
|
||||||
|
|
||||||
|
# Case B: no config → env wins
|
||||||
|
_write_config(tmp_path, "")
|
||||||
|
# Clear the cached config load
|
||||||
|
import importlib
|
||||||
|
from hermes_cli import config as cfg_mod
|
||||||
|
importlib.reload(cfg_mod)
|
||||||
|
from hermes_cli import timeouts as to_mod
|
||||||
|
importlib.reload(to_mod)
|
||||||
|
import run_agent as ra_mod
|
||||||
|
importlib.reload(ra_mod)
|
||||||
|
|
||||||
|
agent2 = ra_mod.AIAgent(
|
||||||
|
model="some/model",
|
||||||
|
provider="openrouter",
|
||||||
|
api_key="sk-dummy",
|
||||||
|
base_url="https://openrouter.ai/api/v1",
|
||||||
|
quiet_mode=True,
|
||||||
|
skip_context_files=True,
|
||||||
|
skip_memory=True,
|
||||||
|
platform="cli",
|
||||||
|
)
|
||||||
|
assert agent2._resolved_api_call_timeout() == 999.0
|
||||||
|
|
||||||
|
# Case C: no config, no env → 1800.0 default
|
||||||
|
monkeypatch.delenv("HERMES_API_TIMEOUT", raising=False)
|
||||||
|
assert agent2._resolved_api_call_timeout() == 1800.0
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ For AI provider setup (OpenRouter, Anthropic, Copilot, custom endpoints, self-ho
|
|||||||
|
|
||||||
### Provider Request Timeouts
|
### Provider Request Timeouts
|
||||||
|
|
||||||
You can set `providers.<id>.request_timeout_seconds` for a provider-wide timeout, plus `providers.<id>.models.<model>.timeout_seconds` for a model-specific override. Applies to the primary turn client on every transport (OpenAI-wire, native Anthropic, Anthropic-compatible), the fallback chain, and rebuilds after credential rotation. Leaving these unset keeps SDK defaults (OpenAI ≈ 600s, native Anthropic 900s). See the commented example in [`cli-config.yaml.example`](https://github.com/NousResearch/hermes-agent/blob/main/cli-config.yaml.example).
|
You can set `providers.<id>.request_timeout_seconds` for a provider-wide timeout, plus `providers.<id>.models.<model>.timeout_seconds` for a model-specific override. Applies to the primary turn client on every transport (OpenAI-wire, native Anthropic, Anthropic-compatible), the fallback chain, rebuilds after credential rotation, and (for OpenAI-wire) the per-request timeout kwarg — so the configured value wins over the legacy `HERMES_API_TIMEOUT` env var. Leaving these unset keeps legacy defaults (`HERMES_API_TIMEOUT=1800`s, native Anthropic 900s). See the commented example in [`cli-config.yaml.example`](https://github.com/NousResearch/hermes-agent/blob/main/cli-config.yaml.example).
|
||||||
|
|
||||||
## Terminal Backend Configuration
|
## Terminal Backend Configuration
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user