mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 06:51:16 +08:00
fix(nous): don't trip cross-session rate breaker on upstream-capacity 429s (#15898)
Nous Portal multiplexes multiple upstream providers (DeepSeek, Kimi, MiMo, Hermes) behind one endpoint. Before this fix, any 429 on any of those models recorded a cross-session file breaker that blocked EVERY model on Nous for the cooldown window -- even though the caller's own RPM/RPH/TPM/TPH buckets were healthy. Users hit a DeepSeek V4 Pro capacity error, restarted, switched to Kimi 2.6, and still got 'Nous Portal rate limit active -- resets in 46m 53s'. Nous already emits the full x-ratelimit-* header suite on every response (captured by rate_limit_tracker into agent._rate_limit_state). We now gate the breaker on that data: trip it only when either the 429's own headers or the last-known-good state show a bucket with remaining == 0 AND a reset window >= 60s. Upstream-capacity 429s (healthy buckets everywhere, but upstream out of capacity) fall through to normal retry/fallback and the breaker is never written. Note: the in-memory 'restart TUI/gateway to clear' workaround circulated in Discord does NOT work -- the breaker is file-backed at ~/.hermes/rate_limits/nous.json. The workaround for users still affected by a bad state file is to delete it. Reported in Discord by CrazyDok1 and KYSIV (Apr 2026).
This commit is contained in:
@@ -180,3 +180,145 @@ def format_remaining(seconds: float) -> str:
|
|||||||
h, remainder = divmod(s, 3600)
|
h, remainder = divmod(s, 3600)
|
||||||
m = remainder // 60
|
m = remainder // 60
|
||||||
return f"{h}h {m}m" if m else f"{h}h"
|
return f"{h}h {m}m" if m else f"{h}h"
|
||||||
|
|
||||||
|
|
||||||
|
# Buckets with reset windows shorter than this are treated as transient
|
||||||
|
# (upstream jitter, secondary throttling) rather than a genuine quota
|
||||||
|
# exhaustion worth a cross-session breaker trip.
|
||||||
|
_MIN_RESET_FOR_BREAKER_SECONDS = 60.0
|
||||||
|
|
||||||
|
|
||||||
|
def is_genuine_nous_rate_limit(
|
||||||
|
*,
|
||||||
|
headers: Optional[Mapping[str, str]] = None,
|
||||||
|
last_known_state: Optional[Any] = None,
|
||||||
|
) -> bool:
|
||||||
|
"""Decide whether a 429 from Nous Portal is a real account rate limit.
|
||||||
|
|
||||||
|
Nous Portal multiplexes multiple upstream providers (DeepSeek, Kimi,
|
||||||
|
MiMo, Hermes, ...) behind one endpoint. A 429 can mean either:
|
||||||
|
|
||||||
|
(a) The caller's own RPM / RPH / TPM / TPH bucket on Nous is
|
||||||
|
exhausted — a genuine rate limit that will last until the
|
||||||
|
bucket resets.
|
||||||
|
(b) The upstream provider is out of capacity for a specific model
|
||||||
|
— transient, clears in seconds, and has nothing to do with
|
||||||
|
the caller's quota on Nous.
|
||||||
|
|
||||||
|
Tripping the cross-session breaker on (b) blocks ALL Nous requests
|
||||||
|
(and all models, since Nous is one provider key) for minutes even
|
||||||
|
though the caller's account is healthy and a different model would
|
||||||
|
have worked. That's the bug users hit when DeepSeek V4 Pro 429s
|
||||||
|
trigger a breaker that then blocks Kimi 2.6 and MiMo V2.5 Pro.
|
||||||
|
|
||||||
|
We tell the two apart by looking at:
|
||||||
|
|
||||||
|
1. The 429 response's own ``x-ratelimit-*`` headers. Nous emits
|
||||||
|
the full suite on every response including 429s. An exhausted
|
||||||
|
bucket (``remaining == 0`` with a reset window >= 60s) is
|
||||||
|
proof of (a).
|
||||||
|
2. The last-known-good rate-limit state captured by
|
||||||
|
``_capture_rate_limits()`` on the previous successful
|
||||||
|
response. If any bucket there was already near-exhausted with
|
||||||
|
a substantial reset window, the current 429 is almost
|
||||||
|
certainly (a) continuing from that condition.
|
||||||
|
|
||||||
|
If neither signal fires, we treat the 429 as (b): fail the single
|
||||||
|
request, let the retry loop or model-switch proceed, and do NOT
|
||||||
|
write the cross-session breaker file.
|
||||||
|
|
||||||
|
Returns True when the evidence points at (a).
|
||||||
|
"""
|
||||||
|
# Signal 1: current 429 response headers.
|
||||||
|
state = _parse_buckets_from_headers(headers)
|
||||||
|
if _has_exhausted_bucket(state):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Signal 2: last-known-good state from a recent successful response.
|
||||||
|
# Accepts either a RateLimitState (dataclass from rate_limit_tracker)
|
||||||
|
# or a dict of bucket snapshots.
|
||||||
|
if last_known_state is not None and _has_exhausted_bucket_in_object(last_known_state):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_buckets_from_headers(
|
||||||
|
headers: Optional[Mapping[str, str]],
|
||||||
|
) -> dict[str, tuple[Optional[int], Optional[float]]]:
|
||||||
|
"""Extract (remaining, reset_seconds) per bucket from x-ratelimit-* headers.
|
||||||
|
|
||||||
|
Returns empty dict when no rate-limit headers are present.
|
||||||
|
"""
|
||||||
|
if not headers:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
lowered = {k.lower(): v for k, v in headers.items()}
|
||||||
|
if not any(k.startswith("x-ratelimit-") for k in lowered):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _maybe_int(raw: Optional[str]) -> Optional[int]:
|
||||||
|
if raw is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return int(float(raw))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _maybe_float(raw: Optional[str]) -> Optional[float]:
|
||||||
|
if raw is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return float(raw)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
result: dict[str, tuple[Optional[int], Optional[float]]] = {}
|
||||||
|
for tag in ("requests", "requests-1h", "tokens", "tokens-1h"):
|
||||||
|
remaining = _maybe_int(lowered.get(f"x-ratelimit-remaining-{tag}"))
|
||||||
|
reset = _maybe_float(lowered.get(f"x-ratelimit-reset-{tag}"))
|
||||||
|
if remaining is not None or reset is not None:
|
||||||
|
result[tag] = (remaining, reset)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _has_exhausted_bucket(
|
||||||
|
buckets: Mapping[str, tuple[Optional[int], Optional[float]]],
|
||||||
|
) -> bool:
|
||||||
|
"""Return True when any bucket has remaining == 0 AND a meaningful reset window."""
|
||||||
|
for remaining, reset in buckets.values():
|
||||||
|
if remaining is None or remaining > 0:
|
||||||
|
continue
|
||||||
|
if reset is None:
|
||||||
|
continue
|
||||||
|
if reset >= _MIN_RESET_FOR_BREAKER_SECONDS:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _has_exhausted_bucket_in_object(state: Any) -> bool:
|
||||||
|
"""Check a RateLimitState-like object for an exhausted bucket.
|
||||||
|
|
||||||
|
Accepts the dataclass from ``agent.rate_limit_tracker`` (buckets
|
||||||
|
exposed as attributes ``requests_min``, ``requests_hour``,
|
||||||
|
``tokens_min``, ``tokens_hour``) and falls back gracefully for any
|
||||||
|
object missing those attributes.
|
||||||
|
"""
|
||||||
|
for attr in ("requests_min", "requests_hour", "tokens_min", "tokens_hour"):
|
||||||
|
bucket = getattr(state, attr, None)
|
||||||
|
if bucket is None:
|
||||||
|
continue
|
||||||
|
limit = getattr(bucket, "limit", 0) or 0
|
||||||
|
remaining = getattr(bucket, "remaining", 0) or 0
|
||||||
|
# Prefer the adjusted "remaining_seconds_now" property when present;
|
||||||
|
# fall back to raw reset_seconds.
|
||||||
|
reset = getattr(bucket, "remaining_seconds_now", None)
|
||||||
|
if reset is None:
|
||||||
|
reset = getattr(bucket, "reset_seconds", 0.0) or 0.0
|
||||||
|
if limit <= 0:
|
||||||
|
continue
|
||||||
|
if remaining > 0:
|
||||||
|
continue
|
||||||
|
if reset >= _MIN_RESET_FOR_BREAKER_SECONDS:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|||||||
61
run_agent.py
61
run_agent.py
@@ -11007,36 +11007,69 @@ class AIAgent:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# ── Nous Portal: record rate limit & skip retries ─────
|
# ── Nous Portal: record rate limit & skip retries ─────
|
||||||
# When Nous returns a 429, record the reset time to a
|
# When Nous returns a 429 that is a genuine account-
|
||||||
# shared file so ALL sessions (cron, gateway, auxiliary)
|
# level rate limit, record the reset time to a shared
|
||||||
# know not to pile on. Then skip further retries —
|
# file so ALL sessions (cron, gateway, auxiliary) know
|
||||||
# each one burns another RPH request and deepens the
|
# not to pile on, then skip further retries -- each
|
||||||
# rate limit hole. The retry loop's top-of-iteration
|
# one burns another RPH request and deepens the hole.
|
||||||
# guard will catch this on the next pass and try
|
# The retry loop's top-of-iteration guard will catch
|
||||||
# fallback or bail with a clear message.
|
# this on the next pass and try fallback or bail.
|
||||||
|
#
|
||||||
|
# IMPORTANT: Nous Portal multiplexes multiple upstream
|
||||||
|
# providers (DeepSeek, Kimi, MiMo, Hermes). A 429 can
|
||||||
|
# also mean an UPSTREAM provider is out of capacity
|
||||||
|
# for one specific model -- transient, clears in
|
||||||
|
# seconds, nothing to do with the caller's quota.
|
||||||
|
# Tripping the cross-session breaker on that would
|
||||||
|
# block every Nous model for minutes. We use
|
||||||
|
# ``is_genuine_nous_rate_limit`` to tell the two
|
||||||
|
# apart via the 429's own x-ratelimit-* headers and
|
||||||
|
# the last-known-good state captured on the previous
|
||||||
|
# successful response.
|
||||||
if (
|
if (
|
||||||
is_rate_limited
|
is_rate_limited
|
||||||
and self.provider == "nous"
|
and self.provider == "nous"
|
||||||
and classified.reason == FailoverReason.rate_limit
|
and classified.reason == FailoverReason.rate_limit
|
||||||
and not recovered_with_pool
|
and not recovered_with_pool
|
||||||
):
|
):
|
||||||
|
_genuine_nous_rate_limit = False
|
||||||
try:
|
try:
|
||||||
from agent.nous_rate_guard import record_nous_rate_limit
|
from agent.nous_rate_guard import (
|
||||||
|
is_genuine_nous_rate_limit,
|
||||||
|
record_nous_rate_limit,
|
||||||
|
)
|
||||||
_err_resp = getattr(api_error, "response", None)
|
_err_resp = getattr(api_error, "response", None)
|
||||||
_err_hdrs = (
|
_err_hdrs = (
|
||||||
getattr(_err_resp, "headers", None)
|
getattr(_err_resp, "headers", None)
|
||||||
if _err_resp else None
|
if _err_resp else None
|
||||||
)
|
)
|
||||||
record_nous_rate_limit(
|
_genuine_nous_rate_limit = is_genuine_nous_rate_limit(
|
||||||
headers=_err_hdrs,
|
headers=_err_hdrs,
|
||||||
error_context=error_context,
|
last_known_state=self._rate_limit_state,
|
||||||
)
|
)
|
||||||
|
if _genuine_nous_rate_limit:
|
||||||
|
record_nous_rate_limit(
|
||||||
|
headers=_err_hdrs,
|
||||||
|
error_context=error_context,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logging.info(
|
||||||
|
"Nous 429 looks like upstream capacity "
|
||||||
|
"(no exhausted bucket in headers or "
|
||||||
|
"last-known state) -- not tripping "
|
||||||
|
"cross-session breaker."
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
# Skip straight to max_retries — the top-of-loop
|
if _genuine_nous_rate_limit:
|
||||||
# guard will handle fallback or bail cleanly.
|
# Skip straight to max_retries -- the
|
||||||
retry_count = max_retries
|
# top-of-loop guard will handle fallback or
|
||||||
continue
|
# bail cleanly.
|
||||||
|
retry_count = max_retries
|
||||||
|
continue
|
||||||
|
# Upstream capacity 429: fall through to normal
|
||||||
|
# retry logic. A different model (or the same
|
||||||
|
# model a moment later) will typically succeed.
|
||||||
|
|
||||||
is_payload_too_large = (
|
is_payload_too_large = (
|
||||||
classified.reason == FailoverReason.payload_too_large
|
classified.reason == FailoverReason.payload_too_large
|
||||||
|
|||||||
@@ -251,3 +251,141 @@ class TestAuxiliaryClientIntegration:
|
|||||||
monkeypatch.setattr(aux, "_read_nous_auth", lambda: None)
|
monkeypatch.setattr(aux, "_read_nous_auth", lambda: None)
|
||||||
result = aux._try_nous()
|
result = aux._try_nous()
|
||||||
assert result == (None, None)
|
assert result == (None, None)
|
||||||
|
|
||||||
|
|
||||||
|
class TestIsGenuineNousRateLimit:
|
||||||
|
"""Tell a real account-level 429 apart from an upstream-capacity 429.
|
||||||
|
|
||||||
|
Nous Portal multiplexes upstreams (DeepSeek, Kimi, MiMo, Hermes).
|
||||||
|
A 429 from an upstream out of capacity should NOT trip the
|
||||||
|
cross-session breaker; a real user-quota 429 should.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_exhausted_hourly_bucket_in_429_headers_is_genuine(self):
|
||||||
|
from agent.nous_rate_guard import is_genuine_nous_rate_limit
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"x-ratelimit-limit-requests-1h": "800",
|
||||||
|
"x-ratelimit-remaining-requests-1h": "0",
|
||||||
|
"x-ratelimit-reset-requests-1h": "3100",
|
||||||
|
"x-ratelimit-limit-requests": "200",
|
||||||
|
"x-ratelimit-remaining-requests": "198",
|
||||||
|
"x-ratelimit-reset-requests": "40",
|
||||||
|
}
|
||||||
|
assert is_genuine_nous_rate_limit(headers=headers) is True
|
||||||
|
|
||||||
|
def test_exhausted_tokens_bucket_is_genuine(self):
|
||||||
|
from agent.nous_rate_guard import is_genuine_nous_rate_limit
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"x-ratelimit-limit-tokens": "800000",
|
||||||
|
"x-ratelimit-remaining-tokens": "0",
|
||||||
|
"x-ratelimit-reset-tokens": "45", # < 60s threshold -> not genuine
|
||||||
|
"x-ratelimit-limit-tokens-1h": "8000000",
|
||||||
|
"x-ratelimit-remaining-tokens-1h": "0",
|
||||||
|
"x-ratelimit-reset-tokens-1h": "1800", # >= 60s threshold -> genuine
|
||||||
|
}
|
||||||
|
assert is_genuine_nous_rate_limit(headers=headers) is True
|
||||||
|
|
||||||
|
def test_healthy_headers_on_429_are_upstream_capacity(self):
|
||||||
|
# Classic upstream-capacity symptom: Nous edge reports plenty of
|
||||||
|
# headroom on every bucket, but returns 429 anyway because
|
||||||
|
# upstream (DeepSeek / Kimi / ...) is out of capacity.
|
||||||
|
from agent.nous_rate_guard import is_genuine_nous_rate_limit
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"x-ratelimit-limit-requests": "200",
|
||||||
|
"x-ratelimit-remaining-requests": "198",
|
||||||
|
"x-ratelimit-reset-requests": "40",
|
||||||
|
"x-ratelimit-limit-requests-1h": "800",
|
||||||
|
"x-ratelimit-remaining-requests-1h": "750",
|
||||||
|
"x-ratelimit-reset-requests-1h": "3100",
|
||||||
|
"x-ratelimit-limit-tokens": "800000",
|
||||||
|
"x-ratelimit-remaining-tokens": "790000",
|
||||||
|
"x-ratelimit-reset-tokens": "40",
|
||||||
|
"x-ratelimit-limit-tokens-1h": "8000000",
|
||||||
|
"x-ratelimit-remaining-tokens-1h": "7800000",
|
||||||
|
"x-ratelimit-reset-tokens-1h": "3100",
|
||||||
|
}
|
||||||
|
assert is_genuine_nous_rate_limit(headers=headers) is False
|
||||||
|
|
||||||
|
def test_bare_429_with_no_headers_is_upstream(self):
|
||||||
|
from agent.nous_rate_guard import is_genuine_nous_rate_limit
|
||||||
|
|
||||||
|
assert is_genuine_nous_rate_limit(headers=None) is False
|
||||||
|
assert is_genuine_nous_rate_limit(headers={}) is False
|
||||||
|
assert is_genuine_nous_rate_limit(
|
||||||
|
headers={"content-type": "application/json"}
|
||||||
|
) is False
|
||||||
|
|
||||||
|
def test_exhausted_bucket_with_short_reset_is_not_genuine(self):
|
||||||
|
# remaining == 0 but reset in < 60s: almost certainly a
|
||||||
|
# secondary per-minute throttle that will clear immediately --
|
||||||
|
# not worth tripping the cross-session breaker.
|
||||||
|
from agent.nous_rate_guard import is_genuine_nous_rate_limit
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"x-ratelimit-limit-requests": "200",
|
||||||
|
"x-ratelimit-remaining-requests": "0",
|
||||||
|
"x-ratelimit-reset-requests": "30",
|
||||||
|
}
|
||||||
|
assert is_genuine_nous_rate_limit(headers=headers) is False
|
||||||
|
|
||||||
|
def test_last_known_state_with_exhausted_bucket_triggers_genuine(self):
|
||||||
|
# Headers on the 429 lack rate-limit info, but the previous
|
||||||
|
# successful response already showed the hourly bucket
|
||||||
|
# exhausted -- the 429 is almost certainly that limit
|
||||||
|
# continuing.
|
||||||
|
from agent.nous_rate_guard import is_genuine_nous_rate_limit
|
||||||
|
from agent.rate_limit_tracker import parse_rate_limit_headers
|
||||||
|
|
||||||
|
prior_headers = {
|
||||||
|
"x-ratelimit-limit-requests-1h": "800",
|
||||||
|
"x-ratelimit-remaining-requests-1h": "0",
|
||||||
|
"x-ratelimit-reset-requests-1h": "2000",
|
||||||
|
"x-ratelimit-limit-requests": "200",
|
||||||
|
"x-ratelimit-remaining-requests": "100",
|
||||||
|
"x-ratelimit-reset-requests": "30",
|
||||||
|
"x-ratelimit-limit-tokens": "800000",
|
||||||
|
"x-ratelimit-remaining-tokens": "700000",
|
||||||
|
"x-ratelimit-reset-tokens": "30",
|
||||||
|
"x-ratelimit-limit-tokens-1h": "8000000",
|
||||||
|
"x-ratelimit-remaining-tokens-1h": "7000000",
|
||||||
|
"x-ratelimit-reset-tokens-1h": "2000",
|
||||||
|
}
|
||||||
|
last_state = parse_rate_limit_headers(prior_headers, provider="nous")
|
||||||
|
assert is_genuine_nous_rate_limit(
|
||||||
|
headers=None, last_known_state=last_state
|
||||||
|
) is True
|
||||||
|
|
||||||
|
def test_last_known_state_all_healthy_stays_upstream(self):
|
||||||
|
# Prior state was healthy; bare 429 arrives; should be treated
|
||||||
|
# as upstream capacity.
|
||||||
|
from agent.nous_rate_guard import is_genuine_nous_rate_limit
|
||||||
|
from agent.rate_limit_tracker import parse_rate_limit_headers
|
||||||
|
|
||||||
|
prior_headers = {
|
||||||
|
"x-ratelimit-limit-requests-1h": "800",
|
||||||
|
"x-ratelimit-remaining-requests-1h": "750",
|
||||||
|
"x-ratelimit-reset-requests-1h": "2000",
|
||||||
|
"x-ratelimit-limit-requests": "200",
|
||||||
|
"x-ratelimit-remaining-requests": "180",
|
||||||
|
"x-ratelimit-reset-requests": "30",
|
||||||
|
"x-ratelimit-limit-tokens": "800000",
|
||||||
|
"x-ratelimit-remaining-tokens": "790000",
|
||||||
|
"x-ratelimit-reset-tokens": "30",
|
||||||
|
"x-ratelimit-limit-tokens-1h": "8000000",
|
||||||
|
"x-ratelimit-remaining-tokens-1h": "7900000",
|
||||||
|
"x-ratelimit-reset-tokens-1h": "2000",
|
||||||
|
}
|
||||||
|
last_state = parse_rate_limit_headers(prior_headers, provider="nous")
|
||||||
|
assert is_genuine_nous_rate_limit(
|
||||||
|
headers=None, last_known_state=last_state
|
||||||
|
) is False
|
||||||
|
|
||||||
|
def test_none_last_state_and_no_headers_is_upstream(self):
|
||||||
|
from agent.nous_rate_guard import is_genuine_nous_rate_limit
|
||||||
|
|
||||||
|
assert is_genuine_nous_rate_limit(
|
||||||
|
headers=None, last_known_state=None
|
||||||
|
) is False
|
||||||
|
|||||||
Reference in New Issue
Block a user