From 192e7eb21f5e2c4b8ef7b332e4423ea69a979754 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sun, 26 Apr 2026 04:53:42 -0700 Subject: [PATCH] fix(nous): don't trip cross-session rate breaker on upstream-capacity 429s (#15898) Nous Portal multiplexes multiple upstream providers (DeepSeek, Kimi, MiMo, Hermes) behind one endpoint. Before this fix, any 429 on any of those models recorded a cross-session file breaker that blocked EVERY model on Nous for the cooldown window -- even though the caller's own RPM/RPH/TPM/TPH buckets were healthy. Users hit a DeepSeek V4 Pro capacity error, restarted, switched to Kimi 2.6, and still got 'Nous Portal rate limit active -- resets in 46m 53s'. Nous already emits the full x-ratelimit-* header suite on every response (captured by rate_limit_tracker into agent._rate_limit_state). We now gate the breaker on that data: trip it only when either the 429's own headers or the last-known-good state show a bucket with remaining == 0 AND a reset window >= 60s. Upstream-capacity 429s (healthy buckets everywhere, but upstream out of capacity) fall through to normal retry/fallback and the breaker is never written. Note: the in-memory 'restart TUI/gateway to clear' workaround circulated in Discord does NOT work -- the breaker is file-backed at ~/.hermes/rate_limits/nous.json. The workaround for users still affected by a bad state file is to delete it. Reported in Discord by CrazyDok1 and KYSIV (Apr 2026). --- agent/nous_rate_guard.py | 142 ++++++++++++++++++++++++++++ run_agent.py | 61 +++++++++--- tests/agent/test_nous_rate_guard.py | 138 +++++++++++++++++++++++++++ 3 files changed, 327 insertions(+), 14 deletions(-) diff --git a/agent/nous_rate_guard.py b/agent/nous_rate_guard.py index 712d8a0f1f..ea866f2e08 100644 --- a/agent/nous_rate_guard.py +++ b/agent/nous_rate_guard.py @@ -180,3 +180,145 @@ def format_remaining(seconds: float) -> str: h, remainder = divmod(s, 3600) m = remainder // 60 return f"{h}h {m}m" if m else f"{h}h" + + +# Buckets with reset windows shorter than this are treated as transient +# (upstream jitter, secondary throttling) rather than a genuine quota +# exhaustion worth a cross-session breaker trip. +_MIN_RESET_FOR_BREAKER_SECONDS = 60.0 + + +def is_genuine_nous_rate_limit( + *, + headers: Optional[Mapping[str, str]] = None, + last_known_state: Optional[Any] = None, +) -> bool: + """Decide whether a 429 from Nous Portal is a real account rate limit. + + Nous Portal multiplexes multiple upstream providers (DeepSeek, Kimi, + MiMo, Hermes, ...) behind one endpoint. A 429 can mean either: + + (a) The caller's own RPM / RPH / TPM / TPH bucket on Nous is + exhausted — a genuine rate limit that will last until the + bucket resets. + (b) The upstream provider is out of capacity for a specific model + — transient, clears in seconds, and has nothing to do with + the caller's quota on Nous. + + Tripping the cross-session breaker on (b) blocks ALL Nous requests + (and all models, since Nous is one provider key) for minutes even + though the caller's account is healthy and a different model would + have worked. That's the bug users hit when DeepSeek V4 Pro 429s + trigger a breaker that then blocks Kimi 2.6 and MiMo V2.5 Pro. + + We tell the two apart by looking at: + + 1. The 429 response's own ``x-ratelimit-*`` headers. Nous emits + the full suite on every response including 429s. An exhausted + bucket (``remaining == 0`` with a reset window >= 60s) is + proof of (a). + 2. The last-known-good rate-limit state captured by + ``_capture_rate_limits()`` on the previous successful + response. If any bucket there was already near-exhausted with + a substantial reset window, the current 429 is almost + certainly (a) continuing from that condition. + + If neither signal fires, we treat the 429 as (b): fail the single + request, let the retry loop or model-switch proceed, and do NOT + write the cross-session breaker file. + + Returns True when the evidence points at (a). + """ + # Signal 1: current 429 response headers. + state = _parse_buckets_from_headers(headers) + if _has_exhausted_bucket(state): + return True + + # Signal 2: last-known-good state from a recent successful response. + # Accepts either a RateLimitState (dataclass from rate_limit_tracker) + # or a dict of bucket snapshots. + if last_known_state is not None and _has_exhausted_bucket_in_object(last_known_state): + return True + + return False + + +def _parse_buckets_from_headers( + headers: Optional[Mapping[str, str]], +) -> dict[str, tuple[Optional[int], Optional[float]]]: + """Extract (remaining, reset_seconds) per bucket from x-ratelimit-* headers. + + Returns empty dict when no rate-limit headers are present. + """ + if not headers: + return {} + + lowered = {k.lower(): v for k, v in headers.items()} + if not any(k.startswith("x-ratelimit-") for k in lowered): + return {} + + def _maybe_int(raw: Optional[str]) -> Optional[int]: + if raw is None: + return None + try: + return int(float(raw)) + except (TypeError, ValueError): + return None + + def _maybe_float(raw: Optional[str]) -> Optional[float]: + if raw is None: + return None + try: + return float(raw) + except (TypeError, ValueError): + return None + + result: dict[str, tuple[Optional[int], Optional[float]]] = {} + for tag in ("requests", "requests-1h", "tokens", "tokens-1h"): + remaining = _maybe_int(lowered.get(f"x-ratelimit-remaining-{tag}")) + reset = _maybe_float(lowered.get(f"x-ratelimit-reset-{tag}")) + if remaining is not None or reset is not None: + result[tag] = (remaining, reset) + return result + + +def _has_exhausted_bucket( + buckets: Mapping[str, tuple[Optional[int], Optional[float]]], +) -> bool: + """Return True when any bucket has remaining == 0 AND a meaningful reset window.""" + for remaining, reset in buckets.values(): + if remaining is None or remaining > 0: + continue + if reset is None: + continue + if reset >= _MIN_RESET_FOR_BREAKER_SECONDS: + return True + return False + + +def _has_exhausted_bucket_in_object(state: Any) -> bool: + """Check a RateLimitState-like object for an exhausted bucket. + + Accepts the dataclass from ``agent.rate_limit_tracker`` (buckets + exposed as attributes ``requests_min``, ``requests_hour``, + ``tokens_min``, ``tokens_hour``) and falls back gracefully for any + object missing those attributes. + """ + for attr in ("requests_min", "requests_hour", "tokens_min", "tokens_hour"): + bucket = getattr(state, attr, None) + if bucket is None: + continue + limit = getattr(bucket, "limit", 0) or 0 + remaining = getattr(bucket, "remaining", 0) or 0 + # Prefer the adjusted "remaining_seconds_now" property when present; + # fall back to raw reset_seconds. + reset = getattr(bucket, "remaining_seconds_now", None) + if reset is None: + reset = getattr(bucket, "reset_seconds", 0.0) or 0.0 + if limit <= 0: + continue + if remaining > 0: + continue + if reset >= _MIN_RESET_FOR_BREAKER_SECONDS: + return True + return False diff --git a/run_agent.py b/run_agent.py index 1f2a062127..c0dd76596d 100644 --- a/run_agent.py +++ b/run_agent.py @@ -11007,36 +11007,69 @@ class AIAgent: continue # ── Nous Portal: record rate limit & skip retries ───── - # When Nous returns a 429, record the reset time to a - # shared file so ALL sessions (cron, gateway, auxiliary) - # know not to pile on. Then skip further retries — - # each one burns another RPH request and deepens the - # rate limit hole. The retry loop's top-of-iteration - # guard will catch this on the next pass and try - # fallback or bail with a clear message. + # When Nous returns a 429 that is a genuine account- + # level rate limit, record the reset time to a shared + # file so ALL sessions (cron, gateway, auxiliary) know + # not to pile on, then skip further retries -- each + # one burns another RPH request and deepens the hole. + # The retry loop's top-of-iteration guard will catch + # this on the next pass and try fallback or bail. + # + # IMPORTANT: Nous Portal multiplexes multiple upstream + # providers (DeepSeek, Kimi, MiMo, Hermes). A 429 can + # also mean an UPSTREAM provider is out of capacity + # for one specific model -- transient, clears in + # seconds, nothing to do with the caller's quota. + # Tripping the cross-session breaker on that would + # block every Nous model for minutes. We use + # ``is_genuine_nous_rate_limit`` to tell the two + # apart via the 429's own x-ratelimit-* headers and + # the last-known-good state captured on the previous + # successful response. if ( is_rate_limited and self.provider == "nous" and classified.reason == FailoverReason.rate_limit and not recovered_with_pool ): + _genuine_nous_rate_limit = False try: - from agent.nous_rate_guard import record_nous_rate_limit + from agent.nous_rate_guard import ( + is_genuine_nous_rate_limit, + record_nous_rate_limit, + ) _err_resp = getattr(api_error, "response", None) _err_hdrs = ( getattr(_err_resp, "headers", None) if _err_resp else None ) - record_nous_rate_limit( + _genuine_nous_rate_limit = is_genuine_nous_rate_limit( headers=_err_hdrs, - error_context=error_context, + last_known_state=self._rate_limit_state, ) + if _genuine_nous_rate_limit: + record_nous_rate_limit( + headers=_err_hdrs, + error_context=error_context, + ) + else: + logging.info( + "Nous 429 looks like upstream capacity " + "(no exhausted bucket in headers or " + "last-known state) -- not tripping " + "cross-session breaker." + ) except Exception: pass - # Skip straight to max_retries — the top-of-loop - # guard will handle fallback or bail cleanly. - retry_count = max_retries - continue + if _genuine_nous_rate_limit: + # Skip straight to max_retries -- the + # top-of-loop guard will handle fallback or + # bail cleanly. + retry_count = max_retries + continue + # Upstream capacity 429: fall through to normal + # retry logic. A different model (or the same + # model a moment later) will typically succeed. is_payload_too_large = ( classified.reason == FailoverReason.payload_too_large diff --git a/tests/agent/test_nous_rate_guard.py b/tests/agent/test_nous_rate_guard.py index 45d30f7246..4441aa6e44 100644 --- a/tests/agent/test_nous_rate_guard.py +++ b/tests/agent/test_nous_rate_guard.py @@ -251,3 +251,141 @@ class TestAuxiliaryClientIntegration: monkeypatch.setattr(aux, "_read_nous_auth", lambda: None) result = aux._try_nous() assert result == (None, None) + + +class TestIsGenuineNousRateLimit: + """Tell a real account-level 429 apart from an upstream-capacity 429. + + Nous Portal multiplexes upstreams (DeepSeek, Kimi, MiMo, Hermes). + A 429 from an upstream out of capacity should NOT trip the + cross-session breaker; a real user-quota 429 should. + """ + + def test_exhausted_hourly_bucket_in_429_headers_is_genuine(self): + from agent.nous_rate_guard import is_genuine_nous_rate_limit + + headers = { + "x-ratelimit-limit-requests-1h": "800", + "x-ratelimit-remaining-requests-1h": "0", + "x-ratelimit-reset-requests-1h": "3100", + "x-ratelimit-limit-requests": "200", + "x-ratelimit-remaining-requests": "198", + "x-ratelimit-reset-requests": "40", + } + assert is_genuine_nous_rate_limit(headers=headers) is True + + def test_exhausted_tokens_bucket_is_genuine(self): + from agent.nous_rate_guard import is_genuine_nous_rate_limit + + headers = { + "x-ratelimit-limit-tokens": "800000", + "x-ratelimit-remaining-tokens": "0", + "x-ratelimit-reset-tokens": "45", # < 60s threshold -> not genuine + "x-ratelimit-limit-tokens-1h": "8000000", + "x-ratelimit-remaining-tokens-1h": "0", + "x-ratelimit-reset-tokens-1h": "1800", # >= 60s threshold -> genuine + } + assert is_genuine_nous_rate_limit(headers=headers) is True + + def test_healthy_headers_on_429_are_upstream_capacity(self): + # Classic upstream-capacity symptom: Nous edge reports plenty of + # headroom on every bucket, but returns 429 anyway because + # upstream (DeepSeek / Kimi / ...) is out of capacity. + from agent.nous_rate_guard import is_genuine_nous_rate_limit + + headers = { + "x-ratelimit-limit-requests": "200", + "x-ratelimit-remaining-requests": "198", + "x-ratelimit-reset-requests": "40", + "x-ratelimit-limit-requests-1h": "800", + "x-ratelimit-remaining-requests-1h": "750", + "x-ratelimit-reset-requests-1h": "3100", + "x-ratelimit-limit-tokens": "800000", + "x-ratelimit-remaining-tokens": "790000", + "x-ratelimit-reset-tokens": "40", + "x-ratelimit-limit-tokens-1h": "8000000", + "x-ratelimit-remaining-tokens-1h": "7800000", + "x-ratelimit-reset-tokens-1h": "3100", + } + assert is_genuine_nous_rate_limit(headers=headers) is False + + def test_bare_429_with_no_headers_is_upstream(self): + from agent.nous_rate_guard import is_genuine_nous_rate_limit + + assert is_genuine_nous_rate_limit(headers=None) is False + assert is_genuine_nous_rate_limit(headers={}) is False + assert is_genuine_nous_rate_limit( + headers={"content-type": "application/json"} + ) is False + + def test_exhausted_bucket_with_short_reset_is_not_genuine(self): + # remaining == 0 but reset in < 60s: almost certainly a + # secondary per-minute throttle that will clear immediately -- + # not worth tripping the cross-session breaker. + from agent.nous_rate_guard import is_genuine_nous_rate_limit + + headers = { + "x-ratelimit-limit-requests": "200", + "x-ratelimit-remaining-requests": "0", + "x-ratelimit-reset-requests": "30", + } + assert is_genuine_nous_rate_limit(headers=headers) is False + + def test_last_known_state_with_exhausted_bucket_triggers_genuine(self): + # Headers on the 429 lack rate-limit info, but the previous + # successful response already showed the hourly bucket + # exhausted -- the 429 is almost certainly that limit + # continuing. + from agent.nous_rate_guard import is_genuine_nous_rate_limit + from agent.rate_limit_tracker import parse_rate_limit_headers + + prior_headers = { + "x-ratelimit-limit-requests-1h": "800", + "x-ratelimit-remaining-requests-1h": "0", + "x-ratelimit-reset-requests-1h": "2000", + "x-ratelimit-limit-requests": "200", + "x-ratelimit-remaining-requests": "100", + "x-ratelimit-reset-requests": "30", + "x-ratelimit-limit-tokens": "800000", + "x-ratelimit-remaining-tokens": "700000", + "x-ratelimit-reset-tokens": "30", + "x-ratelimit-limit-tokens-1h": "8000000", + "x-ratelimit-remaining-tokens-1h": "7000000", + "x-ratelimit-reset-tokens-1h": "2000", + } + last_state = parse_rate_limit_headers(prior_headers, provider="nous") + assert is_genuine_nous_rate_limit( + headers=None, last_known_state=last_state + ) is True + + def test_last_known_state_all_healthy_stays_upstream(self): + # Prior state was healthy; bare 429 arrives; should be treated + # as upstream capacity. + from agent.nous_rate_guard import is_genuine_nous_rate_limit + from agent.rate_limit_tracker import parse_rate_limit_headers + + prior_headers = { + "x-ratelimit-limit-requests-1h": "800", + "x-ratelimit-remaining-requests-1h": "750", + "x-ratelimit-reset-requests-1h": "2000", + "x-ratelimit-limit-requests": "200", + "x-ratelimit-remaining-requests": "180", + "x-ratelimit-reset-requests": "30", + "x-ratelimit-limit-tokens": "800000", + "x-ratelimit-remaining-tokens": "790000", + "x-ratelimit-reset-tokens": "30", + "x-ratelimit-limit-tokens-1h": "8000000", + "x-ratelimit-remaining-tokens-1h": "7900000", + "x-ratelimit-reset-tokens-1h": "2000", + } + last_state = parse_rate_limit_headers(prior_headers, provider="nous") + assert is_genuine_nous_rate_limit( + headers=None, last_known_state=last_state + ) is False + + def test_none_last_state_and_no_headers_is_upstream(self): + from agent.nous_rate_guard import is_genuine_nous_rate_limit + + assert is_genuine_nous_rate_limit( + headers=None, last_known_state=None + ) is False