agent/nous_rate_guard.py

"""Cross-session rate limit guard for Nous Portal.

Writes rate limit state to a shared file so all sessions (CLI, gateway,
cron, auxiliary) can check whether Nous Portal is currently rate-limited
before making requests.  Prevents retry amplification when RPH is tapped.

Each 429 from Nous triggers up to 9 API calls per conversation turn
(3 SDK retries x 3 Hermes retries), and every one of those calls counts
against RPH.  By recording the rate limit state on first 429 and checking
it before subsequent attempts, we eliminate the amplification effect.
"""

from __future__ import annotations

import json
import logging
import os
import tempfile
import time
from typing import Any, Mapping, Optional

logger = logging.getLogger(__name__)

_STATE_SUBDIR = "rate_limits"
_STATE_FILENAME = "nous.json"


def _state_path() -> str:
    """Return the path to the Nous rate limit state file."""
    try:
        from hermes_constants import get_hermes_home
        base = get_hermes_home()
    except ImportError:
        base = os.path.join(os.path.expanduser("~"), ".hermes")
    return os.path.join(base, _STATE_SUBDIR, _STATE_FILENAME)


def _parse_reset_seconds(headers: Optional[Mapping[str, str]]) -> Optional[float]:
    """Extract the best available reset-time estimate from response headers.

    Priority:
      1. x-ratelimit-reset-requests-1h  (hourly RPH window — most useful)
      2. x-ratelimit-reset-requests     (per-minute RPM window)
      3. retry-after                     (generic HTTP header)

    Returns seconds-from-now, or None if no usable header found.
    """
    if not headers:
        return None

    lowered = {k.lower(): v for k, v in headers.items()}

    for key in (
        "x-ratelimit-reset-requests-1h",
        "x-ratelimit-reset-requests",
        "retry-after",
    ):
        raw = lowered.get(key)
        if raw is not None:
            try:
                val = float(raw)
                if val > 0:
                    return val
            except (TypeError, ValueError):
                pass

    return None


def record_nous_rate_limit(
    *,
    headers: Optional[Mapping[str, str]] = None,
    error_context: Optional[dict[str, Any]] = None,
    default_cooldown: float = 300.0,
) -> None:
    """Record that Nous Portal is rate-limited.

    Parses the reset time from response headers or error context.
    Falls back to ``default_cooldown`` (5 minutes) if no reset info
    is available.  Writes to a shared file that all sessions can read.

    Args:
        headers: HTTP response headers from the 429 error.
        error_context: Structured error context from _extract_api_error_context().
        default_cooldown: Fallback cooldown in seconds when no header data.
    """
    now = time.time()
    reset_at = None

    # Try headers first (most accurate)
    header_seconds = _parse_reset_seconds(headers)
    if header_seconds is not None:
        reset_at = now + header_seconds

    # Try error_context reset_at (from body parsing)
    if reset_at is None and isinstance(error_context, dict):
        ctx_reset = error_context.get("reset_at")
        if isinstance(ctx_reset, (int, float)) and ctx_reset > now:
            reset_at = float(ctx_reset)

    # Default cooldown
    if reset_at is None:
        reset_at = now + default_cooldown

    path = _state_path()
    try:
        state_dir = os.path.dirname(path)
        os.makedirs(state_dir, exist_ok=True)

        state = {
            "reset_at": reset_at,
            "recorded_at": now,
            "reset_seconds": reset_at - now,
        }

        # Atomic write: write to temp file + rename
        fd, tmp_path = tempfile.mkstemp(dir=state_dir, suffix=".tmp")
        try:
            with os.fdopen(fd, "w") as f:
                json.dump(state, f)
            os.replace(tmp_path, path)
        except Exception:
            # Clean up temp file on failure
            try:
                os.unlink(tmp_path)
            except OSError:
                pass
            raise

        logger.info(
            "Nous rate limit recorded: resets in %.0fs (at %.0f)",
            reset_at - now, reset_at,
        )
    except Exception as exc:
        logger.debug("Failed to write Nous rate limit state: %s", exc)


def nous_rate_limit_remaining() -> Optional[float]:
    """Check if Nous Portal is currently rate-limited.

    Returns:
        Seconds remaining until reset, or None if not rate-limited.
    """
    path = _state_path()
    try:
        with open(path) as f:
            state = json.load(f)
        reset_at = state.get("reset_at", 0)
        remaining = reset_at - time.time()
        if remaining > 0:
            return remaining
        # Expired — clean up
        try:
            os.unlink(path)
        except OSError:
            pass
        return None
    except (FileNotFoundError, json.JSONDecodeError, KeyError, TypeError):
        return None


def clear_nous_rate_limit() -> None:
    """Clear the rate limit state (e.g., after a successful Nous request)."""
    try:
        os.unlink(_state_path())
    except FileNotFoundError:
        pass
    except OSError as exc:
        logger.debug("Failed to clear Nous rate limit state: %s", exc)


def format_remaining(seconds: float) -> str:
    """Format seconds remaining into human-readable duration."""
    s = max(0, int(seconds))
    if s < 60:
        return f"{s}s"
    if s < 3600:
        m, sec = divmod(s, 60)
        return f"{m}m {sec}s" if sec else f"{m}m"
    h, remainder = divmod(s, 3600)
    m = remainder // 60
    return f"{h}h {m}m" if m else f"{h}h"


# Buckets with reset windows shorter than this are treated as transient
# (upstream jitter, secondary throttling) rather than a genuine quota
# exhaustion worth a cross-session breaker trip.
_MIN_RESET_FOR_BREAKER_SECONDS = 60.0


def is_genuine_nous_rate_limit(
    *,
    headers: Optional[Mapping[str, str]] = None,
    last_known_state: Optional[Any] = None,
) -> bool:
    """Decide whether a 429 from Nous Portal is a real account rate limit.

    Nous Portal multiplexes multiple upstream providers (DeepSeek, Kimi,
    MiMo, Hermes, ...) behind one endpoint.  A 429 can mean either:

      (a) The caller's own RPM / RPH / TPM / TPH bucket on Nous is
          exhausted — a genuine rate limit that will last until the
          bucket resets.
      (b) The upstream provider is out of capacity for a specific model
          — transient, clears in seconds, and has nothing to do with
          the caller's quota on Nous.

    Tripping the cross-session breaker on (b) blocks ALL Nous requests
    (and all models, since Nous is one provider key) for minutes even
    though the caller's account is healthy and a different model would
    have worked.  That's the bug users hit when DeepSeek V4 Pro 429s
    trigger a breaker that then blocks Kimi 2.6 and MiMo V2.5 Pro.

    We tell the two apart by looking at:

      1. The 429 response's own ``x-ratelimit-*`` headers.  Nous emits
         the full suite on every response including 429s.  An exhausted
         bucket (``remaining == 0`` with a reset window >= 60s) is
         proof of (a).
      2. The last-known-good rate-limit state captured by
         ``_capture_rate_limits()`` on the previous successful
         response.  If any bucket there was already near-exhausted with
         a substantial reset window, the current 429 is almost
         certainly (a) continuing from that condition.

    If neither signal fires, we treat the 429 as (b): fail the single
    request, let the retry loop or model-switch proceed, and do NOT
    write the cross-session breaker file.

    Returns True when the evidence points at (a).
    """
    # Signal 1: current 429 response headers.
    state = _parse_buckets_from_headers(headers)
    if _has_exhausted_bucket(state):
        return True

    # Signal 2: last-known-good state from a recent successful response.
    # Accepts either a RateLimitState (dataclass from rate_limit_tracker)
    # or a dict of bucket snapshots.
    if last_known_state is not None and _has_exhausted_bucket_in_object(last_known_state):
        return True

    return False


def _parse_buckets_from_headers(
    headers: Optional[Mapping[str, str]],
) -> dict[str, tuple[Optional[int], Optional[float]]]:
    """Extract (remaining, reset_seconds) per bucket from x-ratelimit-* headers.

    Returns empty dict when no rate-limit headers are present.
    """
    if not headers:
        return {}

    lowered = {k.lower(): v for k, v in headers.items()}
    if not any(k.startswith("x-ratelimit-") for k in lowered):
        return {}

    def _maybe_int(raw: Optional[str]) -> Optional[int]:
        if raw is None:
            return None
        try:
            return int(float(raw))
        except (TypeError, ValueError):
            return None

    def _maybe_float(raw: Optional[str]) -> Optional[float]:
        if raw is None:
            return None
        try:
            return float(raw)
        except (TypeError, ValueError):
            return None

    result: dict[str, tuple[Optional[int], Optional[float]]] = {}
    for tag in ("requests", "requests-1h", "tokens", "tokens-1h"):
        remaining = _maybe_int(lowered.get(f"x-ratelimit-remaining-{tag}"))
        reset = _maybe_float(lowered.get(f"x-ratelimit-reset-{tag}"))
        if remaining is not None or reset is not None:
            result[tag] = (remaining, reset)
    return result


def _has_exhausted_bucket(
    buckets: Mapping[str, tuple[Optional[int], Optional[float]]],
) -> bool:
    """Return True when any bucket has remaining == 0 AND a meaningful reset window."""
    for remaining, reset in buckets.values():
        if remaining is None or remaining > 0:
            continue
        if reset is None:
            continue
        if reset >= _MIN_RESET_FOR_BREAKER_SECONDS:
            return True
    return False


def _has_exhausted_bucket_in_object(state: Any) -> bool:
    """Check a RateLimitState-like object for an exhausted bucket.

    Accepts the dataclass from ``agent.rate_limit_tracker`` (buckets
    exposed as attributes ``requests_min``, ``requests_hour``,
    ``tokens_min``, ``tokens_hour``) and falls back gracefully for any
    object missing those attributes.
    """
    for attr in ("requests_min", "requests_hour", "tokens_min", "tokens_hour"):
        bucket = getattr(state, attr, None)
        if bucket is None:
            continue
        limit = getattr(bucket, "limit", 0) or 0
        remaining = getattr(bucket, "remaining", 0) or 0
        # Prefer the adjusted "remaining_seconds_now" property when present;
        # fall back to raw reset_seconds.
        reset = getattr(bucket, "remaining_seconds_now", None)
        if reset is None:
            reset = getattr(bucket, "reset_seconds", 0.0) or 0.0
        if limit <= 0:
            continue
        if remaining > 0:
            continue
        if reset >= _MIN_RESET_FOR_BREAKER_SECONDS:
            return True
    return False
fix: Nous Portal rate limit guard — prevent retry amplification (#10568) When Nous returns a 429, the retry amplification chain burns up to 9 API requests per conversation turn (3 SDK retries × 3 Hermes retries), each counting against RPH and deepening the rate limit. With multiple concurrent sessions (cron + gateway + auxiliary), this creates a spiral where retries keep the limit tapped indefinitely. New module: agent/nous_rate_guard.py - Shared file-based rate limit state (~/.hermes/rate_limits/nous.json) - Parses reset time from x-ratelimit-reset-requests-1h, x-ratelimit- reset-requests, retry-after headers, or error context - Falls back to 5-minute default cooldown if no header data - Atomic writes (tempfile + rename) for cross-process safety - Auto-cleanup of expired state files run_agent.py changes: - Top-of-retry-loop guard: when another session already recorded Nous as rate-limited, skip the API call entirely. Try fallback provider first, then return a clear message with the reset time. - On 429 from Nous: record rate limit state and skip further retries (sets retry_count = max_retries to trigger fallback path) - On success from Nous: clear the rate limit state so other sessions know they can resume auxiliary_client.py changes: - _try_nous() checks rate guard before attempting Nous in the auxiliary fallback chain. When rate-limited, returns (None, None) so the chain skips to the next provider instead of piling more requests onto Nous. This eliminates three sources of amplification: 1. Hermes-level retries (saves 6 of 9 calls per turn) 2. Cross-session retries (cron + gateway all skip Nous) 3. Auxiliary fallback to Nous (compression/session_search skip too) Includes 24 tests covering the rate guard module, header parsing, state lifecycle, and auxiliary client integration. 2026-04-15 16:31:48 -07:00			`"""Cross-session rate limit guard for Nous Portal.`

			`Writes rate limit state to a shared file so all sessions (CLI, gateway,`
			`cron, auxiliary) can check whether Nous Portal is currently rate-limited`
			`before making requests. Prevents retry amplification when RPH is tapped.`

			`Each 429 from Nous triggers up to 9 API calls per conversation turn`
			`(3 SDK retries x 3 Hermes retries), and every one of those calls counts`
			`against RPH. By recording the rate limit state on first 429 and checking`
			`it before subsequent attempts, we eliminate the amplification effect.`
			`"""`

			`from __future__ import annotations`

			`import json`
			`import logging`
			`import os`
			`import tempfile`
			`import time`
			`from typing import Any, Mapping, Optional`

			`logger = logging.getLogger(__name__)`

			`_STATE_SUBDIR = "rate_limits"`
			`_STATE_FILENAME = "nous.json"`


			`def _state_path() -> str:`
			`"""Return the path to the Nous rate limit state file."""`
			`try:`
			`from hermes_constants import get_hermes_home`
			`base = get_hermes_home()`
			`except ImportError:`
			`base = os.path.join(os.path.expanduser("~"), ".hermes")`
			`return os.path.join(base, _STATE_SUBDIR, _STATE_FILENAME)`


			`def _parse_reset_seconds(headers: Optional[Mapping[str, str]]) -> Optional[float]:`
			`"""Extract the best available reset-time estimate from response headers.`

			`Priority:`
			`1. x-ratelimit-reset-requests-1h (hourly RPH window — most useful)`
			`2. x-ratelimit-reset-requests (per-minute RPM window)`
			`3. retry-after (generic HTTP header)`

			`Returns seconds-from-now, or None if no usable header found.`
			`"""`
			`if not headers:`
			`return None`

			`lowered = {k.lower(): v for k, v in headers.items()}`

			`for key in (`
			`"x-ratelimit-reset-requests-1h",`
			`"x-ratelimit-reset-requests",`
			`"retry-after",`
			`):`
			`raw = lowered.get(key)`
			`if raw is not None:`
			`try:`
			`val = float(raw)`
			`if val > 0:`
			`return val`
			`except (TypeError, ValueError):`
			`pass`

			`return None`


			`def record_nous_rate_limit(`
			`*,`
			`headers: Optional[Mapping[str, str]] = None,`
			`error_context: Optional[dict[str, Any]] = None,`
			`default_cooldown: float = 300.0,`
			`) -> None:`
			`"""Record that Nous Portal is rate-limited.`

			`Parses the reset time from response headers or error context.`
			Falls back to ``default_cooldown`` (5 minutes) if no reset info
			`is available. Writes to a shared file that all sessions can read.`

			`Args:`
			`headers: HTTP response headers from the 429 error.`
			`error_context: Structured error context from _extract_api_error_context().`
			`default_cooldown: Fallback cooldown in seconds when no header data.`
			`"""`
			`now = time.time()`
			`reset_at = None`

			`# Try headers first (most accurate)`
			`header_seconds = _parse_reset_seconds(headers)`
			`if header_seconds is not None:`
			`reset_at = now + header_seconds`

			`# Try error_context reset_at (from body parsing)`
			`if reset_at is None and isinstance(error_context, dict):`
			`ctx_reset = error_context.get("reset_at")`
			`if isinstance(ctx_reset, (int, float)) and ctx_reset > now:`
			`reset_at = float(ctx_reset)`

			`# Default cooldown`
			`if reset_at is None:`
			`reset_at = now + default_cooldown`

			`path = _state_path()`
			`try:`
			`state_dir = os.path.dirname(path)`
			`os.makedirs(state_dir, exist_ok=True)`

			`state = {`
			`"reset_at": reset_at,`
			`"recorded_at": now,`
			`"reset_seconds": reset_at - now,`
			`}`

			`# Atomic write: write to temp file + rename`
			`fd, tmp_path = tempfile.mkstemp(dir=state_dir, suffix=".tmp")`
			`try:`
			`with os.fdopen(fd, "w") as f:`
			`json.dump(state, f)`
			`os.replace(tmp_path, path)`
			`except Exception:`
			`# Clean up temp file on failure`
			`try:`
			`os.unlink(tmp_path)`
			`except OSError:`
			`pass`
			`raise`

			`logger.info(`
			`"Nous rate limit recorded: resets in %.0fs (at %.0f)",`
			`reset_at - now, reset_at,`
			`)`
			`except Exception as exc:`
			`logger.debug("Failed to write Nous rate limit state: %s", exc)`


			`def nous_rate_limit_remaining() -> Optional[float]:`
			`"""Check if Nous Portal is currently rate-limited.`

			`Returns:`
			`Seconds remaining until reset, or None if not rate-limited.`
			`"""`
			`path = _state_path()`
			`try:`
			`with open(path) as f:`
			`state = json.load(f)`
			`reset_at = state.get("reset_at", 0)`
			`remaining = reset_at - time.time()`
			`if remaining > 0:`
			`return remaining`
			`# Expired — clean up`
			`try:`
			`os.unlink(path)`
			`except OSError:`
			`pass`
			`return None`
			`except (FileNotFoundError, json.JSONDecodeError, KeyError, TypeError):`
			`return None`


			`def clear_nous_rate_limit() -> None:`
			`"""Clear the rate limit state (e.g., after a successful Nous request)."""`
			`try:`
			`os.unlink(_state_path())`
			`except FileNotFoundError:`
			`pass`
			`except OSError as exc:`
			`logger.debug("Failed to clear Nous rate limit state: %s", exc)`


			`def format_remaining(seconds: float) -> str:`
			`"""Format seconds remaining into human-readable duration."""`
			`s = max(0, int(seconds))`
			`if s < 60:`
			`return f"{s}s"`
			`if s < 3600:`
			`m, sec = divmod(s, 60)`
			`return f"{m}m {sec}s" if sec else f"{m}m"`
			`h, remainder = divmod(s, 3600)`
			`m = remainder // 60`
			`return f"{h}h {m}m" if m else f"{h}h"`
fix(nous): don't trip cross-session rate breaker on upstream-capacity 429s (#15898) Nous Portal multiplexes multiple upstream providers (DeepSeek, Kimi, MiMo, Hermes) behind one endpoint. Before this fix, any 429 on any of those models recorded a cross-session file breaker that blocked EVERY model on Nous for the cooldown window -- even though the caller's own RPM/RPH/TPM/TPH buckets were healthy. Users hit a DeepSeek V4 Pro capacity error, restarted, switched to Kimi 2.6, and still got 'Nous Portal rate limit active -- resets in 46m 53s'. Nous already emits the full x-ratelimit-* header suite on every response (captured by rate_limit_tracker into agent._rate_limit_state). We now gate the breaker on that data: trip it only when either the 429's own headers or the last-known-good state show a bucket with remaining == 0 AND a reset window >= 60s. Upstream-capacity 429s (healthy buckets everywhere, but upstream out of capacity) fall through to normal retry/fallback and the breaker is never written. Note: the in-memory 'restart TUI/gateway to clear' workaround circulated in Discord does NOT work -- the breaker is file-backed at ~/.hermes/rate_limits/nous.json. The workaround for users still affected by a bad state file is to delete it. Reported in Discord by CrazyDok1 and KYSIV (Apr 2026). 2026-04-26 04:53:42 -07:00

			`# Buckets with reset windows shorter than this are treated as transient`
			`# (upstream jitter, secondary throttling) rather than a genuine quota`
			`# exhaustion worth a cross-session breaker trip.`
			`_MIN_RESET_FOR_BREAKER_SECONDS = 60.0`


			`def is_genuine_nous_rate_limit(`
			`*,`
			`headers: Optional[Mapping[str, str]] = None,`
			`last_known_state: Optional[Any] = None,`
			`) -> bool:`
			`"""Decide whether a 429 from Nous Portal is a real account rate limit.`

			`Nous Portal multiplexes multiple upstream providers (DeepSeek, Kimi,`
			`MiMo, Hermes, ...) behind one endpoint. A 429 can mean either:`

			`(a) The caller's own RPM / RPH / TPM / TPH bucket on Nous is`
			`exhausted — a genuine rate limit that will last until the`
			`bucket resets.`
			`(b) The upstream provider is out of capacity for a specific model`
			`— transient, clears in seconds, and has nothing to do with`
			`the caller's quota on Nous.`

			`Tripping the cross-session breaker on (b) blocks ALL Nous requests`
			`(and all models, since Nous is one provider key) for minutes even`
			`though the caller's account is healthy and a different model would`
			`have worked. That's the bug users hit when DeepSeek V4 Pro 429s`
			`trigger a breaker that then blocks Kimi 2.6 and MiMo V2.5 Pro.`

			`We tell the two apart by looking at:`

			1. The 429 response's own ``x-ratelimit-*`` headers. Nous emits
			`the full suite on every response including 429s. An exhausted`
			bucket (``remaining == 0`` with a reset window >= 60s) is
			`proof of (a).`
			`2. The last-known-good rate-limit state captured by`
			``_capture_rate_limits()`` on the previous successful
			`response. If any bucket there was already near-exhausted with`
			`a substantial reset window, the current 429 is almost`
			`certainly (a) continuing from that condition.`

			`If neither signal fires, we treat the 429 as (b): fail the single`
			`request, let the retry loop or model-switch proceed, and do NOT`
			`write the cross-session breaker file.`

			`Returns True when the evidence points at (a).`
			`"""`
			`# Signal 1: current 429 response headers.`
			`state = _parse_buckets_from_headers(headers)`
			`if _has_exhausted_bucket(state):`
			`return True`

			`# Signal 2: last-known-good state from a recent successful response.`
			`# Accepts either a RateLimitState (dataclass from rate_limit_tracker)`
			`# or a dict of bucket snapshots.`
			`if last_known_state is not None and _has_exhausted_bucket_in_object(last_known_state):`
			`return True`

			`return False`


			`def _parse_buckets_from_headers(`
			`headers: Optional[Mapping[str, str]],`
			`) -> dict[str, tuple[Optional[int], Optional[float]]]:`
			`"""Extract (remaining, reset_seconds) per bucket from x-ratelimit-* headers.`

			`Returns empty dict when no rate-limit headers are present.`
			`"""`
			`if not headers:`
			`return {}`

			`lowered = {k.lower(): v for k, v in headers.items()}`
			`if not any(k.startswith("x-ratelimit-") for k in lowered):`
			`return {}`

			`def _maybe_int(raw: Optional[str]) -> Optional[int]:`
			`if raw is None:`
			`return None`
			`try:`
			`return int(float(raw))`
			`except (TypeError, ValueError):`
			`return None`

			`def _maybe_float(raw: Optional[str]) -> Optional[float]:`
			`if raw is None:`
			`return None`
			`try:`
			`return float(raw)`
			`except (TypeError, ValueError):`
			`return None`

			`result: dict[str, tuple[Optional[int], Optional[float]]] = {}`
			`for tag in ("requests", "requests-1h", "tokens", "tokens-1h"):`
			`remaining = _maybe_int(lowered.get(f"x-ratelimit-remaining-{tag}"))`
			`reset = _maybe_float(lowered.get(f"x-ratelimit-reset-{tag}"))`
			`if remaining is not None or reset is not None:`
			`result[tag] = (remaining, reset)`
			`return result`


			`def _has_exhausted_bucket(`
			`buckets: Mapping[str, tuple[Optional[int], Optional[float]]],`
			`) -> bool:`
			`"""Return True when any bucket has remaining == 0 AND a meaningful reset window."""`
			`for remaining, reset in buckets.values():`
			`if remaining is None or remaining > 0:`
			`continue`
			`if reset is None:`
			`continue`
			`if reset >= _MIN_RESET_FOR_BREAKER_SECONDS:`
			`return True`
			`return False`


			`def _has_exhausted_bucket_in_object(state: Any) -> bool:`
			`"""Check a RateLimitState-like object for an exhausted bucket.`

			Accepts the dataclass from ``agent.rate_limit_tracker`` (buckets
			exposed as attributes ``requests_min``, ``requests_hour``,
			``tokens_min``, ``tokens_hour``) and falls back gracefully for any
			`object missing those attributes.`
			`"""`
			`for attr in ("requests_min", "requests_hour", "tokens_min", "tokens_hour"):`
			`bucket = getattr(state, attr, None)`
			`if bucket is None:`
			`continue`
			`limit = getattr(bucket, "limit", 0) or 0`
			`remaining = getattr(bucket, "remaining", 0) or 0`
			`# Prefer the adjusted "remaining_seconds_now" property when present;`
			`# fall back to raw reset_seconds.`
			`reset = getattr(bucket, "remaining_seconds_now", None)`
			`if reset is None:`
			`reset = getattr(bucket, "reset_seconds", 0.0) or 0.0`
			`if limit <= 0:`
			`continue`
			`if remaining > 0:`
			`continue`
			`if reset >= _MIN_RESET_FOR_BREAKER_SECONDS:`
			`return True`
			`return False`