Compare commits

...

1 Commits

Author SHA1 Message Date
helix4u
67e086dc4e fix(run-agent): rotate credential pool on billing-classified 400s 2026-04-10 03:26:21 -07:00
2 changed files with 71 additions and 15 deletions

View File

@@ -4219,49 +4219,80 @@ class AIAgent:
*, *,
status_code: Optional[int], status_code: Optional[int],
has_retried_429: bool, has_retried_429: bool,
classified_reason: Optional[FailoverReason] = None,
error_context: Optional[Dict[str, Any]] = None, error_context: Optional[Dict[str, Any]] = None,
) -> tuple[bool, bool]: ) -> tuple[bool, bool]:
"""Attempt credential recovery via pool rotation. """Attempt credential recovery via pool rotation.
Returns (recovered, has_retried_429). Returns (recovered, has_retried_429).
On 429: first occurrence retries same credential (sets flag True). On rate limits: first occurrence retries same credential (sets flag True).
second consecutive 429 rotates to next credential (resets flag). second consecutive failure rotates to next credential.
On 402: immediately rotates (billing exhaustion won't resolve with retry). On billing exhaustion: immediately rotates.
On 401: attempts token refresh before rotating. On auth failures: attempts token refresh before rotating.
`classified_reason` lets the recovery path honor the structured error
classifier instead of relying only on raw HTTP codes. This matters for
providers that surface billing/rate-limit/auth conditions under a
different status code, such as Anthropic returning HTTP 400 for
"out of extra usage".
""" """
pool = self._credential_pool pool = self._credential_pool
if pool is None or status_code is None: if pool is None:
return False, has_retried_429 return False, has_retried_429
effective_reason = classified_reason
if effective_reason is None:
if status_code == 402: if status_code == 402:
next_entry = pool.mark_exhausted_and_rotate(status_code=402, error_context=error_context) effective_reason = FailoverReason.billing
elif status_code == 429:
effective_reason = FailoverReason.rate_limit
elif status_code == 401:
effective_reason = FailoverReason.auth
if effective_reason == FailoverReason.billing:
rotate_status = status_code if status_code is not None else 402
next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context)
if next_entry is not None: if next_entry is not None:
logger.info(f"Credential 402 (billing) — rotated to pool entry {getattr(next_entry, 'id', '?')}") logger.info(
"Credential %s (billing) — rotated to pool entry %s",
rotate_status,
getattr(next_entry, "id", "?"),
)
self._swap_credential(next_entry) self._swap_credential(next_entry)
return True, False return True, False
return False, has_retried_429 return False, has_retried_429
if status_code == 429: if effective_reason == FailoverReason.rate_limit:
if not has_retried_429: if not has_retried_429:
return False, True return False, True
next_entry = pool.mark_exhausted_and_rotate(status_code=429, error_context=error_context) rotate_status = status_code if status_code is not None else 429
next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context)
if next_entry is not None: if next_entry is not None:
logger.info(f"Credential 429 (rate limit) — rotated to pool entry {getattr(next_entry, 'id', '?')}") logger.info(
"Credential %s (rate limit) — rotated to pool entry %s",
rotate_status,
getattr(next_entry, "id", "?"),
)
self._swap_credential(next_entry) self._swap_credential(next_entry)
return True, False return True, False
return False, True return False, True
if status_code == 401: if effective_reason == FailoverReason.auth:
refreshed = pool.try_refresh_current() refreshed = pool.try_refresh_current()
if refreshed is not None: if refreshed is not None:
logger.info(f"Credential 401 — refreshed pool entry {getattr(refreshed, 'id', '?')}") logger.info(f"Credential auth failure — refreshed pool entry {getattr(refreshed, 'id', '?')}")
self._swap_credential(refreshed) self._swap_credential(refreshed)
return True, has_retried_429 return True, has_retried_429
# Refresh failed — rotate to next credential instead of giving up. # Refresh failed — rotate to next credential instead of giving up.
# The failed entry is already marked exhausted by try_refresh_current(). # The failed entry is already marked exhausted by try_refresh_current().
next_entry = pool.mark_exhausted_and_rotate(status_code=401, error_context=error_context) rotate_status = status_code if status_code is not None else 401
next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context)
if next_entry is not None: if next_entry is not None:
logger.info(f"Credential 401 (refresh failed) — rotated to pool entry {getattr(next_entry, 'id', '?')}") logger.info(
"Credential %s (auth refresh failed) — rotated to pool entry %s",
rotate_status,
getattr(next_entry, "id", "?"),
)
self._swap_credential(next_entry) self._swap_credential(next_entry)
return True, False return True, False
@@ -8157,6 +8188,7 @@ class AIAgent:
recovered_with_pool, has_retried_429 = self._recover_with_credential_pool( recovered_with_pool, has_retried_429 = self._recover_with_credential_pool(
status_code=status_code, status_code=status_code,
has_retried_429=has_retried_429, has_retried_429=has_retried_429,
classified_reason=classified.reason,
error_context=error_context, error_context=error_context,
) )
if recovered_with_pool: if recovered_with_pool:

View File

@@ -19,6 +19,7 @@ import pytest
import run_agent import run_agent
from run_agent import AIAgent from run_agent import AIAgent
from agent.error_classifier import FailoverReason
from agent.prompt_builder import DEFAULT_AGENT_IDENTITY from agent.prompt_builder import DEFAULT_AGENT_IDENTITY
@@ -2242,6 +2243,29 @@ class TestCredentialPoolRecovery:
assert retry_same is False assert retry_same is False
agent._swap_credential.assert_called_once_with(next_entry) agent._swap_credential.assert_called_once_with(next_entry)
def test_recover_with_pool_rotates_on_billing_reason_even_with_http_400(self, agent):
next_entry = SimpleNamespace(label="secondary")
class _Pool:
def mark_exhausted_and_rotate(self, *, status_code, error_context=None):
assert status_code == 400
assert error_context == {"reason": "out_of_extra_usage"}
return next_entry
agent._credential_pool = _Pool()
agent._swap_credential = MagicMock()
recovered, retry_same = agent._recover_with_credential_pool(
status_code=400,
has_retried_429=False,
classified_reason=FailoverReason.billing,
error_context={"reason": "out_of_extra_usage"},
)
assert recovered is True
assert retry_same is False
agent._swap_credential.assert_called_once_with(next_entry)
def test_recover_with_pool_retries_first_429_then_rotates(self, agent): def test_recover_with_pool_retries_first_429_then_rotates(self, agent):
next_entry = SimpleNamespace(label="secondary") next_entry = SimpleNamespace(label="secondary")