fix(auth): write rotated Codex/xAI pool grant through to global root (#48415 )

CredentialPool._sync_device_code_entry_to_auth_store rotated single-use OAuth refresh tokens but wrote the new chain only into the active profile store. When a profile resolves a grant from the global-root fallback (read_credential_pool, #18594) and the pool then refreshes it, root was left holding a now-revoked refresh token — every other profile reading the stale root grant subsequently died with refresh_token_reused / invalid_grant once its access token expired. This is the credential-pool analog of #43589 (which fixed the non-pool xAI refresh path in _save_xai_oauth_tokens). Detect the read-from-root case (profile lacks its own providers.<id> block) BEFORE the profile save and, after it, write the rotated chain back to the global root via a best-effort, seat-belted write-through. A profile that genuinely shadows root (owns the block) is untouched; classic mode (profile == root) is a no-op; a failed root write never breaks the profile's own save. Covers openai-codex (reported), xai-oauth, and nous through the shared sync path.
2026-06-26 20:03:22 +08:00 · 2026-06-24 00:22:11 -07:00
2 changed files with 274 additions and 0 deletions
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@@ -11,6 +11,7 @@ import uuid
 import re
 from dataclasses import dataclass, fields, replace
 from datetime import datetime, timezone
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Set, Tuple

 from hermes_constants import OPENROUTER_BASE_URL
@@ -447,6 +448,63 @@ def get_pool_strategy(provider: str) -> str:
 DEFAULT_MAX_CONCURRENT_PER_CREDENTIAL = 1


+def _write_through_provider_state_to_global_root(
+    provider_id: str, state: Dict[str, Any]
+) -> None:
+    """Persist a rotated OAuth ``state`` into the global-root auth.json.
+
+    Best-effort write-through for the multi-profile rotation hazard
+    (#48415 / #43589): nous, openai-codex, and xai-oauth rotate the
+    refresh_token on refresh, so when a profile pool refresh rotates a grant
+    it resolved from the root fallback, the rotated chain must land back in
+    root. Otherwise root keeps a now-revoked refresh token and every other
+    profile reading the stale root grant dies with ``refresh_token_reused`` /
+    ``invalid_grant`` once its access token expires.
+
+    Only updates ``providers.<provider_id>`` in the root store; never touches
+    the profile store (the caller already saved that). Swallows all errors — a
+    failed write-through degrades to the pre-existing behavior (root stale), it
+    must never break the profile's own successful save. Mirrors
+    ``hermes_cli.auth._write_through_xai_oauth_to_global_root`` (which covers
+    the non-pool xAI refresh path) for the credential-pool refresh path.
+    """
+    try:
+        global_path = auth_mod._global_auth_file_path()
+    except Exception:
+        return
+    if global_path is None:
+        # Classic mode (profile == root); the profile save already hit root.
+        return
+    # Seat belt: under pytest, refuse to write the real user's
+    # ~/.hermes/auth.json even when HERMES_HOME points at a profile path
+    # (mirrors the read-side guard in _load_global_auth_store). Uses the
+    # unmodified HOME env, not Path.home() which fixtures may monkeypatch.
+    if os.environ.get("PYTEST_CURRENT_TEST"):
+        real_home_env = os.environ.get("HOME", "")
+        if real_home_env:
+            real_root = Path(real_home_env) / ".hermes" / "auth.json"
+            try:
+                if global_path.resolve(strict=False) == real_root.resolve(strict=False):
+                    return
+            except Exception:
+                return
+    try:
+        if global_path.exists():
+            global_store = _load_auth_store(global_path)
+        else:
+            global_store = {}
+        if not isinstance(global_store, dict):
+            return
+        _store_provider_state(global_store, provider_id, dict(state), set_active=False)
+        auth_mod._save_auth_store(global_store, global_path)
+    except Exception as exc:  # pragma: no cover - best effort
+        logger.debug(
+            "%s pool refresh: write-through to global root failed: %s",
+            provider_id,
+            exc,
+        )
+
+
 class CredentialPool:
    def __init__(self, provider: str, entries: List[PooledCredential]):
        self.provider = provider
@@ -800,6 +858,28 @@ class CredentialPool:
        try:
            with _auth_store_lock():
                auth_store = _load_auth_store()
+                # Decide BEFORE writing whether this profile is reading the
+                # grant from the global root (no own providers.<id> block) vs.
+                # genuinely shadowing it. A pool refresh rotates single-use
+                # OAuth refresh tokens, so a profile that resolved the grant
+                # from root MUST write the rotated chain back to root too —
+                # otherwise root keeps a revoked refresh token and every other
+                # profile reading the stale root grant dies with
+                # refresh_token_reused / invalid_grant once its access token
+                # expires. This mirrors the xAI write-through in
+                # hermes_cli.auth._save_xai_oauth_tokens (#43589); the pool
+                # refresh path is the Codex/xAI analog reported in #48415.
+                _wt_provider_id = {
+                    "nous": "nous",
+                    "openai-codex": "openai-codex",
+                    "xai-oauth": "xai-oauth",
+                }.get(self.provider)
+                write_through_to_root = bool(_wt_provider_id) and not (
+                    isinstance(auth_store.get("providers"), dict)
+                    and isinstance(
+                        auth_store["providers"].get(_wt_provider_id), dict
+                    )
+                )
                if self.provider == "nous":
                    state = _load_provider_state(auth_store, "nous")
                    if state is None:
@@ -855,6 +935,10 @@ class CredentialPool:
                    return

                _save_auth_store(auth_store)
+                if write_through_to_root and _wt_provider_id:
+                    _write_through_provider_state_to_global_root(
+                        _wt_provider_id, state
+                    )
        except Exception as exc:
            logger.debug("Failed to sync %s pool entry back to auth store: %s", self.provider, exc)

--- a/tests/agent/test_credential_pool_oauth_writethrough.py
+++ b/tests/agent/test_credential_pool_oauth_writethrough.py
@@ -0,0 +1,190 @@
+"""Regression tests for credential-pool OAuth refresh write-through to root.
+
+Companion to ``tests/hermes_cli/test_xai_oauth_writethrough.py``. That file
+covers the *non-pool* xAI refresh path (``_save_xai_oauth_tokens``). These
+cover the **credential-pool** refresh path
+(``CredentialPool._sync_device_code_entry_to_auth_store``): when a profile
+that has no own ``providers.<id>`` block refreshes — via the pool — a rotating
+OAuth grant it resolved from the global-root fallback, the rotated chain must
+be written back to the global root too. Otherwise root keeps a revoked refresh
+token and every other profile reading root's stale grant dies with
+``refresh_token_reused`` / ``invalid_grant`` once its access token expires
+(issue #48415, the Codex/xAI analog of #43589).
+
+The tests drive the real ``_sync_device_code_entry_to_auth_store`` against
+real on-disk auth stores (profile + root under ``tmp_path``) rather than
+mocking the save boundary, so they exercise the actual atomic write path.
+"""
+
+import json
+
+import pytest
+
+from agent import credential_pool as CP
+from agent.credential_pool import (
+    AUTH_TYPE_OAUTH,
+    CredentialPool,
+    PooledCredential,
+)
+from hermes_cli import auth as A
+
+
+def _write_store(path, store):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(store), encoding="utf-8")
+
+
+def _read_store(path):
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def _entry(provider: str, *, id: str, access_token: str, refresh_token: str):
+    return PooledCredential(
+        provider=provider,
+        id=id,
+        label="cred",
+        auth_type=AUTH_TYPE_OAUTH,
+        priority=0,
+        source="device_code",
+        access_token=access_token,
+        refresh_token=refresh_token,
+    )
+
+
+@pytest.fixture
+def profile_and_root(tmp_path, monkeypatch):
+    """Wire a profile auth store + a distinct global-root auth store on disk.
+
+    The pytest seat belt in ``_write_through_provider_state_to_global_root``
+    only refuses the *real* user's ``$HOME/.hermes/auth.json``; a tmp_path
+    root is allowed, so point HOME away from the tmp root to keep the guard
+    from tripping on these fixtures.
+    """
+    profile_path = tmp_path / "profiles" / "work" / "auth.json"
+    root_path = tmp_path / "root" / "auth.json"
+
+    monkeypatch.setattr(A, "_auth_file_path", lambda: profile_path)
+    monkeypatch.setattr(A, "_global_auth_file_path", lambda: root_path)
+    monkeypatch.setenv("HOME", str(tmp_path / "not-the-root"))
+    return profile_path, root_path
+
+
+@pytest.mark.parametrize(
+    "provider",
+    ["openai-codex", "xai-oauth"],
+)
+def test_pool_refresh_writes_through_to_root_when_profile_reads_root(
+    profile_and_root, provider
+):
+    """A profile reading root's grant must push rotated tokens back to root."""
+    profile_path, root_path = profile_and_root
+    # Profile has NO own provider block (reads root via fallback).
+    _write_store(profile_path, {"version": 1, "providers": {}})
+    _write_store(
+        root_path,
+        {
+            "version": 1,
+            "providers": {
+                provider: {
+                    "tokens": {
+                        "access_token": "old-access",
+                        "refresh_token": "old-refresh",
+                    }
+                }
+            },
+        },
+    )
+
+    pool = CredentialPool(provider, [])
+    pool._sync_device_code_entry_to_auth_store(
+        _entry(provider, id="e1", access_token="new-access", refresh_token="new-refresh")
+    )
+
+    # Profile got the rotated chain (existing behavior).
+    profile = _read_store(profile_path)
+    assert (
+        profile["providers"][provider]["tokens"]["refresh_token"] == "new-refresh"
+    )
+
+    # AND the global root no longer holds the revoked refresh token (#48415).
+    root = _read_store(root_path)
+    assert root["providers"][provider]["tokens"]["access_token"] == "new-access"
+    assert root["providers"][provider]["tokens"]["refresh_token"] == "new-refresh"
+
+
+@pytest.mark.parametrize(
+    "provider",
+    ["openai-codex", "xai-oauth"],
+)
+def test_pool_refresh_does_not_touch_root_when_profile_shadows(
+    profile_and_root, provider
+):
+    """A profile that genuinely shadows root must NOT clobber the root grant."""
+    profile_path, root_path = profile_and_root
+    # Profile has its OWN provider block: it shadows root legitimately.
+    _write_store(
+        profile_path,
+        {
+            "version": 1,
+            "providers": {
+                provider: {
+                    "tokens": {
+                        "access_token": "profile-old",
+                        "refresh_token": "profile-old-refresh",
+                    }
+                }
+            },
+        },
+    )
+    _write_store(
+        root_path,
+        {
+            "version": 1,
+            "providers": {
+                provider: {
+                    "tokens": {
+                        "access_token": "root-untouched",
+                        "refresh_token": "root-untouched-refresh",
+                    }
+                }
+            },
+        },
+    )
+
+    pool = CredentialPool(provider, [])
+    pool._sync_device_code_entry_to_auth_store(
+        _entry(
+            provider,
+            id="e2",
+            access_token="profile-new",
+            refresh_token="profile-new-refresh",
+        )
+    )
+
+    profile = _read_store(profile_path)
+    assert (
+        profile["providers"][provider]["tokens"]["refresh_token"]
+        == "profile-new-refresh"
+    )
+
+    # Root keeps its own grant — write-through must not run when the profile
+    # owns the block.
+    root = _read_store(root_path)
+    assert (
+        root["providers"][provider]["tokens"]["refresh_token"]
+        == "root-untouched-refresh"
+    )
+
+
+def test_write_through_helper_is_noop_in_classic_mode(monkeypatch, tmp_path):
+    """When profile == root (classic mode), the helper must be a no-op.
+
+    ``_global_auth_file_path`` returns None in classic mode; the profile save
+    already wrote to root, so a second write would be redundant (and the
+    helper has nothing to target).
+    """
+    monkeypatch.setattr(A, "_global_auth_file_path", lambda: None)
+    # Must not raise and must not attempt any write.
+    CP._write_through_provider_state_to_global_root(
+        "openai-codex", {"tokens": {"access_token": "a", "refresh_token": "r"}}
+    )