tests/run_agent/test_context_token_tracking.py

"""Tests for context token tracking in run_agent.py's usage extraction.

The context counter (status bar) must show the TOTAL prompt tokens including
Anthropic's cached portions. This is an integration test for the token
extraction in run_conversation(), not the ContextCompressor itself (which
is tested in tests/agent/test_context_compressor.py).
"""

import sys
import types
from types import SimpleNamespace

sys.modules.setdefault("fire", types.SimpleNamespace(Fire=lambda *a, **k: None))
sys.modules.setdefault("firecrawl", types.SimpleNamespace(Firecrawl=object))
sys.modules.setdefault("fal_client", types.SimpleNamespace())

import run_agent


def _patch_bootstrap(monkeypatch):
    monkeypatch.setattr(run_agent, "get_tool_definitions", lambda **kwargs: [{
        "type": "function",
        "function": {"name": "t", "description": "t", "parameters": {"type": "object", "properties": {}}},
    }])
    monkeypatch.setattr(run_agent, "check_toolset_requirements", lambda: {})


class _FakeAnthropicClient:
    def close(self):
        pass


class _FakeOpenAIClient:
    """Fake OpenAI client returned by mocked resolve_provider_client."""
    api_key = "fake-codex-key"
    base_url = "https://api.openai.com/v1"
    _default_headers = None


def _make_agent(monkeypatch, api_mode, provider, response_fn):
    _patch_bootstrap(monkeypatch)
    if api_mode == "anthropic_messages":
        monkeypatch.setattr("agent.anthropic_adapter.build_anthropic_client", lambda k, b=None: _FakeAnthropicClient())
    if provider == "openai-codex":
        monkeypatch.setattr(
            "agent.auxiliary_client.resolve_provider_client",
            lambda *a, **kw: (_FakeOpenAIClient(), "test-model"),
        )

    class _A(run_agent.AIAgent):
        def __init__(self, *a, **kw):
            kw.update(skip_context_files=True, skip_memory=True, max_iterations=4)
            super().__init__(*a, **kw)
            self._cleanup_task_resources = self._persist_session = lambda *a, **k: None
            self._save_trajectory = self._save_session_log = lambda *a, **k: None

        def run_conversation(self, msg, conversation_history=None, task_id=None):
            self._interruptible_api_call = lambda kw: response_fn()
            self._disable_streaming = True
            return super().run_conversation(msg, conversation_history=conversation_history, task_id=task_id)

    return _A(model="test-model", api_key="test-key", base_url="http://localhost:1234/v1", provider=provider, api_mode=api_mode)


def _anthropic_resp(input_tok, output_tok, cache_read=0, cache_creation=0):
    usage_fields = {"input_tokens": input_tok, "output_tokens": output_tok}
    if cache_read:
        usage_fields["cache_read_input_tokens"] = cache_read
    if cache_creation:
        usage_fields["cache_creation_input_tokens"] = cache_creation
    return SimpleNamespace(
        content=[SimpleNamespace(type="text", text="ok")],
        stop_reason="end_turn",
        usage=SimpleNamespace(**usage_fields),
        model="claude-sonnet-4-6",
    )


# -- Anthropic: cached tokens must be included --

def test_anthropic_cache_read_and_creation_added(monkeypatch):
    agent = _make_agent(monkeypatch, "anthropic_messages", "anthropic",
                        lambda: _anthropic_resp(3, 10, cache_read=15000, cache_creation=2000))
    agent.run_conversation("hi")
    assert agent.context_compressor.last_prompt_tokens == 17003  # 3+15000+2000
    assert agent.session_prompt_tokens == 17003


def test_anthropic_no_cache_fields(monkeypatch):
    agent = _make_agent(monkeypatch, "anthropic_messages", "anthropic",
                        lambda: _anthropic_resp(500, 20))
    agent.run_conversation("hi")
    assert agent.context_compressor.last_prompt_tokens == 500


def test_anthropic_cache_read_only(monkeypatch):
    agent = _make_agent(monkeypatch, "anthropic_messages", "anthropic",
                        lambda: _anthropic_resp(5, 15, cache_read=17666, cache_creation=15))
    agent.run_conversation("hi")
    assert agent.context_compressor.last_prompt_tokens == 17686  # 5+17666+15


# -- OpenAI: prompt_tokens already total --

def test_openai_prompt_tokens_unchanged(monkeypatch):
    resp = lambda: SimpleNamespace(
        choices=[SimpleNamespace(index=0, message=SimpleNamespace(
            role="assistant", content="ok", tool_calls=None, reasoning_content=None,
        ), finish_reason="stop")],
        usage=SimpleNamespace(prompt_tokens=5000, completion_tokens=100, total_tokens=5100),
        model="gpt-4o",
    )
    agent = _make_agent(monkeypatch, "chat_completions", "openrouter", resp)
    agent.run_conversation("hi")
    assert agent.context_compressor.last_prompt_tokens == 5000


# -- Codex: no cache fields, getattr returns 0 --

def test_codex_no_cache_fields(monkeypatch):
    resp = lambda: SimpleNamespace(
        output=[SimpleNamespace(type="message", content=[SimpleNamespace(type="output_text", text="ok")])],
        usage=SimpleNamespace(input_tokens=3000, output_tokens=50, total_tokens=3050),
        status="completed", model="gpt-5-codex",
    )
    agent = _make_agent(monkeypatch, "codex_responses", "openai-codex", resp)
    agent.run_conversation("hi")
    assert agent.context_compressor.last_prompt_tokens == 3000
fix: context counter shows cached token count in status bar Anthropic prompt caching splits input into cache_read_input_tokens, cache_creation_input_tokens, and non-cached input_tokens. The context counter only read input_tokens (non-cached portion), showing ~3 tokens instead of the real ~18K total. Now includes cached portions for Anthropic native provider only — other providers (OpenAI, OpenRouter, Codex) already include cached tokens in their prompt_tokens field. Before: 3/200K \| 0% After: 17.7K/200K \| 9% 2026-03-17 04:39:11 +03:00			`"""Tests for context token tracking in run_agent.py's usage extraction.`

			`The context counter (status bar) must show the TOTAL prompt tokens including`
			`Anthropic's cached portions. This is an integration test for the token`
			`extraction in run_conversation(), not the ContextCompressor itself (which`
			`is tested in tests/agent/test_context_compressor.py).`
			`"""`

			`import sys`
			`import types`
			`from types import SimpleNamespace`

			`sys.modules.setdefault("fire", types.SimpleNamespace(Fire=lambda a, *k: None))`
			`sys.modules.setdefault("firecrawl", types.SimpleNamespace(Firecrawl=object))`
			`sys.modules.setdefault("fal_client", types.SimpleNamespace())`

			`import run_agent`


			`def _patch_bootstrap(monkeypatch):`
			`monkeypatch.setattr(run_agent, "get_tool_definitions", lambda **kwargs: [{`
			`"type": "function",`
			`"function": {"name": "t", "description": "t", "parameters": {"type": "object", "properties": {}}},`
			`}])`
			`monkeypatch.setattr(run_agent, "check_toolset_requirements", lambda: {})`


			`class _FakeAnthropicClient:`
			`def close(self):`
			`pass`


fix(tests): resolve all consistently failing tests - test_plugins.py: remove tests for unimplemented plugin command API (get_plugin_command_handler, register_command never existed) - test_redact.py: add autouse fixture to clear HERMES_REDACT_SECRETS env var leaked by cli.py import in other tests - test_signal.py: same HERMES_REDACT_SECRETS fix for phone redaction - test_mattermost.py: add @bot_user_id to test messages after the mention-only filter was added in #2443 - test_context_token_tracking.py: mock resolve_provider_client for openai-codex provider that requires real OAuth credentials Full suite: 5893 passed, 0 failed. 2026-03-22 05:58:26 -07:00			`class _FakeOpenAIClient:`
			`"""Fake OpenAI client returned by mocked resolve_provider_client."""`
			`api_key = "fake-codex-key"`
			`base_url = "https://api.openai.com/v1"`
			`_default_headers = None`


fix: context counter shows cached token count in status bar Anthropic prompt caching splits input into cache_read_input_tokens, cache_creation_input_tokens, and non-cached input_tokens. The context counter only read input_tokens (non-cached portion), showing ~3 tokens instead of the real ~18K total. Now includes cached portions for Anthropic native provider only — other providers (OpenAI, OpenRouter, Codex) already include cached tokens in their prompt_tokens field. Before: 3/200K \| 0% After: 17.7K/200K \| 9% 2026-03-17 04:39:11 +03:00			`def _make_agent(monkeypatch, api_mode, provider, response_fn):`
			`_patch_bootstrap(monkeypatch)`
			`if api_mode == "anthropic_messages":`
			`monkeypatch.setattr("agent.anthropic_adapter.build_anthropic_client", lambda k, b=None: _FakeAnthropicClient())`
fix(tests): resolve all consistently failing tests - test_plugins.py: remove tests for unimplemented plugin command API (get_plugin_command_handler, register_command never existed) - test_redact.py: add autouse fixture to clear HERMES_REDACT_SECRETS env var leaked by cli.py import in other tests - test_signal.py: same HERMES_REDACT_SECRETS fix for phone redaction - test_mattermost.py: add @bot_user_id to test messages after the mention-only filter was added in #2443 - test_context_token_tracking.py: mock resolve_provider_client for openai-codex provider that requires real OAuth credentials Full suite: 5893 passed, 0 failed. 2026-03-22 05:58:26 -07:00			`if provider == "openai-codex":`
			`monkeypatch.setattr(`
			`"agent.auxiliary_client.resolve_provider_client",`
			`lambda a, *kw: (_FakeOpenAIClient(), "test-model"),`
			`)`
fix: context counter shows cached token count in status bar Anthropic prompt caching splits input into cache_read_input_tokens, cache_creation_input_tokens, and non-cached input_tokens. The context counter only read input_tokens (non-cached portion), showing ~3 tokens instead of the real ~18K total. Now includes cached portions for Anthropic native provider only — other providers (OpenAI, OpenRouter, Codex) already include cached tokens in their prompt_tokens field. Before: 3/200K \| 0% After: 17.7K/200K \| 9% 2026-03-17 04:39:11 +03:00
			`class _A(run_agent.AIAgent):`
			`def __init__(self, a, *kw):`
			`kw.update(skip_context_files=True, skip_memory=True, max_iterations=4)`
			`super().__init__(a, *kw)`
			`self._cleanup_task_resources = self._persist_session = lambda a, *k: None`
			`self._save_trajectory = self._save_session_log = lambda a, *k: None`

			`def run_conversation(self, msg, conversation_history=None, task_id=None):`
			`self._interruptible_api_call = lambda kw: response_fn()`
fix(tests): fix 78 CI test failures and remove dead test (#9036) Production fixes: - voice_mode.py: add is_recording property to AudioRecorder (parity with TermuxAudioRecorder) - cronjob_tools.py: add sms example to deliver description Test fixes: - test_real_interrupt_subagent: add missing _execution_thread_id (fixes 19 cascading failures from leaked _build_system_prompt patch) - test_anthropic_error_handling: add _FakeMessages, override _interruptible_streaming_api_call (6 fixes) - test_ctx_halving_fix: add missing request_overrides attribute (4 fixes) - test_context_token_tracking: set _disable_streaming=True for non-streaming test path (4 fixes) - test_dict_tool_call_args: set _disable_streaming=True (1 fix) - test_provider_parity: add model='gpt-4o' for AIGateway tests to meet 64K minimum context (4 fixes) - test_session_race_guard: add user_id to SessionSource (5 fixes) - test_restart_drain/helpers: add user_id to SessionSource (2 fixes) - test_telegram_photo_interrupts: add user_id to SessionSource - test_interrupt: target thread_id for per-thread interrupt system (2 fixes) - test_zombie_process_cleanup: rewrite with object.__new__ for refactored GatewayRunner.stop() (1 fix) - test_browser_camofox_state: update config version 15->17 (1 fix) - test_trajectory_compressor_async: widen lookback window 10->20 for line-shifted AsyncOpenAI (1 fix) - test_voice_mode: fixed by production is_recording addition (5 fixes) - test_voice_cli_integration: add _attached_images to CLI stub (2 fixes) - test_hermes_logging: explicit propagation/level reset for cross-test pollution defense (1 fix) - test_run_agent: add base_url for OpenRouter detection tests (2 fixes) Deleted: - test_inline_think_blocks_reasoning_only_accepted: tested unimplemented inline <think> handling 2026-04-13 10:50:24 -07:00			`self._disable_streaming = True`
fix: context counter shows cached token count in status bar Anthropic prompt caching splits input into cache_read_input_tokens, cache_creation_input_tokens, and non-cached input_tokens. The context counter only read input_tokens (non-cached portion), showing ~3 tokens instead of the real ~18K total. Now includes cached portions for Anthropic native provider only — other providers (OpenAI, OpenRouter, Codex) already include cached tokens in their prompt_tokens field. Before: 3/200K \| 0% After: 17.7K/200K \| 9% 2026-03-17 04:39:11 +03:00			`return super().run_conversation(msg, conversation_history=conversation_history, task_id=task_id)`

fix(tests): resolve 12 CI failures + 10 errors across 6 root causes (#11040) Group A (3 tests): 'No LLM provider configured' RuntimeError - test_user_message_surrogates_sanitized, test_counters_initialized_in_init, test_openai_prompt_tokens_unchanged - Root cause: AIAgent.__init__ now requires base_url alongside api_key to skip resolve_provider_client() (which returns None when API keys are blanked in CI). Added base_url='http://localhost:1234/v1' to test agent construction. Group B (5 tests): Discord slash command auto-registration - test_auto_registers_missing_gateway_commands, test_auto_registered_command_, test_register_skill_group_ - Root cause: xdist workers that loaded a discord mock WITHOUT app_commands.Command/Group caused _register_slash_commands() to fail silently. Added comprehensive shared discord mock in tests/gateway/conftest.py (same pattern as existing telegram mock). Group C (5 errors): Discord reply mode 'NoneType has no DMChannel' - All TestReplyToText tests - Root cause: FakeDMChannel was not a subclass of real discord.DMChannel, so isinstance() checks in _handle_message failed when running in full suite (real discord installed). Made FakeDMChannel inherit from discord.DMChannel when available. Removed fragile monkeypatch approach. Group D (2 tests): detect_provider_for_model wrong provider - test_openrouter_slug_match (got 'ai-gateway'), test_bare_name_gets_ openrouter_slug (got 'copilot') - Root cause: ai-gateway, copilot, and kilocode are multi-vendor aggregators that list other providers' models (OpenRouter-style slugs). They were being matched in Step 1 before OpenRouter. Added all three to _AGGREGATORS set so they're skipped like nous/openrouter. Group E (1 test): model_flow_custom StopIteration - test_model_flow_custom_saves_verified_v1_base_url - Root cause: 'Display name' prompt was added after the test was written. The input iterator had 5 answers but the flow now asks 6 questions. Added 6th empty string answer. Group F (1 test): Telegram proxy env assertion - test_uses_proxy_env_for_primary_and_fallback_transports - Root cause: _resolve_proxy_url() now checks TELEGRAM_PROXY first (via resolve_proxy_url('TELEGRAM_PROXY')). Test didn't clear this env var, allowing potential leakage from other tests in xdist workers. Added TELEGRAM_PROXY to the cleanup list. 2026-04-16 06:49:36 -07:00			`return _A(model="test-model", api_key="test-key", base_url="http://localhost:1234/v1", provider=provider, api_mode=api_mode)`
fix: context counter shows cached token count in status bar Anthropic prompt caching splits input into cache_read_input_tokens, cache_creation_input_tokens, and non-cached input_tokens. The context counter only read input_tokens (non-cached portion), showing ~3 tokens instead of the real ~18K total. Now includes cached portions for Anthropic native provider only — other providers (OpenAI, OpenRouter, Codex) already include cached tokens in their prompt_tokens field. Before: 3/200K \| 0% After: 17.7K/200K \| 9% 2026-03-17 04:39:11 +03:00

			`def _anthropic_resp(input_tok, output_tok, cache_read=0, cache_creation=0):`
			`usage_fields = {"input_tokens": input_tok, "output_tokens": output_tok}`
			`if cache_read:`
			`usage_fields["cache_read_input_tokens"] = cache_read`
			`if cache_creation:`
			`usage_fields["cache_creation_input_tokens"] = cache_creation`
			`return SimpleNamespace(`
			`content=[SimpleNamespace(type="text", text="ok")],`
			`stop_reason="end_turn",`
			`usage=SimpleNamespace(**usage_fields),`
			`model="claude-sonnet-4-6",`
			`)`


			`# -- Anthropic: cached tokens must be included --`

			`def test_anthropic_cache_read_and_creation_added(monkeypatch):`
			`agent = _make_agent(monkeypatch, "anthropic_messages", "anthropic",`
			`lambda: _anthropic_resp(3, 10, cache_read=15000, cache_creation=2000))`
			`agent.run_conversation("hi")`
			`assert agent.context_compressor.last_prompt_tokens == 17003 # 3+15000+2000`
			`assert agent.session_prompt_tokens == 17003`


			`def test_anthropic_no_cache_fields(monkeypatch):`
			`agent = _make_agent(monkeypatch, "anthropic_messages", "anthropic",`
			`lambda: _anthropic_resp(500, 20))`
			`agent.run_conversation("hi")`
			`assert agent.context_compressor.last_prompt_tokens == 500`


			`def test_anthropic_cache_read_only(monkeypatch):`
			`agent = _make_agent(monkeypatch, "anthropic_messages", "anthropic",`
			`lambda: _anthropic_resp(5, 15, cache_read=17666, cache_creation=15))`
			`agent.run_conversation("hi")`
			`assert agent.context_compressor.last_prompt_tokens == 17686 # 5+17666+15`


			`# -- OpenAI: prompt_tokens already total --`

			`def test_openai_prompt_tokens_unchanged(monkeypatch):`
			`resp = lambda: SimpleNamespace(`
			`choices=[SimpleNamespace(index=0, message=SimpleNamespace(`
			`role="assistant", content="ok", tool_calls=None, reasoning_content=None,`
			`), finish_reason="stop")],`
			`usage=SimpleNamespace(prompt_tokens=5000, completion_tokens=100, total_tokens=5100),`
			`model="gpt-4o",`
			`)`
			`agent = _make_agent(monkeypatch, "chat_completions", "openrouter", resp)`
			`agent.run_conversation("hi")`
			`assert agent.context_compressor.last_prompt_tokens == 5000`


			`# -- Codex: no cache fields, getattr returns 0 --`

			`def test_codex_no_cache_fields(monkeypatch):`
			`resp = lambda: SimpleNamespace(`
			`output=[SimpleNamespace(type="message", content=[SimpleNamespace(type="output_text", text="ok")])],`
			`usage=SimpleNamespace(input_tokens=3000, output_tokens=50, total_tokens=3050),`
			`status="completed", model="gpt-5-codex",`
			`)`
			`agent = _make_agent(monkeypatch, "codex_responses", "openai-codex", resp)`
			`agent.run_conversation("hi")`
			`assert agent.context_compressor.last_prompt_tokens == 3000`