Compare commits

..

4 Commits

Author SHA1 Message Date
Teknium
6c56d60d82 fix(tests): also add api_key where missing (AIAgent needs BOTH for direct path)
My last fix added base_url but not api_key. AIAgent.__init__ takes the
direct-construction path only when BOTH are set — with only base_url
it still calls resolve_provider_client and fails in hermetic CI.

Same 31 call sites, now with both kwargs.
2026-04-17 07:41:49 -07:00
Teknium
5d179b9777 fix(tests): pass base_url to 31 more AIAgent() calls across run_agent tests
Same root cause as previous commit — tests that construct AIAgent()
without base_url rely on provider-resolver fallback state that doesn't
exist in hermetic CI / shard-split runs. Previously hidden because
other tests in the same xdist worker happened to prime module state.

Covered by the previous fix: calls that passed api_key but not base_url.
This covers calls that pass NEITHER (model=... only) — test_streaming.py
especially (24 call sites). Plus test_860_dedup, test_compression_
persistence, test_create_openai_client_*, test_provider_parity.

One call site (test_none_base_url_passed_as_none) remains explicitly
unmodified — it asserts None/empty base_url behavior, so adding base_url
would defeat the test's intent.

Validation:
- tests/run_agent/: 760 passed, 0 failed (local)
- Matrix shard 3 subset: 3098 passed, 0 failed, 1m49s (local)
2026-04-17 07:17:54 -07:00
Teknium
c2559b80fa fix(tests): pass base_url explicitly in AIAgent constructor calls
Tests that construct AIAgent(api_key=..., ...) without base_url were
relying on provider-resolver fallback state from other tests in the
same xdist worker. When matrix-split distributed them to different
shards, the resolver found no env vars and no config and raised
'No LLM provider configured'.

Fix: add base_url='https://openrouter.ai/api/v1' to every AIAgent
construction that passes api_key. AIAgent.__init__ with both args set
takes the direct-construction path (line 960 in run_agent.py) and
skips resolver fallback entirely, making these tests self-contained.

7 files, 16 call sites updated via AST-based fixup. One call site
(test_none_base_url_passed_as_none) left alone — that test's
intent is to verify base_url=None behavior, so adding base_url
defeats the test.

Validation:
- tests/run_agent/ full run: 760 passed, 0 failed (was 1 failure
  under the AST script's over-application, now clean)
- Matrix shard 3 local run: 3083 passed, 0 failed, 1m44s
2026-04-17 06:59:35 -07:00
Teknium
50f23ea522 ci: split Tests workflow into 4 parallel shards via pytest-split
Target: <2min CI test wall time.

Runs the Tests workflow as a 4-way matrix instead of one job. Each
shard runs ~3,000 tests on its own ubuntu-latest runner (4 cores) with
-n auto xdist inside. Total effective parallelism: 16 workers across
4 machines (vs 4 workers on 1 machine today).

Was previously tried in #11566 and closed — shard 3 hung at 97% complete
for 100+ seconds with dozens of E/F markers. Root cause was cross-test
pollution exposed by splitting test files across shards (e.g. the three
test files that mutated sys.modules['dotenv'] at import time poisoned
whichever shard they landed in). That's now fixed by #11453 and #11577:
conftest is hermetic, the dotenv stub bombs are removed, and tests no
longer depend on each other's env-var side effects.

Changes:
- pyproject.toml: add pytest-split>=0.9,<1 to dev extras
- .github/workflows/tests.yml: 'test' job becomes matrix-split into 4
  groups with fail-fast: false. Runs 'pytest --splits 4 --group N'.
  pytest-split composes with -n auto from pyproject addopts.

e2e job is unchanged (already small, 20s).

Expected timing:
  Before: ~4m total (243s test step + ~25s setup)
  After:  ~90-115s total (shard wall time ~60-90s + ~25s setup)

Hash-based split is deterministic; no .test_durations file needed yet.
Can add one later via --store-durations for better shard balance.
2026-04-17 06:59:35 -07:00
11 changed files with 35 additions and 178 deletions

View File

@@ -16,8 +16,13 @@ concurrency:
jobs:
test:
name: test (${{ matrix.group }}/4)
runs-on: ubuntu-latest
timeout-minutes: 10
strategy:
fail-fast: false
matrix:
group: [1, 2, 3, 4]
steps:
- name: Checkout code
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
@@ -37,10 +42,11 @@ jobs:
source .venv/bin/activate
uv pip install -e ".[all,dev]"
- name: Run tests
- name: Run tests (shard ${{ matrix.group }}/4)
run: |
source .venv/bin/activate
python -m pytest tests/ -q --ignore=tests/integration --ignore=tests/e2e --tb=short -n auto
python -m pytest tests/ -q --ignore=tests/integration --ignore=tests/e2e --tb=short \
--splits 4 --group ${{ matrix.group }}
env:
# Ensure tests don't accidentally call real APIs
OPENROUTER_API_KEY: ""

View File

@@ -2472,10 +2472,10 @@ def _model_flow_kimi(config, current_model=""):
# Step 3: Model selection — show appropriate models for the endpoint
if is_coding_plan:
# Coding Plan models (kimi-k2.5 first)
# Coding Plan models (kimi-for-coding first)
model_list = [
"kimi-k2.5",
"kimi-for-coding",
"kimi-k2.5",
"kimi-k2-thinking",
"kimi-k2-thinking-turbo",
]

View File

@@ -26,8 +26,7 @@ COPILOT_REASONING_EFFORTS_O_SERIES = ["low", "medium", "high"]
# Fallback OpenRouter snapshot used when the live catalog is unavailable.
# (model_id, display description shown in menus)
OPENROUTER_MODELS: list[tuple[str, str]] = [
("moonshotai/kimi-k2.5", "recommended"),
("anthropic/claude-opus-4.7", ""),
("anthropic/claude-opus-4.7", "recommended"),
("anthropic/claude-opus-4.6", ""),
("anthropic/claude-sonnet-4.6", ""),
("qwen/qwen3.6-plus", ""),
@@ -50,6 +49,7 @@ OPENROUTER_MODELS: list[tuple[str, str]] = [
("z-ai/glm-5.1", ""),
("z-ai/glm-5v-turbo", ""),
("z-ai/glm-5-turbo", ""),
("moonshotai/kimi-k2.5", ""),
("x-ai/grok-4.20", ""),
("nvidia/nemotron-3-super-120b-a12b", ""),
("nvidia/nemotron-3-super-120b-a12b:free", "free"),
@@ -75,7 +75,6 @@ def _codex_curated_models() -> list[str]:
_PROVIDER_MODELS: dict[str, list[str]] = {
"nous": [
"moonshotai/kimi-k2.5",
"xiaomi/mimo-v2-pro",
"anthropic/claude-opus-4.7",
"anthropic/claude-opus-4.6",
@@ -97,6 +96,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
"z-ai/glm-5.1",
"z-ai/glm-5v-turbo",
"z-ai/glm-5-turbo",
"moonshotai/kimi-k2.5",
"x-ai/grok-4.20-beta",
"nvidia/nemotron-3-super-120b-a12b",
"nvidia/nemotron-3-super-120b-a12b:free",
@@ -156,8 +156,8 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
"grok-4-1-fast-reasoning",
],
"kimi-coding": [
"kimi-k2.5",
"kimi-for-coding",
"kimi-k2.5",
"kimi-k2-thinking",
"kimi-k2-thinking-turbo",
"kimi-k2-turbo-preview",
@@ -212,7 +212,6 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
"trinity-mini",
],
"opencode-zen": [
"kimi-k2.5",
"gpt-5.4-pro",
"gpt-5.4",
"gpt-5.3-codex",
@@ -244,15 +243,16 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
"glm-5",
"glm-4.7",
"glm-4.6",
"kimi-k2.5",
"kimi-k2-thinking",
"kimi-k2",
"qwen3-coder",
"big-pickle",
],
"opencode-go": [
"kimi-k2.5",
"glm-5.1",
"glm-5",
"kimi-k2.5",
"mimo-v2-pro",
"mimo-v2-omni",
"minimax-m2.7",
@@ -285,21 +285,21 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
# to https://dashscope-intl.aliyuncs.com/compatible-mode/v1 (OpenAI-compat)
# or https://dashscope-intl.aliyuncs.com/apps/anthropic (Anthropic-compat).
"alibaba": [
"kimi-k2.5",
"qwen3.5-plus",
"qwen3-coder-plus",
"qwen3-coder-next",
# Third-party models available on coding-intl
"glm-5",
"glm-4.7",
"kimi-k2.5",
"MiniMax-M2.5",
],
# Curated HF model list — only agentic models that map to OpenRouter defaults.
"huggingface": [
"moonshotai/Kimi-K2.5",
"Qwen/Qwen3.5-397B-A17B",
"Qwen/Qwen3.5-35B-A3B",
"deepseek-ai/DeepSeek-V3.2",
"moonshotai/Kimi-K2.5",
"MiniMaxAI/MiniMax-M2.5",
"zai-org/GLM-5",
"XiaomiMiMo/MiMo-V2-Flash",

View File

@@ -39,7 +39,7 @@ dependencies = [
[project.optional-dependencies]
modal = ["modal>=1.0.0,<2"]
daytona = ["daytona>=0.148.0,<1"]
dev = ["debugpy>=1.8.0,<2", "pytest>=9.0.2,<10", "pytest-asyncio>=1.3.0,<2", "pytest-xdist>=3.0,<4", "mcp>=1.2.0,<2"]
dev = ["debugpy>=1.8.0,<2", "pytest>=9.0.2,<10", "pytest-asyncio>=1.3.0,<2", "pytest-xdist>=3.0,<4", "pytest-split>=0.9,<1", "mcp>=1.2.0,<2"]
messaging = ["python-telegram-bot[webhooks]>=22.6,<23", "discord.py[voice]>=2.7.1,<3", "aiohttp>=3.13.3,<4", "slack-bolt>=1.18.0,<2", "slack-sdk>=3.27.0,<4", "qrcode>=7.0,<8"]
cron = ["croniter>=6.0.0,<7"]
slack = ["slack-bolt>=1.18.0,<2", "slack-sdk>=3.27.0,<4"]

View File

@@ -15,7 +15,7 @@ def test_opencode_go_appears_when_api_key_set():
opencode_go = next((p for p in providers if p["slug"] == "opencode-go"), None)
assert opencode_go is not None, "opencode-go should appear when OPENCODE_GO_API_KEY is set"
assert opencode_go["models"] == ["kimi-k2.5", "glm-5.1", "glm-5", "mimo-v2-pro", "mimo-v2-omni", "minimax-m2.7", "minimax-m2.5"]
assert opencode_go["models"] == ["glm-5.1", "glm-5", "kimi-k2.5", "mimo-v2-pro", "mimo-v2-omni", "minimax-m2.7", "minimax-m2.5"]
# opencode-go can appear as "built-in" (from PROVIDER_TO_MODELS_DEV when
# models.dev is reachable) or "hermes" (from HERMES_OVERLAYS fallback when
# the API is unavailable, e.g. in CI).

View File

@@ -31,31 +31,6 @@ def _isolate_env(tmp_path, monkeypatch):
monkeypatch.delenv("RETAINDB_PROJECT", raising=False)
@pytest.fixture(autouse=True)
def _cap_retaindb_sleeps(monkeypatch):
"""Cap production-code sleeps so background-thread tests run fast.
The retaindb ``_WriteQueue._flush_row`` does ``time.sleep(2)`` after
errors. Across multiple tests that trigger the retry path, that adds
up. Cap the module's bound ``time.sleep`` to 0.05s — tests don't care
about the exact retry delay, only that it happens. The test file's
own ``time.sleep`` stays real since it uses a different reference.
"""
try:
from plugins.memory import retaindb as _retaindb
except ImportError:
return
real_sleep = _retaindb.time.sleep
def _capped_sleep(seconds):
return real_sleep(min(float(seconds), 0.05))
import types as _types
fake_time = _types.SimpleNamespace(sleep=_capped_sleep, time=_retaindb.time.time)
monkeypatch.setattr(_retaindb, "time", fake_time)
# We need the repo root on sys.path so the plugin can import agent.memory_provider
import sys
_repo_root = str(Path(__file__).resolve().parents[2])
@@ -155,18 +130,16 @@ class TestWriteQueue:
def test_enqueue_creates_row(self, tmp_path):
q, client, db_path = self._make_queue(tmp_path)
q.enqueue("user1", "sess1", [{"role": "user", "content": "hi"}])
# shutdown() blocks until the writer thread drains the queue — no need
# to pre-sleep (the old 1s sleep was a just-in-case wait, but shutdown
# does the right thing).
# Give the writer thread a moment to process
time.sleep(1)
q.shutdown()
# If ingest succeeded, the row should be deleted
client.ingest_session.assert_called_once()
def test_enqueue_persists_to_sqlite(self, tmp_path):
client = MagicMock()
# Make ingest slow so the row is still in SQLite when we peek.
# 0.5s is plenty — the test just needs the flush to still be in-flight.
client.ingest_session = MagicMock(side_effect=lambda *a, **kw: time.sleep(0.5))
# Make ingest hang so the row stays in SQLite
client.ingest_session = MagicMock(side_effect=lambda *a, **kw: time.sleep(5))
db_path = tmp_path / "test_queue.db"
q = _WriteQueue(client, db_path)
q.enqueue("user1", "sess1", [{"role": "user", "content": "test"}])
@@ -181,7 +154,8 @@ class TestWriteQueue:
def test_flush_deletes_row_on_success(self, tmp_path):
q, client, db_path = self._make_queue(tmp_path)
q.enqueue("user1", "sess1", [{"role": "user", "content": "hi"}])
q.shutdown() # blocks until drain
time.sleep(1)
q.shutdown()
# Row should be gone
conn = sqlite3.connect(str(db_path))
rows = conn.execute("SELECT COUNT(*) FROM pending").fetchone()[0]
@@ -194,20 +168,14 @@ class TestWriteQueue:
db_path = tmp_path / "test_queue.db"
q = _WriteQueue(client, db_path)
q.enqueue("user1", "sess1", [{"role": "user", "content": "hi"}])
# Poll for the error to be recorded (max 2s), instead of a fixed 3s wait.
deadline = time.time() + 2.0
last_error = None
while time.time() < deadline:
conn = sqlite3.connect(str(db_path))
row = conn.execute("SELECT last_error FROM pending").fetchone()
conn.close()
if row and row[0]:
last_error = row[0]
break
time.sleep(0.05)
time.sleep(3) # Allow retry + sleep(2) in _flush_row
q.shutdown()
assert last_error is not None
assert "API down" in last_error
# Row should still exist with error recorded
conn = sqlite3.connect(str(db_path))
row = conn.execute("SELECT last_error FROM pending").fetchone()
conn.close()
assert row is not None
assert "API down" in row[0]
def test_thread_local_connection_reuse(self, tmp_path):
q, _, _ = self._make_queue(tmp_path)
@@ -225,27 +193,14 @@ class TestWriteQueue:
client1.ingest_session = MagicMock(side_effect=RuntimeError("fail"))
q1 = _WriteQueue(client1, db_path)
q1.enqueue("user1", "sess1", [{"role": "user", "content": "lost turn"}])
# Wait until the error is recorded (poll with short interval).
deadline = time.time() + 2.0
while time.time() < deadline:
conn = sqlite3.connect(str(db_path))
row = conn.execute("SELECT last_error FROM pending").fetchone()
conn.close()
if row and row[0]:
break
time.sleep(0.05)
time.sleep(3)
q1.shutdown()
# Now create a new queue — it should replay the pending rows
client2 = MagicMock()
client2.ingest_session = MagicMock(return_value={"status": "ok"})
q2 = _WriteQueue(client2, db_path)
# Poll for the replay to happen.
deadline = time.time() + 2.0
while time.time() < deadline:
if client2.ingest_session.called:
break
time.sleep(0.05)
time.sleep(2)
q2.shutdown()
# The replayed row should have been ingested via client2

View File

@@ -1,34 +0,0 @@
"""Fast-path fixtures shared across tests/run_agent/.
Many tests in this directory exercise the retry/backoff paths in the
agent loop. Production code uses ``jittered_backoff(base_delay=5.0)``
with a ``while time.time() < sleep_end`` loop — a single retry test
spends 5+ seconds of real wall-clock time on backoff waits.
Mocking ``jittered_backoff`` to return 0.0 collapses the while-loop
to a no-op (``time.time() < time.time() + 0`` is false immediately),
which handles the most common case without touching ``time.sleep``.
We deliberately DO NOT mock ``time.sleep`` here — some tests
(test_interrupt_propagation, test_primary_runtime_restore, etc.) use
the real ``time.sleep`` for threading coordination or assert that it
was called with specific values. Tests that want to additionally
fast-path direct ``time.sleep(N)`` calls in production code should
monkeypatch ``run_agent.time.sleep`` locally (see
``test_anthropic_error_handling.py`` for the pattern).
"""
from __future__ import annotations
import pytest
@pytest.fixture(autouse=True)
def _fast_retry_backoff(monkeypatch):
"""Short-circuit retry backoff for all tests in this directory."""
try:
import run_agent
except ImportError:
return
monkeypatch.setattr(run_agent, "jittered_backoff", lambda *a, **k: 0.0)

View File

@@ -19,24 +19,6 @@ import pytest
from agent.context_compressor import SUMMARY_PREFIX
from run_agent import AIAgent
import run_agent
# ---------------------------------------------------------------------------
# Fast backoff for compression retry tests
# ---------------------------------------------------------------------------
@pytest.fixture(autouse=True)
def _no_compression_sleep(monkeypatch):
"""Short-circuit the 2s time.sleep between compression retries.
Production code has ``time.sleep(2)`` in multiple places after a 413/context
compression, for rate-limit smoothing. Tests assert behavior, not timing.
"""
import time as _time
monkeypatch.setattr(_time, "sleep", lambda *_a, **_k: None)
monkeypatch.setattr(run_agent, "jittered_backoff", lambda *a, **k: 0.0)
# ---------------------------------------------------------------------------

View File

@@ -27,39 +27,6 @@ from gateway.config import Platform
from gateway.session import SessionSource
# ---------------------------------------------------------------------------
# Fast backoff for tests that exercise the retry loop
# ---------------------------------------------------------------------------
@pytest.fixture(autouse=True)
def _no_backoff_wait(monkeypatch):
"""Short-circuit retry backoff so tests don't block on real wall-clock waits.
The production code uses jittered_backoff() with a 5s base delay plus a
tight time.sleep(0.2) loop. Without this patch, each 429/500/529 retry
test burns ~10s of real time on CI — across six tests that's ~60s for
behavior we're not asserting against timing.
Tests assert retry counts and final results, never wait durations.
"""
import asyncio as _asyncio
import time as _time
monkeypatch.setattr(run_agent, "jittered_backoff", lambda *a, **k: 0.0)
monkeypatch.setattr(_time, "sleep", lambda *_a, **_k: None)
# Also fast-path asyncio.sleep — the gateway's _run_agent path has
# several await asyncio.sleep(...) calls that add real wall-clock time.
_real_asyncio_sleep = _asyncio.sleep
async def _fast_sleep(delay=0, *args, **kwargs):
# Yield to the event loop but skip the actual delay.
await _real_asyncio_sleep(0)
monkeypatch.setattr(_asyncio, "sleep", _fast_sleep)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

View File

@@ -11,16 +11,6 @@ from unittest.mock import MagicMock, patch
import pytest
from run_agent import AIAgent
import run_agent
@pytest.fixture(autouse=True)
def _no_fallback_wait(monkeypatch):
"""Short-circuit time.sleep in fallback/recovery paths so tests don't
block on the ``min(3 + retry_count, 8)`` wait before a primary retry."""
import time as _time
monkeypatch.setattr(_time, "sleep", lambda *_a, **_k: None)
monkeypatch.setattr(run_agent, "jittered_backoff", lambda *a, **k: 0.0)
def _make_tool_defs(*names: str) -> list:

View File

@@ -12,15 +12,6 @@ sys.modules.setdefault("fal_client", types.SimpleNamespace())
import run_agent
@pytest.fixture(autouse=True)
def _no_codex_backoff(monkeypatch):
"""Short-circuit retry backoff so Codex retry tests don't block on real
wall-clock waits (5s jittered_backoff base delay + tight time.sleep loop)."""
import time as _time
monkeypatch.setattr(run_agent, "jittered_backoff", lambda *a, **k: 0.0)
monkeypatch.setattr(_time, "sleep", lambda *_a, **_k: None)
def _patch_agent_bootstrap(monkeypatch):
monkeypatch.setattr(
run_agent,