mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-02 00:41:43 +08:00
Three narrow fixes targeting the remaining red checks after #17828: 1. ui-tui/src/app/slash/commands/ops.ts (Docker Build): /reload-mcp's local params type annotated session_id: string while ctx.sid is string | null. Widen to string | null — matches every other rpc call site and the test harness which passes { session_id: null }. Fixes TS2322 on line 86. The rpc signature itself is Record<string, unknown>, so this is purely a local typing fix, no behavioral change. 2. tests/plugins/test_achievements_plugin.py (13 cascading test failures): _install_fake_session_db did a raw sys.modules['hermes_state'] = fake_module without restoration, leaking the fake across xdist worker boundaries. Downstream tests doing from hermes_state import SessionDB got a module whose SessionDB was lambda: fake_db — 6 test_hermes_state.py tests failed with AttributeError: 'function' object has no attribute '_sanitize_fts5_query' / _contains_cjk, and 7 test_860_dedup.py tests failed with TypeError: got unexpected keyword argument 'db_path' (real code calls SessionDB(db_path=...)). Fix: stash monkeypatch on the plugin_api module object in the fixture, and have the helper do monkeypatch.setitem(sys.modules, 'hermes_state', fake_module) for auto-restoration at test teardown. 3. tests/hermes_cli/test_web_server.py (WS race): TestPtyWebSocket::test_pub_broadcasts_to_events_subscribers hit the 30s test timeout on CI. websocket_connect returns after ws.accept() — but /api/events registers the subscriber in _event_channels on the NEXT await (inside _event_lock). A publish immediately after connect could race ahead of registration and be dropped, and the subsequent receive_text() blocked until SIGALRM killed the test. Fix: poll _event_channels after the subscriber connects, before publishing. Validation: scripts/run_tests.sh tests/plugins/test_achievements_plugin.py tests/run_agent/test_860_dedup.py tests/test_hermes_state.py tests/hermes_cli/test_web_server.py 338 passed cd ui-tui && npm run type-check clean cd ui-tui && npm run build clean Remaining red checks are pure infra (Nix ubuntu hits TwirpErrorResponse ResourceExhausted on the GH Actions cache API; Nix macos bounces between npm build openssl-legacy and cache rate-limits) and cannot be fixed in the codebase.
378 lines
14 KiB
Python
378 lines
14 KiB
Python
"""Tests for the bundled hermes-achievements dashboard plugin.
|
|
|
|
These target the two behaviors that matter for official integration:
|
|
|
|
* The 200-session scan cap is removed — the plugin now walks the entire
|
|
session history by default. Lifetime badges (tens of thousands of
|
|
tool calls) were unreachable before this fix on long-running installs.
|
|
* First-ever scans run in a background thread so the dashboard request
|
|
path never blocks, even on 8000+ session databases where a cold scan
|
|
takes minutes.
|
|
|
|
The upstream repo ships its own unittest suite under
|
|
``plugins/hermes-achievements/tests/`` covering the achievement engine
|
|
internals (tier math, secret-state handling, catalog invariants). These
|
|
tests live at the hermes-agent level and focus on the integration
|
|
contract: the plugin scans ALL of your sessions, not the first 200.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import importlib.util
|
|
import sys
|
|
import threading
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import pytest
|
|
|
|
PLUGIN_MODULE_PATH = (
|
|
Path(__file__).resolve().parents[2]
|
|
/ "plugins"
|
|
/ "hermes-achievements"
|
|
/ "dashboard"
|
|
/ "plugin_api.py"
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def plugin_api(tmp_path, monkeypatch):
|
|
"""Load plugin_api with isolated ~/.hermes so state/snapshot files don't collide.
|
|
|
|
We load the module fresh per test because the plugin keeps module-level
|
|
caches (``_SNAPSHOT_CACHE``, ``_SCAN_STATUS``, background thread handle).
|
|
Reloading gives each test a clean world.
|
|
"""
|
|
monkeypatch.setattr(Path, "home", lambda: tmp_path)
|
|
|
|
spec = importlib.util.spec_from_file_location(
|
|
f"plugin_api_test_{id(tmp_path)}", PLUGIN_MODULE_PATH
|
|
)
|
|
module = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(module)
|
|
# Stash monkeypatch so ``_install_fake_session_db`` can use it to
|
|
# swap ``sys.modules['hermes_state']`` with auto-restoration. Without
|
|
# this, a raw ``sys.modules[...] = fake`` assignment would leak the
|
|
# fake into later tests in the same xdist worker — breaking every
|
|
# test that does ``from hermes_state import SessionDB``.
|
|
module._test_monkeypatch = monkeypatch
|
|
yield module
|
|
|
|
|
|
class _FakeSessionDB:
|
|
"""Stand-in for hermes_state.SessionDB that records scan calls."""
|
|
|
|
def __init__(self, session_count: int):
|
|
self.session_count = session_count
|
|
self.last_limit: Optional[int] = None
|
|
self.last_include_children: Optional[bool] = None
|
|
self.list_calls = 0
|
|
self.messages_calls = 0
|
|
|
|
def list_sessions_rich(
|
|
self,
|
|
source: Optional[str] = None,
|
|
exclude_sources: Optional[List[str]] = None,
|
|
limit: int = 20,
|
|
offset: int = 0,
|
|
include_children: bool = False,
|
|
project_compression_tips: bool = True,
|
|
) -> List[Dict[str, Any]]:
|
|
self.last_limit = limit
|
|
self.last_include_children = include_children
|
|
self.list_calls += 1
|
|
# SQLite semantics: LIMIT -1 = unlimited. Honor that here.
|
|
effective = self.session_count if limit == -1 else min(self.session_count, limit)
|
|
now = int(time.time())
|
|
return [
|
|
{
|
|
"id": f"sess-{i}",
|
|
"title": f"Session {i}",
|
|
"preview": f"preview {i}",
|
|
"started_at": now - (self.session_count - i) * 60,
|
|
"last_active": now - (self.session_count - i) * 60 + 30,
|
|
"source": "cli",
|
|
"model": "test-model",
|
|
}
|
|
for i in range(effective)
|
|
]
|
|
|
|
def get_messages(self, session_id: str) -> List[Dict[str, Any]]:
|
|
self.messages_calls += 1
|
|
return [
|
|
{"role": "user", "content": f"ask {session_id}"},
|
|
{
|
|
"role": "assistant",
|
|
"tool_calls": [{"function": {"name": "terminal"}}],
|
|
},
|
|
{"role": "tool", "tool_name": "terminal", "content": "ok"},
|
|
]
|
|
|
|
def close(self) -> None:
|
|
pass
|
|
|
|
|
|
def _install_fake_session_db(plugin_api, fake_db):
|
|
"""Inject a fake SessionDB so ``scan_sessions`` finds it via its local import.
|
|
|
|
Uses the monkeypatch stashed on ``plugin_api`` by the fixture, so the
|
|
``sys.modules['hermes_state']`` swap is auto-restored at test teardown
|
|
and cannot leak into unrelated tests in the same xdist worker.
|
|
"""
|
|
fake_module = type(sys)("hermes_state")
|
|
fake_module.SessionDB = lambda: fake_db
|
|
plugin_api._test_monkeypatch.setitem(sys.modules, "hermes_state", fake_module)
|
|
|
|
|
|
def test_scan_sessions_default_scans_all_history_not_first_200(plugin_api):
|
|
"""Bug regression: ``scan_sessions()`` used to cap at limit=200.
|
|
|
|
A user with 8000+ sessions would only see ~2% of their history in
|
|
achievement totals, making lifetime badges unreachable. The default
|
|
now passes ``LIMIT -1`` (SQLite "unlimited") to ``list_sessions_rich``.
|
|
"""
|
|
fake_db = _FakeSessionDB(session_count=500) # > old 200 cap
|
|
_install_fake_session_db(plugin_api, fake_db)
|
|
|
|
result = plugin_api.scan_sessions()
|
|
|
|
assert fake_db.last_limit == -1, (
|
|
"scan_sessions() must pass LIMIT=-1 (unlimited) to list_sessions_rich "
|
|
f"by default, got {fake_db.last_limit}"
|
|
)
|
|
assert fake_db.last_include_children is True, (
|
|
"scan_sessions() must include subagent/compression child sessions so "
|
|
"tool calls made in delegated agents still count toward achievements"
|
|
)
|
|
assert len(result["sessions"]) == 500
|
|
assert result["scan_meta"]["sessions_total"] == 500
|
|
|
|
|
|
def test_scan_sessions_explicit_positive_limit_is_honored(plugin_api):
|
|
"""Callers can still pass a small limit for smoke tests."""
|
|
fake_db = _FakeSessionDB(session_count=500)
|
|
_install_fake_session_db(plugin_api, fake_db)
|
|
|
|
result = plugin_api.scan_sessions(limit=10)
|
|
|
|
assert fake_db.last_limit == 10
|
|
assert len(result["sessions"]) == 10
|
|
|
|
|
|
def test_scan_sessions_zero_or_negative_limit_means_unlimited(plugin_api):
|
|
"""``limit=0`` and ``limit=-1`` both map to the unlimited path."""
|
|
fake_db = _FakeSessionDB(session_count=300)
|
|
_install_fake_session_db(plugin_api, fake_db)
|
|
|
|
plugin_api.scan_sessions(limit=0)
|
|
assert fake_db.last_limit == -1
|
|
|
|
plugin_api.scan_sessions(limit=-1)
|
|
assert fake_db.last_limit == -1
|
|
|
|
|
|
def test_evaluate_all_first_run_returns_pending_and_starts_background_scan(plugin_api):
|
|
"""First-ever evaluate_all with no cache returns a pending placeholder
|
|
immediately and kicks off a background scan thread. Cold scans on
|
|
large DBs take minutes — blocking the dashboard request path is not
|
|
acceptable.
|
|
"""
|
|
fake_db = _FakeSessionDB(session_count=50)
|
|
_install_fake_session_db(plugin_api, fake_db)
|
|
|
|
# Wrap _run_scan_and_update_cache so we can release it on demand,
|
|
# simulating a slow cold scan without actually waiting.
|
|
scan_started = threading.Event()
|
|
allow_scan_finish = threading.Event()
|
|
original_run = plugin_api._run_scan_and_update_cache
|
|
|
|
def gated_run(*args, **kwargs):
|
|
scan_started.set()
|
|
allow_scan_finish.wait(timeout=5)
|
|
original_run(*args, **kwargs)
|
|
|
|
plugin_api._run_scan_and_update_cache = gated_run
|
|
|
|
t0 = time.time()
|
|
result = plugin_api.evaluate_all()
|
|
elapsed = time.time() - t0
|
|
|
|
# Immediate return — should not block waiting for the scan.
|
|
assert elapsed < 1.0, f"evaluate_all blocked for {elapsed:.2f}s on first run"
|
|
assert result["scan_meta"]["mode"] == "pending"
|
|
assert result["unlocked_count"] == 0
|
|
# Catalog still rendered so UI has something to draw.
|
|
assert result["total_count"] >= 60
|
|
|
|
# Background scan is running.
|
|
assert scan_started.wait(timeout=2), "background scan did not start"
|
|
|
|
# Let the scan complete, then a second call returns real data.
|
|
allow_scan_finish.set()
|
|
# Wait for thread to finish.
|
|
thread = plugin_api._BACKGROUND_SCAN_THREAD
|
|
assert thread is not None
|
|
thread.join(timeout=5)
|
|
assert not thread.is_alive()
|
|
|
|
second = plugin_api.evaluate_all()
|
|
assert second["scan_meta"]["mode"] != "pending"
|
|
assert second["scan_meta"].get("sessions_total") == 50
|
|
|
|
|
|
def test_evaluate_all_stale_cache_serves_stale_and_refreshes_in_background(plugin_api):
|
|
"""When the snapshot is on-disk but older than TTL, evaluate_all returns
|
|
the stale data immediately and kicks a background refresh. Users don't
|
|
stare at a loading spinner every time TTL expires.
|
|
"""
|
|
fake_db = _FakeSessionDB(session_count=10)
|
|
_install_fake_session_db(plugin_api, fake_db)
|
|
|
|
# Seed a stale snapshot on disk.
|
|
stale_generated_at = int(time.time()) - plugin_api.SNAPSHOT_TTL_SECONDS - 60
|
|
stale_payload = {
|
|
"achievements": [],
|
|
"sessions": [],
|
|
"aggregate": {},
|
|
"scan_meta": {"mode": "full", "sessions_total": 1, "sessions_rescanned": 1, "sessions_reused": 0},
|
|
"error": None,
|
|
"unlocked_count": 0,
|
|
"discovered_count": 0,
|
|
"secret_count": 0,
|
|
"total_count": 0,
|
|
"generated_at": stale_generated_at,
|
|
}
|
|
plugin_api.save_snapshot(stale_payload)
|
|
|
|
t0 = time.time()
|
|
result = plugin_api.evaluate_all()
|
|
elapsed = time.time() - t0
|
|
|
|
assert elapsed < 1.0, f"evaluate_all blocked for {elapsed:.2f}s serving stale data"
|
|
assert result["generated_at"] == stale_generated_at
|
|
|
|
# Background scan should be running or have completed.
|
|
thread = plugin_api._BACKGROUND_SCAN_THREAD
|
|
assert thread is not None
|
|
thread.join(timeout=5)
|
|
|
|
fresh = plugin_api.evaluate_all()
|
|
assert fresh["generated_at"] >= stale_generated_at
|
|
|
|
|
|
def test_evaluate_all_force_runs_synchronously(plugin_api):
|
|
"""Manual /rescan (force=True) blocks the caller — users clicking
|
|
the rescan button expect up-to-date data when the call returns.
|
|
"""
|
|
fake_db = _FakeSessionDB(session_count=25)
|
|
_install_fake_session_db(plugin_api, fake_db)
|
|
|
|
result = plugin_api.evaluate_all(force=True)
|
|
|
|
# Synchronous — snapshot is fresh on return.
|
|
assert result["scan_meta"].get("sessions_total") == 25
|
|
assert result["scan_meta"]["mode"] in ("full", "incremental")
|
|
|
|
|
|
def test_start_background_scan_is_idempotent_while_running(plugin_api):
|
|
"""Multiple concurrent dashboard requests must not spawn duplicate scans."""
|
|
fake_db = _FakeSessionDB(session_count=5)
|
|
_install_fake_session_db(plugin_api, fake_db)
|
|
|
|
release = threading.Event()
|
|
original_run = plugin_api._run_scan_and_update_cache
|
|
|
|
def gated_run(*args, **kwargs):
|
|
release.wait(timeout=5)
|
|
original_run(*args, **kwargs)
|
|
|
|
plugin_api._run_scan_and_update_cache = gated_run
|
|
|
|
plugin_api._start_background_scan()
|
|
first_thread = plugin_api._BACKGROUND_SCAN_THREAD
|
|
assert first_thread is not None and first_thread.is_alive()
|
|
|
|
plugin_api._start_background_scan()
|
|
plugin_api._start_background_scan()
|
|
|
|
assert plugin_api._BACKGROUND_SCAN_THREAD is first_thread
|
|
|
|
release.set()
|
|
first_thread.join(timeout=5)
|
|
|
|
|
|
def test_background_scan_publishes_partial_snapshots(plugin_api):
|
|
"""The background scanner publishes intermediate snapshots to the cache
|
|
every ~N sessions. Each dashboard refresh during a long cold scan sees
|
|
more badges unlocked instead of staring at zeros for minutes and then
|
|
having everything pop at the end.
|
|
"""
|
|
fake_db = _FakeSessionDB(session_count=750)
|
|
_install_fake_session_db(plugin_api, fake_db)
|
|
|
|
# Record every partial snapshot the scanner publishes.
|
|
partial_snapshots: List[Dict[str, Any]] = []
|
|
original_compute_from_scan = plugin_api._compute_from_scan
|
|
|
|
def recording_compute(scan, *, is_partial=False):
|
|
result = original_compute_from_scan(scan, is_partial=is_partial)
|
|
if is_partial:
|
|
partial_snapshots.append(result)
|
|
return result
|
|
|
|
plugin_api._compute_from_scan = recording_compute
|
|
|
|
# scan 750 sessions with progress_every=250 → expect 2 intermediate
|
|
# publications (at 250 and 500; the final 750 call goes through the
|
|
# finished, non-partial path).
|
|
plugin_api._run_scan_and_update_cache(publish_partial_snapshots=True)
|
|
|
|
assert len(partial_snapshots) >= 2, (
|
|
f"expected at least 2 partial publications on a 750-session scan with "
|
|
f"progress_every=250, got {len(partial_snapshots)}"
|
|
)
|
|
# Partial snapshots should report growing session counts.
|
|
counts = [p["scan_meta"].get("sessions_scanned_so_far") for p in partial_snapshots]
|
|
assert counts == sorted(counts), f"partial session counts not monotonic: {counts}"
|
|
assert counts[0] < 750 and counts[-1] < 750, (
|
|
f"partial counts should be less than the final total; got {counts}"
|
|
)
|
|
# Every partial reports the expected end-state total so the UI can
|
|
# show an accurate progress bar.
|
|
for p in partial_snapshots:
|
|
assert p["scan_meta"].get("sessions_expected_total") == 750
|
|
|
|
# Final snapshot in cache is the real (non-partial) one.
|
|
final = plugin_api._SNAPSHOT_CACHE
|
|
assert final is not None
|
|
assert final["scan_meta"].get("mode") != "in_progress"
|
|
assert final["scan_meta"].get("sessions_total") == 750
|
|
|
|
|
|
def test_partial_snapshots_do_not_persist_unlock_timestamps(plugin_api):
|
|
"""Intermediate snapshots must not write to state.json — an unlock
|
|
that appears at 30% scan progress could disappear when a later session
|
|
rebalances the aggregate. Only the final snapshot records ``unlocked_at``.
|
|
"""
|
|
fake_db = _FakeSessionDB(session_count=10)
|
|
_install_fake_session_db(plugin_api, fake_db)
|
|
|
|
# Seed empty state, then invoke partial compute directly.
|
|
plugin_api.save_state({"unlocks": {}})
|
|
partial_scan = {
|
|
"sessions": [{"session_id": "x", "tool_call_count": 99999, "tool_names": set()}],
|
|
"aggregate": {"max_tool_calls_in_session": 99999, "total_tool_calls": 99999},
|
|
"scan_meta": {"mode": "in_progress"},
|
|
}
|
|
result = plugin_api._compute_from_scan(partial_scan, is_partial=True)
|
|
|
|
# Some achievements should evaluate as unlocked in this aggregate...
|
|
assert any(a["unlocked"] for a in result["achievements"])
|
|
|
|
# ...but state.json on disk stays empty (no timestamps were recorded).
|
|
persisted = plugin_api.load_state()
|
|
assert persisted.get("unlocks", {}) == {}, (
|
|
"partial scans must not record unlock timestamps — a later session "
|
|
"could change whether the badge deserves to be unlocked yet"
|
|
)
|