hermes-agent/tests/agent/test_curator.py

"""Tests for agent/curator.py — orchestrator, idle gating, state transitions.

LLM spawning is never exercised here — `_run_llm_review` is monkeypatched so
tests run fully offline and the curator module doesn't need real credentials.
"""

from __future__ import annotations

import importlib
import json
from datetime import datetime, timedelta, timezone
from pathlib import Path

import pytest


@pytest.fixture
def curator_env(tmp_path, monkeypatch):
    """Isolated HERMES_HOME + freshly reloaded curator + skill_usage modules."""
    home = tmp_path / ".hermes"
    (home / "skills").mkdir(parents=True)
    monkeypatch.setattr(Path, "home", lambda: tmp_path)
    monkeypatch.setenv("HERMES_HOME", str(home))

    import tools.skill_usage as usage
    importlib.reload(usage)
    import agent.curator as curator
    importlib.reload(curator)

    # Neutralize the real LLM pass by default — tests opt in per-case.
    monkeypatch.setattr(curator, "_run_llm_review", lambda prompt: "llm-stub")

    # Default: no config file → curator defaults. Tests can override.
    monkeypatch.setattr(curator, "_load_config", lambda: {})

    return {"home": home, "curator": curator, "usage": usage}


def _write_skill(skills_dir: Path, name: str):
    d = skills_dir / name
    d.mkdir(parents=True, exist_ok=True)
    (d / "SKILL.md").write_text(
        f"---\nname: {name}\ndescription: x\n---\n", encoding="utf-8",
    )
    return d


# ---------------------------------------------------------------------------
# Config gates
# ---------------------------------------------------------------------------

def test_curator_enabled_default_true(curator_env):
    assert curator_env["curator"].is_enabled() is True


def test_curator_disabled_via_config(curator_env, monkeypatch):
    c = curator_env["curator"]
    monkeypatch.setattr(c, "_load_config", lambda: {"enabled": False})
    assert c.is_enabled() is False
    assert c.should_run_now() is False


def test_curator_defaults(curator_env):
    c = curator_env["curator"]
    assert c.get_interval_hours() == 24 * 7  # 7 days
    assert c.get_min_idle_hours() == 2
    assert c.get_stale_after_days() == 30
    assert c.get_archive_after_days() == 90


def test_curator_config_overrides(curator_env, monkeypatch):
    c = curator_env["curator"]
    monkeypatch.setattr(c, "_load_config", lambda: {
        "interval_hours": 12,
        "min_idle_hours": 0.5,
        "stale_after_days": 7,
        "archive_after_days": 60,
    })
    assert c.get_interval_hours() == 12
    assert c.get_min_idle_hours() == 0.5
    assert c.get_stale_after_days() == 7
    assert c.get_archive_after_days() == 60


# ---------------------------------------------------------------------------
# should_run_now
# ---------------------------------------------------------------------------

def test_first_run_always_eligible(curator_env):
    c = curator_env["curator"]
    assert c.should_run_now() is True


def test_recent_run_blocks(curator_env):
    c = curator_env["curator"]
    c.save_state({
        "last_run_at": datetime.now(timezone.utc).isoformat(),
        "paused": False,
    })
    assert c.should_run_now() is False


def test_old_run_eligible(curator_env):
    """A run older than the configured interval should re-trigger. Use a
    2x-interval cushion so the test doesn't become coupled to the exact
    default — bumping DEFAULT_INTERVAL_HOURS shouldn't break it."""
    c = curator_env["curator"]
    long_ago = datetime.now(timezone.utc) - timedelta(
        hours=c.get_interval_hours() * 2
    )
    c.save_state({"last_run_at": long_ago.isoformat(), "paused": False})
    assert c.should_run_now() is True


def test_paused_blocks_even_if_stale(curator_env):
    c = curator_env["curator"]
    long_ago = datetime.now(timezone.utc) - timedelta(days=30)
    c.save_state({"last_run_at": long_ago.isoformat(), "paused": True})
    assert c.should_run_now() is False


def test_set_paused_roundtrip(curator_env):
    c = curator_env["curator"]
    c.set_paused(True)
    assert c.is_paused() is True
    c.set_paused(False)
    assert c.is_paused() is False


# ---------------------------------------------------------------------------
# Automatic state transitions
# ---------------------------------------------------------------------------

def test_unused_skill_transitions_to_stale(curator_env):
    c = curator_env["curator"]
    u = curator_env["usage"]
    skills_dir = curator_env["home"] / "skills"
    _write_skill(skills_dir, "old-skill")

    # Record last-use well past stale_after_days (30 default)
    long_ago = (datetime.now(timezone.utc) - timedelta(days=45)).isoformat()
    data = u.load_usage()
    data["old-skill"] = u._empty_record()
    data["old-skill"]["last_used_at"] = long_ago
    data["old-skill"]["created_at"] = long_ago
    u.save_usage(data)

    counts = c.apply_automatic_transitions()
    assert counts["marked_stale"] == 1
    assert u.get_record("old-skill")["state"] == "stale"


def test_very_old_skill_gets_archived(curator_env):
    c = curator_env["curator"]
    u = curator_env["usage"]
    skills_dir = curator_env["home"] / "skills"
    skill_dir = _write_skill(skills_dir, "ancient")

    super_old = (datetime.now(timezone.utc) - timedelta(days=120)).isoformat()
    data = u.load_usage()
    data["ancient"] = u._empty_record()
    data["ancient"]["last_used_at"] = super_old
    data["ancient"]["created_at"] = super_old
    u.save_usage(data)

    counts = c.apply_automatic_transitions()
    assert counts["archived"] == 1
    assert not skill_dir.exists()
    assert (skills_dir / ".archive" / "ancient" / "SKILL.md").exists()
    assert u.get_record("ancient")["state"] == "archived"


def test_pinned_skill_is_never_touched(curator_env):
    c = curator_env["curator"]
    u = curator_env["usage"]
    skills_dir = curator_env["home"] / "skills"
    _write_skill(skills_dir, "precious")

    super_old = (datetime.now(timezone.utc) - timedelta(days=365)).isoformat()
    data = u.load_usage()
    data["precious"] = u._empty_record()
    data["precious"]["last_used_at"] = super_old
    data["precious"]["created_at"] = super_old
    data["precious"]["pinned"] = True
    u.save_usage(data)

    counts = c.apply_automatic_transitions()
    assert counts["archived"] == 0
    assert counts["marked_stale"] == 0
    rec = u.get_record("precious")
    assert rec["state"] == "active"  # untouched
    assert rec["pinned"] is True


def test_stale_skill_reactivates_on_recent_use(curator_env):
    c = curator_env["curator"]
    u = curator_env["usage"]
    skills_dir = curator_env["home"] / "skills"
    _write_skill(skills_dir, "revived")

    recent = datetime.now(timezone.utc).isoformat()
    data = u.load_usage()
    data["revived"] = u._empty_record()
    data["revived"]["state"] = "stale"
    data["revived"]["last_used_at"] = recent
    data["revived"]["created_at"] = recent
    u.save_usage(data)

    counts = c.apply_automatic_transitions()
    assert counts["reactivated"] == 1
    assert u.get_record("revived")["state"] == "active"


def test_new_skill_without_last_used_not_immediately_archived(curator_env):
    """A freshly-created skill with no use history should not get archived
    just because last_used_at is None."""
    c = curator_env["curator"]
    u = curator_env["usage"]
    skills_dir = curator_env["home"] / "skills"
    _write_skill(skills_dir, "fresh")

    # Bump nothing — record doesn't exist yet. Curator should create it
    # and fall back to created_at which is ~now.
    counts = c.apply_automatic_transitions()
    assert counts["archived"] == 0
    assert counts["marked_stale"] == 0
    assert (skills_dir / "fresh").exists()


def test_bundled_skill_not_touched_by_transitions(curator_env):
    c = curator_env["curator"]
    u = curator_env["usage"]
    skills_dir = curator_env["home"] / "skills"
    _write_skill(skills_dir, "bundled")
    (skills_dir / ".bundled_manifest").write_text(
        "bundled:abc\n", encoding="utf-8",
    )

    super_old = (datetime.now(timezone.utc) - timedelta(days=500)).isoformat()
    data = u.load_usage()
    data["bundled"] = u._empty_record()
    data["bundled"]["last_used_at"] = super_old
    u.save_usage(data)

    counts = c.apply_automatic_transitions()
    # bundled skills are excluded from the agent-created list entirely
    assert counts["checked"] == 0
    assert (skills_dir / "bundled").exists()  # never moved


# ---------------------------------------------------------------------------
# run_curator_review orchestration
# ---------------------------------------------------------------------------

def test_run_review_records_state(curator_env):
    c = curator_env["curator"]
    skills_dir = curator_env["home"] / "skills"
    _write_skill(skills_dir, "a")

    result = c.run_curator_review(synchronous=True)
    assert "started_at" in result
    state = c.load_state()
    assert state["last_run_at"] is not None
    assert state["run_count"] >= 1
    assert state["last_run_summary"] is not None


def test_run_review_synchronous_invokes_llm_stub(curator_env, monkeypatch):
    c = curator_env["curator"]
    skills_dir = curator_env["home"] / "skills"
    _write_skill(skills_dir, "a")

    calls = []
    def _stub(prompt):
        calls.append(prompt)
        return {
            "final": "stubbed-summary",
            "summary": "stubbed-summary",
            "model": "stub-model",
            "provider": "stub-provider",
            "tool_calls": [],
            "error": None,
        }
    monkeypatch.setattr(c, "_run_llm_review", _stub)

    captured = []
    c.run_curator_review(on_summary=lambda s: captured.append(s), synchronous=True)

    assert len(calls) == 1
    assert "skill CURATOR" in calls[0] or "CURATOR" in calls[0]
    assert captured  # on_summary was called
    assert any("stubbed-summary" in s for s in captured)


def test_run_review_skips_llm_when_no_candidates(curator_env, monkeypatch):
    c = curator_env["curator"]
    # No skills in the dir → no candidates
    calls = []
    monkeypatch.setattr(
        c, "_run_llm_review",
        lambda prompt: (calls.append(prompt), "never-called")[1],
    )

    captured = []
    c.run_curator_review(on_summary=lambda s: captured.append(s), synchronous=True)

    assert calls == []  # LLM not invoked
    assert any("skipped" in s for s in captured)


def test_maybe_run_curator_respects_disabled(curator_env, monkeypatch):
    c = curator_env["curator"]
    monkeypatch.setattr(c, "_load_config", lambda: {"enabled": False})
    result = c.maybe_run_curator()
    assert result is None


def test_maybe_run_curator_enforces_idle_gate(curator_env, monkeypatch):
    c = curator_env["curator"]
    monkeypatch.setattr(c, "_load_config", lambda: {"min_idle_hours": 2})
    # idle less than the threshold
    result = c.maybe_run_curator(idle_for_seconds=60.0)
    assert result is None


def test_maybe_run_curator_runs_when_eligible(curator_env, monkeypatch):
    c = curator_env["curator"]
    skills_dir = curator_env["home"] / "skills"
    _write_skill(skills_dir, "a")
    # Force idle over threshold
    result = c.maybe_run_curator(idle_for_seconds=99999.0)
    assert result is not None
    assert "started_at" in result


def test_maybe_run_curator_swallows_exceptions(curator_env, monkeypatch):
    c = curator_env["curator"]

    def explode():
        raise RuntimeError("boom")

    monkeypatch.setattr(c, "should_run_now", explode)
    # Must not raise
    assert c.maybe_run_curator() is None


# ---------------------------------------------------------------------------
# Persistence
# ---------------------------------------------------------------------------

def test_state_file_survives_corrupt_read(curator_env):
    c = curator_env["curator"]
    c._state_file().write_text("not json", encoding="utf-8")
    # Must fall back to default, not raise
    assert c.load_state() == c._default_state()


def test_state_atomic_write_no_tmp_leftovers(curator_env):
    c = curator_env["curator"]
    c.save_state({"paused": True})
    parent = c._state_file().parent
    for p in parent.iterdir():
        assert not p.name.startswith(".curator_state_"), f"tmp leftover: {p.name}"


def test_state_preserves_last_report_path(curator_env):
    c = curator_env["curator"]
    c.save_state({
        "last_run_at": "2026-04-30T12:00:00+00:00",
        "last_run_summary": "ok",
        "last_report_path": "/tmp/curator-report",
        "paused": False,
        "run_count": 1,
    })
    state = c.load_state()
    assert state["last_report_path"] == "/tmp/curator-report"


def test_curator_review_prompt_has_invariants():
    """Core invariants must be in the review prompt text."""
    from agent.curator import CURATOR_REVIEW_PROMPT
    assert "MUST NOT" in CURATOR_REVIEW_PROMPT or "DO NOT" in CURATOR_REVIEW_PROMPT
    assert "bundled" in CURATOR_REVIEW_PROMPT.lower()
    assert "delete" in CURATOR_REVIEW_PROMPT.lower()
    assert "pinned" in CURATOR_REVIEW_PROMPT.lower()
    # Must describe the actions the reviewer can take. The exact vocabulary
    # has tightened over time (the umbrella-first prompt drops 'keep' as a
    # first-class decision verb, since passive keep-everything is the
    # failure mode the prompt is trying to avoid), but the core merge /
    # archive / patch trio must remain callable.
    for verb in ("patch", "archive"):
        assert verb in CURATOR_REVIEW_PROMPT.lower()
    # Must mention consolidation (possibly via "merge" or "consolidat")
    assert "consolidat" in CURATOR_REVIEW_PROMPT.lower() or "merge" in CURATOR_REVIEW_PROMPT.lower()


def test_curator_review_prompt_points_at_existing_tools_only():
    """The review prompt must rely on existing tools (skill_manage + terminal)
    and must NOT reference bespoke curator tools that are not registered
    model tools."""
    from agent.curator import CURATOR_REVIEW_PROMPT
    assert "skill_manage" in CURATOR_REVIEW_PROMPT
    assert "skills_list" in CURATOR_REVIEW_PROMPT
    assert "skill_view" in CURATOR_REVIEW_PROMPT
    assert "terminal" in CURATOR_REVIEW_PROMPT.lower()
    # These would be nice but aren't actually registered as tools — the
    # curator uses skill_manage + terminal mv instead.
    assert "archive_skill" not in CURATOR_REVIEW_PROMPT
    assert "pin_skill" not in CURATOR_REVIEW_PROMPT


def test_curator_does_not_instruct_model_to_pin():
    """Pinning is a user opt-out, not a model decision. The prompt should
    not tell the reviewer to pin skills autonomously."""
    from agent.curator import CURATOR_REVIEW_PROMPT
    # "pinned" appears in the invariant ("skip pinned skills"), but "pin"
    # as a decision verb should not.
    lines = CURATOR_REVIEW_PROMPT.split("\n")
    decision_block = "\n".join(
        l for l in lines
        if l.strip().startswith(("keep", "patch", "archive", "consolidate", "pin "))
    )
    # No standalone "pin" action line
    assert not any(l.strip().startswith("pin ") for l in lines), (
        f"Found a pin action line in:\n{decision_block}"
    )


def test_curator_review_prompt_is_umbrella_first():
    """The curator prompt must push umbrella-building / class-level thinking,
    not pair-level 'are these two the same?' analysis."""
    from agent.curator import CURATOR_REVIEW_PROMPT
    lower = CURATOR_REVIEW_PROMPT.lower()
    # Must frame the task as active umbrella-building, not a passive audit.
    assert "umbrella" in lower, (
        "must use UMBRELLA framing — the class-first abstraction the curator "
        "is designed to produce"
    )
    # Must tell the reviewer not to stop at pair-level distinctness.
    assert "class" in lower, "must reference class-level thinking"
    # Must cover the three consolidation methods explicitly
    assert "references/" in CURATOR_REVIEW_PROMPT, (
        "must name references/ as a demotion target for session-specific content"
    )
    # templates/ and scripts/ make the umbrella a real class-level skill
    assert "templates/" in CURATOR_REVIEW_PROMPT
    assert "scripts/" in CURATOR_REVIEW_PROMPT
    # Must say the counter argument: usage=0 is not a reason to skip
    assert "use_count" in CURATOR_REVIEW_PROMPT or "counter" in lower, (
        "must pre-empt the 'usage counters are zero, I can't judge' bailout"
    )


def test_curator_review_prompt_offers_support_file_actions():
    """Support-file demotion (references/templates/scripts) must be one of
    the three consolidation methods, alongside merge-into-existing and
    create-new-umbrella."""
    from agent.curator import CURATOR_REVIEW_PROMPT
    # skill_manage action=write_file is how references/ are added to an
    # existing skill — this is the create-adjacent action the curator needs
    # to demote narrow siblings without touching their SKILL.md.
    assert "write_file" in CURATOR_REVIEW_PROMPT
    # Must offer creating a brand-new umbrella when no existing one fits
    assert "action=create" in CURATOR_REVIEW_PROMPT or "create a new umbrella" in CURATOR_REVIEW_PROMPT.lower()


def test_cli_unpin_refuses_bundled_skill(curator_env, capsys):
    """hermes curator unpin must refuse bundled/hub skills too (matches pin)."""
    from hermes_cli import curator as cli
    skills_dir = curator_env["home"] / "skills"
    _write_skill(skills_dir, "ship-skill")
    (skills_dir / ".bundled_manifest").write_text(
        "ship-skill:abc\n", encoding="utf-8",
    )

    class _A:
        skill = "ship-skill"

    rc = cli._cmd_unpin(_A())
    captured = capsys.readouterr()
    assert rc == 1
    assert "bundled" in captured.out.lower() or "hub" in captured.out.lower()


def test_cli_pin_refuses_bundled_skill(curator_env, capsys):
    from hermes_cli import curator as cli
    skills_dir = curator_env["home"] / "skills"
    _write_skill(skills_dir, "ship-skill")
    (skills_dir / ".bundled_manifest").write_text(
        "ship-skill:abc\n", encoding="utf-8",
    )

    class _A:
        skill = "ship-skill"

    rc = cli._cmd_pin(_A())
    captured = capsys.readouterr()
    assert rc == 1
    assert "bundled" in captured.out.lower() or "hub" in captured.out.lower()


# ---------------------------------------------------------------------------
# curator review-model resolution (canonical auxiliary.curator slot)
#
# Curator was unified with the rest of the aux task system in Apr 2026 so
# `hermes model` → auxiliary picker, the dashboard Models tab, and the full
# per-task config (timeout, base_url, api_key, extra_body) all work for it.
# Voscko report: curator.auxiliary.{provider,model} was advertised but never
# read. Fix wires curator through auxiliary.curator with a legacy fallback.
# ---------------------------------------------------------------------------


def test_review_model_defaults_to_main_when_slot_is_auto(curator_env):
    """auxiliary.curator absent (or auto/empty) → use main model.provider/model."""
    curator = curator_env["curator"]
    cfg = {
        "model": {"provider": "openrouter", "default": "openai/gpt-5.5"},
    }
    assert curator._resolve_review_model(cfg) == ("openrouter", "openai/gpt-5.5")

    # Explicit auto/empty slot — still main model.
    cfg["auxiliary"] = {"curator": {"provider": "auto", "model": ""}}
    assert curator._resolve_review_model(cfg) == ("openrouter", "openai/gpt-5.5")


def test_review_model_honors_auxiliary_curator_slot(curator_env):
    """auxiliary.curator.{provider,model} fully set → that pair wins."""
    curator = curator_env["curator"]
    cfg = {
        "model": {"provider": "openrouter", "default": "openai/gpt-5.5"},
        "auxiliary": {
            "curator": {
                "provider": "openrouter",
                "model": "openai/gpt-5.4-mini",
            },
        },
    }
    assert curator._resolve_review_model(cfg) == (
        "openrouter", "openai/gpt-5.4-mini",
    )


def test_review_model_auxiliary_curator_partial_override_falls_back(curator_env):
    """Only one of slot provider/model set → fall back to the main pair.

    Prevents half-configured overrides from sending an empty side to
    resolve_runtime_provider.
    """
    curator = curator_env["curator"]
    base_main = {"provider": "openrouter", "default": "openai/gpt-5.5"}

    cfg_provider_only = {
        "model": dict(base_main),
        "auxiliary": {"curator": {"provider": "openrouter", "model": ""}},
    }
    assert curator._resolve_review_model(cfg_provider_only) == (
        "openrouter", "openai/gpt-5.5",
    )

    cfg_model_only = {
        "model": dict(base_main),
        "auxiliary": {"curator": {"provider": "auto", "model": "gpt-5.4-mini"}},
    }
    assert curator._resolve_review_model(cfg_model_only) == (
        "openrouter", "openai/gpt-5.5",
    )


def test_review_model_legacy_curator_auxiliary_still_works(curator_env, caplog):
    """Pre-unification users set curator.auxiliary.{provider,model} — honor it.

    Emits a deprecation log line but keeps their config working.
    """
    curator = curator_env["curator"]
    cfg = {
        "model": {"provider": "openrouter", "default": "openai/gpt-5.5"},
        "curator": {
            "auxiliary": {
                "provider": "openrouter",
                "model": "openai/gpt-5.4-mini",
            },
        },
    }
    import logging
    with caplog.at_level(logging.INFO, logger="agent.curator"):
        result = curator._resolve_review_model(cfg)
    assert result == ("openrouter", "openai/gpt-5.4-mini")
    assert any(
        "deprecated curator.auxiliary" in rec.message for rec in caplog.records
    ), "expected deprecation warning when legacy curator.auxiliary is used"


def test_review_model_new_slot_wins_over_legacy(curator_env):
    """When BOTH new and legacy are set, the canonical slot wins."""
    curator = curator_env["curator"]
    cfg = {
        "model": {"provider": "openrouter", "default": "openai/gpt-5.5"},
        "auxiliary": {
            "curator": {"provider": "nous", "model": "new-winner"},
        },
        "curator": {
            "auxiliary": {"provider": "openrouter", "model": "legacy-loser"},
        },
    }
    assert curator._resolve_review_model(cfg) == ("nous", "new-winner")


def test_review_model_handles_missing_sections(curator_env):
    """Missing auxiliary/curator sections never raise — fall back cleanly."""
    curator = curator_env["curator"]
    cfg = {"model": {"provider": "anthropic", "model": "claude-sonnet-4-6"}}
    assert curator._resolve_review_model(cfg) == (
        "anthropic", "claude-sonnet-4-6",
    )

    # Completely empty config → ("auto", "") — resolve_runtime_provider
    # handles the auto-detection chain from there.
    assert curator._resolve_review_model({}) == ("auto", "")


def test_curator_slot_is_canonical_aux_task():
    """Curator must be a first-class slot in every aux-task registry.

    Four sources of truth, all checked by the shared registry test
    (test_aux_config.py) for the main tasks — this test pins `curator`
    specifically so the unification doesn't silently regress.
    """
    from hermes_cli.config import DEFAULT_CONFIG
    from hermes_cli.main import _AUX_TASKS
    from hermes_cli.web_server import _AUX_TASK_SLOTS

    # 1. DEFAULT_CONFIG.auxiliary — schema source
    assert "curator" in DEFAULT_CONFIG["auxiliary"], \
        "curator missing from DEFAULT_CONFIG['auxiliary']"
    slot = DEFAULT_CONFIG["auxiliary"]["curator"]
    assert slot["provider"] == "auto"
    assert slot["model"] == ""
    assert slot["timeout"] > 0, "curator timeout should be set (reviews run long)"

    # 2. hermes_cli/main.py _AUX_TASKS — CLI picker
    aux_keys = {k for k, _name, _desc in _AUX_TASKS}
    assert "curator" in aux_keys, "curator missing from _AUX_TASKS (CLI picker)"

    # 3. hermes_cli/web_server.py _AUX_TASK_SLOTS — REST API allowlist
    assert "curator" in _AUX_TASK_SLOTS, \
        "curator missing from _AUX_TASK_SLOTS (dashboard REST API)"

    # 4. web/src/pages/ModelsPage.tsx is checked at build time; the tsx
    #    array and this tuple share a ``Must match _AUX_TASK_SLOTS`` comment.