Files
hermes-agent/tests/agent/test_curator_classification.py

551 lines
20 KiB
Python
Raw Normal View History

feat(curator): split archived into consolidated vs pruned with model + heuristic classification (#17941) * fix(curator): split 'archived' into consolidated vs pruned in run reports Users who watched a curator run saw skills like 'anthropic-api' listed under 'Skills archived' and interpreted that as pruning — but the curator had actually absorbed those skills into a new umbrella (e.g. 'llm-providers') during the same run. The directory gets archived for safety (all removals are recoverable), but the content still lives under a different name. Users then 'restored' what they thought were deleted skills and ended up with confusingly duplicated skillsets (old-name + absorbed-inside-umbrella). Classify removed skills using this run's skill_manage tool calls: - consolidated: content absorbed into a surviving/newly-created skill (evidenced by a skill_manage write_file/patch/create/edit whose target is a different skill AND whose file_path/content references the removed skill's name) - pruned: archived without consolidation evidence (truly stale) REPORT.md now shows two distinct sections: - 'Consolidated into umbrella skills' — with `removed → merged into umbrella` - 'Pruned — archived for staleness' — pure staleness archives run.json schema additions (backward compatible): - counts.consolidated_this_run, counts.pruned_this_run - consolidated: [{name, into, evidence}, ...] - pruned: [names] - archived: retained as the union for backward compat Also: relabel the auto-transitions 'archived' counter to 'archived (no LLM, pure time-based staleness)' so it's clearly distinct from LLM-pass archives. Tests: 9 new tests in test_curator_classification.py covering consolidation evidence parsing (write_file/patch/create), hyphen/underscore name variants, self-reference rejection, destination-must-exist, mixed runs, and malformed-JSON fallback safety. Existing test_report_md_is_human_readable updated to cover the new section names. E2E: isolated HERMES_HOME, realistic 3-skill run, REPORT.md verified end-to-end. * feat(curator): hybrid model-declared + heuristic classification Extend the consolidated-vs-pruned split with LLM-authored intent: 1. Curator prompt now requires a structured YAML block at the end of the final response (consolidations / prunings with short rationale). 2. _parse_structured_summary() extracts it tolerantly — missing block, malformed YAML, partial lists all fall back to heuristic cleanly. 3. _reconcile_classification() merges model intent with the tool-call heuristic: - Model wins on rationale when its umbrella exists post-run - Model hallucination (umbrella doesn't exist) is downgraded to the heuristic's finding, or pruned if there's no evidence either - Heuristic catches model omission — consolidations the model enumerated tools for but forgot to list get surfaced with a '(detected via tool-call audit)' tag 4. REPORT.md now shows per-row rationale alongside 'removed → umbrella' and flags audit-only rows so the user knows why no reason is shown. Backward compat: run.json's 'archived' field (union) is preserved. 'pruned' is now a list of dicts with {name, source, reason}; 'pruned_names' is the flat-name list for legacy consumers. Tests: 15 new covering YAML parse edge cases (malformed, empty lists, bare-string entries, missing fields), reconciler rules (model wins, hallucination fallback, heuristic catches omission, prune with reason), and an end-to-end report-render test with all four paths exercised.
2026-04-30 10:31:23 -07:00
"""Tests for the curator consolidated-vs-pruned classifier.
The classifier splits skills that disappeared between the before/after
snapshots into two buckets:
- "consolidated" absorbed into an umbrella; content still lives
under another skill's files
- "pruned" archived for staleness; content not preserved elsewhere
Without the split the report lumped everything under "Skills archived",
which misled users into thinking consolidated skills had been pruned.
"""
from __future__ import annotations
import json
from datetime import datetime, timezone
from pathlib import Path
import pytest
@pytest.fixture
def curator_env(tmp_path, monkeypatch):
home = tmp_path / ".hermes"
home.mkdir()
(home / "skills").mkdir()
(home / "logs").mkdir()
monkeypatch.setenv("HERMES_HOME", str(home))
monkeypatch.setattr(Path, "home", lambda: tmp_path)
import importlib
import hermes_constants
importlib.reload(hermes_constants)
from agent import curator
importlib.reload(curator)
yield curator
def test_classify_consolidated_via_write_file_evidence(curator_env):
"""skill_manage write_file on umbrella references/<removed>.md = consolidated."""
result = curator_env._classify_removed_skills(
removed=["axolotl-training"],
added=[],
after_names={"training-platforms", "keeper"},
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "write_file",
"name": "training-platforms",
"file_path": "references/axolotl-training.md",
"file_content": "# Axolotl\n...",
}),
},
],
)
assert len(result["consolidated"]) == 1
assert result["consolidated"][0]["name"] == "axolotl-training"
assert result["consolidated"][0]["into"] == "training-platforms"
assert result["pruned"] == []
def test_classify_pruned_when_no_destination_reference(curator_env):
"""Removed skill with no referencing tool call = pruned."""
result = curator_env._classify_removed_skills(
removed=["old-stale-thing"],
added=[],
after_names={"keeper"},
tool_calls=[
{"name": "skills_list", "arguments": "{}"},
{"name": "skill_manage", "arguments": json.dumps({
"action": "patch", "name": "keeper",
"old_string": "foo", "new_string": "bar",
})},
],
)
assert result["consolidated"] == []
assert len(result["pruned"]) == 1
assert result["pruned"][0]["name"] == "old-stale-thing"
def test_classify_consolidated_into_newly_created_umbrella(curator_env):
"""Removed skill absorbed into a skill that was created THIS run."""
result = curator_env._classify_removed_skills(
removed=["anthropic-api"],
added=["llm-providers"], # new umbrella
after_names={"llm-providers"},
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "create",
"name": "llm-providers",
"content": "# LLM Providers\n\n## anthropic-api\nMerged from the old anthropic-api skill.\n",
}),
},
],
)
assert len(result["consolidated"]) == 1
assert result["consolidated"][0]["name"] == "anthropic-api"
assert result["consolidated"][0]["into"] == "llm-providers"
def test_classify_handles_underscore_hyphen_variants(curator_env):
"""Names with hyphens match underscore forms in paths/content and vice versa."""
result = curator_env._classify_removed_skills(
removed=["open-webui-setup"],
added=[],
after_names={"webui"},
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "write_file",
"name": "webui",
"file_path": "references/open_webui_setup.md",
"file_content": "...",
}),
},
],
)
assert len(result["consolidated"]) == 1
assert result["consolidated"][0]["into"] == "webui"
def test_classify_self_reference_does_not_count(curator_env):
"""A tool call that targets the removed skill itself is NOT consolidation."""
# e.g. the curator patched the skill once and later archived it
result = curator_env._classify_removed_skills(
removed=["doomed"],
added=[],
after_names={"keeper"},
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "patch",
"name": "doomed", # same as removed
"old_string": "x",
"new_string": "y",
}),
},
],
)
assert result["consolidated"] == []
assert result["pruned"][0]["name"] == "doomed"
def test_classify_destination_must_exist_after_run(curator_env):
"""A reference to a skill that doesn't exist after the run can't be the umbrella."""
result = curator_env._classify_removed_skills(
removed=["thing"],
added=[],
after_names={"keeper"}, # "ghost" not in here
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "write_file",
"name": "ghost", # not in after_names
"file_path": "references/thing.md",
"file_content": "...",
}),
},
],
)
assert result["consolidated"] == []
assert result["pruned"][0]["name"] == "thing"
def test_classify_mixed_run_produces_both_buckets(curator_env):
"""A realistic run: one skill consolidated, one skill pruned."""
result = curator_env._classify_removed_skills(
removed=["absorbed-skill", "dead-skill"],
added=["umbrella"],
after_names={"umbrella", "keeper"},
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "write_file",
"name": "umbrella",
"file_path": "references/absorbed-skill.md",
"file_content": "...",
}),
},
],
)
assert len(result["consolidated"]) == 1
assert result["consolidated"][0]["name"] == "absorbed-skill"
assert result["consolidated"][0]["into"] == "umbrella"
assert len(result["pruned"]) == 1
assert result["pruned"][0]["name"] == "dead-skill"
def test_classify_handles_malformed_arguments_string(curator_env):
"""Truncated/malformed JSON in arguments falls back to substring match."""
# Arguments truncated to 400 chars may not parse as JSON.
truncated_raw = (
'{"action":"write_file","name":"umbrella","file_path":"references/'
'absorbed-skill.md","file_content":"long content that was cut off mid'
)
result = curator_env._classify_removed_skills(
removed=["absorbed-skill"],
added=[],
after_names={"umbrella"},
tool_calls=[
{"name": "skill_manage", "arguments": truncated_raw},
],
)
# Fallback substring match finds "absorbed-skill" in the raw truncated string
# even though json.loads fails — but it can't identify target="umbrella"
# because _raw is the only haystack and there's no dict access. The
# classifier only promotes to "consolidated" if it can identify a target
# skill from args.get("name"). Ensure we fail safe: no false positive.
# (This is a correctness floor — better to prune-label than hallucinate
# an umbrella that wasn't really used.)
assert result["consolidated"] == []
assert len(result["pruned"]) == 1
def test_report_md_splits_consolidated_and_pruned_sections(curator_env):
"""End-to-end: REPORT.md shows both sections distinctly."""
curator = curator_env
start = datetime.now(timezone.utc)
before = [
{"name": "absorbed-skill", "state": "active", "pinned": False},
{"name": "dead-skill", "state": "stale", "pinned": False},
{"name": "keeper", "state": "active", "pinned": False},
]
after = [
{"name": "keeper", "state": "active", "pinned": False},
{"name": "umbrella", "state": "active", "pinned": False},
]
run_dir = curator._write_run_report(
started_at=start,
elapsed_seconds=60.0,
auto_counts={"checked": 3, "marked_stale": 0, "archived": 0, "reactivated": 0},
auto_summary="no auto changes",
before_report=before,
before_names={r["name"] for r in before},
after_report=after,
llm_meta={
"final": "Consolidated absorbed-skill into umbrella. Pruned dead-skill.",
"summary": "1 consolidated, 1 pruned",
"model": "m",
"provider": "p",
"error": None,
"tool_calls": [
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "create",
"name": "umbrella",
"content": "# umbrella\n\nAbsorbed absorbed-skill.",
}),
},
],
},
)
payload = json.loads((run_dir / "run.json").read_text())
# Both lists exist and are disjoint
consolidated_names = {e["name"] for e in payload["consolidated"]}
assert consolidated_names == {"absorbed-skill"}
# `pruned` holds full dicts {name, source, reason}; `pruned_names` is the
# flat list for quick scans / legacy compat.
pruned_names = payload["pruned_names"]
assert pruned_names == ["dead-skill"]
assert all(isinstance(e, dict) and "name" in e for e in payload["pruned"])
# The union still matches the legacy "archived" field for backward compat
assert set(payload["archived"]) == consolidated_names | set(pruned_names)
# counts exposed
assert payload["counts"]["consolidated_this_run"] == 1
assert payload["counts"]["pruned_this_run"] == 1
md = (run_dir / "REPORT.md").read_text()
# Two separate sections, not a single "Skills archived" lump
assert "Consolidated into umbrella skills" in md
assert "Pruned — archived for staleness" in md
assert "`absorbed-skill` → merged into `umbrella`" in md
assert "`dead-skill`" in md
# The old single-lump section should not appear
assert "### Skills archived" not in md
# ---------------------------------------------------------------------------
# _parse_structured_summary — extracting the model's required YAML block
# ---------------------------------------------------------------------------
def test_parse_structured_summary_happy_path(curator_env):
text = (
"Long human summary here. I processed clusters X, Y, Z.\n\n"
"## Structured summary (required)\n"
"```yaml\n"
"consolidations:\n"
" - from: anthropic-api\n"
" into: llm-providers\n"
" reason: duplicate of the generic llm-providers skill\n"
" - from: openai-api\n"
" into: llm-providers\n"
" reason: same — merged with sibling\n"
"prunings:\n"
" - name: random-old-notes\n"
" reason: pre-curator garbage, no overlap\n"
"```\n"
)
out = curator_env._parse_structured_summary(text)
assert len(out["consolidations"]) == 2
assert out["consolidations"][0] == {
"from": "anthropic-api",
"into": "llm-providers",
"reason": "duplicate of the generic llm-providers skill",
}
assert len(out["prunings"]) == 1
assert out["prunings"][0]["reason"] == "pre-curator garbage, no overlap"
def test_parse_structured_summary_missing_block(curator_env):
out = curator_env._parse_structured_summary("No block in this text.")
assert out == {"consolidations": [], "prunings": []}
def test_parse_structured_summary_malformed_yaml(curator_env):
text = "```yaml\nthis: is\n not: [valid yaml\n```"
out = curator_env._parse_structured_summary(text)
assert out == {"consolidations": [], "prunings": []}
def test_parse_structured_summary_empty_lists(curator_env):
text = "```yaml\nconsolidations: []\nprunings: []\n```"
out = curator_env._parse_structured_summary(text)
assert out == {"consolidations": [], "prunings": []}
def test_parse_structured_summary_ignores_bare_strings(curator_env):
"""Entries that aren't dicts (e.g. a model wrote bare names) are skipped."""
text = (
"```yaml\n"
"consolidations:\n"
" - just-a-bare-string\n"
" - from: real-entry\n"
" into: umbrella\n"
" reason: valid\n"
"prunings: []\n"
"```"
)
out = curator_env._parse_structured_summary(text)
assert len(out["consolidations"]) == 1
assert out["consolidations"][0]["from"] == "real-entry"
def test_parse_structured_summary_missing_required_fields(curator_env):
"""Consolidation entries without from+into are skipped."""
text = (
"```yaml\n"
"consolidations:\n"
" - from: only-from\n"
" reason: no into\n"
" - into: only-into\n"
" - from: good\n"
" into: umbrella\n"
"prunings: []\n"
"```"
)
out = curator_env._parse_structured_summary(text)
assert len(out["consolidations"]) == 1
assert out["consolidations"][0]["from"] == "good"
# ---------------------------------------------------------------------------
# _reconcile_classification — merging model block with heuristic
# ---------------------------------------------------------------------------
def test_reconcile_model_wins_when_umbrella_exists(curator_env):
"""Model claim + umbrella in destinations → model authority (with reason)."""
out = curator_env._reconcile_classification(
removed=["anthropic-api"],
heuristic={"consolidated": [], "pruned": [{"name": "anthropic-api"}]},
model_block={
"consolidations": [{
"from": "anthropic-api",
"into": "llm-providers",
"reason": "duplicate",
}],
"prunings": [],
},
destinations={"llm-providers"},
)
assert len(out["consolidated"]) == 1
e = out["consolidated"][0]
assert e["name"] == "anthropic-api"
assert e["into"] == "llm-providers"
assert e["reason"] == "duplicate"
assert e["source"] == "model"
assert out["pruned"] == []
def test_reconcile_model_hallucinates_umbrella(curator_env):
"""Model names a non-existent umbrella — downgrade, prefer heuristic if any."""
out = curator_env._reconcile_classification(
removed=["thing"],
heuristic={
"consolidated": [{"name": "thing", "into": "real-umbrella", "evidence": "..."}],
"pruned": [],
},
model_block={
"consolidations": [{
"from": "thing",
"into": "nonexistent-umbrella",
"reason": "confused",
}],
"prunings": [],
},
destinations={"real-umbrella"},
)
assert len(out["consolidated"]) == 1
e = out["consolidated"][0]
assert e["into"] == "real-umbrella"
assert "tool-call audit" in e["source"]
assert e["model_claimed_into"] == "nonexistent-umbrella"
def test_reconcile_model_hallucinates_with_no_heuristic_evidence(curator_env):
"""Model names a non-existent umbrella AND no tool-call evidence → prune."""
out = curator_env._reconcile_classification(
removed=["ghost"],
heuristic={"consolidated": [], "pruned": [{"name": "ghost"}]},
model_block={
"consolidations": [{
"from": "ghost",
"into": "nonexistent",
"reason": "wrong",
}],
"prunings": [],
},
destinations={"real-umbrella"},
)
assert out["consolidated"] == []
assert len(out["pruned"]) == 1
assert "fallback" in out["pruned"][0]["source"]
def test_reconcile_heuristic_catches_model_omission(curator_env):
"""Model forgot to list a consolidation, heuristic found it."""
out = curator_env._reconcile_classification(
removed=["forgotten"],
heuristic={
"consolidated": [{
"name": "forgotten",
"into": "umbrella",
"evidence": "write_file on umbrella referenced forgotten.md",
}],
"pruned": [],
},
model_block={"consolidations": [], "prunings": []},
destinations={"umbrella"},
)
assert len(out["consolidated"]) == 1
e = out["consolidated"][0]
assert e["into"] == "umbrella"
assert "model omitted" in e["source"]
def test_reconcile_model_prunes_with_reason(curator_env):
"""Model says pruned, heuristic agrees, we surface the reason."""
out = curator_env._reconcile_classification(
removed=["stale-skill"],
heuristic={"consolidated": [], "pruned": [{"name": "stale-skill"}]},
model_block={
"consolidations": [],
"prunings": [{"name": "stale-skill", "reason": "superseded by bundled skill"}],
},
destinations=set(),
)
assert len(out["pruned"]) == 1
e = out["pruned"][0]
assert e["reason"] == "superseded by bundled skill"
assert e["source"] == "model"
def test_reconcile_model_block_visible_in_full_report(curator_env):
"""End-to-end: LLM final response with the YAML block → reasons in REPORT.md."""
import json as _json
from datetime import datetime as _dt, timezone as _tz
start = _dt.now(_tz.utc)
before = [
{"name": "anthropic-api", "state": "active", "pinned": False},
{"name": "stale-thing", "state": "stale", "pinned": False},
]
after = [{"name": "llm-providers", "state": "active", "pinned": False}]
llm_final_text = (
"Processed 3 clusters. Absorbed anthropic-api into llm-providers.\n\n"
"## Structured summary (required)\n"
"```yaml\n"
"consolidations:\n"
" - from: anthropic-api\n"
" into: llm-providers\n"
" reason: duplicate content, now a subsection\n"
"prunings:\n"
" - name: stale-thing\n"
" reason: pre-curator junk, no overlap with anything\n"
"```\n"
)
run_dir = curator_env._write_run_report(
started_at=start,
elapsed_seconds=30.0,
auto_counts={"checked": 2, "marked_stale": 0, "archived": 0, "reactivated": 0},
auto_summary="none",
before_report=before,
before_names={r["name"] for r in before},
after_report=after,
llm_meta={
"final": llm_final_text,
"summary": "1 consolidated, 1 pruned",
"model": "m",
"provider": "p",
"error": None,
"tool_calls": [
{"name": "skill_manage", "arguments": _json.dumps({
"action": "create",
"name": "llm-providers",
"content": "# llm-providers\nIncludes anthropic-api",
})},
],
},
)
payload = _json.loads((run_dir / "run.json").read_text())
cons = payload["consolidated"][0]
assert cons["name"] == "anthropic-api"
assert cons["into"] == "llm-providers"
assert cons["reason"] == "duplicate content, now a subsection"
assert cons["source"] == "model+audit" # model AND heuristic both had it
pruned = payload["pruned"][0]
assert pruned["name"] == "stale-thing"
assert pruned["reason"] == "pre-curator junk, no overlap with anything"
md = (run_dir / "REPORT.md").read_text()
assert "duplicate content, now a subsection" in md
assert "pre-curator junk" in md