Files
hermes-agent/tests/agent/test_curator_classification.py

814 lines
30 KiB
Python
Raw Normal View History

feat(curator): split archived into consolidated vs pruned with model + heuristic classification (#17941) * fix(curator): split 'archived' into consolidated vs pruned in run reports Users who watched a curator run saw skills like 'anthropic-api' listed under 'Skills archived' and interpreted that as pruning — but the curator had actually absorbed those skills into a new umbrella (e.g. 'llm-providers') during the same run. The directory gets archived for safety (all removals are recoverable), but the content still lives under a different name. Users then 'restored' what they thought were deleted skills and ended up with confusingly duplicated skillsets (old-name + absorbed-inside-umbrella). Classify removed skills using this run's skill_manage tool calls: - consolidated: content absorbed into a surviving/newly-created skill (evidenced by a skill_manage write_file/patch/create/edit whose target is a different skill AND whose file_path/content references the removed skill's name) - pruned: archived without consolidation evidence (truly stale) REPORT.md now shows two distinct sections: - 'Consolidated into umbrella skills' — with `removed → merged into umbrella` - 'Pruned — archived for staleness' — pure staleness archives run.json schema additions (backward compatible): - counts.consolidated_this_run, counts.pruned_this_run - consolidated: [{name, into, evidence}, ...] - pruned: [names] - archived: retained as the union for backward compat Also: relabel the auto-transitions 'archived' counter to 'archived (no LLM, pure time-based staleness)' so it's clearly distinct from LLM-pass archives. Tests: 9 new tests in test_curator_classification.py covering consolidation evidence parsing (write_file/patch/create), hyphen/underscore name variants, self-reference rejection, destination-must-exist, mixed runs, and malformed-JSON fallback safety. Existing test_report_md_is_human_readable updated to cover the new section names. E2E: isolated HERMES_HOME, realistic 3-skill run, REPORT.md verified end-to-end. * feat(curator): hybrid model-declared + heuristic classification Extend the consolidated-vs-pruned split with LLM-authored intent: 1. Curator prompt now requires a structured YAML block at the end of the final response (consolidations / prunings with short rationale). 2. _parse_structured_summary() extracts it tolerantly — missing block, malformed YAML, partial lists all fall back to heuristic cleanly. 3. _reconcile_classification() merges model intent with the tool-call heuristic: - Model wins on rationale when its umbrella exists post-run - Model hallucination (umbrella doesn't exist) is downgraded to the heuristic's finding, or pruned if there's no evidence either - Heuristic catches model omission — consolidations the model enumerated tools for but forgot to list get surfaced with a '(detected via tool-call audit)' tag 4. REPORT.md now shows per-row rationale alongside 'removed → umbrella' and flags audit-only rows so the user knows why no reason is shown. Backward compat: run.json's 'archived' field (union) is preserved. 'pruned' is now a list of dicts with {name, source, reason}; 'pruned_names' is the flat-name list for legacy consumers. Tests: 15 new covering YAML parse edge cases (malformed, empty lists, bare-string entries, missing fields), reconciler rules (model wins, hallucination fallback, heuristic catches omission, prune with reason), and an end-to-end report-render test with all four paths exercised.
2026-04-30 10:31:23 -07:00
"""Tests for the curator consolidated-vs-pruned classifier.
The classifier splits skills that disappeared between the before/after
snapshots into two buckets:
- "consolidated" absorbed into an umbrella; content still lives
under another skill's files
- "pruned" archived for staleness; content not preserved elsewhere
Without the split the report lumped everything under "Skills archived",
which misled users into thinking consolidated skills had been pruned.
"""
from __future__ import annotations
import json
from datetime import datetime, timezone
from pathlib import Path
import pytest
@pytest.fixture
def curator_env(tmp_path, monkeypatch):
home = tmp_path / ".hermes"
home.mkdir()
(home / "skills").mkdir()
(home / "logs").mkdir()
monkeypatch.setenv("HERMES_HOME", str(home))
monkeypatch.setattr(Path, "home", lambda: tmp_path)
import importlib
import hermes_constants
importlib.reload(hermes_constants)
from agent import curator
importlib.reload(curator)
yield curator
def test_classify_consolidated_via_write_file_evidence(curator_env):
"""skill_manage write_file on umbrella references/<removed>.md = consolidated."""
result = curator_env._classify_removed_skills(
removed=["axolotl-training"],
added=[],
after_names={"training-platforms", "keeper"},
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "write_file",
"name": "training-platforms",
"file_path": "references/axolotl-training.md",
"file_content": "# Axolotl\n...",
}),
},
],
)
assert len(result["consolidated"]) == 1
assert result["consolidated"][0]["name"] == "axolotl-training"
assert result["consolidated"][0]["into"] == "training-platforms"
assert result["pruned"] == []
def test_classify_pruned_when_no_destination_reference(curator_env):
"""Removed skill with no referencing tool call = pruned."""
result = curator_env._classify_removed_skills(
removed=["old-stale-thing"],
added=[],
after_names={"keeper"},
tool_calls=[
{"name": "skills_list", "arguments": "{}"},
{"name": "skill_manage", "arguments": json.dumps({
"action": "patch", "name": "keeper",
"old_string": "foo", "new_string": "bar",
})},
],
)
assert result["consolidated"] == []
assert len(result["pruned"]) == 1
assert result["pruned"][0]["name"] == "old-stale-thing"
def test_classify_consolidated_into_newly_created_umbrella(curator_env):
"""Removed skill absorbed into a skill that was created THIS run."""
result = curator_env._classify_removed_skills(
removed=["anthropic-api"],
added=["llm-providers"], # new umbrella
after_names={"llm-providers"},
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "create",
"name": "llm-providers",
"content": "# LLM Providers\n\n## anthropic-api\nMerged from the old anthropic-api skill.\n",
}),
},
],
)
assert len(result["consolidated"]) == 1
assert result["consolidated"][0]["name"] == "anthropic-api"
assert result["consolidated"][0]["into"] == "llm-providers"
def test_classify_handles_underscore_hyphen_variants(curator_env):
"""Names with hyphens match underscore forms in paths/content and vice versa."""
result = curator_env._classify_removed_skills(
removed=["open-webui-setup"],
added=[],
after_names={"webui"},
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "write_file",
"name": "webui",
"file_path": "references/open_webui_setup.md",
"file_content": "...",
}),
},
],
)
assert len(result["consolidated"]) == 1
assert result["consolidated"][0]["into"] == "webui"
def test_classify_self_reference_does_not_count(curator_env):
"""A tool call that targets the removed skill itself is NOT consolidation."""
# e.g. the curator patched the skill once and later archived it
result = curator_env._classify_removed_skills(
removed=["doomed"],
added=[],
after_names={"keeper"},
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "patch",
"name": "doomed", # same as removed
"old_string": "x",
"new_string": "y",
}),
},
],
)
assert result["consolidated"] == []
assert result["pruned"][0]["name"] == "doomed"
def test_classify_destination_must_exist_after_run(curator_env):
"""A reference to a skill that doesn't exist after the run can't be the umbrella."""
result = curator_env._classify_removed_skills(
removed=["thing"],
added=[],
after_names={"keeper"}, # "ghost" not in here
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "write_file",
"name": "ghost", # not in after_names
"file_path": "references/thing.md",
"file_content": "...",
}),
},
],
)
assert result["consolidated"] == []
assert result["pruned"][0]["name"] == "thing"
def test_classify_mixed_run_produces_both_buckets(curator_env):
"""A realistic run: one skill consolidated, one skill pruned."""
result = curator_env._classify_removed_skills(
removed=["absorbed-skill", "dead-skill"],
added=["umbrella"],
after_names={"umbrella", "keeper"},
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "write_file",
"name": "umbrella",
"file_path": "references/absorbed-skill.md",
"file_content": "...",
}),
},
],
)
assert len(result["consolidated"]) == 1
assert result["consolidated"][0]["name"] == "absorbed-skill"
assert result["consolidated"][0]["into"] == "umbrella"
assert len(result["pruned"]) == 1
assert result["pruned"][0]["name"] == "dead-skill"
def test_classify_handles_malformed_arguments_string(curator_env):
"""Truncated/malformed JSON in arguments falls back to substring match."""
# Arguments truncated to 400 chars may not parse as JSON.
truncated_raw = (
'{"action":"write_file","name":"umbrella","file_path":"references/'
'absorbed-skill.md","file_content":"long content that was cut off mid'
)
result = curator_env._classify_removed_skills(
removed=["absorbed-skill"],
added=[],
after_names={"umbrella"},
tool_calls=[
{"name": "skill_manage", "arguments": truncated_raw},
],
)
# Fallback substring match finds "absorbed-skill" in the raw truncated string
# even though json.loads fails — but it can't identify target="umbrella"
# because _raw is the only haystack and there's no dict access. The
# classifier only promotes to "consolidated" if it can identify a target
# skill from args.get("name"). Ensure we fail safe: no false positive.
# (This is a correctness floor — better to prune-label than hallucinate
# an umbrella that wasn't really used.)
assert result["consolidated"] == []
assert len(result["pruned"]) == 1
def test_report_md_splits_consolidated_and_pruned_sections(curator_env):
"""End-to-end: REPORT.md shows both sections distinctly."""
curator = curator_env
start = datetime.now(timezone.utc)
before = [
{"name": "absorbed-skill", "state": "active", "pinned": False},
{"name": "dead-skill", "state": "stale", "pinned": False},
{"name": "keeper", "state": "active", "pinned": False},
]
after = [
{"name": "keeper", "state": "active", "pinned": False},
{"name": "umbrella", "state": "active", "pinned": False},
]
run_dir = curator._write_run_report(
started_at=start,
elapsed_seconds=60.0,
auto_counts={"checked": 3, "marked_stale": 0, "archived": 0, "reactivated": 0},
auto_summary="no auto changes",
before_report=before,
before_names={r["name"] for r in before},
after_report=after,
llm_meta={
"final": "Consolidated absorbed-skill into umbrella. Pruned dead-skill.",
"summary": "1 consolidated, 1 pruned",
"model": "m",
"provider": "p",
"error": None,
"tool_calls": [
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "create",
"name": "umbrella",
"content": "# umbrella\n\nAbsorbed absorbed-skill.",
}),
},
],
},
)
payload = json.loads((run_dir / "run.json").read_text())
# Both lists exist and are disjoint
consolidated_names = {e["name"] for e in payload["consolidated"]}
assert consolidated_names == {"absorbed-skill"}
# `pruned` holds full dicts {name, source, reason}; `pruned_names` is the
# flat list for quick scans / legacy compat.
pruned_names = payload["pruned_names"]
assert pruned_names == ["dead-skill"]
assert all(isinstance(e, dict) and "name" in e for e in payload["pruned"])
# The union still matches the legacy "archived" field for backward compat
assert set(payload["archived"]) == consolidated_names | set(pruned_names)
# counts exposed
assert payload["counts"]["consolidated_this_run"] == 1
assert payload["counts"]["pruned_this_run"] == 1
md = (run_dir / "REPORT.md").read_text()
# Two separate sections, not a single "Skills archived" lump
assert "Consolidated into umbrella skills" in md
assert "Pruned — archived for staleness" in md
assert "`absorbed-skill` → merged into `umbrella`" in md
assert "`dead-skill`" in md
# The old single-lump section should not appear
assert "### Skills archived" not in md
# ---------------------------------------------------------------------------
# _parse_structured_summary — extracting the model's required YAML block
# ---------------------------------------------------------------------------
def test_parse_structured_summary_happy_path(curator_env):
text = (
"Long human summary here. I processed clusters X, Y, Z.\n\n"
"## Structured summary (required)\n"
"```yaml\n"
"consolidations:\n"
" - from: anthropic-api\n"
" into: llm-providers\n"
" reason: duplicate of the generic llm-providers skill\n"
" - from: openai-api\n"
" into: llm-providers\n"
" reason: same — merged with sibling\n"
"prunings:\n"
" - name: random-old-notes\n"
" reason: pre-curator garbage, no overlap\n"
"```\n"
)
out = curator_env._parse_structured_summary(text)
assert len(out["consolidations"]) == 2
assert out["consolidations"][0] == {
"from": "anthropic-api",
"into": "llm-providers",
"reason": "duplicate of the generic llm-providers skill",
}
assert len(out["prunings"]) == 1
assert out["prunings"][0]["reason"] == "pre-curator garbage, no overlap"
def test_parse_structured_summary_missing_block(curator_env):
out = curator_env._parse_structured_summary("No block in this text.")
assert out == {"consolidations": [], "prunings": []}
def test_parse_structured_summary_malformed_yaml(curator_env):
text = "```yaml\nthis: is\n not: [valid yaml\n```"
out = curator_env._parse_structured_summary(text)
assert out == {"consolidations": [], "prunings": []}
def test_parse_structured_summary_empty_lists(curator_env):
text = "```yaml\nconsolidations: []\nprunings: []\n```"
out = curator_env._parse_structured_summary(text)
assert out == {"consolidations": [], "prunings": []}
def test_parse_structured_summary_ignores_bare_strings(curator_env):
"""Entries that aren't dicts (e.g. a model wrote bare names) are skipped."""
text = (
"```yaml\n"
"consolidations:\n"
" - just-a-bare-string\n"
" - from: real-entry\n"
" into: umbrella\n"
" reason: valid\n"
"prunings: []\n"
"```"
)
out = curator_env._parse_structured_summary(text)
assert len(out["consolidations"]) == 1
assert out["consolidations"][0]["from"] == "real-entry"
def test_parse_structured_summary_missing_required_fields(curator_env):
"""Consolidation entries without from+into are skipped."""
text = (
"```yaml\n"
"consolidations:\n"
" - from: only-from\n"
" reason: no into\n"
" - into: only-into\n"
" - from: good\n"
" into: umbrella\n"
"prunings: []\n"
"```"
)
out = curator_env._parse_structured_summary(text)
assert len(out["consolidations"]) == 1
assert out["consolidations"][0]["from"] == "good"
# ---------------------------------------------------------------------------
# _reconcile_classification — merging model block with heuristic
# ---------------------------------------------------------------------------
def test_reconcile_model_wins_when_umbrella_exists(curator_env):
"""Model claim + umbrella in destinations → model authority (with reason)."""
out = curator_env._reconcile_classification(
removed=["anthropic-api"],
heuristic={"consolidated": [], "pruned": [{"name": "anthropic-api"}]},
model_block={
"consolidations": [{
"from": "anthropic-api",
"into": "llm-providers",
"reason": "duplicate",
}],
"prunings": [],
},
destinations={"llm-providers"},
)
assert len(out["consolidated"]) == 1
e = out["consolidated"][0]
assert e["name"] == "anthropic-api"
assert e["into"] == "llm-providers"
assert e["reason"] == "duplicate"
assert e["source"] == "model"
assert out["pruned"] == []
def test_reconcile_model_hallucinates_umbrella(curator_env):
"""Model names a non-existent umbrella — downgrade, prefer heuristic if any."""
out = curator_env._reconcile_classification(
removed=["thing"],
heuristic={
"consolidated": [{"name": "thing", "into": "real-umbrella", "evidence": "..."}],
"pruned": [],
},
model_block={
"consolidations": [{
"from": "thing",
"into": "nonexistent-umbrella",
"reason": "confused",
}],
"prunings": [],
},
destinations={"real-umbrella"},
)
assert len(out["consolidated"]) == 1
e = out["consolidated"][0]
assert e["into"] == "real-umbrella"
assert "tool-call audit" in e["source"]
assert e["model_claimed_into"] == "nonexistent-umbrella"
def test_reconcile_model_hallucinates_with_no_heuristic_evidence(curator_env):
"""Model names a non-existent umbrella AND no tool-call evidence → prune."""
out = curator_env._reconcile_classification(
removed=["ghost"],
heuristic={"consolidated": [], "pruned": [{"name": "ghost"}]},
model_block={
"consolidations": [{
"from": "ghost",
"into": "nonexistent",
"reason": "wrong",
}],
"prunings": [],
},
destinations={"real-umbrella"},
)
assert out["consolidated"] == []
assert len(out["pruned"]) == 1
assert "fallback" in out["pruned"][0]["source"]
def test_reconcile_heuristic_catches_model_omission(curator_env):
"""Model forgot to list a consolidation, heuristic found it."""
out = curator_env._reconcile_classification(
removed=["forgotten"],
heuristic={
"consolidated": [{
"name": "forgotten",
"into": "umbrella",
"evidence": "write_file on umbrella referenced forgotten.md",
}],
"pruned": [],
},
model_block={"consolidations": [], "prunings": []},
destinations={"umbrella"},
)
assert len(out["consolidated"]) == 1
e = out["consolidated"][0]
assert e["into"] == "umbrella"
assert "model omitted" in e["source"]
def test_reconcile_model_prunes_with_reason(curator_env):
"""Model says pruned, heuristic agrees, we surface the reason."""
out = curator_env._reconcile_classification(
removed=["stale-skill"],
heuristic={"consolidated": [], "pruned": [{"name": "stale-skill"}]},
model_block={
"consolidations": [],
"prunings": [{"name": "stale-skill", "reason": "superseded by bundled skill"}],
},
destinations=set(),
)
assert len(out["pruned"]) == 1
e = out["pruned"][0]
assert e["reason"] == "superseded by bundled skill"
assert e["source"] == "model"
def test_reconcile_model_block_visible_in_full_report(curator_env):
"""End-to-end: LLM final response with the YAML block → reasons in REPORT.md."""
import json as _json
from datetime import datetime as _dt, timezone as _tz
start = _dt.now(_tz.utc)
before = [
{"name": "anthropic-api", "state": "active", "pinned": False},
{"name": "stale-thing", "state": "stale", "pinned": False},
]
after = [{"name": "llm-providers", "state": "active", "pinned": False}]
llm_final_text = (
"Processed 3 clusters. Absorbed anthropic-api into llm-providers.\n\n"
"## Structured summary (required)\n"
"```yaml\n"
"consolidations:\n"
" - from: anthropic-api\n"
" into: llm-providers\n"
" reason: duplicate content, now a subsection\n"
"prunings:\n"
" - name: stale-thing\n"
" reason: pre-curator junk, no overlap with anything\n"
"```\n"
)
run_dir = curator_env._write_run_report(
started_at=start,
elapsed_seconds=30.0,
auto_counts={"checked": 2, "marked_stale": 0, "archived": 0, "reactivated": 0},
auto_summary="none",
before_report=before,
before_names={r["name"] for r in before},
after_report=after,
llm_meta={
"final": llm_final_text,
"summary": "1 consolidated, 1 pruned",
"model": "m",
"provider": "p",
"error": None,
"tool_calls": [
{"name": "skill_manage", "arguments": _json.dumps({
"action": "create",
"name": "llm-providers",
"content": "# llm-providers\nIncludes anthropic-api",
})},
],
},
)
payload = _json.loads((run_dir / "run.json").read_text())
cons = payload["consolidated"][0]
assert cons["name"] == "anthropic-api"
assert cons["into"] == "llm-providers"
assert cons["reason"] == "duplicate content, now a subsection"
assert cons["source"] == "model+audit" # model AND heuristic both had it
pruned = payload["pruned"][0]
assert pruned["name"] == "stale-thing"
assert pruned["reason"] == "pre-curator junk, no overlap with anything"
md = (run_dir / "REPORT.md").read_text()
assert "duplicate content, now a subsection" in md
assert "pre-curator junk" in md
fix(curator): authoritative absorbed_into on delete + restore cron skill links on rollback (#18671) (#18731) * fix(curator): authoritative absorbed_into declarations on skill delete Closes #18671. The classification pipeline that feeds cron-ref rewriting used to infer consolidation vs pruning from two brittle signals: the curator model's post-hoc YAML summary block, and a substring heuristic scanning other tool calls for the removed skill's name. Both miss in real consolidations — the model forgets the YAML under reasoning pressure, and the heuristic misses when the umbrella's patch content describes the absorbed behavior abstractly instead of naming the old slug. When both miss, the skill falls through to 'no-evidence fallback' pruned, and #18253's cron rewriter drops the cron ref entirely instead of mapping it to the umbrella. Same observable symptom as pre-#18253: 'Skill(s) not found and skipped' at the next cron run. The fix makes the model declare intent at the moment of deletion. skill_manage(action='delete') now accepts absorbed_into: - absorbed_into='<umbrella>' -> consolidated, target must exist on disk - absorbed_into='' -> explicit prune, no forwarding target - missing -> legacy path, falls through to heuristic/YAML The curator reconciler reads these declarations off llm_meta.tool_calls BEFORE either the YAML block or the substring heuristic. Declaration wins. Fallback logic stays intact for backward compat with any caller (human or older curator conversation) that doesn't populate the arg. Changes - tools/skill_manager_tool.py: add absorbed_into param to skill_manage + _delete_skill. Validate target exists when non-empty. Reject absorbed_into=<self>. Wire through dispatcher + registry + schema. - agent/curator.py: new _extract_absorbed_into_declarations() walks tool calls for skill_manage(delete) with the arg. _reconcile_classification accepts absorbed_declarations= and treats them as authoritative. Curator prompt updated to require the arg on every delete. - Tests: 7 new skill_manager tests covering the tool contract (valid target, empty string, nonexistent target, self-reference, whitespace, backward compat, dispatcher plumbing). 11 new curator tests covering the extractor + authoritative reconciler path + mixed-legacy-and- declared runs. Validation - 307/307 targeted tests pass (curator + cron + skill_manager suites). - E2E #18671 repro: 3 narrow skills, 1 umbrella, cron job referencing all 3. Model emits NO YAML block. Heuristic misses (patch prose doesn't name old slugs). Delete calls carry absorbed_into. Result: both PR skills correctly classified 'consolidated' + cron rewritten ['pr-review-format', 'pr-review-checklist', 'stale-junk'] -> ['hermes-agent-dev']; stale-junk pruned via absorbed_into=''. - E2E backward-compat: delete without absorbed_into, model emits YAML -> routed via existing 'model' source, cron still rewritten correctly. * feat(curator): capture + restore cron skill links across snapshot/rollback Before this, rolling back a curator run restored the skills tree but cron jobs still pointed at the umbrella skills the curator had rewritten them to. The user would see their old narrow skills back on disk but their cron jobs still configured with the merged umbrella — not actually 'back to how it was'. Snapshot side: snapshot_skills() now captures ~/.hermes/cron/jobs.json alongside the skills tarball, as cron-jobs.json. The manifest gets a new 'cron_jobs' block with {backed_up, jobs_count} so rollback (and the CLI confirm dialog) can surface what's in the snapshot. If jobs.json is missing/unreadable/malformed, snapshot proceeds without cron data — the skills backup is the core guarantee; cron is additive. Rollback side: after the skills extract succeeds, the new _restore_cron_skill_links() reconciles the backed-up jobs into the live jobs.json SURGICALLY. Only 'skills' and 'skill' fields are restored, and only on jobs matched by id. Everything else about a cron job — schedule, last_run_at, next_run_at, enabled, prompt, workdir, hooks — is live state the user or scheduler has modified since the snapshot; overwriting it would regress unrelated activity. Reconciliation rules: - Job in backup AND live, skills differ → skills restored. - Job in backup AND live, skills match → no-op. - Job in backup, NOT in live → skipped (user deleted it after snapshot; their choice is later than the snapshot). - Job in live, NOT in backup → untouched (user created it after snapshot). - Snapshot missing cron-jobs.json at all → rollback still succeeds, reports 'not captured' (older pre-feature snapshots keep working). Writes go through cron.jobs.save_jobs under the same _jobs_file_lock the scheduler uses, so rollback doesn't race tick(). Also: - hermes_cli/curator.py: rollback confirm dialog now shows 'cron jobs: N (will be restored for skill-link fields only)' when the snapshot has cron data, or 'not in snapshot (<reason>)' otherwise. - rollback()'s message string includes a 'cron links: ...' clause summarizing the reconciliation outcome. Tests - 9 new cases: snapshot-with-cron, snapshot-without-cron, malformed-json captured-as-raw, full rollback-restores-skills-and-cron, rollback touches only skill fields, rollback skips user-deleted jobs, rollback leaves user-created jobs untouched, rollback still works with pre-feature snapshot that has no cron-jobs.json, standalone unit test on _restore_cron_skill_links exercising the full report shape. Validation - 484/484 targeted tests pass (curator + cron + skill_manager suites). - E2E: real snapshot_skills, real cron rewrite, real rollback. Before: ['pr-review-format', 'pr-review-checklist', 'pr-triage-salvage']. After curator: ['hermes-agent-dev']. After rollback: ['pr-review-format', 'pr-review-checklist', 'pr-triage-salvage']. Non-skill fields (id, name, prompt) preserved across the round trip.
2026-05-02 01:29:57 -07:00
# ---------------------------------------------------------------------------
# _extract_absorbed_into_declarations — authoritative signal from delete calls
# ---------------------------------------------------------------------------
def test_extract_absorbed_into_picks_up_consolidation(curator_env):
"""Delete call with absorbed_into=<umbrella> yields a declaration."""
declarations = curator_env._extract_absorbed_into_declarations([
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "delete",
"name": "narrow-skill",
"absorbed_into": "umbrella",
}),
},
])
assert declarations == {
"narrow-skill": {"into": "umbrella", "declared": True},
}
def test_extract_absorbed_into_empty_string_is_explicit_prune(curator_env):
"""absorbed_into='' is recorded as an explicit prune declaration."""
declarations = curator_env._extract_absorbed_into_declarations([
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "delete",
"name": "stale",
"absorbed_into": "",
}),
},
])
assert declarations == {"stale": {"into": "", "declared": True}}
def test_extract_absorbed_into_missing_arg_ignored(curator_env):
"""Delete call without absorbed_into is skipped — fallback to heuristic."""
declarations = curator_env._extract_absorbed_into_declarations([
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "delete",
"name": "legacy-skill",
}),
},
])
assert declarations == {}
def test_extract_absorbed_into_ignores_non_delete_actions(curator_env):
"""Patch, create, write_file etc. must not leak into declarations."""
declarations = curator_env._extract_absorbed_into_declarations([
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "patch",
"name": "umbrella",
"old_string": "...",
"new_string": "...",
"absorbed_into": "something", # bogus on non-delete, must be ignored
}),
},
])
assert declarations == {}
def test_extract_absorbed_into_accepts_dict_arguments(curator_env):
"""arguments can arrive as a dict (defensive path) — still works."""
declarations = curator_env._extract_absorbed_into_declarations([
{
"name": "skill_manage",
"arguments": {
"action": "delete",
"name": "narrow",
"absorbed_into": "umbrella",
},
},
])
assert declarations == {"narrow": {"into": "umbrella", "declared": True}}
def test_extract_absorbed_into_strips_whitespace(curator_env):
declarations = curator_env._extract_absorbed_into_declarations([
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "delete",
"name": " narrow ",
"absorbed_into": " umbrella ",
}),
},
])
assert declarations == {"narrow": {"into": "umbrella", "declared": True}}
def test_extract_absorbed_into_ignores_non_skill_manage_calls(curator_env):
declarations = curator_env._extract_absorbed_into_declarations([
{"name": "terminal", "arguments": json.dumps({"command": "ls"})},
{"name": "read_file", "arguments": json.dumps({"path": "/tmp/x"})},
])
assert declarations == {}
def test_extract_absorbed_into_handles_malformed_arguments(curator_env):
"""Garbage JSON in arguments must not crash the extractor."""
declarations = curator_env._extract_absorbed_into_declarations([
{"name": "skill_manage", "arguments": "{not json"},
{"name": "skill_manage", "arguments": None},
{"name": "skill_manage"}, # no arguments key at all
])
assert declarations == {}
# ---------------------------------------------------------------------------
# _reconcile_classification with absorbed_into declarations (authoritative)
# ---------------------------------------------------------------------------
def test_reconcile_absorbed_into_beats_everything_else(curator_env):
"""Model declared absorbed_into at delete; YAML/heuristic disagree — declaration wins.
This is the exact #18671 regression: the model forgets to emit the YAML
summary block, the heuristic's substring match misses because the
umbrella's patch content doesn't literally contain the old skill's
slug. Previously this fell through to 'no-evidence fallback' prune,
which dropped the cron ref instead of rewriting. With absorbed_into
declared, the model tells us directly.
"""
out = curator_env._reconcile_classification(
removed=["pr-review-format"],
heuristic={"consolidated": [], "pruned": [{"name": "pr-review-format"}]},
model_block={"consolidations": [], "prunings": []}, # model forgot YAML block
destinations={"hermes-agent-dev"},
absorbed_declarations={
"pr-review-format": {"into": "hermes-agent-dev", "declared": True},
},
)
assert len(out["consolidated"]) == 1
assert out["pruned"] == []
e = out["consolidated"][0]
assert e["name"] == "pr-review-format"
assert e["into"] == "hermes-agent-dev"
assert "absorbed_into" in e["source"]
def test_reconcile_absorbed_into_empty_is_explicit_prune(curator_env):
"""absorbed_into='' takes precedence and routes to pruned, not fallback."""
out = curator_env._reconcile_classification(
removed=["stale"],
heuristic={"consolidated": [], "pruned": [{"name": "stale"}]},
model_block={"consolidations": [], "prunings": []},
destinations=set(),
absorbed_declarations={
"stale": {"into": "", "declared": True},
},
)
assert out["consolidated"] == []
assert len(out["pruned"]) == 1
assert "model-declared prune" in out["pruned"][0]["source"]
def test_reconcile_absorbed_into_nonexistent_target_falls_through(curator_env):
"""If the declared umbrella doesn't exist in destinations, fall through to
heuristic/YAML logic. Shouldn't happen in practice (the tool validates at
delete time) but the reconciler is defensive."""
out = curator_env._reconcile_classification(
removed=["thing"],
heuristic={
"consolidated": [{"name": "thing", "into": "real-umbrella", "evidence": "..."}],
"pruned": [],
},
model_block={"consolidations": [], "prunings": []},
destinations={"real-umbrella"},
absorbed_declarations={
"thing": {"into": "ghost-umbrella", "declared": True},
},
)
assert len(out["consolidated"]) == 1
assert out["consolidated"][0]["into"] == "real-umbrella"
assert "tool-call audit" in out["consolidated"][0]["source"]
def test_reconcile_declaration_preserves_yaml_reason(curator_env):
"""When the model both declared absorbed_into AND emitted YAML with reason,
the reason carries through so REPORT.md still has it."""
out = curator_env._reconcile_classification(
removed=["narrow"],
heuristic={"consolidated": [], "pruned": []},
model_block={
"consolidations": [{
"from": "narrow",
"into": "umbrella",
"reason": "duplicate of umbrella's main content",
}],
"prunings": [],
},
destinations={"umbrella"},
absorbed_declarations={
"narrow": {"into": "umbrella", "declared": True},
},
)
assert len(out["consolidated"]) == 1
e = out["consolidated"][0]
assert e["into"] == "umbrella"
assert "absorbed_into" in e["source"]
assert e["reason"] == "duplicate of umbrella's main content"
def test_reconcile_without_declarations_preserves_legacy_behavior(curator_env):
"""Backward compat: no absorbed_declarations arg → all existing logic intact."""
out = curator_env._reconcile_classification(
removed=["thing"],
heuristic={
"consolidated": [{"name": "thing", "into": "umbrella", "evidence": "..."}],
"pruned": [],
},
model_block={"consolidations": [], "prunings": []},
destinations={"umbrella"},
# no absorbed_declarations — defaults to None → behaves identically to pre-change
)
assert len(out["consolidated"]) == 1
assert out["consolidated"][0]["into"] == "umbrella"
def test_reconcile_mixed_declarations_and_legacy_calls(curator_env):
"""Real-world run: some deletes declared absorbed_into, some didn't.
Declared ones use the authoritative path; others fall through to YAML/heuristic.
"""
out = curator_env._reconcile_classification(
removed=["declared-cons", "declared-prune", "legacy-cons", "legacy-prune"],
heuristic={
"consolidated": [
{"name": "legacy-cons", "into": "umbrella-a", "evidence": "..."},
],
"pruned": [{"name": "legacy-prune"}],
},
model_block={"consolidations": [], "prunings": []},
destinations={"umbrella-a", "umbrella-b"},
absorbed_declarations={
"declared-cons": {"into": "umbrella-b", "declared": True},
"declared-prune": {"into": "", "declared": True},
},
)
cons_by_name = {e["name"]: e for e in out["consolidated"]}
pruned_by_name = {e["name"]: e for e in out["pruned"]}
assert "declared-cons" in cons_by_name
assert cons_by_name["declared-cons"]["into"] == "umbrella-b"
assert "absorbed_into" in cons_by_name["declared-cons"]["source"]
assert "legacy-cons" in cons_by_name
assert cons_by_name["legacy-cons"]["into"] == "umbrella-a"
assert "tool-call audit" in cons_by_name["legacy-cons"]["source"]
assert "declared-prune" in pruned_by_name
assert "model-declared prune" in pruned_by_name["declared-prune"]["source"]
assert "legacy-prune" in pruned_by_name
assert "no-evidence fallback" in pruned_by_name["legacy-prune"]["source"]