mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-29 15:31:38 +08:00
Ships a complete offline eval harness at scripts/compression_eval/. Runs a real conversation fixture through ContextCompressor.compress(), asks the compressor model to answer probe questions from the compressed state, then has a judge model score each answer 0-5 on six dimensions (accuracy, context_awareness, artifact_trail, completeness, continuity, instruction_following). Methodology adapted from Factory's Dec 2025 write-up (https://factory.ai/news/evaluating-compression); the scoreboard framing is not adopted. Motivation: we edit context_compressor.py prompts and _template_sections by hand and ship with no automated check that compression still preserves file paths, error codes, or the active task. Until now there has been no signal between 'test suite green' and 'a user hits a bad summary in production.' What's shipped - DESIGN.md — full architecture, fixture/probe format, scrubber pipeline, grading rubric, open follow-ups - README.md — usage, cost expectations, when to run it - scrub_fixtures.py — reproducible pipeline that converts real sessions from ~/.hermes/sessions/*.jsonl into public-safe JSON fixtures. Applies agent.redact.redact_sensitive_text + username path normalisation + personal handle scrubbing + email/git-author normalisation + reasoning scratchpad stripping + platform-mention scrubbing + first-user paraphrase + system-prompt placeholder + orphan-message pruning + 2KB tool-output truncation - fixtures/ — three scrubbed session snapshots covering three session shapes: feature-impl-context-priority (75 msgs / ~17k tokens) debug-session-feishu-id-model (59 msgs / ~13k tokens) config-build-competitive-scouts (61 msgs / ~23k tokens) - probes/ — three probe banks (10-11 probes each) covering all four types (recall/artifact/continuation/decision) with expected_facts anchors (PR numbers, file paths, error codes, commands) - rubric.py — six-dimension grading rubric, judge-prompt builder, JSON-with-fallback response parser - compressor_driver.py — thin wrapper around ContextCompressor for forced single-shot compression (fixtures are below the default 100k threshold so we force compress() to attribute score deltas to prompt changes, not threshold-fire variance) - grader.py — two-phase continuation + grading calls via the OpenAI SDK directly against the resolved provider endpoint - report.py — markdown report renderer (paste-ready for PR bodies), --compare-to delta mode, per-run JSON dumper - run_eval.py — fire-style CLI (--fixtures, --runs, --judge-model, --compressor-model, --label, --focus-topic, --compare-to, --verbose) - tests/scripts/test_compression_eval.py — 33 hermetic unit tests covering rubric parsing edge cases, judge-prompt building, report rendering, summariser medians, per-run JSON roundtrip, fixture and probe loading, and a PII smoke check on the checked-in fixtures Non-LLM paths are covered by the 33-test suite that runs in CI. The LLM paths (continuation + grading) require credentials and real API calls, so they're exercised by running the eval itself — not by CI. Validation - 33/33 unit tests pass in 0.33s via scripts/run_tests.sh - 50/50 adjacent tests (tests/agent/test_context_compressor.py) still pass — no regression introduced - End-to-end dry run against debug-session-feishu-id-model with openai/gpt-5.4-mini via Nous Portal: Compression: 13081 -> 3055 tokens (76.6% ratio), 59 -> 10 messages Overall score: 3.25 (artifact_trail 1.50 is the weak spot, matching Factory's published observation) Specific probe misses surfaced with concrete judge notes Noise floor (one empirical data point) Same inputs re-run: overall 3.25 -> 3.17 (delta -0.08). Individual dimensions varied up to ±0.5 between two single-run medians. Confirms the DESIGN.md < 0.3 noise guidance is the right order of magnitude for single-run comparisons. Tighter noise measurement (N=10) is tracked as an open follow-up in DESIGN.md. Why scripts/ and not tests/ Requires API credentials, costs ~$0.50-1.50 per run, minutes to execute, LLM-graded (non-deterministic). Incompatible with scripts/run_tests.sh which is hermetic, parallel, credential-free. scripts/sample_and_compress.py is the existing precedent for offline credentialed tooling. Open follow-ups (tracked in DESIGN.md, not blocking this PR) 1. Iterative-merge fixture (two chained compressions on one session) 2. Precise noise-floor measurement at N=10 3. Scripted scrubber helpers to lower the cost of fixture #4+ 4. Judge model selection policy (pin vs. per-user)
450 lines
14 KiB
Python
450 lines
14 KiB
Python
"""Unit tests for scripts/compression_eval/ non-LLM paths.
|
|
|
|
These exercise rubric parsing, report rendering, and fixture/probe
|
|
loading — everything that does NOT require API credentials. The eval
|
|
harness itself (run_eval.py) is not hermetic and is not tested here.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
_SCRIPTS_DIR = Path(__file__).resolve().parents[2] / "scripts" / "compression_eval"
|
|
if str(_SCRIPTS_DIR) not in sys.path:
|
|
sys.path.insert(0, str(_SCRIPTS_DIR))
|
|
|
|
from rubric import ( # noqa: E402
|
|
DIMENSIONS,
|
|
SCORE_SCALE,
|
|
build_judge_prompt,
|
|
parse_judge_response,
|
|
)
|
|
from report import ( # noqa: E402
|
|
render_report,
|
|
summarize_fixture_runs,
|
|
write_run_json,
|
|
)
|
|
|
|
|
|
# ---------- rubric.parse_judge_response ----------
|
|
|
|
|
|
def test_parse_judge_response_accepts_clean_json():
|
|
raw = json.dumps({
|
|
"accuracy": 4,
|
|
"context_awareness": 3,
|
|
"artifact_trail": 2,
|
|
"completeness": 5,
|
|
"continuity": 4,
|
|
"instruction_following": 5,
|
|
"notes": "missed redis_client.py",
|
|
})
|
|
out = parse_judge_response(raw)
|
|
assert out["scores"]["accuracy"] == 4
|
|
assert out["scores"]["artifact_trail"] == 2
|
|
assert out["notes"] == "missed redis_client.py"
|
|
assert 0 <= out["overall"] <= 5
|
|
# overall is the arithmetic mean of the six dims
|
|
expected = (4 + 3 + 2 + 5 + 4 + 5) / 6
|
|
assert abs(out["overall"] - expected) < 1e-9
|
|
|
|
|
|
def test_parse_judge_response_strips_code_fences():
|
|
raw = '```json\n{"accuracy":5,"context_awareness":5,"artifact_trail":5,"completeness":5,"continuity":5,"instruction_following":5,"notes":""}\n```'
|
|
out = parse_judge_response(raw)
|
|
assert all(v == 5 for v in out["scores"].values())
|
|
|
|
|
|
def test_parse_judge_response_tolerates_surrounding_prose():
|
|
raw = (
|
|
"Here is my grading:\n\n"
|
|
'{"accuracy": 3, "context_awareness": 4, "artifact_trail": 3, '
|
|
'"completeness": 4, "continuity": 3, "instruction_following": 5, '
|
|
'"notes": "ok"}\n\n'
|
|
"Let me know if you need more detail."
|
|
)
|
|
out = parse_judge_response(raw)
|
|
assert out["scores"]["accuracy"] == 3
|
|
|
|
|
|
def test_parse_judge_response_rounds_floats_to_ints():
|
|
raw = json.dumps({
|
|
"accuracy": 3.4,
|
|
"context_awareness": 3.6,
|
|
"artifact_trail": 3,
|
|
"completeness": 3,
|
|
"continuity": 3,
|
|
"instruction_following": 3,
|
|
"notes": "",
|
|
})
|
|
out = parse_judge_response(raw)
|
|
assert out["scores"]["accuracy"] == 3
|
|
assert out["scores"]["context_awareness"] == 4
|
|
|
|
|
|
def test_parse_judge_response_rejects_out_of_range():
|
|
raw = json.dumps({
|
|
"accuracy": 7, # illegal
|
|
"context_awareness": 3, "artifact_trail": 3, "completeness": 3,
|
|
"continuity": 3, "instruction_following": 3, "notes": "",
|
|
})
|
|
with pytest.raises(ValueError, match="out of range"):
|
|
parse_judge_response(raw)
|
|
|
|
|
|
def test_parse_judge_response_rejects_missing_dimension():
|
|
raw = json.dumps({
|
|
"accuracy": 3, "context_awareness": 3, "artifact_trail": 3,
|
|
"completeness": 3, "continuity": 3,
|
|
# instruction_following missing
|
|
"notes": "",
|
|
})
|
|
with pytest.raises(ValueError, match="missing dimension"):
|
|
parse_judge_response(raw)
|
|
|
|
|
|
def test_parse_judge_response_rejects_non_numeric():
|
|
raw = json.dumps({
|
|
"accuracy": "high",
|
|
"context_awareness": 3, "artifact_trail": 3, "completeness": 3,
|
|
"continuity": 3, "instruction_following": 3, "notes": "",
|
|
})
|
|
with pytest.raises(ValueError, match="not numeric"):
|
|
parse_judge_response(raw)
|
|
|
|
|
|
def test_parse_judge_response_rejects_booleans_as_numeric():
|
|
# JSON bools coerce to int otherwise — catch that explicitly
|
|
raw = json.dumps({
|
|
"accuracy": True,
|
|
"context_awareness": 3, "artifact_trail": 3, "completeness": 3,
|
|
"continuity": 3, "instruction_following": 3, "notes": "",
|
|
})
|
|
with pytest.raises(ValueError, match="not numeric"):
|
|
parse_judge_response(raw)
|
|
|
|
|
|
def test_parse_judge_response_rejects_empty():
|
|
with pytest.raises(ValueError, match="empty"):
|
|
parse_judge_response("")
|
|
|
|
|
|
def test_parse_judge_response_rejects_no_json():
|
|
with pytest.raises(ValueError, match="no JSON object"):
|
|
parse_judge_response("just some prose with no braces at all")
|
|
|
|
|
|
def test_parse_judge_response_rejects_malformed_json():
|
|
with pytest.raises(ValueError, match="not valid JSON"):
|
|
parse_judge_response("{accuracy: 3,}") # missing quotes, trailing comma
|
|
|
|
|
|
def test_parse_judge_response_truncates_long_notes():
|
|
long_notes = "x" * 500
|
|
raw = json.dumps({
|
|
"accuracy": 3, "context_awareness": 3, "artifact_trail": 3,
|
|
"completeness": 3, "continuity": 3, "instruction_following": 3,
|
|
"notes": long_notes,
|
|
})
|
|
out = parse_judge_response(raw)
|
|
assert len(out["notes"]) == 200
|
|
|
|
|
|
# ---------- rubric.build_judge_prompt ----------
|
|
|
|
|
|
def test_build_judge_prompt_mentions_all_dimensions():
|
|
prompt = build_judge_prompt(
|
|
probe_question="What files were modified?",
|
|
probe_type="artifact",
|
|
expected_facts=["foo.py", "bar.py"],
|
|
assistant_answer="I modified foo.py.",
|
|
)
|
|
for dim in DIMENSIONS:
|
|
assert dim in prompt
|
|
|
|
|
|
def test_build_judge_prompt_includes_expected_facts():
|
|
prompt = build_judge_prompt(
|
|
probe_question="What files were modified?",
|
|
probe_type="artifact",
|
|
expected_facts=["specific_file.py", "another_file.py"],
|
|
assistant_answer="n/a",
|
|
)
|
|
assert "specific_file.py" in prompt
|
|
assert "another_file.py" in prompt
|
|
|
|
|
|
def test_build_judge_prompt_handles_empty_expected_facts():
|
|
prompt = build_judge_prompt(
|
|
probe_question="anything?",
|
|
probe_type="recall",
|
|
expected_facts=[],
|
|
assistant_answer="nope",
|
|
)
|
|
assert "(none provided)" in prompt
|
|
|
|
|
|
def test_build_judge_prompt_includes_all_score_scale_levels():
|
|
prompt = build_judge_prompt(
|
|
probe_question="q", probe_type="recall",
|
|
expected_facts=[], assistant_answer="a",
|
|
)
|
|
for score in SCORE_SCALE:
|
|
assert f" {score}:" in prompt
|
|
|
|
|
|
# ---------- report.summarize_fixture_runs ----------
|
|
|
|
|
|
def _fake_run(fixture_name: str, run_index: int, probe_scores: dict) -> dict:
|
|
"""Build a synthetic per-run payload for summariser tests."""
|
|
probes = []
|
|
for pid, per_dim in probe_scores.items():
|
|
overall = sum(per_dim.values()) / len(per_dim)
|
|
probes.append({
|
|
"id": pid,
|
|
"type": "recall",
|
|
"question": "q",
|
|
"expected_facts": [],
|
|
"answer": "a",
|
|
"scores": per_dim,
|
|
"overall": overall,
|
|
"notes": f"note-run{run_index}",
|
|
"parse_error": None,
|
|
"elapsed_seconds": 0.1,
|
|
})
|
|
return {
|
|
"fixture_name": fixture_name,
|
|
"run_index": run_index,
|
|
"compression": {
|
|
"pre_tokens": 10000,
|
|
"post_tokens": 5000,
|
|
"compression_ratio": 0.5,
|
|
"pre_message_count": 50,
|
|
"post_message_count": 25,
|
|
"summary_text": "## Active Task\n...",
|
|
},
|
|
"probes": probes,
|
|
}
|
|
|
|
|
|
def _all_dims(value: int) -> dict:
|
|
return {d: value for d in DIMENSIONS}
|
|
|
|
|
|
def test_summarize_handles_single_run():
|
|
runs = [_fake_run("fx1", 1, {
|
|
"p1": _all_dims(4),
|
|
"p2": _all_dims(3),
|
|
})]
|
|
s = summarize_fixture_runs(runs)
|
|
assert s["fixture_name"] == "fx1"
|
|
assert s["runs"] == 1
|
|
# Median of {4, 3} per dim is 3.5
|
|
for d in DIMENSIONS:
|
|
assert abs(s["dimension_medians"][d] - 3.5) < 1e-9
|
|
# Both probes have overall >= 3.0 so no misses
|
|
assert s["misses"] == []
|
|
|
|
|
|
def test_summarize_flags_misses_below_three():
|
|
runs = [_fake_run("fx1", 1, {
|
|
"p_good": _all_dims(4),
|
|
"p_bad": _all_dims(2),
|
|
})]
|
|
s = summarize_fixture_runs(runs)
|
|
miss_ids = [m["id"] for m in s["misses"]]
|
|
assert "p_bad" in miss_ids
|
|
assert "p_good" not in miss_ids
|
|
miss_entry = next(m for m in s["misses"] if m["id"] == "p_bad")
|
|
assert miss_entry["overall_median"] == 2.0
|
|
assert miss_entry["notes"] == "note-run1"
|
|
|
|
|
|
def test_summarize_medians_across_runs():
|
|
# Three runs, same probe, scores 2, 4, 5 per dim -> median 4
|
|
runs = [
|
|
_fake_run("fx1", 1, {"p": _all_dims(2)}),
|
|
_fake_run("fx1", 2, {"p": _all_dims(4)}),
|
|
_fake_run("fx1", 3, {"p": _all_dims(5)}),
|
|
]
|
|
s = summarize_fixture_runs(runs)
|
|
for d in DIMENSIONS:
|
|
assert s["dimension_medians"][d] == 4.0
|
|
assert s["runs"] == 3
|
|
|
|
|
|
def test_summarize_empty_input():
|
|
assert summarize_fixture_runs([]) == {}
|
|
|
|
|
|
# ---------- report.render_report ----------
|
|
|
|
|
|
def test_render_report_renders_all_fixtures():
|
|
runs = [_fake_run("feature-impl", 1, {"p1": _all_dims(4)})]
|
|
s = summarize_fixture_runs(runs)
|
|
md = render_report(
|
|
label="test",
|
|
compressor_model="modelA",
|
|
judge_model="modelA",
|
|
runs_per_fixture=1,
|
|
summaries=[s],
|
|
)
|
|
assert "feature-impl" in md
|
|
assert "modelA" in md
|
|
for dim in DIMENSIONS:
|
|
assert dim in md
|
|
# Methodology footer present
|
|
assert "Methodology" in md
|
|
assert "factory.ai" in md
|
|
|
|
|
|
def test_render_report_shows_deltas_when_baseline_provided():
|
|
baseline_runs = [_fake_run("fx", 1, {"p1": _all_dims(3)})]
|
|
current_runs = [_fake_run("fx", 1, {"p1": _all_dims(4)})]
|
|
baseline = [summarize_fixture_runs(baseline_runs)]
|
|
current = [summarize_fixture_runs(current_runs)]
|
|
md = render_report(
|
|
label="test",
|
|
compressor_model="m",
|
|
judge_model="m",
|
|
runs_per_fixture=1,
|
|
summaries=current,
|
|
baseline_summaries=baseline,
|
|
)
|
|
# Improvement of +1 from 3 -> 4 on every dim
|
|
assert "+1.00" in md
|
|
assert "Deltas shown against baseline" in md
|
|
|
|
|
|
def test_render_report_lists_misses_section():
|
|
runs = [_fake_run("fx", 1, {
|
|
"good": _all_dims(4),
|
|
"bad": _all_dims(1),
|
|
})]
|
|
s = summarize_fixture_runs(runs)
|
|
md = render_report(
|
|
label="t", compressor_model="m", judge_model="m",
|
|
runs_per_fixture=1, summaries=[s],
|
|
)
|
|
assert "Probes scoring below 3.0" in md
|
|
assert "`bad`" in md
|
|
assert "`good`" not in md
|
|
|
|
|
|
def test_render_report_no_misses_section_when_all_pass():
|
|
runs = [_fake_run("fx", 1, {"p": _all_dims(5)})]
|
|
s = summarize_fixture_runs(runs)
|
|
md = render_report(
|
|
label="t", compressor_model="m", judge_model="m",
|
|
runs_per_fixture=1, summaries=[s],
|
|
)
|
|
assert "Probes scoring below 3.0" not in md
|
|
|
|
|
|
def test_render_report_compression_table():
|
|
runs = [_fake_run("fx", 1, {"p": _all_dims(4)})]
|
|
s = summarize_fixture_runs(runs)
|
|
md = render_report(
|
|
label="t", compressor_model="m", judge_model="m",
|
|
runs_per_fixture=1, summaries=[s],
|
|
)
|
|
assert "Pre tokens" in md
|
|
assert "10000" in md # from _fake_run compression.pre_tokens
|
|
assert "50.0%" in md # ratio renders as percent
|
|
|
|
|
|
# ---------- report.write_run_json ----------
|
|
|
|
|
|
def test_write_run_json_roundtrip(tmp_path):
|
|
payload = _fake_run("fx1", 2, {"p": _all_dims(4)})
|
|
out = write_run_json(
|
|
results_dir=tmp_path,
|
|
fixture_name="fx1",
|
|
run_index=2,
|
|
payload=payload,
|
|
)
|
|
assert out.exists()
|
|
assert out.name == "fx1-run-2.json"
|
|
with out.open() as fh:
|
|
loaded = json.load(fh)
|
|
assert loaded["fixture_name"] == "fx1"
|
|
assert loaded["run_index"] == 2
|
|
|
|
|
|
# ---------- fixture + probe sanity ----------
|
|
|
|
|
|
_EVAL_DIR = Path(__file__).resolve().parents[2] / "scripts" / "compression_eval"
|
|
|
|
|
|
@pytest.mark.parametrize("fixture_name", [
|
|
"feature-impl-context-priority",
|
|
"debug-session-feishu-id-model",
|
|
"config-build-competitive-scouts",
|
|
])
|
|
def test_fixture_loads_and_is_well_formed(fixture_name):
|
|
path = _EVAL_DIR / "fixtures" / f"{fixture_name}.json"
|
|
assert path.exists(), f"fixture missing: {path}"
|
|
with path.open() as fh:
|
|
fx = json.load(fh)
|
|
assert fx["name"] == fixture_name
|
|
assert isinstance(fx["messages"], list) and len(fx["messages"]) > 10
|
|
assert fx["messages"][0]["role"] == "system"
|
|
# At least one user message and one assistant message
|
|
roles = {m["role"] for m in fx["messages"]}
|
|
assert "user" in roles and "assistant" in roles
|
|
|
|
|
|
@pytest.mark.parametrize("fixture_name", [
|
|
"feature-impl-context-priority",
|
|
"debug-session-feishu-id-model",
|
|
"config-build-competitive-scouts",
|
|
])
|
|
def test_probes_have_all_four_types(fixture_name):
|
|
path = _EVAL_DIR / "probes" / f"{fixture_name}.probes.json"
|
|
assert path.exists(), f"probe bank missing: {path}"
|
|
with path.open() as fh:
|
|
pb = json.load(fh)
|
|
assert pb["fixture"] == fixture_name
|
|
types = {p["type"] for p in pb["probes"]}
|
|
assert types == {"recall", "artifact", "continuation", "decision"}, (
|
|
f"{fixture_name} probe bank missing at least one probe type; got {types}"
|
|
)
|
|
# Every probe has expected_facts (possibly empty list but present)
|
|
for p in pb["probes"]:
|
|
assert "id" in p and "question" in p and "type" in p
|
|
assert "expected_facts" in p and isinstance(p["expected_facts"], list)
|
|
|
|
|
|
def test_fixtures_do_not_leak_maintainer_pii():
|
|
"""Smoke test that scrubber actually ran. This is a belt-and-suspenders
|
|
check that would have caught the ethanbit@qq.com leak before it
|
|
landed."""
|
|
for fixture_path in (_EVAL_DIR / "fixtures").glob("*.json"):
|
|
text = fixture_path.read_text()
|
|
lower = text.lower()
|
|
# The scrubbing_passes metadata intentionally documents what was
|
|
# replaced. Ignore the metadata block and only scan the messages.
|
|
data = json.loads(text)
|
|
msg_text = json.dumps(data["messages"])
|
|
msg_lower = msg_text.lower()
|
|
assert "teknium" not in msg_lower, (
|
|
f"{fixture_path.name}: maintainer handle leaked into messages"
|
|
)
|
|
# No personal-email domains (placeholder @example.com is allowed)
|
|
import re
|
|
personal_emails = re.findall(
|
|
r"[A-Za-z0-9._%+-]+@(?!example\.com)[A-Za-z0-9.-]+\.[A-Za-z]{2,}",
|
|
msg_text,
|
|
)
|
|
assert personal_emails == [], (
|
|
f"{fixture_path.name}: personal email(s) leaked: {personal_emails}"
|
|
)
|