"""Rubric for probe-based compression eval grading. Six dimensions scored 0-5 by a judge model. The scoring anchors are spelled out so the judge interpretation is stable across runs and across judge models. Adapted from the methodology in https://factory.ai/news/evaluating-compression. Their scoreboard is not adopted; only the dimension definitions and the 0-5 scale. """ from __future__ import annotations from typing import Any, Dict, List # Canonical dimension order. All reports, parsers, and comparisons derive # from this list — do not hardcode the order elsewhere. DIMENSIONS: List[str] = [ "accuracy", "context_awareness", "artifact_trail", "completeness", "continuity", "instruction_following", ] DIMENSION_DESCRIPTIONS: Dict[str, str] = { "accuracy": ( "Are concrete facts correct — file paths, function names, PR/issue " "numbers, error codes, command outputs, line numbers? A single wrong " "path or error code should cost points. Vague but non-contradicting " "answers score mid-range." ), "context_awareness": ( "Does the answer reflect the CURRENT state of the session, not a " "mid-session snapshot? For example, if a file was modified then " "reverted, does the answer describe the reverted state? If three " "PRs were opened, does the answer know which was merged?" ), "artifact_trail": ( "Does the answer correctly enumerate the artifacts (files read, " "files modified, commands run, tools called, PRs opened, cron jobs " "created)? Missing artifacts cost more than extra unrelated ones." ), "completeness": ( "Does the answer address ALL parts of the probe question? If the " "probe asks for three things and only two are answered, that is " "incomplete regardless of accuracy on the two." ), "continuity": ( "Could the next assistant continue the work using only this answer, " "without having to re-fetch files or re-explore the codebase? An " "answer that lists files by name but doesn't mention the change is " "poor continuity even if accurate." ), "instruction_following": ( "Is the answer in the format the probe requested (list, number, " "short phrase, yes/no)? Ignore tone and length, only assess " "whether the requested form was honoured." ), } SCORE_SCALE: Dict[int, str] = { 0: "No useful information; wrong or hallucinated.", 1: "Major gaps or a key fact is wrong.", 2: "Partially correct but significant omissions.", 3: "Mostly correct with minor omissions or imprecision.", 4: "Correct and complete with only trivial imprecision.", 5: "Fully correct, complete, and in the requested format.", } _RUBRIC_HEADER = """You are an evaluator grading a single answer produced by an AI assistant \ that was given a COMPRESSED handoff summary of an earlier conversation and \ asked a probe question. You are NOT evaluating the compression summary \ directly — you are evaluating whether the answer the assistant produced \ from that summary is correct, complete, and useful. Grade on six dimensions, each 0-5: {dimension_block} 0-5 scale: {scale_block} Grade strictly. Fractional scores are NOT allowed — output integers only. \ If the answer is ambiguous, use the lower of the two candidate scores.""" def build_judge_prompt( *, probe_question: str, probe_type: str, expected_facts: List[str], assistant_answer: str, ) -> str: """Build the full judge prompt for one (probe, answer) pair. The judge is told the expected_facts up front so grading is anchored to concrete signal rather than judge taste. Expected facts are intentionally NOT shown to the assistant that produces the answer. """ dim_block = "\n".join( f"- {d}: {DIMENSION_DESCRIPTIONS[d]}" for d in DIMENSIONS ) scale_block = "\n".join( f" {score}: {desc}" for score, desc in sorted(SCORE_SCALE.items()) ) header = _RUBRIC_HEADER.format( dimension_block=dim_block, scale_block=scale_block, ) expected_block = ( "\n".join(f"- {f}" for f in expected_facts) if expected_facts else "(none provided)" ) output_schema = ( "Respond with ONLY a JSON object, no prose before or after, matching " "this schema exactly:\n" "{\n" ' "accuracy": ,\n' ' "context_awareness": ,\n' ' "artifact_trail": ,\n' ' "completeness": ,\n' ' "continuity": ,\n' ' "instruction_following": ,\n' ' "notes": ""\n' "}" ) return ( f"{header}\n\n" f"PROBE TYPE: {probe_type}\n\n" f"PROBE QUESTION:\n{probe_question}\n\n" f"EXPECTED FACTS (the answer should contain these concrete anchors; " f"missing any is a material defect in accuracy and/or completeness):\n" f"{expected_block}\n\n" f"ASSISTANT ANSWER TO GRADE:\n{assistant_answer}\n\n" f"{output_schema}" ) def parse_judge_response(raw: str) -> Dict[str, Any]: """Parse the judge model's JSON response into a score dict. Tolerates surrounding prose (judges ignore instructions sometimes) by extracting the first {...} block. Validates that every dimension is present as an integer 0-5. Returns dict with keys: scores (dim->int), notes (str), overall (float). Raises ValueError if the response cannot be parsed into a complete score set. """ import json import re if not raw or not raw.strip(): raise ValueError("empty judge response") # Strip code fences and any ```json prefix judges sometimes emit. stripped = raw.strip() fence_match = re.match(r"^```(?:json)?\s*(.*?)\s*```$", stripped, re.DOTALL) if fence_match: stripped = fence_match.group(1).strip() # Extract the first {...} block greedy-to-matching-brace. brace_match = re.search(r"\{.*\}", stripped, re.DOTALL) if not brace_match: raise ValueError(f"no JSON object found in judge response: {raw[:200]!r}") candidate = brace_match.group(0) try: parsed = json.loads(candidate) except json.JSONDecodeError as exc: raise ValueError(f"judge response not valid JSON: {exc}; raw={candidate[:200]!r}") scores: Dict[str, int] = {} for dim in DIMENSIONS: if dim not in parsed: raise ValueError(f"judge response missing dimension {dim!r}: {parsed}") value = parsed[dim] if isinstance(value, bool) or not isinstance(value, (int, float)): raise ValueError(f"dimension {dim} is not numeric: {value!r}") int_val = int(round(value)) if int_val < 0 or int_val > 5: raise ValueError(f"dimension {dim} out of range: {int_val}") scores[dim] = int_val notes_val = parsed.get("notes", "") notes = str(notes_val)[:200] if notes_val else "" overall = sum(scores.values()) / len(scores) return { "scores": scores, "notes": notes, "overall": overall, }