perf(tui): profile harness gains --loop, --save, --compare

Before: change code → build → run profile → manually compare to mental model of last run. After: `--loop` watches ui-tui/src and packages/hermes-ink/src for .ts(x) changes, rebuilds on change, re-runs the same scenario, prints a side-by-side A/B diff against the previous iteration — so each edit's impact is quantified instantly. Ctrl+C to stop. Also added: --save LABEL saves metrics snapshot to /tmp/perf-<LABEL>.json --compare LABEL diffs the current run vs that snapshot --extra-flag X pass-through to node dist/entry.js (prepping for --no-fullscreen below) key_metrics() flattens a full run into scalar numbers across frames, React commits, and per-phase timings. format_diff() prints a table with ↑/↓ markers denoting regressions vs improvements based on whether the metric is lower-is-better (p99, max, patches, drain) or higher-is-better (fps, gaps_under_16ms). Run-to-run noise on static code is ~5-15% on most metrics — big signal (>30% change on renderer_p99 / fps) cuts through cleanly. Useful both for validating a single fix and for detecting subtle regressions during the wheel-accel port. Usage during the next perf session: # one-shot with a baseline for later comparison scripts/profile-tui.py --seconds 6 --hold wheel_up --save pre-accel # after porting the wheel handler scripts/profile-tui.py --seconds 6 --hold wheel_up --compare pre-accel # continuous iteration scripts/profile-tui.py --seconds 6 --hold wheel_up --loop
2026-04-28 06:51:16 +08:00 · 2026-04-26 17:08:07 -05:00
parent f823535db2
commit 82f842277e
1 changed files with 264 additions and 23 deletions
--- a/scripts/profile-tui.py
+++ b/scripts/profile-tui.py
@@ -274,21 +274,125 @@ def format_report(data: dict[str, Any]) -> str:
    return "\n".join(out)


-def main() -> int:
-    p = argparse.ArgumentParser()
-    p.add_argument("--session", help="session id to resume (default: longest in db)")
-    p.add_argument("--hold", default="page_up", choices=sorted(KEYS.keys()), help="key to hold")
-    p.add_argument("--seconds", type=float, default=8.0, help="how long to hold the key")
-    p.add_argument("--rate", type=int, default=30, help="keystrokes per second")
-    p.add_argument("--warmup", type=float, default=3.0, help="seconds to wait after launch before input")
-    p.add_argument("--threshold-ms", type=float, default=0.0, help="HERMES_DEV_PERF_MS (0 = capture all)")
-    p.add_argument("--cols", type=int, default=120)
-    p.add_argument("--rows", type=int, default=40)
-    p.add_argument("--keep-log", action="store_true", help="don't wipe perf.log before run")
-    p.add_argument("--tui-dir", default=str(DEFAULT_TUI_DIR))
-    p.add_argument("--log", default=str(DEFAULT_LOG))
-    args = p.parse_args()
+def key_metrics(data: dict[str, Any]) -> dict[str, float]:
+    """Flatten the report into a dict of scalar metrics for A/B diffing."""
+    metrics: dict[str, float] = {}
+    frames = data.get("frame") or []
+    react = data.get("react") or []

+    if frames:
+        durs = [f["durationMs"] for f in frames]
+        metrics["frames"] = len(frames)
+        metrics["dur_p50"] = pct(durs, 0.50)
+        metrics["dur_p95"] = pct(durs, 0.95)
+        metrics["dur_p99"] = pct(durs, 0.99)
+        metrics["dur_max"] = max(durs)
+
+        ts = sorted(f["ts"] for f in frames)
+        if len(ts) >= 2:
+            elapsed = (ts[-1] - ts[0]) / 1000.0
+            metrics["fps_throughput"] = len(frames) / elapsed if elapsed > 0 else 0.0
+            # Interframe gaps distribution — complementary view to throughput:
+            gaps = [ts[i] - ts[i - 1] for i in range(1, len(ts))]
+            if gaps:
+                metrics["gap_p50_ms"] = pct(gaps, 0.50)
+                metrics["gap_p99_ms"] = pct(gaps, 0.99)
+                metrics["gaps_under_16ms"] = sum(1 for g in gaps if g < 16)
+                metrics["gaps_over_200ms"] = sum(1 for g in gaps if g >= 200)
+
+        for phase in ("renderer", "yoga", "diff", "write"):
+            vals = [f["phases"][phase] for f in frames if f.get("phases")]
+            if vals:
+                metrics[f"{phase}_p99"] = pct(vals, 0.99)
+                metrics[f"{phase}_max"] = max(vals)
+
+        patches = [f["phases"]["patches"] for f in frames if f.get("phases")]
+        if patches:
+            metrics["patches_total"] = sum(patches)
+            metrics["patches_p99"] = pct(patches, 0.99)
+
+        optimized = [
+            f["phases"].get("optimizedPatches", 0) for f in frames if f.get("phases")
+        ]
+        if any(optimized):
+            metrics["optimized_total"] = sum(optimized)
+
+        bytes_list = [
+            f["phases"].get("writeBytes", 0) for f in frames if f.get("phases")
+        ]
+        if any(bytes_list):
+            metrics["writeBytes_total"] = sum(bytes_list)
+
+        drains = [
+            f["phases"].get("prevFrameDrainMs", 0)
+            for f in frames if f.get("phases")
+        ]
+        drain_nonzero = [d for d in drains if d > 0]
+        if drain_nonzero:
+            metrics["drain_p99"] = pct(drain_nonzero, 0.99)
+            metrics["drain_max"] = max(drain_nonzero)
+
+        bp = sum(1 for f in frames if f.get("phases", {}).get("backpressure"))
+        metrics["backpressure_frames"] = bp
+
+    if react:
+        for pid in set(e["id"] for e in react):
+            ms = [e["actualMs"] for e in react if e["id"] == pid]
+            metrics[f"react_{pid}_p99"] = pct(ms, 0.99)
+            metrics[f"react_{pid}_max"] = max(ms)
+
+    return metrics
+
+
+def format_diff(before: dict[str, float], after: dict[str, float]) -> str:
+    """Render a side-by-side A/B comparison table."""
+    keys = sorted(set(before) | set(after))
+    lines = [f"{'metric':<28} {'before':>12} {'after':>12} {'delta':>12}  {'%':>6}"]
+    lines.append("─" * 76)
+    for k in keys:
+        b = before.get(k, 0.0)
+        a = after.get(k, 0.0)
+        d = a - b
+        pct_change = ((a / b) - 1) * 100 if b not in (0, 0.0) else float("inf") if a else 0
+
+        # Flag improvements vs regressions. For _p99 / _max / _total / gaps_over /
+        # patches / writeBytes / backpressure, LOWER is better.  For fps / gaps_under,
+        # HIGHER is better.
+        lower_is_better = any(
+            token in k
+            for token in (
+                "p50",
+                "p95",
+                "p99",
+                "_max",
+                "_total",
+                "gaps_over",
+                "backpressure",
+                "drain",
+            )
+        )
+        higher_is_better = "fps_" in k or "gaps_under" in k
+        mark = ""
+        if d and not (lower_is_better or higher_is_better):
+            mark = ""
+        elif d < 0 and lower_is_better:
+            mark = "↓"
+        elif d > 0 and higher_is_better:
+            mark = "↑"
+        elif d > 0 and lower_is_better:
+            mark = "↑"  # regression
+        elif d < 0 and higher_is_better:
+            mark = "↓"  # regression
+
+        pct_str = "—" if pct_change == float("inf") else f"{pct_change:+6.1f}%"
+        lines.append(
+            f"{k:<28} {b:>12.2f} {a:>12.2f} {d:>+12.2f}  {pct_str} {mark}"
+        )
+
+    return "\n".join(lines)
+
+
+def run_once(args: argparse.Namespace) -> dict[str, Any]:
    tui_dir = Path(args.tui_dir).resolve()
    entry = tui_dir / "dist" / "entry.js"
    if not entry.exists():
@@ -313,18 +417,17 @@ def main() -> int:
    env["COLUMNS"] = str(args.cols)
    env["LINES"] = str(args.rows)
    env["TERM"] = env.get("TERM", "xterm-256color")
-    # Ensure bracketed-paste doesn't intercept our PageUp writes.

+    # Pass through extra flags the TUI wrapper recognizes (e.g. --no-fullscreen).
+    # Stored on args as `extra_flags` list.
    node = os.environ.get("HERMES_PERF_NODE", "node")
+    node_args = [node, str(entry), *getattr(args, "extra_flags", [])]

-    # Fork under a PTY so the TUI enters alt-screen / raw-mode cleanly.
    pid, fd = pty.fork()
    if pid == 0:
-        # Child: exec node.  PTY makes stdin/stdout/stderr all TTY.
-        os.execvpe(node, [node, str(entry)], env)
+        os.execvpe(node, node_args, env)

    try:
-        # Set initial PTY size via ioctl (TIOCSWINSZ).
        import fcntl, struct, termios
        winsize = struct.pack("HHHH", args.rows, args.cols, 0, 0)
        fcntl.ioctl(fd, termios.TIOCSWINSZ, winsize)
@@ -337,10 +440,8 @@ def main() -> int:
        sent = hold_key(fd, KEYS[args.hold], args.seconds, args.rate)
        print(f"  sent {sent} keystrokes")

-        # Small cooldown so trailing frames get written to the log.
        drain(fd, 0.5)
    finally:
-        # Kill TUI cleanly.  SIGTERM first, SIGKILL if stubborn.
        try:
            os.kill(pid, signal.SIGTERM)
            for _ in range(10):
@@ -358,17 +459,157 @@ def main() -> int:
        except OSError:
            pass

-    # Give the log a moment to flush.
    time.sleep(0.2)
+    return summarize(log, since_ms)

-    data = summarize(log, since_ms)
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--session", help="session id to resume (default: longest in db)")
+    p.add_argument("--hold", default="page_up", choices=sorted(KEYS.keys()), help="key to hold")
+    p.add_argument("--seconds", type=float, default=8.0, help="how long to hold the key")
+    p.add_argument("--rate", type=int, default=30, help="keystrokes per second")
+    p.add_argument("--warmup", type=float, default=3.0, help="seconds to wait after launch before input")
+    p.add_argument("--threshold-ms", type=float, default=0.0, help="HERMES_DEV_PERF_MS (0 = capture all)")
+    p.add_argument("--cols", type=int, default=120)
+    p.add_argument("--rows", type=int, default=40)
+    p.add_argument("--keep-log", action="store_true", help="don't wipe perf.log before run")
+    p.add_argument("--tui-dir", default=str(DEFAULT_TUI_DIR))
+    p.add_argument("--log", default=str(DEFAULT_LOG))
+    p.add_argument("--save", metavar="LABEL",
+                   help="save the final metrics as /tmp/perf-<LABEL>.json for later --compare")
+    p.add_argument("--compare", metavar="LABEL",
+                   help="diff against /tmp/perf-<LABEL>.json after running")
+    p.add_argument("--loop", action="store_true",
+                   help="watch for source changes, rebuild, rerun, and diff vs previous run")
+    p.add_argument("--extra-flag", dest="extra_flags", action="append", default=[],
+                   help="pass through to node dist/entry.js (repeatable)")
+    args = p.parse_args()
+
+    if args.loop:
+        return loop_mode(args)
+
+    # Single-shot path.
+    data = run_once(args)
    print()
    print(format_report(data))

+    metrics = key_metrics(data)
+
+    if args.save:
+        path = Path(f"/tmp/perf-{args.save}.json")
+        path.write_text(json.dumps(metrics, indent=2))
+        print(f"\n• saved: {path}")
+
+    if args.compare:
+        path = Path(f"/tmp/perf-{args.compare}.json")
+        if not path.exists():
+            print(f"\n⚠ no baseline at {path} — run with --save {args.compare} first")
+        else:
+            before = json.loads(path.read_text())
+            print(f"\n═══ A/B diff vs /tmp/perf-{args.compare}.json ═══")
+            print(format_diff(before, metrics))
+
    if not data["react"] and not data["frame"]:
        return 2
    return 0


+def loop_mode(args: argparse.Namespace) -> int:
+    """Watch source files, rebuild, rerun, print A/B diff against previous run.
+
+    Keeps a rolling 'previous run' baseline in memory so each iteration
+    reports delta vs the last one — visibility into whether the last
+    edit moved the needle.  Press Ctrl+C to stop.
+    """
+    import subprocess
+
+    tui_dir = Path(args.tui_dir).resolve()
+    src_root = tui_dir / "src"
+    pkg_root = tui_dir / "packages" / "hermes-ink" / "src"
+
+    def collect_mtimes() -> dict[str, float]:
+        mtimes: dict[str, float] = {}
+        for root in (src_root, pkg_root):
+            if not root.exists():
+                continue
+            for path in root.rglob("*"):
+                if path.suffix in {".ts", ".tsx"} and "__tests__" not in str(path):
+                    try:
+                        mtimes[str(path)] = path.stat().st_mtime
+                    except OSError:
+                        pass
+        return mtimes
+
+    previous_metrics: dict[str, float] | None = None
+    previous_mtimes = collect_mtimes()
+    iteration = 0
+
+    print(f"• loop mode — watching {src_root} + {pkg_root} for *.ts(x) changes")
+    print("• edit any TS file, the harness rebuilds + reruns automatically")
+    print("• Ctrl+C to stop\n")
+
+    try:
+        while True:
+            iteration += 1
+            print(f"\n{'═' * 76}")
+            print(f"Iteration {iteration}  @ {time.strftime('%H:%M:%S')}")
+            print("═" * 76)
+
+            if iteration > 1:
+                print("• rebuilding…")
+                result = subprocess.run(
+                    ["npm", "run", "build"],
+                    cwd=tui_dir,
+                    capture_output=True,
+                    text=True,
+                )
+                if result.returncode != 0:
+                    print("✗ build failed:")
+                    print(result.stdout[-2000:])
+                    print(result.stderr[-2000:])
+                    print("\n• waiting for source changes to retry…")
+                    previous_mtimes = wait_for_change(previous_mtimes, collect_mtimes)
+                    continue
+                print("✓ build ok")
+
+            data = run_once(args)
+            metrics = key_metrics(data)
+
+            print()
+            print(format_report(data))
+
+            if previous_metrics is not None:
+                print(f"\n═══ A/B diff vs iteration {iteration - 1} ═══")
+                print(format_diff(previous_metrics, metrics))
+
+            previous_metrics = metrics
+
+            print("\n• waiting for source changes…")
+            previous_mtimes = wait_for_change(previous_mtimes, collect_mtimes)
+    except KeyboardInterrupt:
+        print("\n• loop stopped")
+        return 0
+
+
+def wait_for_change(prev: dict[str, float], collect) -> dict[str, float]:
+    """Poll every 1s until a watched file's mtime changes. Debounced 500ms."""
+    while True:
+        time.sleep(1)
+        current = collect()
+
+        changed = [
+            path for path, mtime in current.items() if prev.get(path) != mtime
+        ]
+
+        if changed:
+            print(f"  ↻ {len(changed)} file(s) changed:")
+            for path in changed[:5]:
+                print(f"    {path}")
+            # Debounce — editor save bursts can take ~500ms to settle
+            time.sleep(0.5)
+            return collect()
+
+
 if __name__ == "__main__":
    sys.exit(main())