hermes-agent/scripts/profile-tui.py

#!/usr/bin/env python3
"""Drive the Hermes TUI under HERMES_DEV_PERF and summarize the pipeline.

Usage:
  scripts/profile-tui.py [--session SID] [--hold KEY] [--seconds N] [--rate HZ]

Defaults: picks the session with the most messages, holds PageUp for 8s at
~30 Hz (matching xterm key-repeat), summarizes ~/.hermes/perf.log on exit.

The --tui build must exist (run `npm run build` in ui-tui first). This script
launches `node dist/entry.js` directly with HERMES_TUI_RESUME set so it
bypasses the hermes_cli wrapper — we want repeatable timing, not the CLI's
session-picker flow.

Environment overrides:
  HERMES_PERF_LOG     (default ~/.hermes/perf.log)
  HERMES_PERF_NODE    (default node from $PATH)
  HERMES_TUI_DIR      (default /home/bb/hermes-agent/ui-tui)

Exit code is 0 if the harness ran and parsed results, 2 if the TUI crashed
or produced no perf data (suggests HERMES_DEV_PERF wiring is broken).
"""

from __future__ import annotations

import argparse
import json
import os
import pty
import select
import signal
import sqlite3
import statistics
import sys
import time
from pathlib import Path
from typing import Any


DEFAULT_TUI_DIR = Path(os.environ.get("HERMES_TUI_DIR", "/home/bb/hermes-agent/ui-tui"))
DEFAULT_LOG = Path(os.environ.get("HERMES_PERF_LOG", str(Path.home() / ".hermes" / "perf.log")))
DEFAULT_STATE_DB = Path.home() / ".hermes" / "state.db"

# Keystroke escape sequences.  Matches what xterm/VT220 send when the
# terminal has bracketed-paste disabled and the key-repeat handler fires.
KEYS = {
    "page_up": b"\x1b[5~",
    "page_down": b"\x1b[6~",
    "wheel_up": b"\x1b[M`!!",      # mouse wheel up (SGR-less) — best-effort
    "shift_up": b"\x1b[1;2A",
    "shift_down": b"\x1b[1;2B",
}


def pick_longest_session(db: Path) -> str:
    conn = sqlite3.connect(db)
    row = conn.execute(
        "SELECT id FROM sessions s ORDER BY "
        "(SELECT COUNT(*) FROM messages m WHERE m.session_id = s.id) DESC LIMIT 1"
    ).fetchone()
    if not row:
        sys.exit(f"no sessions in {db}")
    return row[0]


def drain(fd: int, timeout: float) -> bytes:
    """Read whatever's available from fd within `timeout`, then return."""
    chunks = []
    end = time.monotonic() + timeout
    while time.monotonic() < end:
        r, _, _ = select.select([fd], [], [], max(0.0, end - time.monotonic()))
        if not r:
            break
        try:
            data = os.read(fd, 4096)
        except OSError:
            break
        if not data:
            break
        chunks.append(data)
    return b"".join(chunks)


def hold_key(fd: int, seq: bytes, seconds: float, rate_hz: int) -> int:
    """Write `seq` to fd at ~rate_hz for `seconds`. Returns keystrokes sent."""
    interval = 1.0 / max(1, rate_hz)
    end = time.monotonic() + seconds
    sent = 0
    while time.monotonic() < end:
        try:
            os.write(fd, seq)
            sent += 1
        except OSError:
            break
        # Drain stdout to keep the PTY buffer flowing; ignore content.
        drain(fd, 0)
        time.sleep(interval)
    return sent


def summarize(log: Path, since_ts_ms: int) -> dict[str, Any]:
    """Parse perf.log, keep only events newer than since_ts_ms, return stats."""
    react_events: list[dict[str, Any]] = []
    frame_events: list[dict[str, Any]] = []
    if not log.exists():
        return {"error": f"no log at {log}", "react": [], "frame": []}
    for line in log.read_text().splitlines():
        line = line.strip()
        if not line:
            continue
        try:
            row = json.loads(line)
        except json.JSONDecodeError:
            continue
        if int(row.get("ts", 0)) < since_ts_ms:
            continue
        src = row.get("src")
        if src == "react":
            react_events.append(row)
        elif src == "frame":
            frame_events.append(row)

    return {
        "react": react_events,
        "frame": frame_events,
    }


def pct(values: list[float], p: float) -> float:
    if not values:
        return 0.0
    s = sorted(values)
    idx = min(len(s) - 1, int(len(s) * p))
    return s[idx]


def format_report(data: dict[str, Any]) -> str:
    react = data.get("react") or []
    frames = data.get("frame") or []
    out = []

    out.append("═══ React Profiler ═══")
    if not react:
        out.append("  (no react events — HERMES_DEV_PERF wired? threshold too high?)")
    else:
        by_id: dict[str, list[float]] = {}
        for r in react:
            by_id.setdefault(r["id"], []).append(r["actualMs"])
        out.append(f"  {'pane':<14} {'count':>6} {'p50':>8} {'p95':>8} {'p99':>8} {'max':>8}")
        for pid, ms in sorted(by_id.items(), key=lambda kv: -pct(kv[1], 0.99)):
            out.append(
                f"  {pid:<14} {len(ms):>6} {pct(ms,0.50):>8.2f} {pct(ms,0.95):>8.2f} "
                f"{pct(ms,0.99):>8.2f} {max(ms):>8.2f}"
            )

    out.append("")
    out.append("═══ Ink pipeline ═══")
    if not frames:
        out.append("  (no frame events — onFrame wiring broken?)")
    else:
        dur = [f["durationMs"] for f in frames]
        phases_present = any(f.get("phases") for f in frames)
        out.append(f"  frames captured: {len(frames)}")
        out.append(
            f"  durationMs  p50={pct(dur,0.50):.2f}  p95={pct(dur,0.95):.2f}  "
            f"p99={pct(dur,0.99):.2f}  max={max(dur):.2f}"
        )
        # Effective FPS during the run: frames / elapsed seconds.
        ts = sorted(f["ts"] for f in frames)
        if len(ts) >= 2:
            elapsed_s = (ts[-1] - ts[0]) / 1000.0
            fps = len(frames) / elapsed_s if elapsed_s > 0 else float("inf")
            out.append(f"  throughput: {len(frames)} frames / {elapsed_s:.2f}s = {fps:.1f} fps")

        if phases_present:
            fields = ["yoga", "renderer", "diff", "optimize", "write", "commit"]
            out.append("")
            out.append(f"  {'phase':<10} {'p50':>8} {'p95':>8} {'p99':>8} {'max':>8}   (ms)")
            for field in fields:
                vals = [f["phases"][field] for f in frames if f.get("phases")]
                if vals:
                    out.append(
                        f"  {field:<10} {pct(vals,0.50):>8.2f} {pct(vals,0.95):>8.2f} "
                        f"{pct(vals,0.99):>8.2f} {max(vals):>8.2f}"
                    )
            # Derived: sum of phases vs durationMs (reveals hidden time).
            sum_ps = [
                sum(f["phases"][k] for k in fields)
                for f in frames if f.get("phases")
            ]
            if sum_ps:
                dur_match = [f["durationMs"] for f in frames if f.get("phases")]
                deltas = [d - s for d, s in zip(dur_match, sum_ps)]
                out.append(
                    f"  {'dur-Σphases':<10} {pct(deltas,0.50):>8.2f} {pct(deltas,0.95):>8.2f} "
                    f"{pct(deltas,0.99):>8.2f} {max(deltas):>8.2f}   (unaccounted-for time)"
                )

            # Yoga counters
            visited = [f["phases"]["yogaVisited"] for f in frames if f.get("phases")]
            measured = [f["phases"]["yogaMeasured"] for f in frames if f.get("phases")]
            cache_hits = [f["phases"]["yogaCacheHits"] for f in frames if f.get("phases")]
            live = [f["phases"]["yogaLive"] for f in frames if f.get("phases")]
            out.append("")
            out.append("  Yoga counters (per frame):")
            for name, vals in (
                ("visited", visited),
                ("measured", measured),
                ("cacheHits", cache_hits),
                ("live", live),
            ):
                if vals:
                    out.append(f"    {name:<11} p50={pct(vals,0.5):.0f}  p99={pct(vals,0.99):.0f}  max={max(vals)}")

            # Patch counts — proxy for "how much changed each frame"
            patches = [f["phases"]["patches"] for f in frames if f.get("phases")]
            if patches:
                out.append(
                    f"  patches     p50={pct(patches,0.5):.0f}  p99={pct(patches,0.99):.0f}  "
                    f"max={max(patches)}  total={sum(patches)}"
                )
            optimized = [
                f["phases"].get("optimizedPatches", 0)
                for f in frames if f.get("phases")
            ]
            if any(optimized):
                out.append(
                    f"  optimized   p50={pct(optimized,0.5):.0f}  p99={pct(optimized,0.99):.0f}  "
                    f"max={max(optimized)}  total={sum(optimized)}"
                    f"  (ratio: {sum(optimized)/max(1,sum(patches)):.2f})"
                )

            # Write bytes + drain telemetry — the outer-terminal bottleneck gauge.
            bytes_written = [
                f["phases"].get("writeBytes", 0)
                for f in frames if f.get("phases")
            ]
            if any(bytes_written):
                total_b = sum(bytes_written)
                kb = total_b / 1024
                out.append(
                    f"  writeBytes  p50={pct(bytes_written,0.5):.0f}B  p99={pct(bytes_written,0.99):.0f}B  "
                    f"max={max(bytes_written)}B  total={kb:.1f}KB"
                )
            drains = [
                f["phases"].get("prevFrameDrainMs", 0)
                for f in frames if f.get("phases")
            ]
            if any(d > 0 for d in drains):
                nonzero = [d for d in drains if d > 0]
                out.append(
                    f"  drainMs     p50={pct(nonzero,0.5):.2f}  p95={pct(nonzero,0.95):.2f}  "
                    f"p99={pct(nonzero,0.99):.2f}  max={max(nonzero):.2f}   (terminal flush latency)"
                )
            backpressure = sum(1 for f in frames if f.get("phases", {}).get("backpressure"))
            if backpressure:
                out.append(
                    f"  backpressure: {backpressure}/{len(frames)} frames "
                    f"({100*backpressure/len(frames):.0f}%)   (Node stdout buffer full — terminal slow)"
                )

        # Flickers
        flicker_frames = [f for f in frames if f.get("flickers")]
        if flicker_frames:
            out.append("")
            out.append(f"  ⚠ flickers detected in {len(flicker_frames)} frames")
            reasons: dict[str, int] = {}
            for f in flicker_frames:
                for fl in f["flickers"]:
                    reasons[fl["reason"]] = reasons.get(fl["reason"], 0) + 1
            for reason, n in sorted(reasons.items(), key=lambda kv: -kv[1]):
                out.append(f"    {reason}: {n}")

    return "\n".join(out)


def key_metrics(data: dict[str, Any]) -> dict[str, float]:
    """Flatten the report into a dict of scalar metrics for A/B diffing."""
    metrics: dict[str, float] = {}
    frames = data.get("frame") or []
    react = data.get("react") or []

    if frames:
        durs = [f["durationMs"] for f in frames]
        metrics["frames"] = len(frames)
        metrics["dur_p50"] = pct(durs, 0.50)
        metrics["dur_p95"] = pct(durs, 0.95)
        metrics["dur_p99"] = pct(durs, 0.99)
        metrics["dur_max"] = max(durs)

        ts = sorted(f["ts"] for f in frames)
        if len(ts) >= 2:
            elapsed = (ts[-1] - ts[0]) / 1000.0
            metrics["fps_throughput"] = len(frames) / elapsed if elapsed > 0 else 0.0
            # Interframe gaps distribution — complementary view to throughput:
            gaps = [ts[i] - ts[i - 1] for i in range(1, len(ts))]
            if gaps:
                metrics["gap_p50_ms"] = pct(gaps, 0.50)
                metrics["gap_p99_ms"] = pct(gaps, 0.99)
                metrics["gaps_under_16ms"] = sum(1 for g in gaps if g < 16)
                metrics["gaps_over_200ms"] = sum(1 for g in gaps if g >= 200)

        for phase in ("renderer", "yoga", "diff", "write"):
            vals = [f["phases"][phase] for f in frames if f.get("phases")]
            if vals:
                metrics[f"{phase}_p99"] = pct(vals, 0.99)
                metrics[f"{phase}_max"] = max(vals)

        patches = [f["phases"]["patches"] for f in frames if f.get("phases")]
        if patches:
            metrics["patches_total"] = sum(patches)
            metrics["patches_p99"] = pct(patches, 0.99)

        optimized = [
            f["phases"].get("optimizedPatches", 0) for f in frames if f.get("phases")
        ]
        if any(optimized):
            metrics["optimized_total"] = sum(optimized)

        bytes_list = [
            f["phases"].get("writeBytes", 0) for f in frames if f.get("phases")
        ]
        if any(bytes_list):
            metrics["writeBytes_total"] = sum(bytes_list)

        drains = [
            f["phases"].get("prevFrameDrainMs", 0)
            for f in frames if f.get("phases")
        ]
        drain_nonzero = [d for d in drains if d > 0]
        if drain_nonzero:
            metrics["drain_p99"] = pct(drain_nonzero, 0.99)
            metrics["drain_max"] = max(drain_nonzero)

        bp = sum(1 for f in frames if f.get("phases", {}).get("backpressure"))
        metrics["backpressure_frames"] = bp

    if react:
        for pid in set(e["id"] for e in react):
            ms = [e["actualMs"] for e in react if e["id"] == pid]
            metrics[f"react_{pid}_p99"] = pct(ms, 0.99)
            metrics[f"react_{pid}_max"] = max(ms)

    return metrics


def format_diff(before: dict[str, float], after: dict[str, float]) -> str:
    """Render a side-by-side A/B comparison table."""
    keys = sorted(set(before) | set(after))
    lines = [f"{'metric':<28} {'before':>12} {'after':>12} {'delta':>12}  {'%':>6}"]
    lines.append("─" * 76)
    for k in keys:
        b = before.get(k, 0.0)
        a = after.get(k, 0.0)
        d = a - b
        pct_change = ((a / b) - 1) * 100 if b not in (0, 0.0) else float("inf") if a else 0

        # Flag improvements vs regressions. For _p99 / _max / _total / gaps_over /
        # patches / writeBytes / backpressure, LOWER is better.  For fps / gaps_under,
        # HIGHER is better.
        lower_is_better = any(
            token in k
            for token in (
                "p50",
                "p95",
                "p99",
                "_max",
                "_total",
                "gaps_over",
                "backpressure",
                "drain",
            )
        )
        higher_is_better = "fps_" in k or "gaps_under" in k
        mark = ""
        if d and not (lower_is_better or higher_is_better):
            mark = ""
        elif d < 0 and lower_is_better:
            mark = "↓"
        elif d > 0 and higher_is_better:
            mark = "↑"
        elif d > 0 and lower_is_better:
            mark = "↑"  # regression
        elif d < 0 and higher_is_better:
            mark = "↓"  # regression

        pct_str = "—" if pct_change == float("inf") else f"{pct_change:+6.1f}%"
        lines.append(
            f"{k:<28} {b:>12.2f} {a:>12.2f} {d:>+12.2f}  {pct_str} {mark}"
        )

    return "\n".join(lines)


def run_once(args: argparse.Namespace) -> dict[str, Any]:
    tui_dir = Path(args.tui_dir).resolve()
    entry = tui_dir / "dist" / "entry.js"
    if not entry.exists():
        sys.exit(f"{entry} missing — run `npm run build` in {tui_dir} first")

    sid = args.session or pick_longest_session(DEFAULT_STATE_DB)
    print(f"• session: {sid}")
    print(f"• hold: {args.hold} x {args.rate}Hz for {args.seconds}s after {args.warmup}s warmup")
    print(f"• terminal: {args.cols}x{args.rows}")

    log = Path(args.log)
    if not args.keep_log and log.exists():
        log.unlink()

    since_ms = int(time.time() * 1000)

    env = os.environ.copy()
    env["HERMES_DEV_PERF"] = "1"
    env["HERMES_DEV_PERF_MS"] = str(args.threshold_ms)
    env["HERMES_DEV_PERF_LOG"] = str(log)
    env["HERMES_TUI_RESUME"] = sid
    env["COLUMNS"] = str(args.cols)
    env["LINES"] = str(args.rows)
    env["TERM"] = env.get("TERM", "xterm-256color")

    # Pass through extra flags the TUI wrapper recognizes (e.g. --no-fullscreen).
    # Stored on args as `extra_flags` list.
    node = os.environ.get("HERMES_PERF_NODE", "node")
    node_args = [node, str(entry), *getattr(args, "extra_flags", [])]

    pid, fd = pty.fork()
    if pid == 0:
        os.execvpe(node, node_args, env)

    try:
        import fcntl, struct, termios
        winsize = struct.pack("HHHH", args.rows, args.cols, 0, 0)
        fcntl.ioctl(fd, termios.TIOCSWINSZ, winsize)

        print(f"• pid: {pid}  fd: {fd}")
        print(f"• warmup {args.warmup}s (drain startup output)…")
        drain(fd, args.warmup)

        print(f"• holding {args.hold}…")
        sent = hold_key(fd, KEYS[args.hold], args.seconds, args.rate)
        print(f"  sent {sent} keystrokes")

        drain(fd, 0.5)
    finally:
        try:
            os.kill(pid, signal.SIGTERM)
            for _ in range(10):
                pid_done, _ = os.waitpid(pid, os.WNOHANG)
                if pid_done == pid:
                    break
                time.sleep(0.1)
            else:
                os.kill(pid, signal.SIGKILL)
                os.waitpid(pid, 0)
        except (ProcessLookupError, ChildProcessError):
            pass
        try:
            os.close(fd)
        except OSError:
            pass

    time.sleep(0.2)
    return summarize(log, since_ms)


def main() -> int:
    p = argparse.ArgumentParser()
    p.add_argument("--session", help="session id to resume (default: longest in db)")
    p.add_argument("--hold", default="page_up", choices=sorted(KEYS.keys()), help="key to hold")
    p.add_argument("--seconds", type=float, default=8.0, help="how long to hold the key")
    p.add_argument("--rate", type=int, default=30, help="keystrokes per second")
    p.add_argument("--warmup", type=float, default=3.0, help="seconds to wait after launch before input")
    p.add_argument("--threshold-ms", type=float, default=0.0, help="HERMES_DEV_PERF_MS (0 = capture all)")
    p.add_argument("--cols", type=int, default=120)
    p.add_argument("--rows", type=int, default=40)
    p.add_argument("--keep-log", action="store_true", help="don't wipe perf.log before run")
    p.add_argument("--tui-dir", default=str(DEFAULT_TUI_DIR))
    p.add_argument("--log", default=str(DEFAULT_LOG))
    p.add_argument("--save", metavar="LABEL",
                   help="save the final metrics as /tmp/perf-<LABEL>.json for later --compare")
    p.add_argument("--compare", metavar="LABEL",
                   help="diff against /tmp/perf-<LABEL>.json after running")
    p.add_argument("--loop", action="store_true",
                   help="watch for source changes, rebuild, rerun, and diff vs previous run")
    p.add_argument("--extra-flag", dest="extra_flags", action="append", default=[],
                   help="pass through to node dist/entry.js (repeatable)")
    args = p.parse_args()

    if args.loop:
        return loop_mode(args)

    # Single-shot path.
    data = run_once(args)
    print()
    print(format_report(data))

    metrics = key_metrics(data)

    if args.save:
        path = Path(f"/tmp/perf-{args.save}.json")
        path.write_text(json.dumps(metrics, indent=2))
        print(f"\n• saved: {path}")

    if args.compare:
        path = Path(f"/tmp/perf-{args.compare}.json")
        if not path.exists():
            print(f"\n⚠ no baseline at {path} — run with --save {args.compare} first")
        else:
            before = json.loads(path.read_text())
            print(f"\n═══ A/B diff vs /tmp/perf-{args.compare}.json ═══")
            print(format_diff(before, metrics))

    if not data["react"] and not data["frame"]:
        return 2
    return 0


def loop_mode(args: argparse.Namespace) -> int:
    """Watch source files, rebuild, rerun, print A/B diff against previous run.

    Keeps a rolling 'previous run' baseline in memory so each iteration
    reports delta vs the last one — visibility into whether the last
    edit moved the needle.  Press Ctrl+C to stop.
    """
    import subprocess

    tui_dir = Path(args.tui_dir).resolve()
    src_root = tui_dir / "src"
    pkg_root = tui_dir / "packages" / "hermes-ink" / "src"

    def collect_mtimes() -> dict[str, float]:
        mtimes: dict[str, float] = {}
        for root in (src_root, pkg_root):
            if not root.exists():
                continue
            for path in root.rglob("*"):
                if path.suffix in {".ts", ".tsx"} and "__tests__" not in str(path):
                    try:
                        mtimes[str(path)] = path.stat().st_mtime
                    except OSError:
                        pass
        return mtimes

    previous_metrics: dict[str, float] | None = None
    previous_mtimes = collect_mtimes()
    iteration = 0

    print(f"• loop mode — watching {src_root} + {pkg_root} for *.ts(x) changes")
    print("• edit any TS file, the harness rebuilds + reruns automatically")
    print("• Ctrl+C to stop\n")

    try:
        while True:
            iteration += 1
            print(f"\n{'═' * 76}")
            print(f"Iteration {iteration}  @ {time.strftime('%H:%M:%S')}")
            print("═" * 76)

            if iteration > 1:
                print("• rebuilding…")
                result = subprocess.run(
                    ["npm", "run", "build"],
                    cwd=tui_dir,
                    capture_output=True,
                    text=True,
                )
                if result.returncode != 0:
                    print("✗ build failed:")
                    print(result.stdout[-2000:])
                    print(result.stderr[-2000:])
                    print("\n• waiting for source changes to retry…")
                    previous_mtimes = wait_for_change(previous_mtimes, collect_mtimes)
                    continue
                print("✓ build ok")

            data = run_once(args)
            metrics = key_metrics(data)

            print()
            print(format_report(data))

            if previous_metrics is not None:
                print(f"\n═══ A/B diff vs iteration {iteration - 1} ═══")
                print(format_diff(previous_metrics, metrics))

            previous_metrics = metrics

            print("\n• waiting for source changes…")
            previous_mtimes = wait_for_change(previous_mtimes, collect_mtimes)
    except KeyboardInterrupt:
        print("\n• loop stopped")
        return 0


def wait_for_change(prev: dict[str, float], collect) -> dict[str, float]:
    """Poll every 1s until a watched file's mtime changes. Debounced 500ms."""
    while True:
        time.sleep(1)
        current = collect()

        changed = [
            path for path, mtime in current.items() if prev.get(path) != mtime
        ]

        if changed:
            print(f"  ↻ {len(changed)} file(s) changed:")
            for path in changed[:5]:
                print(f"    {path}")
            # Debounce — editor save bursts can take ~500ms to settle
            time.sleep(0.5)
            return collect()


if __name__ == "__main__":
    sys.exit(main())