hermes-agent/tests/stress/test_concurrency.py

"""Multi-process concurrency stress test for the Kanban kernel.

5 worker processes race for claims on a shared DB with 100 tasks. Each
worker loops: claim -> simulate work -> complete. Asserts the invariants
that make the system worth building:

  - No task claimed by two workers simultaneously
  - No task completed twice
  - Every claim produces exactly one run row
  - Every completion closes exactly one run row
  - Zero SQLite locking errors that escape the retry layer
  - Total run count == total claim events == total completed events

This test is the primary justification for WAL + CAS-based claim. If it
passes, the architecture holds. If it fails, we have a real bug to fix
before anyone runs this in anger.
"""

import json
import multiprocessing as mp
import os
import random
import sqlite3
import subprocess
import sys
import tempfile
import time
from pathlib import Path


NUM_WORKERS = 5
NUM_TASKS = 100
WORKER_TIMEOUT_S = 60
WT = str(Path(__file__).resolve().parents[2])


def worker_loop(worker_id: int, hermes_home: str, result_file: str) -> None:
    """One worker's inner loop. Runs in a fresh Python process.

    Tries to claim a ready task, marks it done with a per-worker summary,
    repeats until the ready pool is empty. Records every claim + complete
    into its own JSON result file for later aggregation.
    """
    os.environ["HERMES_HOME"] = hermes_home
    os.environ["HOME"] = hermes_home
    sys.path.insert(0, WT)

    from hermes_cli import kanban_db as kb

    events = []
    empty_polls = 0
    start = time.monotonic()

    while time.monotonic() - start < WORKER_TIMEOUT_S:
        conn = kb.connect()
        try:
            # Find any ready task (non-deterministic order intentional — we
            # want workers to race on popular assignees).
            row = conn.execute(
                "SELECT id FROM tasks WHERE status = 'ready' "
                "AND claim_lock IS NULL LIMIT 1"
            ).fetchone()
            if row is None:
                empty_polls += 1
                if empty_polls > 20:
                    break  # queue empty long enough, stop
                time.sleep(0.01)
                continue
            empty_polls = 0

            tid = row["id"]
            try:
                claimed = kb.claim_task(
                    conn, tid, claimer=f"worker-{worker_id}",
                )
            except sqlite3.OperationalError as e:
                events.append({"kind": "sqlite_err_on_claim", "task": tid, "err": str(e)})
                continue
            if claimed is None:
                # Someone else beat us — expected contention, not an error.
                events.append({"kind": "lost_claim_race", "task": tid})
                continue

            run = kb.latest_run(conn, tid)
            events.append({
                "kind": "claimed",
                "task": tid,
                "worker": worker_id,
                "run_id": run.id,
                "t": time.monotonic() - start,
            })

            # Simulate short, variable work
            time.sleep(random.uniform(0.001, 0.05))

            try:
                kb.complete_task(
                    conn, tid,
                    result=f"done by worker-{worker_id}",
                    summary=f"worker-{worker_id} finished task {tid}",
                    metadata={"worker_id": worker_id, "run_id": run.id},
                )
            except sqlite3.OperationalError as e:
                events.append({"kind": "sqlite_err_on_complete", "task": tid, "err": str(e)})
                continue
            events.append({
                "kind": "completed",
                "task": tid,
                "worker": worker_id,
                "run_id": run.id,
                "t": time.monotonic() - start,
            })
        finally:
            conn.close()

    with open(result_file, "w") as f:
        json.dump(events, f)


def main():
    home = tempfile.mkdtemp(prefix="hermes_concurrency_")
    print(f"HERMES_HOME = {home}")

    # Seed.
    os.environ["HERMES_HOME"] = home
    os.environ["HOME"] = home
    sys.path.insert(0, WT)
    from hermes_cli import kanban_db as kb

    kb.init_db()
    conn = kb.connect()
    tids = []
    for i in range(NUM_TASKS):
        tid = kb.create_task(
            conn, title=f"task #{i}", assignee="shared",
            tenant="concurrency-test",
        )
        tids.append(tid)
    conn.close()
    print(f"Seeded {NUM_TASKS} tasks.")

    # Spawn workers.
    ctx = mp.get_context("spawn")
    result_files = [f"/tmp/concurrency_worker_{i}.json" for i in range(NUM_WORKERS)]
    procs = []
    start = time.monotonic()
    for i in range(NUM_WORKERS):
        p = ctx.Process(target=worker_loop, args=(i, home, result_files[i]))
        p.start()
        procs.append(p)

    for p in procs:
        p.join(timeout=WORKER_TIMEOUT_S + 30)
        if p.is_alive():
            p.terminate()
            p.join()

    elapsed = time.monotonic() - start
    print(f"All workers done in {elapsed:.1f}s")

    # Aggregate worker events.
    all_events = []
    for i, f in enumerate(result_files):
        if not os.path.isfile(f):
            print(f"  WORKER {i} produced no result file — died?")
            continue
        with open(f) as fh:
            events = json.load(fh)
        all_events.extend(events)

    # ============ INVARIANT CHECKS ============
    print()
    print("=" * 60)
    print("INVARIANT CHECKS")
    print("=" * 60)

    failures = []

    # Check 1: no task claimed by two different workers
    claims_by_task = {}
    for e in all_events:
        if e["kind"] == "claimed":
            if e["task"] in claims_by_task:
                prev = claims_by_task[e["task"]]
                if prev["worker"] != e["worker"]:
                    failures.append(
                        f"DOUBLE CLAIM: task {e['task']} claimed by "
                        f"worker {prev['worker']} AND worker {e['worker']}"
                    )
            claims_by_task[e["task"]] = e

    # Check 2: every completion has a matching claim from the same worker
    for e in all_events:
        if e["kind"] == "completed":
            prev_claim = claims_by_task.get(e["task"])
            if prev_claim is None:
                failures.append(f"COMPLETION WITHOUT CLAIM: task {e['task']}")
            elif prev_claim["worker"] != e["worker"]:
                failures.append(
                    f"WORKER MISMATCH: task {e['task']} claimed by "
                    f"{prev_claim['worker']} but completed by {e['worker']}"
                )

    # Check 3: DB state — every task should be in 'done', no dangling claims
    conn = kb.connect()
    try:
        bad_status = conn.execute(
            "SELECT id, status, claim_lock, current_run_id FROM tasks "
            "WHERE status != 'done' OR claim_lock IS NOT NULL "
            "OR current_run_id IS NOT NULL"
        ).fetchall()
        if bad_status:
            for row in bad_status:
                failures.append(
                    f"BAD FINAL STATE: task {row['id']} status={row['status']} "
                    f"claim_lock={row['claim_lock']} current_run_id={row['current_run_id']}"
                )

        # Check 4: exactly one run per task, all closed as completed
        bad_runs = conn.execute(
            "SELECT task_id, COUNT(*) as n FROM task_runs "
            "GROUP BY task_id HAVING n != 1"
        ).fetchall()
        if bad_runs:
            for row in bad_runs:
                failures.append(
                    f"WRONG RUN COUNT: task {row['task_id']} has {row['n']} runs (expected 1)"
                )

        open_runs = conn.execute(
            "SELECT id, task_id FROM task_runs WHERE ended_at IS NULL"
        ).fetchall()
        for row in open_runs:
            failures.append(f"OPEN RUN: run {row['id']} on task {row['task_id']}")

        wrong_outcomes = conn.execute(
            "SELECT task_id, outcome FROM task_runs "
            "WHERE outcome IS NULL OR outcome != 'completed'"
        ).fetchall()
        for row in wrong_outcomes:
            failures.append(
                f"WRONG OUTCOME: task {row['task_id']} run outcome={row['outcome']}"
            )

        # Check 5: event counts — exactly NUM_TASKS completed events
        completed_events = conn.execute(
            "SELECT COUNT(*) as n FROM task_events WHERE kind='completed'"
        ).fetchone()["n"]
        if completed_events != NUM_TASKS:
            failures.append(
                f"EVENT COUNT MISMATCH: {completed_events} completed events "
                f"expected {NUM_TASKS}"
            )

        # Check 6: count SQLite errors that escaped retry
        sqlite_errs = sum(
            1 for e in all_events if e["kind"].startswith("sqlite_err")
        )
        if sqlite_errs > 0:
            failures.append(f"UNRETRIED SQLITE ERRORS: {sqlite_errs}")

    finally:
        conn.close()

    # ============ STATS ============
    print()
    total_claims = sum(1 for e in all_events if e["kind"] == "claimed")
    total_completes = sum(1 for e in all_events if e["kind"] == "completed")
    total_lost_races = sum(1 for e in all_events if e["kind"] == "lost_claim_race")

    per_worker = {}
    for e in all_events:
        if e["kind"] == "completed":
            per_worker.setdefault(e["worker"], 0)
            per_worker[e["worker"]] += 1

    print(f"Total claims:      {total_claims}")
    print(f"Total completes:   {total_completes}")
    print(f"Lost claim races:  {total_lost_races}  (expected contention; not a bug)")
    print(f"Elapsed:           {elapsed:.2f}s")
    print(f"Throughput:        {NUM_TASKS/elapsed:.1f} tasks/sec")
    print(f"Per-worker completions:")
    for w in sorted(per_worker.keys()):
        print(f"  worker-{w}: {per_worker[w]}")

    if failures:
        print()
        print("=" * 60)
        print(f"FAILURES ({len(failures)}):")
        print("=" * 60)
        for f in failures[:20]:
            print(f"  {f}")
        if len(failures) > 20:
            print(f"  ... and {len(failures) - 20} more")
        sys.exit(1)
    else:
        print()
        print("✔ ALL INVARIANTS HELD")


if __name__ == "__main__":
    main()