mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-03 17:27:37 +08:00
Salvage of PR #16100 onto current main (after emozilla's #17514 fix that unblocks plugin Pydantic body validation). History preserved on the standing `feat/kanban-standing` branch; this squashes the 22 iterative commits into one clean landing. What this lands: - SQLite kernel (hermes_cli/kanban_db.py) — durable task board with tasks, task_links, task_runs, task_comments, task_events, kanban_notify_subs tables. WAL mode, atomic claim via CAS, tenant-namespaced, skills JSON array per task, max-runtime timeouts, worker heartbeats, idempotency keys, circuit breaker on repeated spawn failures, crash detection via /proc/<pid>/status, run history preserved across attempts. - Dispatcher — runs inside the gateway by default (`kanban.dispatch_in_gateway: true`). Ticks every 60s, reclaims stale claims, promotes ready tasks, spawns `hermes -p <assignee> chat -q "work kanban task <id>"` with HERMES_KANBAN_TASK + HERMES_KANBAN_WORKSPACE env. Auto-loads `--skills kanban-worker` plus any per-task skills. Health telemetry warns on stuck ready queue. - Structured tool surface (tools/kanban_tools.py) — 7 tools (kanban_show, kanban_complete, kanban_block, kanban_heartbeat, kanban_comment, kanban_create, kanban_link). Gated on HERMES_KANBAN_TASK via check_fn so zero schema footprint in normal sessions. - System-prompt guidance (agent/prompt_builder.py KANBAN_GUIDANCE) injected only when kanban tools are active. - Dashboard plugin (plugins/kanban/dashboard/) — Linear-style board UI: triage/todo/ready/running/blocked/done columns, drag-drop, inline create, task drawer with markdown, comments, run history, dependency editor, bulk ops, lanes-by-profile grouping, WS-driven live refresh. Matches active dashboard theme via CSS variables. - CLI — `hermes kanban init|create|list|show|assign|link|unlink| claim|comment|complete|block|unblock|archive|tail|dispatch|context| init|gc|watch|stats|notify|log|heartbeat|runs|assignees` + `/kanban` slash in-session. - Worker + orchestrator skills (skills/devops/kanban-worker + kanban-orchestrator) — pattern library for good summary/metadata shapes, retry diagnostics, block-reason examples, fan-out patterns. - Per-task force-loaded skills — `--skill <name>` (repeatable), stored as JSON, threaded through to dispatcher argv as one `--skills X` pair per skill alongside the built-in kanban-worker. Dashboard + CLI + tool parity. - Deprecation of standalone `hermes kanban daemon` — stub exits 2 with migration guidance; `--force` escape hatch for headless hosts. - Docs (website/docs/user-guide/features/kanban.md + kanban-tutorial.md) with 11 dashboard screenshots walking through four user stories (Solo Dev, Fleet Farming, Role Pipeline, Circuit Breaker). - Tests (251 passing): kernel schema + migration + CAS atomicity, dispatcher logic, circuit breaker, crash detection, max-runtime timeouts, claim lifecycle, tenant isolation, idempotency keys, per- task skills round-trip + validation + dispatcher argv, tool surface (7 tools × round-trip + error paths), dashboard REST (CRUD + bulk + links + warnings), gateway-embedded dispatcher (config gate, env override, graceful shutdown), CLI deprecation stub, migration from legacy schemas. Gateway integration: - GatewayRunner._kanban_dispatcher_watcher — new asyncio background task, symmetric with _kanban_notifier_watcher. Runs dispatch_once via asyncio.to_thread so SQLite WAL never blocks the loop. Sleeps in 1s slices for snappy shutdown. Respects HERMES_KANBAN_DISPATCH_IN_GATEWAY=0 env override for debugging. - Config: new `kanban` section in DEFAULT_CONFIG with `dispatch_in_gateway: true` (default) + `dispatch_interval_seconds: 60`. Additive — no \_config_version bump needed. Forward-compat: - workflow_template_id / current_step_key columns on tasks (v1 writes NULL; v2 will use them for routing). - task_runs holds claim machinery (claim_lock, claim_expires, worker_pid, last_heartbeat_at) so multi-attempt history is first- class from day one. Closes #16102. Co-authored-by: emozilla <emozilla@nousresearch.com>
351 lines
12 KiB
Python
351 lines
12 KiB
Python
"""Harder concurrency stress: mixed operations + larger scale.
|
|
|
|
Scales to 500 tasks, 10 workers, 60s runtime. Each worker randomly:
|
|
- claims + completes (70%)
|
|
- claims + blocks with a reason (15%)
|
|
- unblocks a random blocked task (10%)
|
|
- archives a random done task (5%)
|
|
|
|
Adds a background "dispatcher" process that calls release_stale_claims
|
|
and detect_crashed_workers every 200ms, racing against the workers to
|
|
surface TTL + crash detection races.
|
|
|
|
Pass criteria: runs invariant holds, no double-completions, no orphan
|
|
runs, no SQLite errors escape the retry layer.
|
|
"""
|
|
|
|
import json
|
|
import multiprocessing as mp
|
|
import os
|
|
import random
|
|
import sqlite3
|
|
import sys
|
|
import tempfile
|
|
import time
|
|
from pathlib import Path
|
|
|
|
NUM_WORKERS = 10
|
|
NUM_TASKS = 500
|
|
RUN_DURATION_S = 30
|
|
WT = str(Path(__file__).resolve().parents[2])
|
|
|
|
|
|
def worker_loop(worker_id: int, hermes_home: str, result_file: str) -> None:
|
|
os.environ["HERMES_HOME"] = hermes_home
|
|
os.environ["HOME"] = hermes_home
|
|
sys.path.insert(0, WT)
|
|
from hermes_cli import kanban_db as kb
|
|
|
|
events = []
|
|
start = time.monotonic()
|
|
idle_rounds = 0
|
|
|
|
while time.monotonic() - start < RUN_DURATION_S:
|
|
conn = kb.connect()
|
|
try:
|
|
op = random.random()
|
|
|
|
if op < 0.10:
|
|
# Try to unblock a blocked task.
|
|
row = conn.execute(
|
|
"SELECT id FROM tasks WHERE status='blocked' "
|
|
"ORDER BY RANDOM() LIMIT 1"
|
|
).fetchone()
|
|
if row:
|
|
try:
|
|
ok = kb.unblock_task(conn, row["id"])
|
|
events.append({"kind": "unblocked" if ok else "unblock_noop",
|
|
"task": row["id"], "worker": worker_id})
|
|
except sqlite3.OperationalError as e:
|
|
events.append({"kind": "sqlite_err", "op": "unblock",
|
|
"task": row["id"], "err": str(e)[:100]})
|
|
continue
|
|
|
|
if op < 0.15:
|
|
# Try to archive a done task.
|
|
row = conn.execute(
|
|
"SELECT id FROM tasks WHERE status='done' "
|
|
"ORDER BY RANDOM() LIMIT 1"
|
|
).fetchone()
|
|
if row:
|
|
try:
|
|
kb.archive_task(conn, row["id"])
|
|
events.append({"kind": "archived", "task": row["id"],
|
|
"worker": worker_id})
|
|
except sqlite3.OperationalError as e:
|
|
events.append({"kind": "sqlite_err", "op": "archive",
|
|
"task": row["id"], "err": str(e)[:100]})
|
|
continue
|
|
|
|
# Default: claim + complete-or-block.
|
|
row = conn.execute(
|
|
"SELECT id FROM tasks WHERE status='ready' "
|
|
"AND claim_lock IS NULL LIMIT 1"
|
|
).fetchone()
|
|
if row is None:
|
|
idle_rounds += 1
|
|
if idle_rounds > 50:
|
|
break
|
|
time.sleep(0.02)
|
|
continue
|
|
idle_rounds = 0
|
|
|
|
tid = row["id"]
|
|
try:
|
|
claimed = kb.claim_task(
|
|
conn, tid, claimer=f"worker-{worker_id}",
|
|
ttl_seconds=5, # short TTL so reclaim races in
|
|
)
|
|
except sqlite3.OperationalError as e:
|
|
events.append({"kind": "sqlite_err", "op": "claim",
|
|
"task": tid, "err": str(e)[:100]})
|
|
continue
|
|
if claimed is None:
|
|
events.append({"kind": "lost_claim_race", "task": tid})
|
|
continue
|
|
|
|
run = kb.latest_run(conn, tid)
|
|
events.append({"kind": "claimed", "task": tid, "worker": worker_id,
|
|
"run_id": run.id, "t": time.monotonic() - start})
|
|
|
|
time.sleep(random.uniform(0.005, 0.05))
|
|
|
|
# 20% of the time, block instead of complete
|
|
if random.random() < 0.20:
|
|
try:
|
|
kb.block_task(conn, tid,
|
|
reason=f"blocked by worker-{worker_id}")
|
|
events.append({"kind": "blocked", "task": tid,
|
|
"worker": worker_id, "run_id": run.id})
|
|
except sqlite3.OperationalError as e:
|
|
events.append({"kind": "sqlite_err", "op": "block",
|
|
"task": tid, "err": str(e)[:100]})
|
|
else:
|
|
try:
|
|
kb.complete_task(
|
|
conn, tid,
|
|
result=f"done by worker-{worker_id}",
|
|
summary=f"worker-{worker_id} ok",
|
|
metadata={"worker_id": worker_id},
|
|
)
|
|
events.append({"kind": "completed", "task": tid,
|
|
"worker": worker_id, "run_id": run.id,
|
|
"t": time.monotonic() - start})
|
|
except sqlite3.OperationalError as e:
|
|
events.append({"kind": "sqlite_err", "op": "complete",
|
|
"task": tid, "err": str(e)[:100]})
|
|
finally:
|
|
conn.close()
|
|
|
|
with open(result_file, "w") as f:
|
|
json.dump(events, f)
|
|
|
|
|
|
def reclaimer_loop(hermes_home: str, result_file: str) -> None:
|
|
"""Background dispatcher-like loop that reclaims stale tasks."""
|
|
os.environ["HERMES_HOME"] = hermes_home
|
|
os.environ["HOME"] = hermes_home
|
|
sys.path.insert(0, WT)
|
|
from hermes_cli import kanban_db as kb
|
|
|
|
events = []
|
|
start = time.monotonic()
|
|
while time.monotonic() - start < RUN_DURATION_S + 2:
|
|
conn = kb.connect()
|
|
try:
|
|
try:
|
|
reclaimed = kb.release_stale_claims(conn)
|
|
if reclaimed:
|
|
events.append({"kind": "reclaimed", "count": reclaimed,
|
|
"t": time.monotonic() - start})
|
|
except sqlite3.OperationalError as e:
|
|
events.append({"kind": "sqlite_err", "op": "reclaim",
|
|
"err": str(e)[:100]})
|
|
finally:
|
|
conn.close()
|
|
time.sleep(0.2)
|
|
|
|
with open(result_file, "w") as f:
|
|
json.dump(events, f)
|
|
|
|
|
|
def main():
|
|
home = tempfile.mkdtemp(prefix="hermes_mixed_stress_")
|
|
print(f"HERMES_HOME = {home}")
|
|
|
|
os.environ["HERMES_HOME"] = home
|
|
os.environ["HOME"] = home
|
|
sys.path.insert(0, WT)
|
|
from hermes_cli import kanban_db as kb
|
|
|
|
kb.init_db()
|
|
conn = kb.connect()
|
|
for i in range(NUM_TASKS):
|
|
kb.create_task(
|
|
conn, title=f"t#{i}", assignee="shared", tenant="mixed-stress",
|
|
)
|
|
conn.close()
|
|
print(f"Seeded {NUM_TASKS} tasks, launching {NUM_WORKERS} workers + 1 reclaimer")
|
|
|
|
ctx = mp.get_context("spawn")
|
|
worker_results = [f"/tmp/mixed_worker_{i}.json" for i in range(NUM_WORKERS)]
|
|
reclaim_result = "/tmp/mixed_reclaim.json"
|
|
|
|
procs = []
|
|
start = time.monotonic()
|
|
for i in range(NUM_WORKERS):
|
|
p = ctx.Process(target=worker_loop, args=(i, home, worker_results[i]))
|
|
p.start()
|
|
procs.append(p)
|
|
r = ctx.Process(target=reclaimer_loop, args=(home, reclaim_result))
|
|
r.start()
|
|
procs.append(r)
|
|
|
|
for p in procs:
|
|
p.join(timeout=RUN_DURATION_S + 30)
|
|
if p.is_alive():
|
|
p.terminate()
|
|
p.join()
|
|
|
|
elapsed = time.monotonic() - start
|
|
print(f"Done in {elapsed:.1f}s")
|
|
|
|
# Aggregate.
|
|
all_events = []
|
|
for i, f in enumerate(worker_results):
|
|
if os.path.isfile(f):
|
|
with open(f) as fh:
|
|
all_events.extend(json.load(fh))
|
|
else:
|
|
print(f" WORKER {i} died with no result file!")
|
|
reclaim_events = []
|
|
if os.path.isfile(reclaim_result):
|
|
with open(reclaim_result) as fh:
|
|
reclaim_events = json.load(fh)
|
|
|
|
# ============ INVARIANT CHECKS ============
|
|
print()
|
|
print("=" * 60)
|
|
print("INVARIANT CHECKS")
|
|
print("=" * 60)
|
|
|
|
failures = []
|
|
|
|
# Per-run attribution tracking
|
|
claims = [e for e in all_events if e["kind"] == "claimed"]
|
|
completions = [e for e in all_events if e["kind"] == "completed"]
|
|
blocks = [e for e in all_events if e["kind"] == "blocked"]
|
|
|
|
# Every completion must have a matching claim on the same run_id AND
|
|
# the same worker (workers don't steal each other's runs).
|
|
claims_by_run = {c["run_id"]: c for c in claims}
|
|
for comp in completions:
|
|
claim = claims_by_run.get(comp["run_id"])
|
|
if claim is None:
|
|
# It's possible this worker saw a reclaimed run from another worker
|
|
# — that's still a bug: the worker shouldn't be able to complete
|
|
# a run it didn't claim. But let me check if reclaim happened first.
|
|
failures.append(
|
|
f"COMPLETION WITHOUT CLAIM: task {comp['task']} run {comp['run_id']} "
|
|
f"by worker {comp['worker']}"
|
|
)
|
|
elif claim["worker"] != comp["worker"]:
|
|
failures.append(
|
|
f"CROSS-WORKER COMPLETION: run {comp['run_id']} claimed by "
|
|
f"worker {claim['worker']} but completed by worker {comp['worker']}"
|
|
)
|
|
|
|
# SQLite errors that escaped the retry layer
|
|
sqlite_errs = [e for e in all_events if e["kind"] == "sqlite_err"]
|
|
if sqlite_errs:
|
|
for e in sqlite_errs[:5]:
|
|
failures.append(f"SQLITE ERROR: op={e.get('op')} err={e.get('err')}")
|
|
if len(sqlite_errs) > 5:
|
|
failures.append(f" ... and {len(sqlite_errs) - 5} more sqlite errs")
|
|
|
|
# DB final state — every task should be in a clean terminal state.
|
|
conn = kb.connect()
|
|
try:
|
|
# Invariant: current_run_id NULL iff latest run is terminal
|
|
inconsistent = conn.execute("""
|
|
SELECT t.id, t.status, t.current_run_id
|
|
FROM tasks t
|
|
WHERE t.current_run_id IS NOT NULL
|
|
AND EXISTS (SELECT 1 FROM task_runs r
|
|
WHERE r.id = t.current_run_id AND r.ended_at IS NOT NULL)
|
|
""").fetchall()
|
|
for row in inconsistent:
|
|
failures.append(
|
|
f"INVARIANT VIOLATION: task {row['id']} status={row['status']} "
|
|
f"has current_run_id={row['current_run_id']} but run is ended"
|
|
)
|
|
|
|
# Invariant: no orphan open runs
|
|
orphans = conn.execute("""
|
|
SELECT r.id, r.task_id, r.status
|
|
FROM task_runs r
|
|
LEFT JOIN tasks t ON t.current_run_id = r.id
|
|
WHERE r.ended_at IS NULL AND t.id IS NULL
|
|
""").fetchall()
|
|
for row in orphans:
|
|
failures.append(
|
|
f"ORPHAN OPEN RUN: run {row['id']} on task {row['task_id']}"
|
|
)
|
|
|
|
# Counts — should roughly balance.
|
|
status_counts = dict(
|
|
conn.execute("SELECT status, COUNT(*) FROM tasks GROUP BY status").fetchall()
|
|
)
|
|
run_outcome_counts = dict(
|
|
conn.execute(
|
|
"SELECT outcome, COUNT(*) FROM task_runs "
|
|
"WHERE ended_at IS NOT NULL GROUP BY outcome"
|
|
).fetchall()
|
|
)
|
|
active_runs = conn.execute(
|
|
"SELECT COUNT(*) FROM task_runs WHERE ended_at IS NULL"
|
|
).fetchone()[0]
|
|
|
|
finally:
|
|
conn.close()
|
|
|
|
# ============ STATS ============
|
|
print()
|
|
print(f"Workers: {NUM_WORKERS}, Tasks: {NUM_TASKS}")
|
|
print(f"Elapsed: {elapsed:.1f}s")
|
|
print(f"Events collected: {len(all_events)} (+{len(reclaim_events)} reclaim)")
|
|
print()
|
|
print("Operations:")
|
|
op_counts = {}
|
|
for e in all_events:
|
|
op_counts[e["kind"]] = op_counts.get(e["kind"], 0) + 1
|
|
for k in sorted(op_counts.keys()):
|
|
print(f" {k:<25} {op_counts[k]}")
|
|
|
|
print()
|
|
print("Final task status:")
|
|
for s, n in sorted(status_counts.items()):
|
|
print(f" {s:<10} {n}")
|
|
print("Final run outcomes:")
|
|
for o, n in sorted(run_outcome_counts.items(), key=lambda x: (x[0] or '',)):
|
|
print(f" {o:<12} {n}")
|
|
print(f" active {active_runs}")
|
|
|
|
if failures:
|
|
print()
|
|
print("=" * 60)
|
|
print(f"FAILURES ({len(failures)}):")
|
|
print("=" * 60)
|
|
for f in failures[:30]:
|
|
print(f" {f}")
|
|
if len(failures) > 30:
|
|
print(f" ... and {len(failures) - 30} more")
|
|
sys.exit(1)
|
|
else:
|
|
print()
|
|
print("✔ ALL INVARIANTS HELD UNDER MIXED STRESS")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|