Files
hermes-agent/tui_gateway/entry.py

109 lines
3.9 KiB
Python
Raw Normal View History

2026-04-02 19:06:42 -05:00
import json
import os
2026-04-03 19:52:50 -05:00
import signal
2026-04-02 19:06:42 -05:00
import sys
import time
import traceback
2026-04-02 19:06:42 -05:00
from tui_gateway.server import _CRASH_LOG, dispatch, resolve_skin, write_json
2026-04-02 19:06:42 -05:00
def _log_signal(signum: int, frame) -> None:
"""Capture WHICH thread and WHERE a termination signal hit us.
SIG_DFL for SIGPIPE kills the process silently the instant any
background thread (TTS playback, beep, voice status emitter, etc.)
writes to a stdout the TUI has stopped reading. Without this
handler the gateway-exited banner in the TUI has no trace the
crash log never sees a Python exception because the kernel reaps
the process before the interpreter runs anything.
"""
name = {
signal.SIGPIPE: "SIGPIPE",
signal.SIGTERM: "SIGTERM",
signal.SIGHUP: "SIGHUP",
}.get(signum, f"signal {signum}")
try:
os.makedirs(os.path.dirname(_CRASH_LOG), exist_ok=True)
with open(_CRASH_LOG, "a", encoding="utf-8") as f:
f.write(
f"\n=== {name} received · {time.strftime('%Y-%m-%d %H:%M:%S')} ===\n"
)
if frame is not None:
f.write("main-thread stack at signal delivery:\n")
traceback.print_stack(frame, file=f)
# All live threads — signal may have been triggered by a
# background thread (write to broken stdout from TTS, etc.).
import threading as _threading
for tid, th in _threading._active.items():
f.write(f"\n--- thread {th.name} (id={tid}) ---\n")
f.write("".join(traceback.format_stack(sys._current_frames().get(tid))))
except Exception:
pass
print(f"[gateway-signal] {name}", file=sys.stderr, flush=True)
sys.exit(0)
signal.signal(signal.SIGPIPE, _log_signal)
signal.signal(signal.SIGTERM, _log_signal)
signal.signal(signal.SIGHUP, _log_signal)
2026-04-13 21:20:55 -05:00
signal.signal(signal.SIGINT, signal.SIG_IGN)
2026-04-03 19:52:50 -05:00
def _log_exit(reason: str) -> None:
"""Record why the gateway subprocess is shutting down.
Three exit paths (startup write fail, parse-error-response write fail,
dispatch-response write fail, stdin EOF) all collapse into a silent
sys.exit(0) here. Without this trail the TUI shows "gateway exited"
with no actionable clue about WHICH broken pipe or WHICH message
triggered it the main reason voice-mode turns look like phantom
crashes when the real story is "TUI read pipe closed on this event".
"""
try:
os.makedirs(os.path.dirname(_CRASH_LOG), exist_ok=True)
with open(_CRASH_LOG, "a", encoding="utf-8") as f:
f.write(
f"\n=== gateway exit · {time.strftime('%Y-%m-%d %H:%M:%S')} "
f"· reason={reason} ===\n"
)
except Exception:
pass
print(f"[gateway-exit] {reason}", file=sys.stderr, flush=True)
2026-04-02 19:06:42 -05:00
def main():
2026-04-06 18:38:13 -05:00
if not write_json({
2026-04-02 19:06:42 -05:00
"jsonrpc": "2.0",
"method": "event",
"params": {"type": "gateway.ready", "payload": {"skin": resolve_skin()}},
2026-04-06 18:38:13 -05:00
}):
_log_exit("startup write failed (broken stdout pipe before first event)")
2026-04-06 18:38:13 -05:00
sys.exit(0)
2026-04-02 19:06:42 -05:00
for raw in sys.stdin:
line = raw.strip()
if not line:
continue
try:
req = json.loads(line)
except json.JSONDecodeError:
2026-04-06 18:38:13 -05:00
if not write_json({"jsonrpc": "2.0", "error": {"code": -32700, "message": "parse error"}, "id": None}):
_log_exit("parse-error-response write failed (broken stdout pipe)")
2026-04-06 18:38:13 -05:00
sys.exit(0)
2026-04-02 19:06:42 -05:00
continue
method = req.get("method") if isinstance(req, dict) else None
fix(tui-gateway): dispatch slow RPC handlers on a thread pool (#12546) The stdin-read loop in entry.py calls handle_request() inline, so the five handlers that can block for seconds to minutes (slash.exec, cli.exec, shell.exec, session.resume, session.branch) freeze the dispatcher. While one is running, any inbound RPC — notably approval.respond and session.interrupt — sits unread in the pipe buffer and lands only after the slow handler returns. Route only those five onto a small ThreadPoolExecutor; every other handler stays on the main thread so the fast-path ordering is unchanged and the audit surface stays small. write_json is already _stdout_lock-guarded, so concurrent response writes are safe. Pool size defaults to 4 (overridable via HERMES_TUI_RPC_POOL_WORKERS). - add _LONG_HANDLERS set + ThreadPoolExecutor + atexit shutdown - new dispatch(req) function: pool for long handlers, inline for rest - _run_and_emit wraps pool work in a try/except so a misbehaving handler still surfaces as a JSON-RPC error instead of silently dying in a worker - entry.py swaps handle_request → dispatch - 5 new tests: sync path still inline, long handlers emit via stdout, fast handler not blocked behind slow one, handler exceptions map to error responses, non-long methods always take the sync path Manual repro confirms the fix: shell.exec(sleep 3) + terminal.resize sent back-to-back now returns the resize response at t=0s while the sleep finishes independently at t=3s. Before, both landed together at t=3s. Fixes #12546.
2026-04-19 07:47:15 -05:00
resp = dispatch(req)
2026-04-02 19:06:42 -05:00
if resp is not None:
2026-04-06 18:38:13 -05:00
if not write_json(resp):
_log_exit(f"response write failed for method={method!r} (broken stdout pipe)")
2026-04-06 18:38:13 -05:00
sys.exit(0)
2026-04-02 19:06:42 -05:00
_log_exit("stdin EOF (TUI closed the command pipe)")
2026-04-02 19:06:42 -05:00
if __name__ == "__main__":
main()