"""Process-wide voice recording + TTS API for the TUI gateway. Wraps ``tools.voice_mode`` (recording/transcription) and ``tools.tts_tool`` (text-to-speech) behind idempotent, stateful entry points that the gateway's ``voice.record``, ``voice.toggle``, and ``voice.tts`` JSON-RPC handlers can call from a dedicated thread. The gateway imports this module lazily so that missing optional audio deps (sounddevice, faster-whisper, numpy) surface as an ``ImportError`` at call time, not at startup. Two usage modes are exposed: * **Push-to-talk** (``start_recording`` / ``stop_and_transcribe``) — single manually-bounded capture used when the caller drives the start/stop pair explicitly. * **Continuous (VAD)** (``start_continuous`` / ``stop_continuous``) — mirrors the classic CLI voice mode: recording auto-stops on silence, transcribes, hands the result to a callback, and then auto-restarts for the next turn. Three consecutive no-speech cycles stop the loop and fire ``on_silent_limit`` so the UI can turn the mode off. """ from __future__ import annotations import logging import os import sys import threading from typing import Any, Callable, Optional # Modifier aliases mirrored from the TUI parser (``ui-tui/src/lib/platform.ts``) # ``_MOD_ALIASES`` table — the contract that removes the cross-runtime # mismatch Copilot flagged in round-9 on #19835. # # ``super``/``win``/``windows`` are intentionally absent: prompt_toolkit # has no super/meta modifier for the Cmd key, so those spellings are # TUI-only. The normalizer below returns the documented default # (``c-b``) for them — a silent fallback was preferred to a hard # startup crash (Copilot round-11). The CLI binding site # (``_register_voice_handler`` in cli.py) logs a warning when that # fallback fires so users see why their TUI-only shortcut isn't # bound in the classic CLI. _VOICE_MOD_ALIASES = { "ctrl": "c-", "control": "c-", "alt": "a-", "option": "a-", "opt": "a-", } # Named keys prompt_toolkit accepts in ``c-`` / ``a-`` form. # Aliases collapse to prompt_toolkit's canonical spelling so the same # config value binds identically in both runtimes (Copilot round-10 on # #19835). _VOICE_NAMED_KEYS = { "space": "space", "spc": "space", "enter": "enter", "return": "enter", "ret": "enter", "tab": "tab", "escape": "escape", "esc": "escape", "backspace": "backspace", "bs": "backspace", "delete": "delete", "del": "delete", } # ``useInputHandlers()`` intercepts these before the voice check runs, # so a binding like ``ctrl+c`` (interrupt), ``ctrl+d`` (quit), or # ``ctrl+l`` (clear screen) would be advertised in /voice status but # never fire push-to-talk — the same blocklist the TUI parser uses. _VOICE_RESERVED_CTRL_CHARS = frozenset({"c", "d", "l"}) # On macOS the classic CLI's prompt_toolkit bindings for copy / exit / # clear also claim ``a-c`` / ``a-d`` / ``a-l`` via the action-modifier # lookup, and hermes-ink reports Alt as ``key.meta`` on many terminals. # Mirror the TUI parser's darwin-only reservation so ``option+c`` etc. # don't bind Alt+C in the CLI while the TUI silently falls back to # Ctrl+B (Copilot round-14 on #19835). _VOICE_RESERVED_ALT_CHARS_MAC = frozenset({"c", "d", "l"}) _DEFAULT_PT_KEY = "c-b" def voice_record_key_from_config(cfg: Any) -> Any: """Shape-safe ``cfg.voice.record_key`` lookup. ``load_config()`` deep-merges raw YAML and preserves scalar overrides, so a hand-edited ``voice: true`` / ``voice: cmd+b`` leaves ``cfg["voice"]`` as a bool/str instead of a dict, and the naive ``.get("voice", {}).get("record_key")`` chain raises AttributeError before voice can even start (Copilot round-11 on #19835). Return ``None`` for malformed shapes so call sites can feed the result straight into the normalizer/formatter and get the documented default. """ if not isinstance(cfg, dict): return None voice = cfg.get("voice") if not isinstance(voice, dict): return None return voice.get("record_key") def normalize_voice_record_key_for_prompt_toolkit(raw: Any) -> str: """Coerce ``voice.record_key`` into prompt_toolkit's ``c-x`` / ``a-x`` format. Mirrors the TUI parser contract (``ui-tui/src/lib/platform.ts``) so one config value binds the same shortcut in both runtimes: * non-string / empty / typo'd / bare-char / multi-modifier / reserved ``ctrl+c|d|l`` → documented default ``c-b`` * single-char keys: ``ctrl+o`` → ``c-o`` * named keys: ``ctrl+space`` → ``c-space`` (aliases collapse: ``ctrl+return`` → ``c-enter``) * ``super`` / ``win`` / ``windows`` → ``c-b`` (TUI-only modifiers — prompt_toolkit has no super mod; the CLI binding site is expected to warn when this fallback fires so users see the cross-runtime split, Copilot round-11 on #19835) """ if not isinstance(raw, str): return _DEFAULT_PT_KEY lowered = raw.strip().lower() if not lowered: return _DEFAULT_PT_KEY parts = [p.strip() for p in lowered.split("+") if p.strip()] if not parts: return _DEFAULT_PT_KEY # Multi-modifier chords like ``ctrl+alt+r`` bind different shortcuts # in prompt_toolkit (a-c-r form) and hermes-ink rejects them; collapse # to the documented default instead of silently diverging. if len(parts) > 2: return _DEFAULT_PT_KEY # Bare char / bare named key (no explicit modifier) — the CLI's # prompt_toolkit binds the raw key without a modifier, which the TUI # parser refuses; reject here too so both runtimes agree. if len(parts) == 1: return _DEFAULT_PT_KEY modifier_token, key_token = parts # ``super`` / ``win`` / ``windows`` are TUI-only (prompt_toolkit has # no super modifier, so ``@kb.add(super+b)`` crashes the CLI at # startup). Fall back to the documented default here; the CLI # binding site is expected to log a warning when the configured # value is one of these spellings so users know the TUI+CLI # runtimes diverge on that shortcut (Copilot round-11 on #19835). if modifier_token in {"super", "win", "windows"}: return _DEFAULT_PT_KEY normalized_mod = _VOICE_MOD_ALIASES.get(modifier_token) if not normalized_mod: return _DEFAULT_PT_KEY # Single-char key: reject reserved-ctrl chords that the TUI would # also block at parse time, plus the mac-only alt reservation. if len(key_token) == 1: if normalized_mod == "c-" and key_token in _VOICE_RESERVED_CTRL_CHARS: return _DEFAULT_PT_KEY if ( normalized_mod == "a-" and sys.platform == "darwin" and key_token in _VOICE_RESERVED_ALT_CHARS_MAC ): return _DEFAULT_PT_KEY return f"{normalized_mod}{key_token}" # Multi-char key token must be a known named key; typos like # ``ctrl+spcae`` fall back to the default rather than being passed # through as ``c-spcae`` (which prompt_toolkit would reject). named = _VOICE_NAMED_KEYS.get(key_token) if not named: return _DEFAULT_PT_KEY return f"{normalized_mod}{named}" def format_voice_record_key_for_status(raw: Any) -> str: """Render ``voice.record_key`` for ``/voice status`` in CLI-friendly form. Mirrors the TUI's ``formatVoiceRecordKey``: returns ``Ctrl+B`` / ``Alt+Space`` / ``Ctrl+Enter``. Malformed configs surface as the documented default so status never advertises a shortcut that won't bind (Copilot round-10 on #19835). """ normalized = normalize_voice_record_key_for_prompt_toolkit(raw) if normalized.startswith("c-"): prefix, key = "Ctrl+", normalized[2:] elif normalized.startswith("a-"): prefix, key = "Alt+", normalized[2:] elif "+" in normalized: # ``super+`` / ``win+`` — CLI won't bind them, but # render in title case so status output is still readable. mod, key = normalized.split("+", 1) prefix = mod[0].upper() + mod[1:] + "+" else: return "Ctrl+B" if not key: return prefix.rstrip("+") if len(key) == 1: return prefix + key.upper() return prefix + key[0].upper() + key[1:] from tools.voice_mode import ( create_audio_recorder, is_whisper_hallucination, play_audio_file, transcribe_recording, ) logger = logging.getLogger(__name__) def _debug(msg: str) -> None: """Emit a debug breadcrumb when HERMES_VOICE_DEBUG=1. Goes to stderr so the TUI gateway wraps it as a gateway.stderr event, which createGatewayEventHandler shows as an Activity line — exactly what we need to diagnose "why didn't the loop auto-restart?" in the user's real terminal without shipping a separate debug RPC. Any OSError / BrokenPipeError is swallowed because this fires from background threads (silence callback, TTS daemon, beep) where a broken stderr pipe must not kill the whole gateway — the main command pipe (stdin+stdout) is what actually matters. """ if os.environ.get("HERMES_VOICE_DEBUG", "").strip() != "1": return try: print(f"[voice] {msg}", file=sys.stderr, flush=True) except (BrokenPipeError, OSError): pass def _beeps_enabled() -> bool: """CLI parity: voice.beep_enabled in config.yaml (default True).""" try: from hermes_cli.config import load_config voice_cfg = load_config().get("voice", {}) if isinstance(voice_cfg, dict): return bool(voice_cfg.get("beep_enabled", True)) except Exception: pass return True def _play_beep(frequency: int, count: int = 1) -> None: """Audible cue matching cli.py's record/stop beeps. 880 Hz single-beep on start (cli.py:_voice_start_recording line 7532), 660 Hz double-beep on stop (cli.py:_voice_stop_and_transcribe line 7585). Best-effort — sounddevice failures are silently swallowed so the voice loop never breaks because a speaker was unavailable. """ if not _beeps_enabled(): return try: from tools.voice_mode import play_beep play_beep(frequency=frequency, count=count) except Exception as e: _debug(f"beep {frequency}Hz failed: {e}") # ── Push-to-talk state ─────────────────────────────────────────────── _recorder = None _recorder_lock = threading.Lock() # ── Continuous (VAD) state ─────────────────────────────────────────── _continuous_lock = threading.Lock() _continuous_active = False _continuous_recorder: Any = None # ── TTS-vs-STT feedback guard ──────────────────────────────────────── # When TTS plays the agent reply over the speakers, the live microphone # picks it up and transcribes the agent's own voice as user input — an # infinite loop the agent happily joins ("Ha, looks like we're in a loop"). # This Event mirrors cli.py:_voice_tts_done: cleared while speak_text is # playing, set while silent. _continuous_on_silence waits on it before # re-arming the recorder, and speak_text itself cancels any live capture # before starting playback so the tail of the previous utterance doesn't # leak into the mic. _tts_playing = threading.Event() _tts_playing.set() # initially "not playing" _continuous_on_transcript: Optional[Callable[[str], None]] = None _continuous_on_status: Optional[Callable[[str], None]] = None _continuous_on_silent_limit: Optional[Callable[[], None]] = None _continuous_no_speech_count = 0 _CONTINUOUS_NO_SPEECH_LIMIT = 3 # ── Push-to-talk API ───────────────────────────────────────────────── def start_recording() -> None: """Begin capturing from the default input device (push-to-talk). Idempotent — calling again while a recording is in progress is a no-op. """ global _recorder with _recorder_lock: if _recorder is not None and getattr(_recorder, "is_recording", False): return rec = create_audio_recorder() rec.start() _recorder = rec def stop_and_transcribe() -> Optional[str]: """Stop the active push-to-talk recording, transcribe, return text. Returns ``None`` when no recording is active, when the microphone captured no speech, or when Whisper returned a known hallucination. """ global _recorder with _recorder_lock: rec = _recorder _recorder = None if rec is None: return None wav_path = rec.stop() if not wav_path: return None try: result = transcribe_recording(wav_path) except Exception as e: logger.warning("voice transcription failed: %s", e) return None finally: try: if os.path.isfile(wav_path): os.unlink(wav_path) except Exception: pass # transcribe_recording returns {"success": bool, "transcript": str, ...} # — matches cli.py:_voice_stop_and_transcribe's result.get("transcript"). if not result.get("success"): return None text = (result.get("transcript") or "").strip() if not text or is_whisper_hallucination(text): return None return text # ── Continuous (VAD) API ───────────────────────────────────────────── def start_continuous( on_transcript: Callable[[str], None], on_status: Optional[Callable[[str], None]] = None, on_silent_limit: Optional[Callable[[], None]] = None, silence_threshold: int = 200, silence_duration: float = 3.0, ) -> None: """Start a VAD-driven continuous recording loop. The loop calls ``on_transcript(text)`` each time speech is detected and transcribed successfully, then auto-restarts. After ``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech picked up at all) the loop stops itself and calls ``on_silent_limit`` so the UI can reflect "voice off". Idempotent — calling while already active is a no-op. ``on_status`` is called with ``"listening"`` / ``"transcribing"`` / ``"idle"`` so the UI can show a live indicator. """ global _continuous_active, _continuous_recorder global _continuous_on_transcript, _continuous_on_status, _continuous_on_silent_limit global _continuous_no_speech_count with _continuous_lock: if _continuous_active: _debug("start_continuous: already active — no-op") return _continuous_active = True _continuous_on_transcript = on_transcript _continuous_on_status = on_status _continuous_on_silent_limit = on_silent_limit _continuous_no_speech_count = 0 if _continuous_recorder is None: _continuous_recorder = create_audio_recorder() _continuous_recorder._silence_threshold = silence_threshold _continuous_recorder._silence_duration = silence_duration rec = _continuous_recorder _debug( f"start_continuous: begin (threshold={silence_threshold}, duration={silence_duration}s)" ) # CLI parity: single 880 Hz beep *before* opening the stream — placing # the beep after stream.start() on macOS triggers a CoreAudio conflict # (cli.py:7528 comment). _play_beep(frequency=880, count=1) try: rec.start(on_silence_stop=_continuous_on_silence) except Exception as e: logger.error("failed to start continuous recording: %s", e) _debug(f"start_continuous: rec.start raised {type(e).__name__}: {e}") with _continuous_lock: _continuous_active = False raise if on_status: try: on_status("listening") except Exception: pass def stop_continuous() -> None: """Stop the active continuous loop and release the microphone. Idempotent — calling while not active is a no-op. Any in-flight transcription completes but its result is discarded (the callback checks ``_continuous_active`` before firing). """ global _continuous_active, _continuous_on_transcript global _continuous_on_status, _continuous_on_silent_limit global _continuous_recorder, _continuous_no_speech_count with _continuous_lock: if not _continuous_active: return _continuous_active = False rec = _continuous_recorder on_status = _continuous_on_status _continuous_on_transcript = None _continuous_on_status = None _continuous_on_silent_limit = None _continuous_no_speech_count = 0 if rec is not None: try: # cancel() (not stop()) discards buffered frames — the loop # is over, we don't want to transcribe a half-captured turn. rec.cancel() except Exception as e: logger.warning("failed to cancel recorder: %s", e) # Audible "recording stopped" cue (CLI parity: same 660 Hz × 2 the # silence-auto-stop path plays). _play_beep(frequency=660, count=2) if on_status: try: on_status("idle") except Exception: pass def is_continuous_active() -> bool: """Whether a continuous voice loop is currently running.""" with _continuous_lock: return _continuous_active def _continuous_on_silence() -> None: """AudioRecorder silence callback — runs in a daemon thread. Stops the current capture, transcribes, delivers the text via ``on_transcript``, and — if the loop is still active — starts the next capture. Three consecutive silent cycles end the loop. """ global _continuous_active, _continuous_no_speech_count _debug("_continuous_on_silence: fired") with _continuous_lock: if not _continuous_active: _debug("_continuous_on_silence: loop inactive — abort") return rec = _continuous_recorder on_transcript = _continuous_on_transcript on_status = _continuous_on_status on_silent_limit = _continuous_on_silent_limit if rec is None: _debug("_continuous_on_silence: no recorder — abort") return if on_status: try: on_status("transcribing") except Exception: pass wav_path = rec.stop() # Peak RMS is the critical diagnostic when stop() returns None despite # the VAD firing — tells us at a glance whether the mic was too quiet # for SILENCE_RMS_THRESHOLD (200) or the VAD + peak checks disagree. peak_rms = getattr(rec, "_peak_rms", -1) _debug( f"_continuous_on_silence: rec.stop -> {wav_path!r} (peak_rms={peak_rms})" ) # CLI parity: double 660 Hz beep after the stream stops (safe from the # CoreAudio conflict that blocks pre-start beeps). _play_beep(frequency=660, count=2) transcript: Optional[str] = None if wav_path: try: result = transcribe_recording(wav_path) # transcribe_recording returns {"success": bool, "transcript": str, # "error": str?} — NOT {"text": str}. Using the wrong key silently # produced empty transcripts even when Groq/local STT returned fine, # which masqueraded as "not hearing the user" to the caller. success = bool(result.get("success")) text = (result.get("transcript") or "").strip() err = result.get("error") _debug( f"_continuous_on_silence: transcribe -> success={success} " f"text={text!r} err={err!r}" ) if success and text and not is_whisper_hallucination(text): transcript = text except Exception as e: logger.warning("continuous transcription failed: %s", e) _debug(f"_continuous_on_silence: transcribe raised {type(e).__name__}: {e}") finally: try: if os.path.isfile(wav_path): os.unlink(wav_path) except Exception: pass with _continuous_lock: if not _continuous_active: # User stopped us while we were transcribing — discard. _debug("_continuous_on_silence: stopped during transcribe — no restart") return if transcript: _continuous_no_speech_count = 0 else: _continuous_no_speech_count += 1 should_halt = _continuous_no_speech_count >= _CONTINUOUS_NO_SPEECH_LIMIT no_speech = _continuous_no_speech_count if transcript and on_transcript: try: on_transcript(transcript) except Exception as e: logger.warning("on_transcript callback raised: %s", e) if should_halt: _debug(f"_continuous_on_silence: {no_speech} silent cycles — halting") with _continuous_lock: _continuous_active = False _continuous_no_speech_count = 0 if on_silent_limit: try: on_silent_limit() except Exception: pass try: rec.cancel() except Exception: pass if on_status: try: on_status("idle") except Exception: pass return # CLI parity (cli.py:10619-10621): wait for any in-flight TTS to # finish before re-arming the mic, then leave a small gap to avoid # catching the tail of the speaker output. Without this the voice # loop becomes a feedback loop — the agent's spoken reply lands # back in the mic and gets re-submitted. if not _tts_playing.is_set(): _debug("_continuous_on_silence: waiting for TTS to finish") _tts_playing.wait(timeout=60) import time as _time _time.sleep(0.3) # User may have stopped the loop during the wait. with _continuous_lock: if not _continuous_active: _debug("_continuous_on_silence: stopped while waiting for TTS") return # Restart for the next turn. _debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})") _play_beep(frequency=880, count=1) try: rec.start(on_silence_stop=_continuous_on_silence) except Exception as e: logger.error("failed to restart continuous recording: %s", e) _debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}") with _continuous_lock: _continuous_active = False return if on_status: try: on_status("listening") except Exception: pass # ── TTS API ────────────────────────────────────────────────────────── def speak_text(text: str) -> None: """Synthesize ``text`` with the configured TTS provider and play it. Mirrors cli.py:_voice_speak_response exactly — same markdown strip pipeline, same 4000-char cap, same explicit mp3 output path, same MP3-over-OGG playback choice (afplay misbehaves on OGG), same cleanup of both extensions. Keeping these in sync means a voice-mode TTS session in the TUI sounds identical to one in the classic CLI. While playback is in flight the module-level _tts_playing Event is cleared so the continuous-recording loop knows to wait before re-arming the mic (otherwise the agent's spoken reply feedback-loops through the microphone and the agent ends up replying to itself). """ if not text or not text.strip(): return import re import tempfile import time # Cancel any live capture before we open the speakers — otherwise the # last ~200ms of the user's turn tail + the first syllables of our TTS # both end up in the next recording window. The continuous loop will # re-arm itself after _tts_playing flips back (see _continuous_on_silence). paused_recording = False with _continuous_lock: if ( _continuous_active and _continuous_recorder is not None and getattr(_continuous_recorder, "is_recording", False) ): try: _continuous_recorder.cancel() paused_recording = True except Exception as e: logger.warning("failed to pause recorder for TTS: %s", e) _tts_playing.clear() _debug(f"speak_text: TTS begin (paused_recording={paused_recording})") try: from tools.tts_tool import text_to_speech_tool tts_text = text[:4000] if len(text) > 4000 else text tts_text = re.sub(r'```[\s\S]*?```', ' ', tts_text) # fenced code blocks tts_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', tts_text) # [text](url) → text tts_text = re.sub(r'https?://\S+', '', tts_text) # bare URLs tts_text = re.sub(r'\*\*(.+?)\*\*', r'\1', tts_text) # bold tts_text = re.sub(r'\*(.+?)\*', r'\1', tts_text) # italic tts_text = re.sub(r'`(.+?)`', r'\1', tts_text) # inline code tts_text = re.sub(r'^#+\s*', '', tts_text, flags=re.MULTILINE) # headers tts_text = re.sub(r'^\s*[-*]\s+', '', tts_text, flags=re.MULTILINE) # list bullets tts_text = re.sub(r'---+', '', tts_text) # horizontal rules tts_text = re.sub(r'\n{3,}', '\n\n', tts_text) # excess newlines tts_text = tts_text.strip() if not tts_text: return # MP3 output path, pre-chosen so we can play the MP3 directly even # when text_to_speech_tool auto-converts to OGG for messaging # platforms. afplay's OGG support is flaky, MP3 always works. os.makedirs(os.path.join(tempfile.gettempdir(), "hermes_voice"), exist_ok=True) mp3_path = os.path.join( tempfile.gettempdir(), "hermes_voice", f"tts_{time.strftime('%Y%m%d_%H%M%S')}.mp3", ) _debug(f"speak_text: synthesizing {len(tts_text)} chars -> {mp3_path}") text_to_speech_tool(text=tts_text, output_path=mp3_path) if os.path.isfile(mp3_path) and os.path.getsize(mp3_path) > 0: _debug(f"speak_text: playing {mp3_path} ({os.path.getsize(mp3_path)} bytes)") play_audio_file(mp3_path) try: os.unlink(mp3_path) ogg_path = mp3_path.rsplit(".", 1)[0] + ".ogg" if os.path.isfile(ogg_path): os.unlink(ogg_path) except OSError: pass else: _debug(f"speak_text: TTS tool produced no audio at {mp3_path}") except Exception as e: logger.warning("Voice TTS playback failed: %s", e) _debug(f"speak_text raised {type(e).__name__}: {e}") finally: _tts_playing.set() _debug("speak_text: TTS done") # Re-arm the mic so the user can answer without pressing Ctrl+B. # Small delay lets the OS flush speaker output and afplay fully # release the audio device before sounddevice re-opens the input. if paused_recording: time.sleep(0.3) with _continuous_lock: if _continuous_active and _continuous_recorder is not None: try: _continuous_recorder.start( on_silence_stop=_continuous_on_silence ) _debug("speak_text: recording resumed after TTS") except Exception as e: logger.warning( "failed to resume recorder after TTS: %s", e )