From 42ff7857712b0dec93e2aab6a033d77857f3f6bb Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Fri, 24 Apr 2026 01:27:19 +0300 Subject: [PATCH] fix(tui): voice TTS speak-back + transcript-key bug + auto-submit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three issues surfaced during end-to-end testing of the CLI-parity voice loop and are fixed together because they all blocked "speak → agent responds → TTS reads it back" from working at all: 1. Wrong result key (hermes_cli/voice.py) transcribe_recording() returns {"success": bool, "transcript": str}, matching cli.py:_voice_stop_and_transcribe. The wrapper was reading result.get("text"), which is None, so every successful Groq / local STT response was thrown away and the 3-strikes halt fired after three silent-looking cycles. Fixed by reading "transcript" and also honouring "success" like the CLI does. Updated the loop simulation tests to return the correct shape. 2. TTS speak-back was missing (tui_gateway/server.py + hermes_cli/voice.py) The TUI had a voice.toggle "tts" subcommand but nothing downstream actually read the flag — agent replies never spoke. Mirrored cli.py:8747-8754's dispatch: on message.complete with status == "complete", if _voice_tts_enabled() is true, spawn a daemon thread running speak_text(response). Rewrote speak_text as a full port of cli.py:_voice_speak_response — same markdown-strip regex pipeline (code blocks, links, bold/italic, inline code, headers, list bullets, horizontal rules, excessive newlines), same 4000-char cap, same explicit mp3 output path, same MP3-over-OGG playback choice (afplay misbehaves on OGG), same cleanup of both extensions. Keeps TUI TTS audible output byte-for-byte identical to the classic CLI. 3. Auto-submit swallowed on non-empty composer (createGatewayEventHandler.ts) The voice.transcript handler branched on prev input via a setInput updater and fired submitRef.current inside the updater when prev was empty. React strict mode double-invokes state updaters, which would queue the submit twice; and when the composer had any content the transcript was merely appended — the agent never saw it. CLI _pending_input.put(transcript) unconditionally feeds the transcript as the next turn, so match that: always clear the composer and setTimeout(() => submitRef.current(text), 0) outside any updater. Side effect can't run twice this way, and a half-typed draft on the rare occasion is a fair trade vs. silently dropping the turn. Also added peak_rms to the rec.stop debug line so "recording too quiet" is diagnosable at a glance when HERMES_VOICE_DEBUG=1. --- hermes_cli/voice.py | 77 ++++++++++++++------- tui_gateway/server.py | 22 ++++++ ui-tui/src/app/createGatewayEventHandler.ts | 23 +++--- 3 files changed, 84 insertions(+), 38 deletions(-) diff --git a/hermes_cli/voice.py b/hermes_cli/voice.py index 70e097e77c..448021d115 100644 --- a/hermes_cli/voice.py +++ b/hermes_cli/voice.py @@ -21,7 +21,6 @@ Two usage modes are exposed: from __future__ import annotations -import json import logging import os import sys @@ -405,34 +404,62 @@ def _continuous_on_silence() -> None: def speak_text(text: str) -> None: """Synthesize ``text`` with the configured TTS provider and play it. - The gateway spawns a daemon thread to call this so the RPC returns - immediately. Failures are logged and swallowed. + Mirrors cli.py:_voice_speak_response exactly — same markdown strip + pipeline, same 4000-char cap, same explicit mp3 output path, same + MP3-over-OGG playback choice (afplay misbehaves on OGG), same cleanup + of both extensions. Keeping these in sync means a voice-mode TTS + session in the TUI sounds identical to one in the classic CLI. """ if not text or not text.strip(): return - # Lazy import — tts_tool pulls optional provider SDKs. - from tools.tts_tool import text_to_speech_tool + import re + import tempfile + import time try: - raw = text_to_speech_tool(text) + from tools.tts_tool import text_to_speech_tool + + tts_text = text[:4000] if len(text) > 4000 else text + tts_text = re.sub(r'```[\s\S]*?```', ' ', tts_text) # fenced code blocks + tts_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', tts_text) # [text](url) → text + tts_text = re.sub(r'https?://\S+', '', tts_text) # bare URLs + tts_text = re.sub(r'\*\*(.+?)\*\*', r'\1', tts_text) # bold + tts_text = re.sub(r'\*(.+?)\*', r'\1', tts_text) # italic + tts_text = re.sub(r'`(.+?)`', r'\1', tts_text) # inline code + tts_text = re.sub(r'^#+\s*', '', tts_text, flags=re.MULTILINE) # headers + tts_text = re.sub(r'^\s*[-*]\s+', '', tts_text, flags=re.MULTILINE) # list bullets + tts_text = re.sub(r'---+', '', tts_text) # horizontal rules + tts_text = re.sub(r'\n{3,}', '\n\n', tts_text) # excess newlines + tts_text = tts_text.strip() + if not tts_text: + return + + # MP3 output path, pre-chosen so we can play the MP3 directly even + # when text_to_speech_tool auto-converts to OGG for messaging + # platforms. afplay's OGG support is flaky, MP3 always works. + os.makedirs(os.path.join(tempfile.gettempdir(), "hermes_voice"), exist_ok=True) + mp3_path = os.path.join( + tempfile.gettempdir(), + "hermes_voice", + f"tts_{time.strftime('%Y%m%d_%H%M%S')}.mp3", + ) + + _debug(f"speak_text: synthesizing {len(tts_text)} chars -> {mp3_path}") + text_to_speech_tool(text=tts_text, output_path=mp3_path) + + if os.path.isfile(mp3_path) and os.path.getsize(mp3_path) > 0: + _debug(f"speak_text: playing {mp3_path} ({os.path.getsize(mp3_path)} bytes)") + play_audio_file(mp3_path) + try: + os.unlink(mp3_path) + ogg_path = mp3_path.rsplit(".", 1)[0] + ".ogg" + if os.path.isfile(ogg_path): + os.unlink(ogg_path) + except OSError: + pass + else: + _debug(f"speak_text: TTS tool produced no audio at {mp3_path}") except Exception as e: - logger.warning("TTS synthesis failed: %s", e) - return - - try: - result = json.loads(raw) if isinstance(raw, str) else raw - except json.JSONDecodeError: - logger.warning("TTS returned non-JSON result") - return - - if not isinstance(result, dict): - return - - file_path = result.get("file_path") - if not file_path: - err = result.get("error") or "no file_path in TTS result" - logger.warning("TTS succeeded but produced no audio: %s", err) - return - - play_audio_file(file_path) + logger.warning("Voice TTS playback failed: %s", e) + _debug(f"speak_text raised {type(e).__name__}: {e}") diff --git a/tui_gateway/server.py b/tui_gateway/server.py index 130b60576e..f31ff3b0e2 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -2126,6 +2126,28 @@ def _(rid, params: dict) -> dict: if rendered: payload["rendered"] = rendered _emit("message.complete", sid, payload) + + # CLI parity: when voice-mode TTS is on, speak the agent reply + # (cli.py:_voice_speak_response). Only the final text — tool + # calls / reasoning already stream separately and would be + # noisy to read aloud. + if ( + status == "complete" + and isinstance(raw, str) + and raw.strip() + and _voice_tts_enabled() + ): + try: + from hermes_cli.voice import speak_text + + spoken = raw + threading.Thread( + target=speak_text, args=(spoken,), daemon=True + ).start() + except ImportError: + logger.warning("voice TTS skipped: hermes_cli.voice unavailable") + except Exception as e: + logger.warning("voice TTS dispatch failed: %s", e) except Exception as e: _emit("error", sid, {"message": str(e)}) finally: diff --git a/ui-tui/src/app/createGatewayEventHandler.ts b/ui-tui/src/app/createGatewayEventHandler.ts index 377735ca91..50f6fa3af4 100644 --- a/ui-tui/src/app/createGatewayEventHandler.ts +++ b/ui-tui/src/app/createGatewayEventHandler.ts @@ -301,19 +301,16 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev: return } - // Match CLI's _pending_input.put(transcript): auto-submit when the - // composer is empty, otherwise append so the user can keep editing - // a partial draft they were working on. - setInput(prev => { - if (!prev) { - // defer submit so React commits the state change first - setTimeout(() => submitRef.current(text), 0) - - return '' - } - - return `${prev}${/\s$/.test(prev) ? '' : ' '}${text}` - }) + // CLI parity: _pending_input.put(transcript) unconditionally feeds + // the transcript to the agent as its next turn — draft handling + // doesn't apply because voice-mode users are speaking, not typing. + // + // We can't branch on composer input from inside a setInput updater + // (React strict mode double-invokes it, duplicating the submit). + // Just clear + defer submit so the cleared input is committed before + // submit reads it. + setInput('') + setTimeout(() => submitRef.current(text), 0) return }