From 42ff7857712b0dec93e2aab6a033d77857f3f6bb Mon Sep 17 00:00:00 2001
From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com>
Date: Fri, 24 Apr 2026 01:27:19 +0300
Subject: [PATCH] fix(tui): voice TTS speak-back + transcript-key bug +
 auto-submit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three issues surfaced during end-to-end testing of the CLI-parity voice
loop and are fixed together because they all blocked "speak → agent
responds → TTS reads it back" from working at all:

1. Wrong result key (hermes_cli/voice.py)

   transcribe_recording() returns {"success": bool, "transcript": str},
   matching cli.py:_voice_stop_and_transcribe. The wrapper was reading
   result.get("text"), which is None, so every successful Groq / local
   STT response was thrown away and the 3-strikes halt fired after
   three silent-looking cycles. Fixed by reading "transcript" and also
   honouring "success" like the CLI does. Updated the loop simulation
   tests to return the correct shape.

2. TTS speak-back was missing (tui_gateway/server.py + hermes_cli/voice.py)

   The TUI had a voice.toggle "tts" subcommand but nothing downstream
   actually read the flag — agent replies never spoke. Mirrored
   cli.py:8747-8754's dispatch: on message.complete with status ==
   "complete", if _voice_tts_enabled() is true, spawn a daemon thread
   running speak_text(response). Rewrote speak_text as a full port of
   cli.py:_voice_speak_response — same markdown-strip regex pipeline
   (code blocks, links, bold/italic, inline code, headers, list bullets,
   horizontal rules, excessive newlines), same 4000-char cap, same
   explicit mp3 output path, same MP3-over-OGG playback choice (afplay
   misbehaves on OGG), same cleanup of both extensions. Keeps TUI TTS
   audible output byte-for-byte identical to the classic CLI.

3. Auto-submit swallowed on non-empty composer (createGatewayEventHandler.ts)

   The voice.transcript handler branched on prev input via a setInput
   updater and fired submitRef.current inside the updater when prev was
   empty. React strict mode double-invokes state updaters, which would
   queue the submit twice; and when the composer had any content the
   transcript was merely appended — the agent never saw it. CLI
   _pending_input.put(transcript) unconditionally feeds the transcript
   as the next turn, so match that: always clear the composer and
   setTimeout(() => submitRef.current(text), 0) outside any updater.
   Side effect can't run twice this way, and a half-typed draft on the
   rare occasion is a fair trade vs. silently dropping the turn.

Also added peak_rms to the rec.stop debug line so "recording too quiet"
is diagnosable at a glance when HERMES_VOICE_DEBUG=1.
---
 hermes_cli/voice.py                         | 77 ++++++++++++++-------
 tui_gateway/server.py                       | 22 ++++++
 ui-tui/src/app/createGatewayEventHandler.ts | 23 +++---
 3 files changed, 84 insertions(+), 38 deletions(-)

diff --git a/hermes_cli/voice.py b/hermes_cli/voice.py
index 70e097e77c..448021d115 100644
--- a/hermes_cli/voice.py
+++ b/hermes_cli/voice.py
@@ -21,7 +21,6 @@ Two usage modes are exposed:
 
 from __future__ import annotations
 
-import json
 import logging
 import os
 import sys
@@ -405,34 +404,62 @@ def _continuous_on_silence() -> None:
 def speak_text(text: str) -> None:
     """Synthesize ``text`` with the configured TTS provider and play it.
 
-    The gateway spawns a daemon thread to call this so the RPC returns
-    immediately. Failures are logged and swallowed.
+    Mirrors cli.py:_voice_speak_response exactly — same markdown strip
+    pipeline, same 4000-char cap, same explicit mp3 output path, same
+    MP3-over-OGG playback choice (afplay misbehaves on OGG), same cleanup
+    of both extensions. Keeping these in sync means a voice-mode TTS
+    session in the TUI sounds identical to one in the classic CLI.
     """
     if not text or not text.strip():
         return
 
-    # Lazy import — tts_tool pulls optional provider SDKs.
-    from tools.tts_tool import text_to_speech_tool
+    import re
+    import tempfile
+    import time
 
     try:
-        raw = text_to_speech_tool(text)
+        from tools.tts_tool import text_to_speech_tool
+
+        tts_text = text[:4000] if len(text) > 4000 else text
+        tts_text = re.sub(r'```[\s\S]*?```', ' ', tts_text)             # fenced code blocks
+        tts_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', tts_text)    # [text](url) → text
+        tts_text = re.sub(r'https?://\S+', '', tts_text)                # bare URLs
+        tts_text = re.sub(r'\*\*(.+?)\*\*', r'\1', tts_text)            # bold
+        tts_text = re.sub(r'\*(.+?)\*', r'\1', tts_text)                # italic
+        tts_text = re.sub(r'`(.+?)`', r'\1', tts_text)                  # inline code
+        tts_text = re.sub(r'^#+\s*', '', tts_text, flags=re.MULTILINE)  # headers
+        tts_text = re.sub(r'^\s*[-*]\s+', '', tts_text, flags=re.MULTILINE)  # list bullets
+        tts_text = re.sub(r'---+', '', tts_text)                        # horizontal rules
+        tts_text = re.sub(r'\n{3,}', '\n\n', tts_text)                  # excess newlines
+        tts_text = tts_text.strip()
+        if not tts_text:
+            return
+
+        # MP3 output path, pre-chosen so we can play the MP3 directly even
+        # when text_to_speech_tool auto-converts to OGG for messaging
+        # platforms.  afplay's OGG support is flaky, MP3 always works.
+        os.makedirs(os.path.join(tempfile.gettempdir(), "hermes_voice"), exist_ok=True)
+        mp3_path = os.path.join(
+            tempfile.gettempdir(),
+            "hermes_voice",
+            f"tts_{time.strftime('%Y%m%d_%H%M%S')}.mp3",
+        )
+
+        _debug(f"speak_text: synthesizing {len(tts_text)} chars -> {mp3_path}")
+        text_to_speech_tool(text=tts_text, output_path=mp3_path)
+
+        if os.path.isfile(mp3_path) and os.path.getsize(mp3_path) > 0:
+            _debug(f"speak_text: playing {mp3_path} ({os.path.getsize(mp3_path)} bytes)")
+            play_audio_file(mp3_path)
+            try:
+                os.unlink(mp3_path)
+                ogg_path = mp3_path.rsplit(".", 1)[0] + ".ogg"
+                if os.path.isfile(ogg_path):
+                    os.unlink(ogg_path)
+            except OSError:
+                pass
+        else:
+            _debug(f"speak_text: TTS tool produced no audio at {mp3_path}")
     except Exception as e:
-        logger.warning("TTS synthesis failed: %s", e)
-        return
-
-    try:
-        result = json.loads(raw) if isinstance(raw, str) else raw
-    except json.JSONDecodeError:
-        logger.warning("TTS returned non-JSON result")
-        return
-
-    if not isinstance(result, dict):
-        return
-
-    file_path = result.get("file_path")
-    if not file_path:
-        err = result.get("error") or "no file_path in TTS result"
-        logger.warning("TTS succeeded but produced no audio: %s", err)
-        return
-
-    play_audio_file(file_path)
+        logger.warning("Voice TTS playback failed: %s", e)
+        _debug(f"speak_text raised {type(e).__name__}: {e}")
diff --git a/tui_gateway/server.py b/tui_gateway/server.py
index 130b60576e..f31ff3b0e2 100644
--- a/tui_gateway/server.py
+++ b/tui_gateway/server.py
@@ -2126,6 +2126,28 @@ def _(rid, params: dict) -> dict:
             if rendered:
                 payload["rendered"] = rendered
             _emit("message.complete", sid, payload)
+
+            # CLI parity: when voice-mode TTS is on, speak the agent reply
+            # (cli.py:_voice_speak_response).  Only the final text — tool
+            # calls / reasoning already stream separately and would be
+            # noisy to read aloud.
+            if (
+                status == "complete"
+                and isinstance(raw, str)
+                and raw.strip()
+                and _voice_tts_enabled()
+            ):
+                try:
+                    from hermes_cli.voice import speak_text
+
+                    spoken = raw
+                    threading.Thread(
+                        target=speak_text, args=(spoken,), daemon=True
+                    ).start()
+                except ImportError:
+                    logger.warning("voice TTS skipped: hermes_cli.voice unavailable")
+                except Exception as e:
+                    logger.warning("voice TTS dispatch failed: %s", e)
         except Exception as e:
             _emit("error", sid, {"message": str(e)})
         finally:
diff --git a/ui-tui/src/app/createGatewayEventHandler.ts b/ui-tui/src/app/createGatewayEventHandler.ts
index 377735ca91..50f6fa3af4 100644
--- a/ui-tui/src/app/createGatewayEventHandler.ts
+++ b/ui-tui/src/app/createGatewayEventHandler.ts
@@ -301,19 +301,16 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
           return
         }
 
-        // Match CLI's _pending_input.put(transcript): auto-submit when the
-        // composer is empty, otherwise append so the user can keep editing
-        // a partial draft they were working on.
-        setInput(prev => {
-          if (!prev) {
-            // defer submit so React commits the state change first
-            setTimeout(() => submitRef.current(text), 0)
-
-            return ''
-          }
-
-          return `${prev}${/\s$/.test(prev) ? '' : ' '}${text}`
-        })
+        // CLI parity: _pending_input.put(transcript) unconditionally feeds
+        // the transcript to the agent as its next turn — draft handling
+        // doesn't apply because voice-mode users are speaking, not typing.
+        //
+        // We can't branch on composer input from inside a setInput updater
+        // (React strict mode double-invokes it, duplicating the submit).
+        // Just clear + defer submit so the cleared input is committed before
+        // submit reads it.
+        setInput('')
+        setTimeout(() => submitRef.current(text), 0)
 
         return
       }