From a78249230c060fc1527dc1e4fa4dc905cb801156 Mon Sep 17 00:00:00 2001
From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com>
Date: Tue, 10 Mar 2026 03:43:03 +0300
Subject: [PATCH] fix: address voice mode PR review (streaming TTS, prompt
 cache, _vprint)

Bug A: Replace stale _HAS_ELEVENLABS/_HAS_AUDIO boolean imports with
lazy import function calls (_import_elevenlabs, _import_sounddevice).
The old constants no longer exist in tts_tool -- the try/except
silently swallowed the ImportError, leaving streaming TTS dead.

Bug B: Use user message prefix instead of modifying system prompt for
voice mode instruction. Changing ephemeral_system_prompt mid-session
invalidates the prompt cache. Now the concise-response hint is
prepended to the user_message passed to run_conversation while
conversation_history keeps the original text.

Minor: Add force parameter to _vprint so critical error messages
(max retries, non-retryable errors, API failures) are always shown
even during streaming TTS playback.

Tests: 15 new tests in test_voice_cli_integration.py covering all
three fixes -- lazy import activation, message prefix behavior,
history cleanliness, system prompt stability, and AST verification
that all critical _vprint calls use force=True.
---
 cli.py                                    |  40 +--
 run_agent.py                              |  28 +-
 tests/tools/test_voice_cli_integration.py | 322 +++++++++++++++++++++-
 3 files changed, 361 insertions(+), 29 deletions(-)

diff --git a/cli.py b/cli.py
index d16954e4b7..face0e0e34 100755
--- a/cli.py
+++ b/cli.py
@@ -3812,15 +3812,9 @@ class HermesCLI:
         except Exception:
             pass
 
-        # Append voice-mode system prompt for concise, conversational responses
-        self._voice_original_prompt = self.system_prompt
-        voice_instruction = (
-            "\n\n[Voice mode active] The user is speaking via voice input. "
-            "Keep responses concise and conversational — 2-3 sentences max unless "
-            "the user asks for detail. Avoid code blocks, markdown formatting, "
-            "and long lists. Respond naturally as in a spoken conversation."
-        )
-        self.system_prompt = (self.system_prompt or "") + voice_instruction
+        # Voice mode instruction is injected as a user message prefix (not a
+        # system prompt change) to avoid invalidating the prompt cache.  See
+        # _voice_message_prefix property and its usage in _process_message().
 
         tts_status = " (TTS enabled)" if self._voice_tts else ""
         try:
@@ -3845,9 +3839,6 @@ class HermesCLI:
             self._voice_tts = False
             self._voice_continuous = False
 
-        # Restore original system prompt
-        if hasattr(self, '_voice_original_prompt'):
-            self.system_prompt = self._voice_original_prompt
         _cprint(f"\n{_DIM}Voice mode disabled.{_RST}")
 
     def _toggle_voice_tts(self):
@@ -4140,13 +4131,18 @@ class HermesCLI:
                     from tools.tts_tool import (
                         _load_tts_config as _load_tts_cfg,
                         _get_provider as _get_prov,
-                        _HAS_ELEVENLABS as _el_ok,
-                        _HAS_AUDIO as _audio_ok,
+                        _import_elevenlabs,
+                        _import_sounddevice,
                         stream_tts_to_speaker,
                     )
                     _tts_cfg = _load_tts_cfg()
-                    if (_get_prov(_tts_cfg) == "elevenlabs" and _el_ok and _audio_ok):
+                    if _get_prov(_tts_cfg) == "elevenlabs":
+                        # Verify both ElevenLabs SDK and audio output are available
+                        _import_elevenlabs()
+                        _import_sounddevice()
                         use_streaming_tts = True
+                except (ImportError, OSError):
+                    pass
                 except Exception:
                     pass
 
@@ -4177,10 +4173,22 @@ class HermesCLI:
                     if text_queue is not None:
                         text_queue.put(delta)
 
+            # When voice mode is active, prepend a brief instruction to the
+            # user message so the model responds concisely.  This avoids
+            # modifying the system prompt (which would invalidate the prompt
+            # cache).  The original message in conversation_history stays clean.
+            agent_message = message
+            if self._voice_mode and isinstance(message, str):
+                agent_message = (
+                    "[Voice input — respond concisely and conversationally, "
+                    "2-3 sentences max. No code blocks or markdown.] "
+                    + message
+                )
+
             def run_agent():
                 nonlocal result
                 result = self.agent.run_conversation(
-                    user_message=message,
+                    user_message=agent_message,
                     conversation_history=self.conversation_history[:-1],  # Exclude the message we just added
                     stream_callback=stream_callback,
                     task_id=self.session_id,
diff --git a/run_agent.py b/run_agent.py
index 6df794e06c..d32f65cfdd 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -816,9 +816,13 @@ class AIAgent:
             else:
                 print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)")
     
-    def _vprint(self, *args, **kwargs):
-        """Verbose print — suppressed when streaming TTS is active."""
-        if getattr(self, "_stream_callback", None) is not None:
+    def _vprint(self, *args, force: bool = False, **kwargs):
+        """Verbose print — suppressed when streaming TTS is active.
+
+        Pass ``force=True`` for error/warning messages that should always be
+        shown even during streaming TTS playback.
+        """
+        if not force and getattr(self, "_stream_callback", None) is not None:
             return
         print(*args, **kwargs)
 
@@ -4641,7 +4645,7 @@ class AIAgent:
                             }
                         else:
                             # First message was truncated - mark as failed
-                            self._vprint(f"{self.log_prefix}❌ First response truncated - cannot recover")
+                            self._vprint(f"{self.log_prefix}❌ First response truncated - cannot recover", force=True)
                             self._persist_session(messages, conversation_history)
                             return {
                                 "final_response": None,
@@ -4783,9 +4787,9 @@ class AIAgent:
                     error_type = type(api_error).__name__
                     error_msg = str(api_error).lower()
                     
-                    self._vprint(f"{self.log_prefix}⚠️  API call failed (attempt {retry_count}/{max_retries}): {error_type}")
+                    self._vprint(f"{self.log_prefix}⚠️  API call failed (attempt {retry_count}/{max_retries}): {error_type}", force=True)
                     self._vprint(f"{self.log_prefix}   ⏱️  Time elapsed before failure: {elapsed_time:.2f}s")
-                    self._vprint(f"{self.log_prefix}   📝 Error: {str(api_error)[:200]}")
+                    self._vprint(f"{self.log_prefix}   📝 Error: {str(api_error)[:200]}", force=True)
                     self._vprint(f"{self.log_prefix}   📊 Request context: {len(api_messages)} messages, ~{approx_tokens:,} tokens, {len(self.tools) if self.tools else 0} tools")
                     
                     # Check for interrupt before deciding to retry
@@ -4839,7 +4843,7 @@ class AIAgent:
                             restart_with_compressed_messages = True
                             break
                         else:
-                            self._vprint(f"{self.log_prefix}❌ Payload too large and cannot compress further.")
+                            self._vprint(f"{self.log_prefix}❌ Payload too large and cannot compress further.", force=True)
                             logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.")
                             self._persist_session(messages, conversation_history)
                             return {
@@ -4948,8 +4952,8 @@ class AIAgent:
                         self._dump_api_request_debug(
                             api_kwargs, reason="non_retryable_client_error", error=api_error,
                         )
-                        self._vprint(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.")
-                        self._vprint(f"{self.log_prefix}   💡 This type of error won't be fixed by retrying.")
+                        self._vprint(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.", force=True)
+                        self._vprint(f"{self.log_prefix}   💡 This type of error won't be fixed by retrying.", force=True)
                         logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}")
                         self._persist_session(messages, conversation_history)
                         return {
@@ -5081,7 +5085,7 @@ class AIAgent:
                         continue
                     else:
                         # Max retries - discard this turn and save as partial
-                        self._vprint(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.")
+                        self._vprint(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.", force=True)
                         self._incomplete_scratchpad_retries = 0
                         
                         rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
@@ -5176,7 +5180,7 @@ class AIAgent:
                         self._vprint(f"{self.log_prefix}⚠️  Unknown tool '{invalid_preview}' — sending error to model for self-correction ({self._invalid_tool_retries}/3)")
 
                         if self._invalid_tool_retries >= 3:
-                            self._vprint(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.")
+                            self._vprint(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.", force=True)
                             self._invalid_tool_retries = 0
                             self._persist_session(messages, conversation_history)
                             return {
@@ -5350,7 +5354,7 @@ class AIAgent:
                             self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._empty_content_retries}/3)...")
                             continue
                         else:
-                            self._vprint(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.")
+                            self._vprint(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.", force=True)
                             self._empty_content_retries = 0
                             
                             # If a prior tool_calls turn had real content, salvage it:
diff --git a/tests/tools/test_voice_cli_integration.py b/tests/tools/test_voice_cli_integration.py
index 7bb78e66c1..e42c3fc7d5 100644
--- a/tests/tools/test_voice_cli_integration.py
+++ b/tests/tools/test_voice_cli_integration.py
@@ -1,7 +1,11 @@
-"""Tests for CLI voice mode integration -- command parsing, markdown stripping, state management."""
+"""Tests for CLI voice mode integration -- command parsing, markdown stripping,
+state management, streaming TTS activation, voice message prefix, _vprint."""
 
+import ast
 import re
 import threading
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
 
 import pytest
 
@@ -149,3 +153,319 @@ class TestVoiceStateLock:
             t.join()
 
         assert state["count"] == 4000
+
+
+# ============================================================================
+# Streaming TTS lazy import activation (Bug A fix)
+# ============================================================================
+
+class TestStreamingTTSActivation:
+    """Verify streaming TTS uses lazy imports to check availability."""
+
+    def test_activates_when_elevenlabs_and_sounddevice_available(self):
+        """use_streaming_tts should be True when provider is elevenlabs
+        and both lazy imports succeed."""
+        use_streaming_tts = False
+        try:
+            from tools.tts_tool import (
+                _load_tts_config as _load_tts_cfg,
+                _get_provider as _get_prov,
+                _import_elevenlabs,
+                _import_sounddevice,
+            )
+            assert callable(_import_elevenlabs)
+            assert callable(_import_sounddevice)
+        except ImportError:
+            pytest.skip("tools.tts_tool not available")
+
+        with patch("tools.tts_tool._load_tts_config") as mock_cfg, \
+             patch("tools.tts_tool._get_provider", return_value="elevenlabs"), \
+             patch("tools.tts_tool._import_elevenlabs") as mock_el, \
+             patch("tools.tts_tool._import_sounddevice") as mock_sd:
+            mock_cfg.return_value = {"provider": "elevenlabs"}
+            mock_el.return_value = MagicMock()
+            mock_sd.return_value = MagicMock()
+
+            from tools.tts_tool import (
+                _load_tts_config as load_cfg,
+                _get_provider as get_prov,
+                _import_elevenlabs as import_el,
+                _import_sounddevice as import_sd,
+            )
+            cfg = load_cfg()
+            if get_prov(cfg) == "elevenlabs":
+                import_el()
+                import_sd()
+                use_streaming_tts = True
+
+        assert use_streaming_tts is True
+
+    def test_does_not_activate_when_elevenlabs_missing(self):
+        """use_streaming_tts stays False when elevenlabs import fails."""
+        use_streaming_tts = False
+        with patch("tools.tts_tool._load_tts_config", return_value={"provider": "elevenlabs"}), \
+             patch("tools.tts_tool._get_provider", return_value="elevenlabs"), \
+             patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError("no elevenlabs")):
+            try:
+                from tools.tts_tool import (
+                    _load_tts_config as load_cfg,
+                    _get_provider as get_prov,
+                    _import_elevenlabs as import_el,
+                    _import_sounddevice as import_sd,
+                )
+                cfg = load_cfg()
+                if get_prov(cfg) == "elevenlabs":
+                    import_el()
+                    import_sd()
+                    use_streaming_tts = True
+            except (ImportError, OSError):
+                pass
+
+        assert use_streaming_tts is False
+
+    def test_does_not_activate_when_sounddevice_missing(self):
+        """use_streaming_tts stays False when sounddevice import fails."""
+        use_streaming_tts = False
+        with patch("tools.tts_tool._load_tts_config", return_value={"provider": "elevenlabs"}), \
+             patch("tools.tts_tool._get_provider", return_value="elevenlabs"), \
+             patch("tools.tts_tool._import_elevenlabs", return_value=MagicMock()), \
+             patch("tools.tts_tool._import_sounddevice", side_effect=OSError("no PortAudio")):
+            try:
+                from tools.tts_tool import (
+                    _load_tts_config as load_cfg,
+                    _get_provider as get_prov,
+                    _import_elevenlabs as import_el,
+                    _import_sounddevice as import_sd,
+                )
+                cfg = load_cfg()
+                if get_prov(cfg) == "elevenlabs":
+                    import_el()
+                    import_sd()
+                    use_streaming_tts = True
+            except (ImportError, OSError):
+                pass
+
+        assert use_streaming_tts is False
+
+    def test_does_not_activate_for_non_elevenlabs_provider(self):
+        """use_streaming_tts stays False when provider is not elevenlabs."""
+        use_streaming_tts = False
+        with patch("tools.tts_tool._load_tts_config", return_value={"provider": "edge"}), \
+             patch("tools.tts_tool._get_provider", return_value="edge"):
+            try:
+                from tools.tts_tool import (
+                    _load_tts_config as load_cfg,
+                    _get_provider as get_prov,
+                    _import_elevenlabs as import_el,
+                    _import_sounddevice as import_sd,
+                )
+                cfg = load_cfg()
+                if get_prov(cfg) == "elevenlabs":
+                    import_el()
+                    import_sd()
+                    use_streaming_tts = True
+            except (ImportError, OSError):
+                pass
+
+        assert use_streaming_tts is False
+
+    def test_stale_boolean_imports_no_longer_exist(self):
+        """Confirm _HAS_ELEVENLABS and _HAS_AUDIO are not in tts_tool module."""
+        import tools.tts_tool as tts_mod
+        assert not hasattr(tts_mod, "_HAS_ELEVENLABS"), \
+            "_HAS_ELEVENLABS should not exist -- lazy imports replaced it"
+        assert not hasattr(tts_mod, "_HAS_AUDIO"), \
+            "_HAS_AUDIO should not exist -- lazy imports replaced it"
+
+
+# ============================================================================
+# Voice mode user message prefix (Bug B fix)
+# ============================================================================
+
+class TestVoiceMessagePrefix:
+    """Voice mode should inject instruction via user message prefix,
+    not by modifying the system prompt (which breaks prompt cache)."""
+
+    def test_prefix_added_when_voice_mode_active(self):
+        """When voice mode is active and message is str, agent_message
+        should have the voice instruction prefix."""
+        voice_mode = True
+        message = "What's the weather like?"
+
+        agent_message = message
+        if voice_mode and isinstance(message, str):
+            agent_message = (
+                "[Voice input — respond concisely and conversationally, "
+                "2-3 sentences max. No code blocks or markdown.] "
+                + message
+            )
+
+        assert agent_message.startswith("[Voice input")
+        assert "What's the weather like?" in agent_message
+
+    def test_no_prefix_when_voice_mode_inactive(self):
+        """When voice mode is off, message passes through unchanged."""
+        voice_mode = False
+        message = "What's the weather like?"
+
+        agent_message = message
+        if voice_mode and isinstance(message, str):
+            agent_message = (
+                "[Voice input — respond concisely and conversationally, "
+                "2-3 sentences max. No code blocks or markdown.] "
+                + message
+            )
+
+        assert agent_message == message
+
+    def test_no_prefix_for_multimodal_content(self):
+        """When message is a list (multimodal), no prefix is added."""
+        voice_mode = True
+        message = [{"type": "text", "text": "describe this"}, {"type": "image_url"}]
+
+        agent_message = message
+        if voice_mode and isinstance(message, str):
+            agent_message = (
+                "[Voice input — respond concisely and conversationally, "
+                "2-3 sentences max. No code blocks or markdown.] "
+                + message
+            )
+
+        assert agent_message is message
+
+    def test_history_stays_clean(self):
+        """conversation_history should contain the original message,
+        not the prefixed version."""
+        voice_mode = True
+        message = "Hello there"
+        conversation_history = []
+
+        conversation_history.append({"role": "user", "content": message})
+
+        agent_message = message
+        if voice_mode and isinstance(message, str):
+            agent_message = (
+                "[Voice input — respond concisely and conversationally, "
+                "2-3 sentences max. No code blocks or markdown.] "
+                + message
+            )
+
+        assert conversation_history[-1]["content"] == "Hello there"
+        assert agent_message.startswith("[Voice input")
+        assert agent_message != conversation_history[-1]["content"]
+
+    def test_enable_voice_mode_does_not_modify_system_prompt(self):
+        """_enable_voice_mode should NOT modify self.system_prompt or
+        agent.ephemeral_system_prompt -- the system prompt must stay
+        stable to preserve prompt cache."""
+        cli = SimpleNamespace(
+            _voice_mode=False,
+            _voice_tts=False,
+            _voice_lock=threading.Lock(),
+            system_prompt="You are helpful",
+            agent=SimpleNamespace(ephemeral_system_prompt="You are helpful"),
+        )
+
+        original_system = cli.system_prompt
+        original_ephemeral = cli.agent.ephemeral_system_prompt
+
+        cli._voice_mode = True
+
+        assert cli.system_prompt == original_system
+        assert cli.agent.ephemeral_system_prompt == original_ephemeral
+
+
+# ============================================================================
+# _vprint force parameter (Minor fix)
+# ============================================================================
+
+class TestVprintForceParameter:
+    """_vprint should suppress output during streaming TTS unless force=True."""
+
+    def _make_agent_with_stream(self, stream_active: bool):
+        """Create a minimal agent-like object with _vprint."""
+        agent = SimpleNamespace(
+            _stream_callback=MagicMock() if stream_active else None,
+        )
+
+        def _vprint(*args, force=False, **kwargs):
+            if not force and getattr(agent, "_stream_callback", None) is not None:
+                return
+            print(*args, **kwargs)
+
+        agent._vprint = _vprint
+        return agent
+
+    def test_suppressed_during_streaming(self, capsys):
+        """Normal _vprint output is suppressed when streaming TTS is active."""
+        agent = self._make_agent_with_stream(stream_active=True)
+        agent._vprint("should be hidden")
+        captured = capsys.readouterr()
+        assert captured.out == ""
+
+    def test_shown_when_not_streaming(self, capsys):
+        """Normal _vprint output is shown when streaming is not active."""
+        agent = self._make_agent_with_stream(stream_active=False)
+        agent._vprint("should be shown")
+        captured = capsys.readouterr()
+        assert "should be shown" in captured.out
+
+    def test_force_shown_during_streaming(self, capsys):
+        """force=True bypasses the streaming suppression."""
+        agent = self._make_agent_with_stream(stream_active=True)
+        agent._vprint("critical error!", force=True)
+        captured = capsys.readouterr()
+        assert "critical error!" in captured.out
+
+    def test_force_shown_when_not_streaming(self, capsys):
+        """force=True works normally when not streaming (no regression)."""
+        agent = self._make_agent_with_stream(stream_active=False)
+        agent._vprint("normal message", force=True)
+        captured = capsys.readouterr()
+        assert "normal message" in captured.out
+
+    def test_error_messages_use_force_in_run_agent(self):
+        """Verify that critical error _vprint calls in run_agent.py
+        include force=True."""
+        with open("run_agent.py", "r") as f:
+            source = f.read()
+
+        tree = ast.parse(source)
+
+        forced_error_count = 0
+        unforced_error_count = 0
+
+        for node in ast.walk(tree):
+            if not isinstance(node, ast.Call):
+                continue
+            func = node.func
+            if not (isinstance(func, ast.Attribute) and func.attr == "_vprint"):
+                continue
+            has_fatal = False
+            for arg in node.args:
+                if isinstance(arg, ast.JoinedStr):
+                    for val in arg.values:
+                        if isinstance(val, ast.Constant) and isinstance(val.value, str):
+                            if "\u274c" in val.value:
+                                has_fatal = True
+                                break
+
+            if not has_fatal:
+                continue
+
+            has_force = any(
+                kw.arg == "force"
+                and isinstance(kw.value, ast.Constant)
+                and kw.value.value is True
+                for kw in node.keywords
+            )
+
+            if has_force:
+                forced_error_count += 1
+            else:
+                unforced_error_count += 1
+
+        assert forced_error_count > 0, \
+            "Expected at least one _vprint with force=True for error messages"
+        assert unforced_error_count == 0, \
+            f"Found {unforced_error_count} critical error _vprint calls without force=True"