fix: address voice mode PR review (streaming TTS, prompt cache, _vprint)

Bug A: Replace stale _HAS_ELEVENLABS/_HAS_AUDIO boolean imports with lazy import function calls (_import_elevenlabs, _import_sounddevice). The old constants no longer exist in tts_tool -- the try/except silently swallowed the ImportError, leaving streaming TTS dead. Bug B: Use user message prefix instead of modifying system prompt for voice mode instruction. Changing ephemeral_system_prompt mid-session invalidates the prompt cache. Now the concise-response hint is prepended to the user_message passed to run_conversation while conversation_history keeps the original text. Minor: Add force parameter to _vprint so critical error messages (max retries, non-retryable errors, API failures) are always shown even during streaming TTS playback. Tests: 15 new tests in test_voice_cli_integration.py covering all three fixes -- lazy import activation, message prefix behavior, history cleanliness, system prompt stability, and AST verification that all critical _vprint calls use force=True.
2026-04-28 06:51:16 +08:00 · 2026-03-10 03:43:03 +03:00
parent fc893f98f4
commit a78249230c
3 changed files with 361 additions and 29 deletions
--- a/cli.py
+++ b/cli.py
@@ -3812,15 +3812,9 @@ class HermesCLI:
        except Exception:
            pass

-        # Append voice-mode system prompt for concise, conversational responses
-        self._voice_original_prompt = self.system_prompt
-        voice_instruction = (
-            "\n\n[Voice mode active] The user is speaking via voice input. "
-            "Keep responses concise and conversational — 2-3 sentences max unless "
-            "the user asks for detail. Avoid code blocks, markdown formatting, "
-            "and long lists. Respond naturally as in a spoken conversation."
-        )
-        self.system_prompt = (self.system_prompt or "") + voice_instruction
+        # Voice mode instruction is injected as a user message prefix (not a
+        # system prompt change) to avoid invalidating the prompt cache.  See
+        # _voice_message_prefix property and its usage in _process_message().

        tts_status = " (TTS enabled)" if self._voice_tts else ""
        try:
@@ -3845,9 +3839,6 @@ class HermesCLI:
            self._voice_tts = False
            self._voice_continuous = False

-        # Restore original system prompt
-        if hasattr(self, '_voice_original_prompt'):
-            self.system_prompt = self._voice_original_prompt
        _cprint(f"\n{_DIM}Voice mode disabled.{_RST}")

    def _toggle_voice_tts(self):
@@ -4140,13 +4131,18 @@ class HermesCLI:
                    from tools.tts_tool import (
                        _load_tts_config as _load_tts_cfg,
                        _get_provider as _get_prov,
-                        _HAS_ELEVENLABS as _el_ok,
-                        _HAS_AUDIO as _audio_ok,
+                        _import_elevenlabs,
+                        _import_sounddevice,
                        stream_tts_to_speaker,
                    )
                    _tts_cfg = _load_tts_cfg()
-                    if (_get_prov(_tts_cfg) == "elevenlabs" and _el_ok and _audio_ok):
+                    if _get_prov(_tts_cfg) == "elevenlabs":
+                        # Verify both ElevenLabs SDK and audio output are available
+                        _import_elevenlabs()
+                        _import_sounddevice()
                        use_streaming_tts = True
+                except (ImportError, OSError):
+                    pass
                except Exception:
                    pass

@@ -4177,10 +4173,22 @@ class HermesCLI:
                    if text_queue is not None:
                        text_queue.put(delta)

+            # When voice mode is active, prepend a brief instruction to the
+            # user message so the model responds concisely.  This avoids
+            # modifying the system prompt (which would invalidate the prompt
+            # cache).  The original message in conversation_history stays clean.
+            agent_message = message
+            if self._voice_mode and isinstance(message, str):
+                agent_message = (
+                    "[Voice input — respond concisely and conversationally, "
+                    "2-3 sentences max. No code blocks or markdown.] "
+                    + message
+                )
+
            def run_agent():
                nonlocal result
                result = self.agent.run_conversation(
-                    user_message=message,
+                    user_message=agent_message,
                    conversation_history=self.conversation_history[:-1],  # Exclude the message we just added
                    stream_callback=stream_callback,
                    task_id=self.session_id,