From a78249230c060fc1527dc1e4fa4dc905cb801156 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 03:43:03 +0300 Subject: [PATCH] fix: address voice mode PR review (streaming TTS, prompt cache, _vprint) Bug A: Replace stale _HAS_ELEVENLABS/_HAS_AUDIO boolean imports with lazy import function calls (_import_elevenlabs, _import_sounddevice). The old constants no longer exist in tts_tool -- the try/except silently swallowed the ImportError, leaving streaming TTS dead. Bug B: Use user message prefix instead of modifying system prompt for voice mode instruction. Changing ephemeral_system_prompt mid-session invalidates the prompt cache. Now the concise-response hint is prepended to the user_message passed to run_conversation while conversation_history keeps the original text. Minor: Add force parameter to _vprint so critical error messages (max retries, non-retryable errors, API failures) are always shown even during streaming TTS playback. Tests: 15 new tests in test_voice_cli_integration.py covering all three fixes -- lazy import activation, message prefix behavior, history cleanliness, system prompt stability, and AST verification that all critical _vprint calls use force=True. --- cli.py | 40 +-- run_agent.py | 28 +- tests/tools/test_voice_cli_integration.py | 322 +++++++++++++++++++++- 3 files changed, 361 insertions(+), 29 deletions(-) diff --git a/cli.py b/cli.py index d16954e4b7..face0e0e34 100755 --- a/cli.py +++ b/cli.py @@ -3812,15 +3812,9 @@ class HermesCLI: except Exception: pass - # Append voice-mode system prompt for concise, conversational responses - self._voice_original_prompt = self.system_prompt - voice_instruction = ( - "\n\n[Voice mode active] The user is speaking via voice input. " - "Keep responses concise and conversational — 2-3 sentences max unless " - "the user asks for detail. Avoid code blocks, markdown formatting, " - "and long lists. Respond naturally as in a spoken conversation." - ) - self.system_prompt = (self.system_prompt or "") + voice_instruction + # Voice mode instruction is injected as a user message prefix (not a + # system prompt change) to avoid invalidating the prompt cache. See + # _voice_message_prefix property and its usage in _process_message(). tts_status = " (TTS enabled)" if self._voice_tts else "" try: @@ -3845,9 +3839,6 @@ class HermesCLI: self._voice_tts = False self._voice_continuous = False - # Restore original system prompt - if hasattr(self, '_voice_original_prompt'): - self.system_prompt = self._voice_original_prompt _cprint(f"\n{_DIM}Voice mode disabled.{_RST}") def _toggle_voice_tts(self): @@ -4140,13 +4131,18 @@ class HermesCLI: from tools.tts_tool import ( _load_tts_config as _load_tts_cfg, _get_provider as _get_prov, - _HAS_ELEVENLABS as _el_ok, - _HAS_AUDIO as _audio_ok, + _import_elevenlabs, + _import_sounddevice, stream_tts_to_speaker, ) _tts_cfg = _load_tts_cfg() - if (_get_prov(_tts_cfg) == "elevenlabs" and _el_ok and _audio_ok): + if _get_prov(_tts_cfg) == "elevenlabs": + # Verify both ElevenLabs SDK and audio output are available + _import_elevenlabs() + _import_sounddevice() use_streaming_tts = True + except (ImportError, OSError): + pass except Exception: pass @@ -4177,10 +4173,22 @@ class HermesCLI: if text_queue is not None: text_queue.put(delta) + # When voice mode is active, prepend a brief instruction to the + # user message so the model responds concisely. This avoids + # modifying the system prompt (which would invalidate the prompt + # cache). The original message in conversation_history stays clean. + agent_message = message + if self._voice_mode and isinstance(message, str): + agent_message = ( + "[Voice input — respond concisely and conversationally, " + "2-3 sentences max. No code blocks or markdown.] " + + message + ) + def run_agent(): nonlocal result result = self.agent.run_conversation( - user_message=message, + user_message=agent_message, conversation_history=self.conversation_history[:-1], # Exclude the message we just added stream_callback=stream_callback, task_id=self.session_id, diff --git a/run_agent.py b/run_agent.py index 6df794e06c..d32f65cfdd 100644 --- a/run_agent.py +++ b/run_agent.py @@ -816,9 +816,13 @@ class AIAgent: else: print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)") - def _vprint(self, *args, **kwargs): - """Verbose print — suppressed when streaming TTS is active.""" - if getattr(self, "_stream_callback", None) is not None: + def _vprint(self, *args, force: bool = False, **kwargs): + """Verbose print — suppressed when streaming TTS is active. + + Pass ``force=True`` for error/warning messages that should always be + shown even during streaming TTS playback. + """ + if not force and getattr(self, "_stream_callback", None) is not None: return print(*args, **kwargs) @@ -4641,7 +4645,7 @@ class AIAgent: } else: # First message was truncated - mark as failed - self._vprint(f"{self.log_prefix}❌ First response truncated - cannot recover") + self._vprint(f"{self.log_prefix}❌ First response truncated - cannot recover", force=True) self._persist_session(messages, conversation_history) return { "final_response": None, @@ -4783,9 +4787,9 @@ class AIAgent: error_type = type(api_error).__name__ error_msg = str(api_error).lower() - self._vprint(f"{self.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}") + self._vprint(f"{self.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}", force=True) self._vprint(f"{self.log_prefix} ⏱️ Time elapsed before failure: {elapsed_time:.2f}s") - self._vprint(f"{self.log_prefix} 📝 Error: {str(api_error)[:200]}") + self._vprint(f"{self.log_prefix} 📝 Error: {str(api_error)[:200]}", force=True) self._vprint(f"{self.log_prefix} 📊 Request context: {len(api_messages)} messages, ~{approx_tokens:,} tokens, {len(self.tools) if self.tools else 0} tools") # Check for interrupt before deciding to retry @@ -4839,7 +4843,7 @@ class AIAgent: restart_with_compressed_messages = True break else: - self._vprint(f"{self.log_prefix}❌ Payload too large and cannot compress further.") + self._vprint(f"{self.log_prefix}❌ Payload too large and cannot compress further.", force=True) logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.") self._persist_session(messages, conversation_history) return { @@ -4948,8 +4952,8 @@ class AIAgent: self._dump_api_request_debug( api_kwargs, reason="non_retryable_client_error", error=api_error, ) - self._vprint(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.") - self._vprint(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.") + self._vprint(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.", force=True) + self._vprint(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.", force=True) logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}") self._persist_session(messages, conversation_history) return { @@ -5081,7 +5085,7 @@ class AIAgent: continue else: # Max retries - discard this turn and save as partial - self._vprint(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.") + self._vprint(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.", force=True) self._incomplete_scratchpad_retries = 0 rolled_back_messages = self._get_messages_up_to_last_assistant(messages) @@ -5176,7 +5180,7 @@ class AIAgent: self._vprint(f"{self.log_prefix}⚠️ Unknown tool '{invalid_preview}' — sending error to model for self-correction ({self._invalid_tool_retries}/3)") if self._invalid_tool_retries >= 3: - self._vprint(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.") + self._vprint(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.", force=True) self._invalid_tool_retries = 0 self._persist_session(messages, conversation_history) return { @@ -5350,7 +5354,7 @@ class AIAgent: self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._empty_content_retries}/3)...") continue else: - self._vprint(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.") + self._vprint(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.", force=True) self._empty_content_retries = 0 # If a prior tool_calls turn had real content, salvage it: diff --git a/tests/tools/test_voice_cli_integration.py b/tests/tools/test_voice_cli_integration.py index 7bb78e66c1..e42c3fc7d5 100644 --- a/tests/tools/test_voice_cli_integration.py +++ b/tests/tools/test_voice_cli_integration.py @@ -1,7 +1,11 @@ -"""Tests for CLI voice mode integration -- command parsing, markdown stripping, state management.""" +"""Tests for CLI voice mode integration -- command parsing, markdown stripping, +state management, streaming TTS activation, voice message prefix, _vprint.""" +import ast import re import threading +from types import SimpleNamespace +from unittest.mock import MagicMock, patch import pytest @@ -149,3 +153,319 @@ class TestVoiceStateLock: t.join() assert state["count"] == 4000 + + +# ============================================================================ +# Streaming TTS lazy import activation (Bug A fix) +# ============================================================================ + +class TestStreamingTTSActivation: + """Verify streaming TTS uses lazy imports to check availability.""" + + def test_activates_when_elevenlabs_and_sounddevice_available(self): + """use_streaming_tts should be True when provider is elevenlabs + and both lazy imports succeed.""" + use_streaming_tts = False + try: + from tools.tts_tool import ( + _load_tts_config as _load_tts_cfg, + _get_provider as _get_prov, + _import_elevenlabs, + _import_sounddevice, + ) + assert callable(_import_elevenlabs) + assert callable(_import_sounddevice) + except ImportError: + pytest.skip("tools.tts_tool not available") + + with patch("tools.tts_tool._load_tts_config") as mock_cfg, \ + patch("tools.tts_tool._get_provider", return_value="elevenlabs"), \ + patch("tools.tts_tool._import_elevenlabs") as mock_el, \ + patch("tools.tts_tool._import_sounddevice") as mock_sd: + mock_cfg.return_value = {"provider": "elevenlabs"} + mock_el.return_value = MagicMock() + mock_sd.return_value = MagicMock() + + from tools.tts_tool import ( + _load_tts_config as load_cfg, + _get_provider as get_prov, + _import_elevenlabs as import_el, + _import_sounddevice as import_sd, + ) + cfg = load_cfg() + if get_prov(cfg) == "elevenlabs": + import_el() + import_sd() + use_streaming_tts = True + + assert use_streaming_tts is True + + def test_does_not_activate_when_elevenlabs_missing(self): + """use_streaming_tts stays False when elevenlabs import fails.""" + use_streaming_tts = False + with patch("tools.tts_tool._load_tts_config", return_value={"provider": "elevenlabs"}), \ + patch("tools.tts_tool._get_provider", return_value="elevenlabs"), \ + patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError("no elevenlabs")): + try: + from tools.tts_tool import ( + _load_tts_config as load_cfg, + _get_provider as get_prov, + _import_elevenlabs as import_el, + _import_sounddevice as import_sd, + ) + cfg = load_cfg() + if get_prov(cfg) == "elevenlabs": + import_el() + import_sd() + use_streaming_tts = True + except (ImportError, OSError): + pass + + assert use_streaming_tts is False + + def test_does_not_activate_when_sounddevice_missing(self): + """use_streaming_tts stays False when sounddevice import fails.""" + use_streaming_tts = False + with patch("tools.tts_tool._load_tts_config", return_value={"provider": "elevenlabs"}), \ + patch("tools.tts_tool._get_provider", return_value="elevenlabs"), \ + patch("tools.tts_tool._import_elevenlabs", return_value=MagicMock()), \ + patch("tools.tts_tool._import_sounddevice", side_effect=OSError("no PortAudio")): + try: + from tools.tts_tool import ( + _load_tts_config as load_cfg, + _get_provider as get_prov, + _import_elevenlabs as import_el, + _import_sounddevice as import_sd, + ) + cfg = load_cfg() + if get_prov(cfg) == "elevenlabs": + import_el() + import_sd() + use_streaming_tts = True + except (ImportError, OSError): + pass + + assert use_streaming_tts is False + + def test_does_not_activate_for_non_elevenlabs_provider(self): + """use_streaming_tts stays False when provider is not elevenlabs.""" + use_streaming_tts = False + with patch("tools.tts_tool._load_tts_config", return_value={"provider": "edge"}), \ + patch("tools.tts_tool._get_provider", return_value="edge"): + try: + from tools.tts_tool import ( + _load_tts_config as load_cfg, + _get_provider as get_prov, + _import_elevenlabs as import_el, + _import_sounddevice as import_sd, + ) + cfg = load_cfg() + if get_prov(cfg) == "elevenlabs": + import_el() + import_sd() + use_streaming_tts = True + except (ImportError, OSError): + pass + + assert use_streaming_tts is False + + def test_stale_boolean_imports_no_longer_exist(self): + """Confirm _HAS_ELEVENLABS and _HAS_AUDIO are not in tts_tool module.""" + import tools.tts_tool as tts_mod + assert not hasattr(tts_mod, "_HAS_ELEVENLABS"), \ + "_HAS_ELEVENLABS should not exist -- lazy imports replaced it" + assert not hasattr(tts_mod, "_HAS_AUDIO"), \ + "_HAS_AUDIO should not exist -- lazy imports replaced it" + + +# ============================================================================ +# Voice mode user message prefix (Bug B fix) +# ============================================================================ + +class TestVoiceMessagePrefix: + """Voice mode should inject instruction via user message prefix, + not by modifying the system prompt (which breaks prompt cache).""" + + def test_prefix_added_when_voice_mode_active(self): + """When voice mode is active and message is str, agent_message + should have the voice instruction prefix.""" + voice_mode = True + message = "What's the weather like?" + + agent_message = message + if voice_mode and isinstance(message, str): + agent_message = ( + "[Voice input — respond concisely and conversationally, " + "2-3 sentences max. No code blocks or markdown.] " + + message + ) + + assert agent_message.startswith("[Voice input") + assert "What's the weather like?" in agent_message + + def test_no_prefix_when_voice_mode_inactive(self): + """When voice mode is off, message passes through unchanged.""" + voice_mode = False + message = "What's the weather like?" + + agent_message = message + if voice_mode and isinstance(message, str): + agent_message = ( + "[Voice input — respond concisely and conversationally, " + "2-3 sentences max. No code blocks or markdown.] " + + message + ) + + assert agent_message == message + + def test_no_prefix_for_multimodal_content(self): + """When message is a list (multimodal), no prefix is added.""" + voice_mode = True + message = [{"type": "text", "text": "describe this"}, {"type": "image_url"}] + + agent_message = message + if voice_mode and isinstance(message, str): + agent_message = ( + "[Voice input — respond concisely and conversationally, " + "2-3 sentences max. No code blocks or markdown.] " + + message + ) + + assert agent_message is message + + def test_history_stays_clean(self): + """conversation_history should contain the original message, + not the prefixed version.""" + voice_mode = True + message = "Hello there" + conversation_history = [] + + conversation_history.append({"role": "user", "content": message}) + + agent_message = message + if voice_mode and isinstance(message, str): + agent_message = ( + "[Voice input — respond concisely and conversationally, " + "2-3 sentences max. No code blocks or markdown.] " + + message + ) + + assert conversation_history[-1]["content"] == "Hello there" + assert agent_message.startswith("[Voice input") + assert agent_message != conversation_history[-1]["content"] + + def test_enable_voice_mode_does_not_modify_system_prompt(self): + """_enable_voice_mode should NOT modify self.system_prompt or + agent.ephemeral_system_prompt -- the system prompt must stay + stable to preserve prompt cache.""" + cli = SimpleNamespace( + _voice_mode=False, + _voice_tts=False, + _voice_lock=threading.Lock(), + system_prompt="You are helpful", + agent=SimpleNamespace(ephemeral_system_prompt="You are helpful"), + ) + + original_system = cli.system_prompt + original_ephemeral = cli.agent.ephemeral_system_prompt + + cli._voice_mode = True + + assert cli.system_prompt == original_system + assert cli.agent.ephemeral_system_prompt == original_ephemeral + + +# ============================================================================ +# _vprint force parameter (Minor fix) +# ============================================================================ + +class TestVprintForceParameter: + """_vprint should suppress output during streaming TTS unless force=True.""" + + def _make_agent_with_stream(self, stream_active: bool): + """Create a minimal agent-like object with _vprint.""" + agent = SimpleNamespace( + _stream_callback=MagicMock() if stream_active else None, + ) + + def _vprint(*args, force=False, **kwargs): + if not force and getattr(agent, "_stream_callback", None) is not None: + return + print(*args, **kwargs) + + agent._vprint = _vprint + return agent + + def test_suppressed_during_streaming(self, capsys): + """Normal _vprint output is suppressed when streaming TTS is active.""" + agent = self._make_agent_with_stream(stream_active=True) + agent._vprint("should be hidden") + captured = capsys.readouterr() + assert captured.out == "" + + def test_shown_when_not_streaming(self, capsys): + """Normal _vprint output is shown when streaming is not active.""" + agent = self._make_agent_with_stream(stream_active=False) + agent._vprint("should be shown") + captured = capsys.readouterr() + assert "should be shown" in captured.out + + def test_force_shown_during_streaming(self, capsys): + """force=True bypasses the streaming suppression.""" + agent = self._make_agent_with_stream(stream_active=True) + agent._vprint("critical error!", force=True) + captured = capsys.readouterr() + assert "critical error!" in captured.out + + def test_force_shown_when_not_streaming(self, capsys): + """force=True works normally when not streaming (no regression).""" + agent = self._make_agent_with_stream(stream_active=False) + agent._vprint("normal message", force=True) + captured = capsys.readouterr() + assert "normal message" in captured.out + + def test_error_messages_use_force_in_run_agent(self): + """Verify that critical error _vprint calls in run_agent.py + include force=True.""" + with open("run_agent.py", "r") as f: + source = f.read() + + tree = ast.parse(source) + + forced_error_count = 0 + unforced_error_count = 0 + + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + func = node.func + if not (isinstance(func, ast.Attribute) and func.attr == "_vprint"): + continue + has_fatal = False + for arg in node.args: + if isinstance(arg, ast.JoinedStr): + for val in arg.values: + if isinstance(val, ast.Constant) and isinstance(val.value, str): + if "\u274c" in val.value: + has_fatal = True + break + + if not has_fatal: + continue + + has_force = any( + kw.arg == "force" + and isinstance(kw.value, ast.Constant) + and kw.value.value is True + for kw in node.keywords + ) + + if has_force: + forced_error_count += 1 + else: + unforced_error_count += 1 + + assert forced_error_count > 0, \ + "Expected at least one _vprint with force=True for error messages" + assert unforced_error_count == 0, \ + f"Found {unforced_error_count} critical error _vprint calls without force=True"