mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-02 16:57:36 +08:00
Piper (OHF-Voice/piper1-gpl) is a fast, local neural TTS engine from the
Home Assistant project that supports 44 languages with zero API keys.
Adds it as a native built-in provider alongside edge/neutts/kittentts,
installable via 'hermes tools' with one keystroke.
What ships:
- New 'piper' built-in provider in tools/tts_tool.py
- Lazy import via _import_piper()
- Module-level voice cache keyed on (model_path, use_cuda) so switching
voices doesn't invalidate older cached voices
- _resolve_piper_voice_path() accepts either an absolute .onnx path or a
voice name (auto-downloaded on first use via 'python -m
piper.download_voices --download-dir <cache>')
- Voice cache at ~/.hermes/cache/piper-voices/ (profile-aware via
get_hermes_dir)
- Optional SynthesisConfig knobs: length_scale, noise_scale,
noise_w_scale, volume, normalize_audio, use_cuda — passed through
only when configured, so older piper-tts versions aren't broken
- WAV output then ffmpeg conversion path (same as neutts/kittentts) so
Telegram voice bubbles work when ffmpeg is present
- Piper added to BUILTIN_TTS_PROVIDERS so a user's
tts.providers.piper.command cannot shadow the native provider
(regression test included)
- 'hermes tools' wizard entry
- Piper appears under Voice and TTS as local free, with
'pip install piper-tts' auto-install via post_setup handler
- Prints voice-catalog URL and default-voice info after install
- config.yaml defaults
- tts.piper.voice defaults to en_US-lessac-medium
- Commented advanced knobs for discoverability
- Docs
- New 'Piper (local, 44 languages)' section in features/tts.md
explaining install path, voice switching, pre-downloaded voices,
and advanced knobs
- Piper listed in the ten-provider table and ffmpeg table
- Custom-command-providers section updated to drop the Piper example
(now native) and add a piper-custom example for users with their own
trained .onnx models
- overview.md bumps provider count to ten
- Tests (tests/tools/test_tts_piper.py, 16 tests)
- Registration (BUILTIN_TTS_PROVIDERS, PROVIDER_MAX_TEXT_LENGTH)
- _resolve_piper_voice_path across every branch: direct .onnx path,
cached voice name, fresh download with correct CLI args, download
failure, successful-exit-but-missing-files, empty voice to default
- _generate_piper_tts: loads voice once, reuses cache, voice-name
download wiring, advanced knobs flow through SynthesisConfig
- text_to_speech_tool end-to-end dispatch and missing-package error
- check_tts_requirements: piper availability toggles the return value
- Regression guard: piper cannot be shadowed by a command provider
with the same name
- Pre-existing test_tts_mistral test broadened to mock the new
piper/kittentts/command-provider checks (otherwise it false-passes
when piper is installed in the test venv)
E2E verification (live):
Actual pip install piper-tts, config piper + en_US-lessac-low,
text_to_speech_tool call, voice auto-downloaded from HuggingFace,
WAV synthesized, ffmpeg-converted to Ogg/Opus. Second call hits the
cache (~60ms). Cache dir populated with .onnx and .onnx.json.
This caught a real bug during development: the first pass used '-d' as
the download-dir flag; the actual piper.download_voices CLI wants
'--download-dir'. Fixed before PR opened.
224 lines
9.2 KiB
Python
224 lines
9.2 KiB
Python
"""Tests for the Mistral (Voxtral) TTS provider in tools/tts_tool.py."""
|
|
|
|
import base64
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def clean_env(monkeypatch):
|
|
for key in ("MISTRAL_API_KEY", "HERMES_SESSION_PLATFORM"):
|
|
monkeypatch.delenv(key, raising=False)
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_mistral_module():
|
|
mock_client = MagicMock()
|
|
mock_client.__enter__ = MagicMock(return_value=mock_client)
|
|
mock_client.__exit__ = MagicMock(return_value=False)
|
|
mock_mistral_cls = MagicMock(return_value=mock_client)
|
|
fake_module = MagicMock()
|
|
fake_module.Mistral = mock_mistral_cls
|
|
with patch.dict("sys.modules", {"mistralai": fake_module, "mistralai.client": fake_module}):
|
|
yield mock_client
|
|
|
|
|
|
class TestGenerateMistralTts:
|
|
def test_missing_api_key_raises_value_error(self, tmp_path, mock_mistral_module):
|
|
from tools.tts_tool import _generate_mistral_tts
|
|
|
|
output_path = str(tmp_path / "test.mp3")
|
|
with pytest.raises(ValueError, match="MISTRAL_API_KEY"):
|
|
_generate_mistral_tts("Hello", output_path, {})
|
|
|
|
def test_successful_generation(self, tmp_path, mock_mistral_module, monkeypatch):
|
|
from tools.tts_tool import _generate_mistral_tts
|
|
|
|
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
|
audio_content = b"fake-audio-bytes"
|
|
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
|
audio_data=base64.b64encode(audio_content).decode()
|
|
)
|
|
|
|
output_path = str(tmp_path / "test.mp3")
|
|
result = _generate_mistral_tts("Hello world", output_path, {})
|
|
|
|
assert result == output_path
|
|
assert (tmp_path / "test.mp3").read_bytes() == audio_content
|
|
mock_mistral_module.audio.speech.complete.assert_called_once()
|
|
mock_mistral_module.__exit__.assert_called_once()
|
|
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
|
assert call_kwargs["input"] == "Hello world"
|
|
assert call_kwargs["response_format"] == "mp3"
|
|
|
|
@pytest.mark.parametrize(
|
|
"extension, expected_format",
|
|
[(".ogg", "opus"), (".wav", "wav"), (".flac", "flac"), (".mp3", "mp3")],
|
|
)
|
|
def test_response_format_from_extension(
|
|
self, tmp_path, mock_mistral_module, monkeypatch, extension, expected_format
|
|
):
|
|
from tools.tts_tool import _generate_mistral_tts
|
|
|
|
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
|
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
|
audio_data=base64.b64encode(b"data").decode()
|
|
)
|
|
|
|
output_path = str(tmp_path / f"test{extension}")
|
|
_generate_mistral_tts("Hi", output_path, {})
|
|
|
|
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
|
assert call_kwargs["response_format"] == expected_format
|
|
|
|
def test_voice_id_passed_when_configured(
|
|
self, tmp_path, mock_mistral_module, monkeypatch
|
|
):
|
|
from tools.tts_tool import _generate_mistral_tts
|
|
|
|
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
|
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
|
audio_data=base64.b64encode(b"data").decode()
|
|
)
|
|
|
|
config = {"mistral": {"voice_id": "my-voice-uuid"}}
|
|
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
|
|
|
|
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
|
assert call_kwargs["voice_id"] == "my-voice-uuid"
|
|
|
|
def test_default_voice_id_when_absent(
|
|
self, tmp_path, mock_mistral_module, monkeypatch
|
|
):
|
|
from tools.tts_tool import DEFAULT_MISTRAL_TTS_VOICE_ID, _generate_mistral_tts
|
|
|
|
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
|
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
|
audio_data=base64.b64encode(b"data").decode()
|
|
)
|
|
|
|
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), {})
|
|
|
|
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
|
assert call_kwargs["voice_id"] == DEFAULT_MISTRAL_TTS_VOICE_ID
|
|
|
|
def test_default_voice_id_when_empty_string(
|
|
self, tmp_path, mock_mistral_module, monkeypatch
|
|
):
|
|
from tools.tts_tool import DEFAULT_MISTRAL_TTS_VOICE_ID, _generate_mistral_tts
|
|
|
|
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
|
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
|
audio_data=base64.b64encode(b"data").decode()
|
|
)
|
|
|
|
config = {"mistral": {"voice_id": ""}}
|
|
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
|
|
|
|
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
|
assert call_kwargs["voice_id"] == DEFAULT_MISTRAL_TTS_VOICE_ID
|
|
|
|
def test_api_error_sanitized(self, tmp_path, mock_mistral_module, monkeypatch):
|
|
from tools.tts_tool import _generate_mistral_tts
|
|
|
|
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
|
mock_mistral_module.audio.speech.complete.side_effect = RuntimeError(
|
|
"secret-key-in-error"
|
|
)
|
|
|
|
with pytest.raises(RuntimeError, match="RuntimeError") as exc_info:
|
|
_generate_mistral_tts("Hello", str(tmp_path / "test.mp3"), {})
|
|
assert "secret-key-in-error" not in str(exc_info.value)
|
|
|
|
def test_default_model_used(self, tmp_path, mock_mistral_module, monkeypatch):
|
|
from tools.tts_tool import DEFAULT_MISTRAL_TTS_MODEL, _generate_mistral_tts
|
|
|
|
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
|
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
|
audio_data=base64.b64encode(b"data").decode()
|
|
)
|
|
|
|
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), {})
|
|
|
|
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
|
assert call_kwargs["model"] == DEFAULT_MISTRAL_TTS_MODEL
|
|
|
|
def test_model_from_config_overrides_default(
|
|
self, tmp_path, mock_mistral_module, monkeypatch
|
|
):
|
|
from tools.tts_tool import _generate_mistral_tts
|
|
|
|
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
|
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
|
audio_data=base64.b64encode(b"data").decode()
|
|
)
|
|
|
|
config = {"mistral": {"model": "voxtral-large-tts-9999"}}
|
|
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
|
|
|
|
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
|
assert call_kwargs["model"] == "voxtral-large-tts-9999"
|
|
|
|
|
|
class TestTtsDispatcherMistral:
|
|
def test_dispatcher_routes_to_mistral(
|
|
self, tmp_path, mock_mistral_module, monkeypatch
|
|
):
|
|
import json
|
|
|
|
from tools.tts_tool import text_to_speech_tool
|
|
|
|
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
|
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
|
audio_data=base64.b64encode(b"audio").decode()
|
|
)
|
|
|
|
output_path = str(tmp_path / "out.mp3")
|
|
with patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}):
|
|
result = json.loads(text_to_speech_tool("Hello", output_path=output_path))
|
|
|
|
assert result["success"] is True
|
|
assert result["provider"] == "mistral"
|
|
mock_mistral_module.audio.speech.complete.assert_called_once()
|
|
|
|
def test_dispatcher_returns_error_when_sdk_not_installed(self, tmp_path, monkeypatch):
|
|
import json
|
|
|
|
from tools.tts_tool import text_to_speech_tool
|
|
|
|
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
|
with patch(
|
|
"tools.tts_tool._import_mistral_client", side_effect=ImportError("no module")
|
|
), patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}):
|
|
result = json.loads(
|
|
text_to_speech_tool("Hello", output_path=str(tmp_path / "out.mp3"))
|
|
)
|
|
|
|
assert result["success"] is False
|
|
assert "mistralai" in result["error"]
|
|
|
|
|
|
class TestCheckTtsRequirementsMistral:
|
|
def test_mistral_sdk_and_key_returns_true(self, mock_mistral_module, monkeypatch):
|
|
from tools.tts_tool import check_tts_requirements
|
|
|
|
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
|
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), \
|
|
patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError), \
|
|
patch("tools.tts_tool._import_openai_client", side_effect=ImportError), \
|
|
patch("tools.tts_tool._check_neutts_available", return_value=False):
|
|
assert check_tts_requirements() is True
|
|
|
|
def test_mistral_key_missing_returns_false(self, mock_mistral_module):
|
|
from tools.tts_tool import check_tts_requirements
|
|
|
|
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), \
|
|
patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError), \
|
|
patch("tools.tts_tool._import_openai_client", side_effect=ImportError), \
|
|
patch("tools.tts_tool._check_neutts_available", return_value=False), \
|
|
patch("tools.tts_tool._check_kittentts_available", return_value=False), \
|
|
patch("tools.tts_tool._check_piper_available", return_value=False), \
|
|
patch("tools.tts_tool._has_any_command_tts_provider", return_value=False):
|
|
assert check_tts_requirements() is False
|