Compare commits

...

1 Commits

Author SHA1 Message Date
jjovalle99
7767dc3c09 feat(tools): add Voxtral TTS provider (Mistral AI) 2026-04-11 01:50:59 -07:00
11 changed files with 379 additions and 12 deletions

View File

@@ -588,7 +588,7 @@ platform_toolsets:
# skills_hub - skill_hub (search/install/manage from online registries — user-driven only) # skills_hub - skill_hub (search/install/manage from online registries — user-driven only)
# moa - mixture_of_agents (requires OPENROUTER_API_KEY) # moa - mixture_of_agents (requires OPENROUTER_API_KEY)
# todo - todo (in-memory task planning, no deps) # todo - todo (in-memory task planning, no deps)
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX key) # tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX/MISTRAL key)
# cronjob - cronjob (create/list/update/pause/resume/run/remove scheduled tasks) # cronjob - cronjob (create/list/update/pause/resume/run/remove scheduled tasks)
# rl - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY) # rl - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY)
# #
@@ -617,7 +617,7 @@ platform_toolsets:
# todo - Task planning and tracking for multi-step work # todo - Task planning and tracking for multi-step work
# memory - Persistent memory across sessions (personal notes + user profile) # memory - Persistent memory across sessions (personal notes + user profile)
# session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization) # session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization)
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax) # tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax, Mistral)
# cronjob - Schedule and manage automated tasks (CLI-only) # cronjob - Schedule and manage automated tasks (CLI-only)
# rl - RL training tools (Tinker-Atropos) # rl - RL training tools (Tinker-Atropos)
# #

View File

@@ -458,7 +458,7 @@ DEFAULT_CONFIG = {
# Text-to-speech configuration # Text-to-speech configuration
"tts": { "tts": {
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "neutts" (local) "provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "minimax" | "mistral" | "neutts" (local)
"edge": { "edge": {
"voice": "en-US-AriaNeural", "voice": "en-US-AriaNeural",
# Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural # Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural
@@ -472,6 +472,10 @@ DEFAULT_CONFIG = {
"voice": "alloy", "voice": "alloy",
# Voices: alloy, echo, fable, onyx, nova, shimmer # Voices: alloy, echo, fable, onyx, nova, shimmer
}, },
"mistral": {
"model": "voxtral-mini-tts-2603",
"voice_id": "c69964a6-ab8b-4f8a-9465-ec0925096ec8", # Paul - Neutral
},
"neutts": { "neutts": {
"ref_audio": "", # Path to reference voice audio (empty = bundled default) "ref_audio": "", # Path to reference voice audio (empty = bundled default)
"ref_text": "", # Path to reference voice transcript (empty = bundled default) "ref_text": "", # Path to reference voice transcript (empty = bundled default)
@@ -1016,6 +1020,13 @@ OPTIONAL_ENV_VARS = {
"password": True, "password": True,
"category": "tool", "category": "tool",
}, },
"MISTRAL_API_KEY": {
"description": "Mistral API key for Voxtral TTS and transcription (STT)",
"prompt": "Mistral API key",
"url": "https://console.mistral.ai/",
"password": True,
"category": "tool",
},
"GITHUB_TOKEN": { "GITHUB_TOKEN": {
"description": "GitHub token for Skills Hub (higher API rate limits, skill publish)", "description": "GitHub token for Skills Hub (higher API rate limits, skill publish)",
"prompt": "GitHub Token", "prompt": "GitHub Token",

View File

@@ -143,6 +143,7 @@ def _tts_label(current_provider: str) -> str:
"openai": "OpenAI TTS", "openai": "OpenAI TTS",
"elevenlabs": "ElevenLabs", "elevenlabs": "ElevenLabs",
"edge": "Edge TTS", "edge": "Edge TTS",
"mistral": "Mistral Voxtral TTS",
"neutts": "NeuTTS", "neutts": "NeuTTS",
} }
return mapping.get(current_provider or "edge", current_provider or "Edge TTS") return mapping.get(current_provider or "edge", current_provider or "Edge TTS")
@@ -309,6 +310,7 @@ def get_nous_subscription_features(
tts_current_provider in {"edge", "neutts"} tts_current_provider in {"edge", "neutts"}
or (tts_current_provider == "openai" and (managed_tts_available or direct_openai_tts)) or (tts_current_provider == "openai" and (managed_tts_available or direct_openai_tts))
or (tts_current_provider == "elevenlabs" and direct_elevenlabs) or (tts_current_provider == "elevenlabs" and direct_elevenlabs)
or (tts_current_provider == "mistral" and bool(get_env_value("MISTRAL_API_KEY")))
) )
tts_active = bool(tts_tool_enabled and tts_available) tts_active = bool(tts_tool_enabled and tts_available)

View File

@@ -557,6 +557,8 @@ def _print_setup_summary(config: dict, hermes_home):
tool_status.append(("Text-to-Speech (OpenAI)", True, None)) tool_status.append(("Text-to-Speech (OpenAI)", True, None))
elif tts_provider == "minimax" and get_env_value("MINIMAX_API_KEY"): elif tts_provider == "minimax" and get_env_value("MINIMAX_API_KEY"):
tool_status.append(("Text-to-Speech (MiniMax)", True, None)) tool_status.append(("Text-to-Speech (MiniMax)", True, None))
elif tts_provider == "mistral" and get_env_value("MISTRAL_API_KEY"):
tool_status.append(("Text-to-Speech (Mistral Voxtral)", True, None))
elif tts_provider == "neutts": elif tts_provider == "neutts":
try: try:
import importlib.util import importlib.util
@@ -1044,6 +1046,7 @@ def _setup_tts_provider(config: dict):
"elevenlabs": "ElevenLabs", "elevenlabs": "ElevenLabs",
"openai": "OpenAI TTS", "openai": "OpenAI TTS",
"minimax": "MiniMax TTS", "minimax": "MiniMax TTS",
"mistral": "Mistral Voxtral TTS",
"neutts": "NeuTTS", "neutts": "NeuTTS",
} }
current_label = provider_labels.get(current_provider, current_provider) current_label = provider_labels.get(current_provider, current_provider)
@@ -1064,10 +1067,11 @@ def _setup_tts_provider(config: dict):
"ElevenLabs (premium quality, needs API key)", "ElevenLabs (premium quality, needs API key)",
"OpenAI TTS (good quality, needs API key)", "OpenAI TTS (good quality, needs API key)",
"MiniMax TTS (high quality with voice cloning, needs API key)", "MiniMax TTS (high quality with voice cloning, needs API key)",
"Mistral Voxtral TTS (multilingual, native Opus, needs API key)",
"NeuTTS (local on-device, free, ~300MB model download)", "NeuTTS (local on-device, free, ~300MB model download)",
] ]
) )
providers.extend(["edge", "elevenlabs", "openai", "minimax", "neutts"]) providers.extend(["edge", "elevenlabs", "openai", "minimax", "mistral", "neutts"])
choices.append(f"Keep current ({current_label})") choices.append(f"Keep current ({current_label})")
keep_current_idx = len(choices) - 1 keep_current_idx = len(choices) - 1
idx = prompt_choice("Select TTS provider:", choices, keep_current_idx) idx = prompt_choice("Select TTS provider:", choices, keep_current_idx)
@@ -1145,6 +1149,18 @@ def _setup_tts_provider(config: dict):
print_warning("No API key provided. Falling back to Edge TTS.") print_warning("No API key provided. Falling back to Edge TTS.")
selected = "edge" selected = "edge"
elif selected == "mistral":
existing = get_env_value("MISTRAL_API_KEY")
if not existing:
print()
api_key = prompt("Mistral API key for TTS", password=True)
if api_key:
save_env_value("MISTRAL_API_KEY", api_key)
print_success("Mistral TTS API key saved")
else:
print_warning("No API key provided. Falling back to Edge TTS.")
selected = "edge"
# Save the selection # Save the selection
if "tts" not in config: if "tts" not in config:
config["tts"] = {} config["tts"] = {}

View File

@@ -181,6 +181,14 @@ TOOL_CATEGORIES = {
], ],
"tts_provider": "elevenlabs", "tts_provider": "elevenlabs",
}, },
{
"name": "Mistral (Voxtral TTS)",
"tag": "Multilingual, native Opus, needs MISTRAL_API_KEY",
"env_vars": [
{"key": "MISTRAL_API_KEY", "prompt": "Mistral API key", "url": "https://console.mistral.ai/"},
],
"tts_provider": "mistral",
},
], ],
}, },
"web": { "web": {

View File

@@ -249,8 +249,12 @@ def check_config(groq_key, eleven_key):
if stt_provider == "groq" and not groq_key: if stt_provider == "groq" and not groq_key:
warn("STT config says groq but GROQ_API_KEY is missing") warn("STT config says groq but GROQ_API_KEY is missing")
if stt_provider == "mistral" and not os.getenv("MISTRAL_API_KEY"):
warn("STT config says mistral but MISTRAL_API_KEY is missing")
if tts_provider == "elevenlabs" and not eleven_key: if tts_provider == "elevenlabs" and not eleven_key:
warn("TTS config says elevenlabs but ELEVENLABS_API_KEY is missing") warn("TTS config says elevenlabs but ELEVENLABS_API_KEY is missing")
if tts_provider == "mistral" and not os.getenv("MISTRAL_API_KEY"):
warn("TTS config says mistral but MISTRAL_API_KEY is missing")
except Exception as e: except Exception as e:
warn("config.yaml", f"parse error: {e}") warn("config.yaml", f"parse error: {e}")
else: else:

View File

@@ -0,0 +1,245 @@
"""Tests for the Mistral (Voxtral) TTS provider in tools/tts_tool.py."""
import base64
from unittest.mock import MagicMock, patch
import pytest
@pytest.fixture(autouse=True)
def clean_env(monkeypatch):
for key in ("MISTRAL_API_KEY", "HERMES_SESSION_PLATFORM"):
monkeypatch.delenv(key, raising=False)
@pytest.fixture
def mock_mistral_module():
mock_client = MagicMock()
mock_client.__enter__ = MagicMock(return_value=mock_client)
mock_client.__exit__ = MagicMock(return_value=False)
mock_mistral_cls = MagicMock(return_value=mock_client)
fake_module = MagicMock()
fake_module.Mistral = mock_mistral_cls
with patch.dict("sys.modules", {"mistralai": fake_module, "mistralai.client": fake_module}):
yield mock_client
class TestGenerateMistralTts:
def test_missing_api_key_raises_value_error(self, tmp_path, mock_mistral_module):
from tools.tts_tool import _generate_mistral_tts
output_path = str(tmp_path / "test.mp3")
with pytest.raises(ValueError, match="MISTRAL_API_KEY"):
_generate_mistral_tts("Hello", output_path, {})
def test_successful_generation(self, tmp_path, mock_mistral_module, monkeypatch):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
audio_content = b"fake-audio-bytes"
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(audio_content).decode()
)
output_path = str(tmp_path / "test.mp3")
result = _generate_mistral_tts("Hello world", output_path, {})
assert result == output_path
assert (tmp_path / "test.mp3").read_bytes() == audio_content
mock_mistral_module.audio.speech.complete.assert_called_once()
mock_mistral_module.__exit__.assert_called_once()
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["input"] == "Hello world"
assert call_kwargs["response_format"] == "mp3"
@pytest.mark.parametrize(
"extension, expected_format",
[(".ogg", "opus"), (".wav", "wav"), (".flac", "flac"), (".mp3", "mp3")],
)
def test_response_format_from_extension(
self, tmp_path, mock_mistral_module, monkeypatch, extension, expected_format
):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
output_path = str(tmp_path / f"test{extension}")
_generate_mistral_tts("Hi", output_path, {})
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["response_format"] == expected_format
def test_voice_id_passed_when_configured(
self, tmp_path, mock_mistral_module, monkeypatch
):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
config = {"mistral": {"voice_id": "my-voice-uuid"}}
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["voice_id"] == "my-voice-uuid"
def test_default_voice_id_when_absent(
self, tmp_path, mock_mistral_module, monkeypatch
):
from tools.tts_tool import DEFAULT_MISTRAL_TTS_VOICE_ID, _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), {})
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["voice_id"] == DEFAULT_MISTRAL_TTS_VOICE_ID
def test_default_voice_id_when_empty_string(
self, tmp_path, mock_mistral_module, monkeypatch
):
from tools.tts_tool import DEFAULT_MISTRAL_TTS_VOICE_ID, _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
config = {"mistral": {"voice_id": ""}}
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["voice_id"] == DEFAULT_MISTRAL_TTS_VOICE_ID
def test_api_error_sanitized(self, tmp_path, mock_mistral_module, monkeypatch):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.side_effect = RuntimeError(
"secret-key-in-error"
)
with pytest.raises(RuntimeError, match="RuntimeError") as exc_info:
_generate_mistral_tts("Hello", str(tmp_path / "test.mp3"), {})
assert "secret-key-in-error" not in str(exc_info.value)
def test_default_model_used(self, tmp_path, mock_mistral_module, monkeypatch):
from tools.tts_tool import DEFAULT_MISTRAL_TTS_MODEL, _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), {})
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["model"] == DEFAULT_MISTRAL_TTS_MODEL
def test_model_from_config_overrides_default(
self, tmp_path, mock_mistral_module, monkeypatch
):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
config = {"mistral": {"model": "voxtral-large-tts-9999"}}
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["model"] == "voxtral-large-tts-9999"
class TestTtsDispatcherMistral:
def test_dispatcher_routes_to_mistral(
self, tmp_path, mock_mistral_module, monkeypatch
):
import json
from tools.tts_tool import text_to_speech_tool
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"audio").decode()
)
output_path = str(tmp_path / "out.mp3")
with patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}):
result = json.loads(text_to_speech_tool("Hello", output_path=output_path))
assert result["success"] is True
assert result["provider"] == "mistral"
mock_mistral_module.audio.speech.complete.assert_called_once()
def test_dispatcher_returns_error_when_sdk_not_installed(self, tmp_path, monkeypatch):
import json
from tools.tts_tool import text_to_speech_tool
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
with patch(
"tools.tts_tool._import_mistral_client", side_effect=ImportError("no module")
), patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}):
result = json.loads(
text_to_speech_tool("Hello", output_path=str(tmp_path / "out.mp3"))
)
assert result["success"] is False
assert "mistralai" in result["error"]
class TestCheckTtsRequirementsMistral:
def test_mistral_sdk_and_key_returns_true(self, mock_mistral_module, monkeypatch):
from tools.tts_tool import check_tts_requirements
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), \
patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError), \
patch("tools.tts_tool._import_openai_client", side_effect=ImportError), \
patch("tools.tts_tool._check_neutts_available", return_value=False):
assert check_tts_requirements() is True
def test_mistral_key_missing_returns_false(self, mock_mistral_module):
from tools.tts_tool import check_tts_requirements
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), \
patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError), \
patch("tools.tts_tool._import_openai_client", side_effect=ImportError), \
patch("tools.tts_tool._check_neutts_available", return_value=False):
assert check_tts_requirements() is False
class TestMistralTtsOpus:
def test_telegram_produces_ogg_and_voice_compatible(
self, tmp_path, mock_mistral_module, monkeypatch
):
import json
from tools.tts_tool import text_to_speech_tool
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
monkeypatch.setenv("HERMES_SESSION_PLATFORM", "telegram")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"opus-audio").decode()
)
with patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}):
result = json.loads(text_to_speech_tool("Hello"))
assert result["success"] is True
assert result["file_path"].endswith(".ogg")
assert result["voice_compatible"] is True
assert "[[audio_as_voice]]" in result["media_tag"]
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["response_format"] == "opus"

View File

@@ -2,11 +2,12 @@
""" """
Text-to-Speech Tool Module Text-to-Speech Tool Module
Supports five TTS providers: Supports six TTS providers:
- Edge TTS (default, free, no API key): Microsoft Edge neural voices - Edge TTS (default, free, no API key): Microsoft Edge neural voices
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY - ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
- OpenAI TTS: Good quality, needs OPENAI_API_KEY - OpenAI TTS: Good quality, needs OPENAI_API_KEY
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY - MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed - NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
Output formats: Output formats:
@@ -23,6 +24,7 @@ Usage:
""" """
import asyncio import asyncio
import base64
import datetime import datetime
import json import json
import logging import logging
@@ -62,6 +64,11 @@ def _import_openai_client():
from openai import OpenAI as OpenAIClient from openai import OpenAI as OpenAIClient
return OpenAIClient return OpenAIClient
def _import_mistral_client():
"""Lazy import Mistral client. Returns the class or raises ImportError."""
from mistralai.client import Mistral
return Mistral
def _import_sounddevice(): def _import_sounddevice():
"""Lazy import sounddevice. Returns the module or raises ImportError/OSError.""" """Lazy import sounddevice. Returns the module or raises ImportError/OSError."""
import sounddevice as sd import sounddevice as sd
@@ -82,6 +89,8 @@ DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
DEFAULT_MINIMAX_MODEL = "speech-2.8-hd" DEFAULT_MINIMAX_MODEL = "speech-2.8-hd"
DEFAULT_MINIMAX_VOICE_ID = "English_Graceful_Lady" DEFAULT_MINIMAX_VOICE_ID = "English_Graceful_Lady"
DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1/t2a_v2" DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1/t2a_v2"
DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603"
DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral
def _get_default_output_dir() -> str: def _get_default_output_dir() -> str:
from hermes_constants import get_hermes_dir from hermes_constants import get_hermes_dir
@@ -365,6 +374,55 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
return output_path return output_path
# ===========================================================================
# Provider: Mistral (Voxtral TTS)
# ===========================================================================
def _generate_mistral_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate audio using Mistral Voxtral TTS API.
The API returns base64-encoded audio; this function decodes it
and writes the raw bytes to *output_path*.
Supports native Opus output for Telegram voice bubbles.
"""
api_key = os.getenv("MISTRAL_API_KEY", "")
if not api_key:
raise ValueError("MISTRAL_API_KEY not set. Get one at https://console.mistral.ai/")
mi_config = tts_config.get("mistral", {})
model = mi_config.get("model", DEFAULT_MISTRAL_TTS_MODEL)
voice_id = mi_config.get("voice_id") or DEFAULT_MISTRAL_TTS_VOICE_ID
if output_path.endswith(".ogg"):
response_format = "opus"
elif output_path.endswith(".wav"):
response_format = "wav"
elif output_path.endswith(".flac"):
response_format = "flac"
else:
response_format = "mp3"
Mistral = _import_mistral_client()
try:
with Mistral(api_key=api_key) as client:
response = client.audio.speech.complete(
model=model,
input=text,
voice_id=voice_id,
response_format=response_format,
)
audio_bytes = base64.b64decode(response.audio_data)
except ValueError:
raise
except Exception as e:
logger.error("Mistral TTS failed: %s", e, exc_info=True)
raise RuntimeError(f"Mistral TTS failed: {type(e).__name__}") from e
with open(output_path, "wb") as f:
f.write(audio_bytes)
return output_path
# =========================================================================== # ===========================================================================
# NeuTTS (local, on-device TTS via neutts_cli) # NeuTTS (local, on-device TTS via neutts_cli)
# =========================================================================== # ===========================================================================
@@ -493,7 +551,7 @@ def text_to_speech_tool(
out_dir.mkdir(parents=True, exist_ok=True) out_dir.mkdir(parents=True, exist_ok=True)
# Use .ogg for Telegram with providers that support native Opus output, # Use .ogg for Telegram with providers that support native Opus output,
# otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later). # otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later).
if want_opus and provider in ("openai", "elevenlabs"): if want_opus and provider in ("openai", "elevenlabs", "mistral"):
file_path = out_dir / f"tts_{timestamp}.ogg" file_path = out_dir / f"tts_{timestamp}.ogg"
else: else:
file_path = out_dir / f"tts_{timestamp}.mp3" file_path = out_dir / f"tts_{timestamp}.mp3"
@@ -530,6 +588,18 @@ def text_to_speech_tool(
logger.info("Generating speech with MiniMax TTS...") logger.info("Generating speech with MiniMax TTS...")
_generate_minimax_tts(text, file_str, tts_config) _generate_minimax_tts(text, file_str, tts_config)
elif provider == "mistral":
try:
_import_mistral_client()
except ImportError:
return json.dumps({
"success": False,
"error": "Mistral provider selected but 'mistralai' package not installed. "
"Run: pip install 'hermes-agent[mistral]'"
}, ensure_ascii=False)
logger.info("Generating speech with Mistral Voxtral TTS...")
_generate_mistral_tts(text, file_str, tts_config)
elif provider == "neutts": elif provider == "neutts":
if not _check_neutts_available(): if not _check_neutts_available():
return json.dumps({ return json.dumps({
@@ -584,8 +654,7 @@ def text_to_speech_tool(
if opus_path: if opus_path:
file_str = opus_path file_str = opus_path
voice_compatible = True voice_compatible = True
elif provider in ("elevenlabs", "openai"): elif provider in ("elevenlabs", "openai", "mistral"):
# These providers can output Opus natively if the path ends in .ogg
voice_compatible = file_str.endswith(".ogg") voice_compatible = file_str.endswith(".ogg")
file_size = os.path.getsize(file_str) file_size = os.path.getsize(file_str)
@@ -653,6 +722,12 @@ def check_tts_requirements() -> bool:
pass pass
if os.getenv("MINIMAX_API_KEY"): if os.getenv("MINIMAX_API_KEY"):
return True return True
try:
_import_mistral_client()
if os.getenv("MISTRAL_API_KEY"):
return True
except ImportError:
pass
if _check_neutts_available(): if _check_neutts_available():
return True return True
return False return False

View File

@@ -145,6 +145,7 @@ ELEVENLABS_API_KEY=***
- `neutts` → free local/on-device TTS - `neutts` → free local/on-device TTS
- `elevenlabs` → best quality - `elevenlabs` → best quality
- `openai` → good middle ground - `openai` → good middle ground
- `mistral` → multilingual, native Opus
### If you use `hermes setup` ### If you use `hermes setup`

View File

@@ -864,6 +864,7 @@ You can switch between providers at any time with `hermes model` — no restart
| Image generation | [FAL](https://fal.ai/) | `FAL_KEY` | | Image generation | [FAL](https://fal.ai/) | `FAL_KEY` |
| Premium TTS voices | [ElevenLabs](https://elevenlabs.io/) | `ELEVENLABS_API_KEY` | | Premium TTS voices | [ElevenLabs](https://elevenlabs.io/) | `ELEVENLABS_API_KEY` |
| OpenAI TTS + voice transcription | [OpenAI](https://platform.openai.com/api-keys) | `VOICE_TOOLS_OPENAI_KEY` | | OpenAI TTS + voice transcription | [OpenAI](https://platform.openai.com/api-keys) | `VOICE_TOOLS_OPENAI_KEY` |
| Mistral TTS + voice transcription | [Mistral](https://console.mistral.ai/) | `MISTRAL_API_KEY` |
| RL Training | [Tinker](https://tinker-console.thinkingmachines.ai/) + [WandB](https://wandb.ai/) | `TINKER_API_KEY`, `WANDB_API_KEY` | | RL Training | [Tinker](https://tinker-console.thinkingmachines.ai/) + [WandB](https://wandb.ai/) | `TINKER_API_KEY`, `WANDB_API_KEY` |
| Cross-session user modeling | [Honcho](https://honcho.dev/) | `HONCHO_API_KEY` | | Cross-session user modeling | [Honcho](https://honcho.dev/) | `HONCHO_API_KEY` |
| Semantic long-term memory | [Supermemory](https://supermemory.ai) | `SUPERMEMORY_API_KEY` | | Semantic long-term memory | [Supermemory](https://supermemory.ai) | `SUPERMEMORY_API_KEY` |

View File

@@ -10,7 +10,7 @@ Hermes Agent supports both text-to-speech output and voice message transcription
## Text-to-Speech ## Text-to-Speech
Convert text to speech with five providers: Convert text to speech with six providers:
| Provider | Quality | Cost | API Key | | Provider | Quality | Cost | API Key |
|----------|---------|------|---------| |----------|---------|------|---------|
@@ -18,6 +18,7 @@ Convert text to speech with five providers:
| **ElevenLabs** | Excellent | Paid | `ELEVENLABS_API_KEY` | | **ElevenLabs** | Excellent | Paid | `ELEVENLABS_API_KEY` |
| **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` | | **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` |
| **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` | | **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` |
| **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` |
| **NeuTTS** | Good | Free | None needed | | **NeuTTS** | Good | Free | None needed |
### Platform Delivery ### Platform Delivery
@@ -34,7 +35,7 @@ Convert text to speech with five providers:
```yaml ```yaml
# In ~/.hermes/config.yaml # In ~/.hermes/config.yaml
tts: tts:
provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "neutts" provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "neutts"
edge: edge:
voice: "en-US-AriaNeural" # 322 voices, 74 languages voice: "en-US-AriaNeural" # 322 voices, 74 languages
elevenlabs: elevenlabs:
@@ -50,6 +51,9 @@ tts:
speed: 1 # 0.5 - 2.0 speed: 1 # 0.5 - 2.0
vol: 1 # 0 - 10 vol: 1 # 0 - 10
pitch: 0 # -12 - 12 pitch: 0 # -12 - 12
mistral:
model: "voxtral-mini-tts-2603"
voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default)
neutts: neutts:
ref_audio: '' ref_audio: ''
ref_text: '' ref_text: ''
@@ -61,7 +65,7 @@ tts:
Telegram voice bubbles require Opus/OGG audio format: Telegram voice bubbles require Opus/OGG audio format:
- **OpenAI and ElevenLabs** produce Opus natively — no extra setup - **OpenAI, ElevenLabs, and Mistral** produce Opus natively — no extra setup
- **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert: - **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
- **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles - **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles
- **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles - **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
@@ -80,7 +84,7 @@ sudo dnf install ffmpeg
Without ffmpeg, Edge TTS, MiniMax TTS, and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble). Without ffmpeg, Edge TTS, MiniMax TTS, and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble).
:::tip :::tip
If you want voice bubbles without installing ffmpeg, switch to the OpenAI or ElevenLabs provider. If you want voice bubbles without installing ffmpeg, switch to the OpenAI, ElevenLabs, or Mistral provider.
::: :::
## Voice Message Transcription (STT) ## Voice Message Transcription (STT)