Compare commits

...

3 Commits

Author SHA1 Message Date
kshitijk4poor
b945f9e0d5 fix: follow-up for Gemini TTS salvage
Review findings addressed:
- Scan response parts for inlineData instead of blindly picking parts[0]
- Validate empty PCM bytes (prevents silent 44-byte WAV)
- Catch URLError for network/DNS failures
- 6 new tests: empty PCM, text-before-audio part, base_url override,
  WAV→MP3 ffmpeg conversion, no-ffmpeg rename fallback, URLError
2026-04-16 20:59:09 +05:30
kshitijk4poor
acff9d36db chore: add zhonghui5207 to AUTHOR_MAP 2026-04-16 20:47:52 +05:30
zhonghui5207
0671201c05 feat(tts): add Gemini TTS provider
Add Google's Gemini speech-generation API as 8th TTS backend.
Returns base64-encoded signed 16-bit PCM at 24 kHz mono, wrapped
in WAV natively via stdlib wave module. Optional ffmpeg conversion
to mp3/ogg for Telegram voice bubbles.

Supports GEMINI_API_KEY and GOOGLE_API_KEY (fallback), 30 prebuilt
voices, configurable model (flash/pro).

Cherry-picked from #10922 by @zhonghui5207. Fixes #10918.
2026-04-16 20:47:29 +05:30
8 changed files with 511 additions and 26 deletions

View File

@@ -26,7 +26,6 @@ model:
# "huggingface" - Hugging Face Inference (requires: HF_TOKEN)
# "xiaomi" - Xiaomi MiMo (requires: XIAOMI_API_KEY)
# "arcee" - Arcee AI Trinity models (requires: ARCEEAI_API_KEY)
# "ollama-cloud" - Ollama Cloud (requires: OLLAMA_API_KEY — https://ollama.com/settings)
# "kilocode" - KiloCode gateway (requires: KILOCODE_API_KEY)
# "ai-gateway" - Vercel AI Gateway (requires: AI_GATEWAY_API_KEY)
#
@@ -38,6 +37,12 @@ model:
# base_url: "http://localhost:1234/v1"
# No API key needed — local servers typically ignore auth.
#
# For Ollama Cloud (https://ollama.com/pricing):
# provider: "custom"
# base_url: "https://ollama.com/v1"
# Set OLLAMA_API_KEY in .env — automatically picked up when base_url
# points to ollama.com.
#
# Can also be overridden with --provider flag or HERMES_INFERENCE_PROVIDER env var.
provider: "auto"
@@ -332,7 +337,6 @@ compression:
# "openrouter" - Force OpenRouter (requires OPENROUTER_API_KEY)
# "nous" - Force Nous Portal (requires: hermes login)
# "gemini" - Force Google AI Studio direct (requires: GOOGLE_API_KEY or GEMINI_API_KEY)
# "ollama-cloud" - Ollama Cloud (requires: OLLAMA_API_KEY)
# "codex" - Force Codex OAuth (requires: hermes model → Codex).
# Uses gpt-5.3-codex which supports vision.
# "main" - Use your custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY).
@@ -593,7 +597,7 @@ platform_toolsets:
# skills_hub - skill_hub (search/install/manage from online registries — user-driven only)
# moa - mixture_of_agents (requires OPENROUTER_API_KEY)
# todo - todo (in-memory task planning, no deps)
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX/MISTRAL key)
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/XAI/MINIMAX/MISTRAL/GEMINI key)
# cronjob - cronjob (create/list/update/pause/resume/run/remove scheduled tasks)
# rl - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY)
#
@@ -622,7 +626,7 @@ platform_toolsets:
# todo - Task planning and tracking for multi-step work
# memory - Persistent memory across sessions (personal notes + user profile)
# session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization)
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax, Mistral)
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, xAI, MiniMax, Mistral, Gemini)
# cronjob - Schedule and manage automated tasks (CLI-only)
# rl - RL training tools (Tinker-Atropos)
#

View File

@@ -61,9 +61,7 @@ from hermes_cli.colors import Colors, color
from hermes_cli.default_soul import DEFAULT_SOUL_MD
# =============================================================================
# Managed mode (NixOS declarative config)
# =============================================================================
_MANAGED_TRUE_VALUES = ("true", "1", "yes")
_MANAGED_SYSTEM_NAMES = {
@@ -147,9 +145,7 @@ def managed_error(action: str = "modify configuration"):
print(format_managed_message(action), file=sys.stderr)
# =============================================================================
# Container-aware CLI (NixOS container mode)
# =============================================================================
def get_container_exec_info() -> Optional[dict]:
"""Read container mode metadata from HERMES_HOME/.container-mode.
@@ -196,9 +192,7 @@ def get_container_exec_info() -> Optional[dict]:
}
# =============================================================================
# Config paths
# =============================================================================
# Re-export from hermes_constants — canonical definition lives there.
from hermes_constants import get_hermes_home # noqa: F811,E402
@@ -335,9 +329,7 @@ def _ensure_hermes_home_managed(home: Path):
_ensure_default_soul_md(home)
# =============================================================================
# Config loading/saving
# =============================================================================
DEFAULT_CONFIG = {
"model": "",
@@ -570,7 +562,7 @@ DEFAULT_CONFIG = {
# Text-to-speech configuration
"tts": {
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "neutts" (local)
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "neutts" (local) | "gemini"
"edge": {
"voice": "en-US-AriaNeural",
# Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural
@@ -600,6 +592,15 @@ DEFAULT_CONFIG = {
"model": "neuphonic/neutts-air-q4-gguf", # HuggingFace model repo
"device": "cpu", # cpu, cuda, or mps
},
"gemini": {
"model": "gemini-2.5-flash-preview-tts",
"voice": "Kore",
# 30 prebuilt voices: Zephyr, Puck, Charon, Kore, Fenrir, Leda,
# Orus, Aoede, Callirrhoe, Autonoe, Enceladus, Iapetus, Umbriel,
# Algieba, Despina, Erinome, Algenib, Rasalgethi, Laomedeia,
# Achernar, Alnilam, Schedar, Gacrux, Pulcherrima, Achird,
# Zubenelgenubi, Vindemiatrix, Sadachbia, Sadaltager, Sulafat
},
},
"stt": {
@@ -781,9 +782,7 @@ DEFAULT_CONFIG = {
"_config_version": 18,
}
# =============================================================================
# Config Migration System
# =============================================================================
# Track which env vars were introduced in each config version.
# Migration only mentions vars new since the user's previous version.
@@ -1901,9 +1900,7 @@ def check_config_version() -> Tuple[int, int]:
return current, latest
# =============================================================================
# Config structure validation
# =============================================================================
# Fields that are valid at root level of config.yaml
_KNOWN_ROOT_KEYS = {
@@ -3167,9 +3164,7 @@ def get_env_value(key: str) -> Optional[str]:
return env_vars.get(key)
# =============================================================================
# Config display
# =============================================================================
def redact_key(key: str) -> str:
"""Redact an API key for display."""
@@ -3461,9 +3456,7 @@ def set_config_value(key: str, value: str):
print(f"✓ Set {key} = {value} in {config_path}")
# =============================================================================
# Command handler
# =============================================================================
def config_command(args):
"""Handle config subcommands."""

View File

@@ -433,6 +433,10 @@ def _print_setup_summary(config: dict, hermes_home):
tool_status.append(("Text-to-Speech (MiniMax)", True, None))
elif tts_provider == "mistral" and get_env_value("MISTRAL_API_KEY"):
tool_status.append(("Text-to-Speech (Mistral Voxtral)", True, None))
elif tts_provider == "gemini" and (
get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY")
):
tool_status.append(("Text-to-Speech (Gemini)", True, None))
elif tts_provider == "neutts":
try:
import importlib.util
@@ -924,6 +928,7 @@ def _setup_tts_provider(config: dict):
"minimax": "MiniMax TTS",
"mistral": "Mistral Voxtral TTS",
"neutts": "NeuTTS",
"gemini": "Gemini TTS",
}
current_label = provider_labels.get(current_provider, current_provider)
@@ -946,9 +951,10 @@ def _setup_tts_provider(config: dict):
"MiniMax TTS (high quality with voice cloning, needs API key)",
"Mistral Voxtral TTS (multilingual, native Opus, needs API key)",
"NeuTTS (local on-device, free, ~300MB model download)",
"Gemini TTS (Google speech generation, 30 voices, needs GEMINI_API_KEY)",
]
)
providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "neutts"])
providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "neutts", "gemini"])
choices.append(f"Keep current ({current_label})")
keep_current_idx = len(choices) - 1
idx = prompt_choice("Select TTS provider:", choices, keep_current_idx)
@@ -1055,6 +1061,19 @@ def _setup_tts_provider(config: dict):
print_warning("No API key provided. Falling back to Edge TTS.")
selected = "edge"
elif selected == "gemini":
existing = get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY")
if not existing:
print()
print_info("Get a key at https://aistudio.google.com/apikey")
api_key = prompt("Gemini API key for TTS", password=True)
if api_key:
save_env_value("GEMINI_API_KEY", api_key)
print_success("Gemini API key saved")
else:
print_warning("No API key provided. Falling back to Edge TTS.")
selected = "edge"
# Save the selection
if "tts" not in config:
config["tts"] = {}

View File

@@ -186,6 +186,7 @@ AUTHOR_MAP = {
"danieldoderlein@users.noreply.github.com": "danieldoderlein",
"lrawnsley@users.noreply.github.com": "lrawnsley",
"taeuk178@users.noreply.github.com": "taeuk178",
"zhonghui5207@users.noreply.github.com": "zhonghui5207",
"ogzerber@users.noreply.github.com": "ogzerber",
"cola-runner@users.noreply.github.com": "cola-runner",
"ygd58@users.noreply.github.com": "ygd58",

View File

@@ -0,0 +1,330 @@
"""Tests for the Gemini TTS provider in tools/tts_tool.py."""
import base64
import os
import struct
from unittest.mock import MagicMock, patch
import pytest
@pytest.fixture(autouse=True)
def clean_env(monkeypatch):
for key in (
"GEMINI_API_KEY",
"GOOGLE_API_KEY",
"HERMES_SESSION_PLATFORM",
"MINIMAX_API_KEY",
"ELEVENLABS_API_KEY",
"OPENAI_API_KEY",
"VOICE_TOOLS_OPENAI_KEY",
"MISTRAL_API_KEY",
):
monkeypatch.delenv(key, raising=False)
def _gemini_response(pcm_bytes: bytes) -> dict:
return {
"candidates": [
{
"content": {
"parts": [
{"inlineData": {"data": base64.b64encode(pcm_bytes).decode()}}
]
}
}
]
}
def _mock_urlopen(response_payload: dict):
resp_body = __import__("json").dumps(response_payload).encode("utf-8")
mock_resp = MagicMock()
mock_resp.read.return_value = resp_body
mock_resp.__enter__ = MagicMock(return_value=mock_resp)
mock_resp.__exit__ = MagicMock(return_value=False)
return mock_resp
class TestGenerateGeminiTts:
def test_missing_api_key_raises_value_error(self, tmp_path):
from tools.tts_tool import _generate_gemini_tts
with pytest.raises(ValueError, match="GEMINI_API_KEY"):
_generate_gemini_tts("Hello", str(tmp_path / "out.wav"), {})
def test_google_api_key_fallback_accepted(self, tmp_path, monkeypatch):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GOOGLE_API_KEY", "test-key")
pcm = b"\x01\x00\x02\x00\x03\x00"
with patch(
"tools.tts_tool.urllib.request.urlopen",
return_value=_mock_urlopen(_gemini_response(pcm)),
):
result = _generate_gemini_tts("Hi", str(tmp_path / "out.wav"), {})
assert result == str(tmp_path / "out.wav")
def test_writes_wav_with_correct_pcm_params(self, tmp_path, monkeypatch):
import wave
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
pcm = struct.pack("<6h", 0, 1, 2, 3, 4, 5)
with patch(
"tools.tts_tool.urllib.request.urlopen",
return_value=_mock_urlopen(_gemini_response(pcm)),
):
out = tmp_path / "out.wav"
_generate_gemini_tts("Hi", str(out), {})
with wave.open(str(out), "rb") as wf:
assert wf.getnchannels() == 1
assert wf.getsampwidth() == 2
assert wf.getframerate() == 24000
assert wf.readframes(wf.getnframes()) == pcm
def test_default_model_and_voice_in_payload(self, tmp_path, monkeypatch):
import json as _json
from tools.tts_tool import (
DEFAULT_GEMINI_TTS_MODEL,
DEFAULT_GEMINI_TTS_VOICE,
_generate_gemini_tts,
)
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
captured = {}
def fake_urlopen(req, timeout=None):
captured["url"] = req.full_url
captured["body"] = _json.loads(req.data.decode())
captured["headers"] = dict(req.headers)
return _mock_urlopen(_gemini_response(b"\x00\x00"))
with patch("tools.tts_tool.urllib.request.urlopen", side_effect=fake_urlopen):
_generate_gemini_tts("hello", str(tmp_path / "out.wav"), {})
assert DEFAULT_GEMINI_TTS_MODEL in captured["url"]
voice_cfg = captured["body"]["generationConfig"]["speechConfig"]["voiceConfig"][
"prebuiltVoiceConfig"
]
assert voice_cfg["voiceName"] == DEFAULT_GEMINI_TTS_VOICE
# Header keys normalize to capitalized form via urllib
assert captured["headers"].get("X-goog-api-key") == "test-key"
def test_config_overrides(self, tmp_path, monkeypatch):
import json as _json
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
captured = {}
def fake_urlopen(req, timeout=None):
captured["url"] = req.full_url
captured["body"] = _json.loads(req.data.decode())
return _mock_urlopen(_gemini_response(b"\x00\x00"))
with patch("tools.tts_tool.urllib.request.urlopen", side_effect=fake_urlopen):
config = {"gemini": {"model": "gemini-2.5-pro-preview-tts", "voice": "Puck"}}
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), config)
assert "gemini-2.5-pro-preview-tts" in captured["url"]
voice_cfg = captured["body"]["generationConfig"]["speechConfig"]["voiceConfig"][
"prebuiltVoiceConfig"
]
assert voice_cfg["voiceName"] == "Puck"
def test_http_error_surfaced_as_runtime_error(self, tmp_path, monkeypatch):
import urllib.error
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
err = urllib.error.HTTPError(
"https://example", 429, "Too Many Requests", {}, None
)
err.read = MagicMock(return_value=b'{"error": "rate limit"}')
with patch("tools.tts_tool.urllib.request.urlopen", side_effect=err):
with pytest.raises(RuntimeError, match="429"):
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
def test_missing_audio_payload_raises_runtime_error(self, tmp_path, monkeypatch):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
bad_response = {"candidates": [{"content": {"parts": []}}]}
with patch(
"tools.tts_tool.urllib.request.urlopen",
return_value=_mock_urlopen(bad_response),
):
with pytest.raises(RuntimeError, match="missing audio payload"):
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
class TestTtsDispatcherGemini:
def test_dispatcher_routes_to_gemini(self, tmp_path, monkeypatch):
import json
from tools.tts_tool import text_to_speech_tool
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
pcm = struct.pack("<2h", 100, -100)
with patch(
"tools.tts_tool.urllib.request.urlopen",
return_value=_mock_urlopen(_gemini_response(pcm)),
), patch(
"tools.tts_tool._load_tts_config", return_value={"provider": "gemini"}
):
# Force .wav output so we skip the ffmpeg / Opus conversion branch
output_path = str(tmp_path / "out.wav")
result = json.loads(text_to_speech_tool("Hello", output_path=output_path))
assert result["success"] is True
assert result["provider"] == "gemini"
class TestCheckTtsRequirementsGemini:
def test_gemini_key_returns_true(self, monkeypatch):
from tools.tts_tool import check_tts_requirements
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), patch(
"tools.tts_tool._import_elevenlabs", side_effect=ImportError
), patch("tools.tts_tool._import_openai_client", side_effect=ImportError), patch(
"tools.tts_tool._import_mistral_client", side_effect=ImportError
), patch("tools.tts_tool._check_neutts_available", return_value=False):
assert check_tts_requirements() is True
def test_google_api_key_also_accepted(self, monkeypatch):
from tools.tts_tool import check_tts_requirements
monkeypatch.setenv("GOOGLE_API_KEY", "test-key")
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), patch(
"tools.tts_tool._import_elevenlabs", side_effect=ImportError
), patch("tools.tts_tool._import_openai_client", side_effect=ImportError), patch(
"tools.tts_tool._import_mistral_client", side_effect=ImportError
), patch("tools.tts_tool._check_neutts_available", return_value=False):
assert check_tts_requirements() is True
def test_no_key_returns_false(self):
from tools.tts_tool import check_tts_requirements
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), patch(
"tools.tts_tool._import_elevenlabs", side_effect=ImportError
), patch("tools.tts_tool._import_openai_client", side_effect=ImportError), patch(
"tools.tts_tool._import_mistral_client", side_effect=ImportError
), patch("tools.tts_tool._check_neutts_available", return_value=False):
assert check_tts_requirements() is False
class TestGeminiTtsEdgeCases:
"""Tests for edge cases and conversion paths added during salvage review."""
def test_empty_pcm_raises_runtime_error(self, tmp_path, monkeypatch):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
with patch(
"tools.tts_tool.urllib.request.urlopen",
return_value=_mock_urlopen(_gemini_response(b"")),
):
with pytest.raises(RuntimeError, match="empty audio data"):
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
def test_text_part_before_audio_is_handled(self, tmp_path, monkeypatch):
"""If the response has a text part before the audio part, still extract audio."""
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
pcm = b"\x01\x00\x02\x00"
mixed_response = {
"candidates": [
{
"content": {
"parts": [
{"text": "Here is your audio"},
{"inlineData": {"data": base64.b64encode(pcm).decode()}},
]
}
}
]
}
with patch(
"tools.tts_tool.urllib.request.urlopen",
return_value=_mock_urlopen(mixed_response),
):
result = _generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
assert result == str(tmp_path / "out.wav")
def test_base_url_config_override(self, tmp_path, monkeypatch):
import json as _json
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
captured = {}
def fake_urlopen(req, timeout=None):
captured["url"] = req.full_url
return _mock_urlopen(_gemini_response(b"\x00\x00"))
with patch("tools.tts_tool.urllib.request.urlopen", side_effect=fake_urlopen):
config = {"gemini": {"base_url": "https://custom.api.example.com/v1"}}
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), config)
assert "custom.api.example.com" in captured["url"]
def test_wav_to_mp3_conversion_with_ffmpeg(self, tmp_path, monkeypatch):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
pcm = b"\x01\x00\x02\x00\x03\x00"
mp3_path = str(tmp_path / "out.mp3")
with patch(
"tools.tts_tool.urllib.request.urlopen",
return_value=_mock_urlopen(_gemini_response(pcm)),
), patch("shutil.which", return_value="/usr/bin/ffmpeg"), patch(
"subprocess.run"
) as mock_run:
result = _generate_gemini_tts("hi", mp3_path, {})
# ffmpeg should be called to convert .wav -> .mp3
mock_run.assert_called_once()
cmd = mock_run.call_args[0][0]
assert cmd[0] == "/usr/bin/ffmpeg"
assert mp3_path in cmd
def test_wav_to_ogg_no_ffmpeg_renames(self, tmp_path, monkeypatch):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
pcm = b"\x01\x00\x02\x00"
ogg_path = str(tmp_path / "out.ogg")
with patch(
"tools.tts_tool.urllib.request.urlopen",
return_value=_mock_urlopen(_gemini_response(pcm)),
), patch("shutil.which", return_value=None):
result = _generate_gemini_tts("hi", ogg_path, {})
# Without ffmpeg, the WAV content gets renamed to .ogg path
assert result == ogg_path
assert os.path.exists(ogg_path)
def test_url_error_surfaced_as_runtime_error(self, tmp_path, monkeypatch):
import urllib.error
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
err = urllib.error.URLError("Name or service not known")
with patch("tools.tts_tool.urllib.request.urlopen", side_effect=err):
with pytest.raises(RuntimeError, match="connection failed"):
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})

View File

@@ -8,7 +8,16 @@ import pytest
@pytest.fixture(autouse=True)
def clean_env(monkeypatch):
for key in ("MISTRAL_API_KEY", "HERMES_SESSION_PLATFORM"):
for key in (
"MISTRAL_API_KEY",
"HERMES_SESSION_PLATFORM",
"MINIMAX_API_KEY",
"ELEVENLABS_API_KEY",
"OPENAI_API_KEY",
"VOICE_TOOLS_OPENAI_KEY",
"GEMINI_API_KEY",
"GOOGLE_API_KEY",
):
monkeypatch.delenv(key, raising=False)

View File

@@ -2,13 +2,14 @@
"""
Text-to-Speech Tool Module
Supports six TTS providers:
Supports seven TTS providers:
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
- Gemini TTS: Google speech generation, 30 prebuilt voices, needs GEMINI_API_KEY or GOOGLE_API_KEY
Output formats:
- Opus (.ogg) for Telegram voice bubbles (requires ffmpeg for Edge TTS)
@@ -35,7 +36,10 @@ import shutil
import subprocess
import tempfile
import threading
import urllib.error
import urllib.request
import uuid
import wave
from pathlib import Path
from typing import Callable, Dict, Any, Optional
from urllib.parse import urljoin
@@ -99,6 +103,12 @@ DEFAULT_XAI_LANGUAGE = "en"
DEFAULT_XAI_SAMPLE_RATE = 24000
DEFAULT_XAI_BIT_RATE = 128000
DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts"
DEFAULT_GEMINI_TTS_VOICE = "Kore"
DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
GEMINI_TTS_SAMPLE_RATE = 24000
GEMINI_TTS_CHANNELS = 1
GEMINI_TTS_SAMPLE_WIDTH = 2 # signed 16-bit PCM
def _get_default_output_dir() -> str:
from hermes_constants import get_hermes_dir
@@ -582,6 +592,114 @@ def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) ->
return output_path
# ===========================================================================
# Provider: Gemini TTS
# ===========================================================================
def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate audio using Google's Gemini speech-generation API.
Gemini returns base64-encoded PCM (signed 16-bit, 24 kHz, mono). This
function wraps the PCM in a WAV container natively (no ffmpeg needed
for the base case), then converts to the caller's requested extension
via ffmpeg if available. Mirrors the NeuTTS output handling.
Reference: https://ai.google.dev/gemini-api/docs/speech-generation
"""
api_key = (
os.getenv("GEMINI_API_KEY")
or os.getenv("GOOGLE_API_KEY")
or ""
).strip()
if not api_key:
raise ValueError(
"GEMINI_API_KEY (or GOOGLE_API_KEY) not set. "
"Get one at https://aistudio.google.com/apikey"
)
gm_config = tts_config.get("gemini", {})
model = gm_config.get("model", DEFAULT_GEMINI_TTS_MODEL)
voice = gm_config.get("voice", DEFAULT_GEMINI_TTS_VOICE)
base_url = gm_config.get("base_url", DEFAULT_GEMINI_TTS_BASE_URL).rstrip("/")
endpoint = f"{base_url}/models/{model}:generateContent"
payload = {
"contents": [{"parts": [{"text": text}]}],
"generationConfig": {
"responseModalities": ["AUDIO"],
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {"voiceName": voice},
},
},
},
}
body = json.dumps(payload).encode("utf-8")
req = urllib.request.Request(
endpoint,
data=body,
headers={
"Content-Type": "application/json",
"x-goog-api-key": api_key,
},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=60) as resp:
response_data = json.loads(resp.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
err_body = exc.read().decode("utf-8", errors="ignore")[:500]
raise RuntimeError(f"Gemini TTS HTTP {exc.code}: {err_body}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(f"Gemini TTS connection failed: {exc.reason}") from exc
try:
parts = response_data["candidates"][0]["content"]["parts"]
audio_part = None
for part in parts:
if "inlineData" in part:
audio_part = part
break
if audio_part is None:
raise RuntimeError(
f"Gemini TTS response missing audio payload: {str(response_data)[:300]}"
)
audio_b64 = audio_part["inlineData"]["data"]
except (KeyError, IndexError, TypeError) as exc:
raise RuntimeError(
f"Gemini TTS response missing audio payload: {str(response_data)[:300]}"
) from exc
pcm_bytes = base64.b64decode(audio_b64)
if not pcm_bytes:
raise RuntimeError("Gemini TTS returned empty audio data")
# Write PCM as WAV natively — ffmpeg is only needed if the caller
# asked for a non-WAV extension (mp3/ogg).
wav_path = output_path
if not output_path.endswith(".wav"):
wav_path = output_path.rsplit(".", 1)[0] + ".wav"
with wave.open(wav_path, "wb") as wf:
wf.setnchannels(GEMINI_TTS_CHANNELS)
wf.setsampwidth(GEMINI_TTS_SAMPLE_WIDTH)
wf.setframerate(GEMINI_TTS_SAMPLE_RATE)
wf.writeframes(pcm_bytes)
if wav_path != output_path:
ffmpeg = shutil.which("ffmpeg")
if ffmpeg:
conv_cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
subprocess.run(conv_cmd, check=True, timeout=30)
os.remove(wav_path)
else:
# No ffmpeg — keep WAV content but honor the caller's path.
os.rename(wav_path, output_path)
return output_path
# ===========================================================================
# Main tool function
# ===========================================================================
@@ -697,6 +815,10 @@ def text_to_speech_tool(
logger.info("Generating speech with NeuTTS (local)...")
_generate_neutts(text, file_str, tts_config)
elif provider == "gemini":
logger.info("Generating speech with Gemini TTS...")
_generate_gemini_tts(text, file_str, tts_config)
else:
# Default: Edge TTS (free), with NeuTTS as local fallback
edge_available = True
@@ -736,7 +858,7 @@ def text_to_speech_tool(
# Try Opus conversion for Telegram compatibility
# Edge TTS outputs MP3, NeuTTS outputs WAV — both need ffmpeg conversion
voice_compatible = False
if provider in ("edge", "neutts", "minimax", "xai") and not file_str.endswith(".ogg"):
if provider in ("edge", "neutts", "minimax", "xai", "gemini") and not file_str.endswith(".ogg"):
opus_path = _convert_to_opus(file_str)
if opus_path:
file_str = opus_path
@@ -817,6 +939,8 @@ def check_tts_requirements() -> bool:
return True
except ImportError:
pass
if os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY"):
return True
if _check_neutts_available():
return True
return False

View File

@@ -10,7 +10,7 @@ Hermes Agent supports both text-to-speech output and voice message transcription
## Text-to-Speech
Convert text to speech with six providers:
Convert text to speech with seven providers:
| Provider | Quality | Cost | API Key |
|----------|---------|------|---------|
@@ -20,6 +20,7 @@ Convert text to speech with six providers:
| **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` |
| **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` |
| **NeuTTS** | Good | Free | None needed |
| **Gemini TTS** | Excellent | Paid (free tier) | `GEMINI_API_KEY` |
### Platform Delivery
@@ -62,6 +63,9 @@ tts:
ref_text: ''
model: neuphonic/neutts-air-q4-gguf
device: cpu
gemini:
model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts
voice: "Kore" # 30 prebuilt voices (Zephyr, Puck, Charon, ...)
```
**Speed control**: The global `tts.speed` value applies to all providers by default. Each provider can override it with its own `speed` setting (e.g., `tts.openai.speed: 1.5`). Provider-specific speed takes precedence over the global value. Default is `1.0` (normal speed).
@@ -74,6 +78,7 @@ Telegram voice bubbles require Opus/OGG audio format:
- **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
- **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles
- **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
- **Gemini TTS** returns raw PCM (wrapped in WAV natively) and needs **ffmpeg** to convert for Telegram voice bubbles
```bash
# Ubuntu/Debian