mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-29 07:21:37 +08:00
Compare commits
3 Commits
fix/plugin
...
feat/gemin
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b945f9e0d5 | ||
|
|
acff9d36db | ||
|
|
0671201c05 |
@@ -26,7 +26,6 @@ model:
|
||||
# "huggingface" - Hugging Face Inference (requires: HF_TOKEN)
|
||||
# "xiaomi" - Xiaomi MiMo (requires: XIAOMI_API_KEY)
|
||||
# "arcee" - Arcee AI Trinity models (requires: ARCEEAI_API_KEY)
|
||||
# "ollama-cloud" - Ollama Cloud (requires: OLLAMA_API_KEY — https://ollama.com/settings)
|
||||
# "kilocode" - KiloCode gateway (requires: KILOCODE_API_KEY)
|
||||
# "ai-gateway" - Vercel AI Gateway (requires: AI_GATEWAY_API_KEY)
|
||||
#
|
||||
@@ -38,6 +37,12 @@ model:
|
||||
# base_url: "http://localhost:1234/v1"
|
||||
# No API key needed — local servers typically ignore auth.
|
||||
#
|
||||
# For Ollama Cloud (https://ollama.com/pricing):
|
||||
# provider: "custom"
|
||||
# base_url: "https://ollama.com/v1"
|
||||
# Set OLLAMA_API_KEY in .env — automatically picked up when base_url
|
||||
# points to ollama.com.
|
||||
#
|
||||
# Can also be overridden with --provider flag or HERMES_INFERENCE_PROVIDER env var.
|
||||
provider: "auto"
|
||||
|
||||
@@ -332,7 +337,6 @@ compression:
|
||||
# "openrouter" - Force OpenRouter (requires OPENROUTER_API_KEY)
|
||||
# "nous" - Force Nous Portal (requires: hermes login)
|
||||
# "gemini" - Force Google AI Studio direct (requires: GOOGLE_API_KEY or GEMINI_API_KEY)
|
||||
# "ollama-cloud" - Ollama Cloud (requires: OLLAMA_API_KEY)
|
||||
# "codex" - Force Codex OAuth (requires: hermes model → Codex).
|
||||
# Uses gpt-5.3-codex which supports vision.
|
||||
# "main" - Use your custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY).
|
||||
@@ -593,7 +597,7 @@ platform_toolsets:
|
||||
# skills_hub - skill_hub (search/install/manage from online registries — user-driven only)
|
||||
# moa - mixture_of_agents (requires OPENROUTER_API_KEY)
|
||||
# todo - todo (in-memory task planning, no deps)
|
||||
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX/MISTRAL key)
|
||||
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/XAI/MINIMAX/MISTRAL/GEMINI key)
|
||||
# cronjob - cronjob (create/list/update/pause/resume/run/remove scheduled tasks)
|
||||
# rl - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY)
|
||||
#
|
||||
@@ -622,7 +626,7 @@ platform_toolsets:
|
||||
# todo - Task planning and tracking for multi-step work
|
||||
# memory - Persistent memory across sessions (personal notes + user profile)
|
||||
# session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization)
|
||||
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax, Mistral)
|
||||
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, xAI, MiniMax, Mistral, Gemini)
|
||||
# cronjob - Schedule and manage automated tasks (CLI-only)
|
||||
# rl - RL training tools (Tinker-Atropos)
|
||||
#
|
||||
|
||||
@@ -61,9 +61,7 @@ from hermes_cli.colors import Colors, color
|
||||
from hermes_cli.default_soul import DEFAULT_SOUL_MD
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Managed mode (NixOS declarative config)
|
||||
# =============================================================================
|
||||
|
||||
_MANAGED_TRUE_VALUES = ("true", "1", "yes")
|
||||
_MANAGED_SYSTEM_NAMES = {
|
||||
@@ -147,9 +145,7 @@ def managed_error(action: str = "modify configuration"):
|
||||
print(format_managed_message(action), file=sys.stderr)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Container-aware CLI (NixOS container mode)
|
||||
# =============================================================================
|
||||
|
||||
def get_container_exec_info() -> Optional[dict]:
|
||||
"""Read container mode metadata from HERMES_HOME/.container-mode.
|
||||
@@ -196,9 +192,7 @@ def get_container_exec_info() -> Optional[dict]:
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Config paths
|
||||
# =============================================================================
|
||||
|
||||
# Re-export from hermes_constants — canonical definition lives there.
|
||||
from hermes_constants import get_hermes_home # noqa: F811,E402
|
||||
@@ -335,9 +329,7 @@ def _ensure_hermes_home_managed(home: Path):
|
||||
_ensure_default_soul_md(home)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Config loading/saving
|
||||
# =============================================================================
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
"model": "",
|
||||
@@ -570,7 +562,7 @@ DEFAULT_CONFIG = {
|
||||
|
||||
# Text-to-speech configuration
|
||||
"tts": {
|
||||
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "neutts" (local)
|
||||
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "neutts" (local) | "gemini"
|
||||
"edge": {
|
||||
"voice": "en-US-AriaNeural",
|
||||
# Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural
|
||||
@@ -600,6 +592,15 @@ DEFAULT_CONFIG = {
|
||||
"model": "neuphonic/neutts-air-q4-gguf", # HuggingFace model repo
|
||||
"device": "cpu", # cpu, cuda, or mps
|
||||
},
|
||||
"gemini": {
|
||||
"model": "gemini-2.5-flash-preview-tts",
|
||||
"voice": "Kore",
|
||||
# 30 prebuilt voices: Zephyr, Puck, Charon, Kore, Fenrir, Leda,
|
||||
# Orus, Aoede, Callirrhoe, Autonoe, Enceladus, Iapetus, Umbriel,
|
||||
# Algieba, Despina, Erinome, Algenib, Rasalgethi, Laomedeia,
|
||||
# Achernar, Alnilam, Schedar, Gacrux, Pulcherrima, Achird,
|
||||
# Zubenelgenubi, Vindemiatrix, Sadachbia, Sadaltager, Sulafat
|
||||
},
|
||||
},
|
||||
|
||||
"stt": {
|
||||
@@ -781,9 +782,7 @@ DEFAULT_CONFIG = {
|
||||
"_config_version": 18,
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Config Migration System
|
||||
# =============================================================================
|
||||
|
||||
# Track which env vars were introduced in each config version.
|
||||
# Migration only mentions vars new since the user's previous version.
|
||||
@@ -1901,9 +1900,7 @@ def check_config_version() -> Tuple[int, int]:
|
||||
return current, latest
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Config structure validation
|
||||
# =============================================================================
|
||||
|
||||
# Fields that are valid at root level of config.yaml
|
||||
_KNOWN_ROOT_KEYS = {
|
||||
@@ -3167,9 +3164,7 @@ def get_env_value(key: str) -> Optional[str]:
|
||||
return env_vars.get(key)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Config display
|
||||
# =============================================================================
|
||||
|
||||
def redact_key(key: str) -> str:
|
||||
"""Redact an API key for display."""
|
||||
@@ -3461,9 +3456,7 @@ def set_config_value(key: str, value: str):
|
||||
print(f"✓ Set {key} = {value} in {config_path}")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Command handler
|
||||
# =============================================================================
|
||||
|
||||
def config_command(args):
|
||||
"""Handle config subcommands."""
|
||||
|
||||
@@ -433,6 +433,10 @@ def _print_setup_summary(config: dict, hermes_home):
|
||||
tool_status.append(("Text-to-Speech (MiniMax)", True, None))
|
||||
elif tts_provider == "mistral" and get_env_value("MISTRAL_API_KEY"):
|
||||
tool_status.append(("Text-to-Speech (Mistral Voxtral)", True, None))
|
||||
elif tts_provider == "gemini" and (
|
||||
get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY")
|
||||
):
|
||||
tool_status.append(("Text-to-Speech (Gemini)", True, None))
|
||||
elif tts_provider == "neutts":
|
||||
try:
|
||||
import importlib.util
|
||||
@@ -924,6 +928,7 @@ def _setup_tts_provider(config: dict):
|
||||
"minimax": "MiniMax TTS",
|
||||
"mistral": "Mistral Voxtral TTS",
|
||||
"neutts": "NeuTTS",
|
||||
"gemini": "Gemini TTS",
|
||||
}
|
||||
current_label = provider_labels.get(current_provider, current_provider)
|
||||
|
||||
@@ -946,9 +951,10 @@ def _setup_tts_provider(config: dict):
|
||||
"MiniMax TTS (high quality with voice cloning, needs API key)",
|
||||
"Mistral Voxtral TTS (multilingual, native Opus, needs API key)",
|
||||
"NeuTTS (local on-device, free, ~300MB model download)",
|
||||
"Gemini TTS (Google speech generation, 30 voices, needs GEMINI_API_KEY)",
|
||||
]
|
||||
)
|
||||
providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "neutts"])
|
||||
providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "neutts", "gemini"])
|
||||
choices.append(f"Keep current ({current_label})")
|
||||
keep_current_idx = len(choices) - 1
|
||||
idx = prompt_choice("Select TTS provider:", choices, keep_current_idx)
|
||||
@@ -1055,6 +1061,19 @@ def _setup_tts_provider(config: dict):
|
||||
print_warning("No API key provided. Falling back to Edge TTS.")
|
||||
selected = "edge"
|
||||
|
||||
elif selected == "gemini":
|
||||
existing = get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY")
|
||||
if not existing:
|
||||
print()
|
||||
print_info("Get a key at https://aistudio.google.com/apikey")
|
||||
api_key = prompt("Gemini API key for TTS", password=True)
|
||||
if api_key:
|
||||
save_env_value("GEMINI_API_KEY", api_key)
|
||||
print_success("Gemini API key saved")
|
||||
else:
|
||||
print_warning("No API key provided. Falling back to Edge TTS.")
|
||||
selected = "edge"
|
||||
|
||||
# Save the selection
|
||||
if "tts" not in config:
|
||||
config["tts"] = {}
|
||||
|
||||
@@ -186,6 +186,7 @@ AUTHOR_MAP = {
|
||||
"danieldoderlein@users.noreply.github.com": "danieldoderlein",
|
||||
"lrawnsley@users.noreply.github.com": "lrawnsley",
|
||||
"taeuk178@users.noreply.github.com": "taeuk178",
|
||||
"zhonghui5207@users.noreply.github.com": "zhonghui5207",
|
||||
"ogzerber@users.noreply.github.com": "ogzerber",
|
||||
"cola-runner@users.noreply.github.com": "cola-runner",
|
||||
"ygd58@users.noreply.github.com": "ygd58",
|
||||
|
||||
330
tests/tools/test_tts_gemini.py
Normal file
330
tests/tools/test_tts_gemini.py
Normal file
@@ -0,0 +1,330 @@
|
||||
"""Tests for the Gemini TTS provider in tools/tts_tool.py."""
|
||||
|
||||
import base64
|
||||
import os
|
||||
import struct
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clean_env(monkeypatch):
|
||||
for key in (
|
||||
"GEMINI_API_KEY",
|
||||
"GOOGLE_API_KEY",
|
||||
"HERMES_SESSION_PLATFORM",
|
||||
"MINIMAX_API_KEY",
|
||||
"ELEVENLABS_API_KEY",
|
||||
"OPENAI_API_KEY",
|
||||
"VOICE_TOOLS_OPENAI_KEY",
|
||||
"MISTRAL_API_KEY",
|
||||
):
|
||||
monkeypatch.delenv(key, raising=False)
|
||||
|
||||
|
||||
def _gemini_response(pcm_bytes: bytes) -> dict:
|
||||
return {
|
||||
"candidates": [
|
||||
{
|
||||
"content": {
|
||||
"parts": [
|
||||
{"inlineData": {"data": base64.b64encode(pcm_bytes).decode()}}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def _mock_urlopen(response_payload: dict):
|
||||
resp_body = __import__("json").dumps(response_payload).encode("utf-8")
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.read.return_value = resp_body
|
||||
mock_resp.__enter__ = MagicMock(return_value=mock_resp)
|
||||
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||
return mock_resp
|
||||
|
||||
|
||||
class TestGenerateGeminiTts:
|
||||
def test_missing_api_key_raises_value_error(self, tmp_path):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
with pytest.raises(ValueError, match="GEMINI_API_KEY"):
|
||||
_generate_gemini_tts("Hello", str(tmp_path / "out.wav"), {})
|
||||
|
||||
def test_google_api_key_fallback_accepted(self, tmp_path, monkeypatch):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GOOGLE_API_KEY", "test-key")
|
||||
pcm = b"\x01\x00\x02\x00\x03\x00"
|
||||
with patch(
|
||||
"tools.tts_tool.urllib.request.urlopen",
|
||||
return_value=_mock_urlopen(_gemini_response(pcm)),
|
||||
):
|
||||
result = _generate_gemini_tts("Hi", str(tmp_path / "out.wav"), {})
|
||||
|
||||
assert result == str(tmp_path / "out.wav")
|
||||
|
||||
def test_writes_wav_with_correct_pcm_params(self, tmp_path, monkeypatch):
|
||||
import wave
|
||||
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
pcm = struct.pack("<6h", 0, 1, 2, 3, 4, 5)
|
||||
with patch(
|
||||
"tools.tts_tool.urllib.request.urlopen",
|
||||
return_value=_mock_urlopen(_gemini_response(pcm)),
|
||||
):
|
||||
out = tmp_path / "out.wav"
|
||||
_generate_gemini_tts("Hi", str(out), {})
|
||||
|
||||
with wave.open(str(out), "rb") as wf:
|
||||
assert wf.getnchannels() == 1
|
||||
assert wf.getsampwidth() == 2
|
||||
assert wf.getframerate() == 24000
|
||||
assert wf.readframes(wf.getnframes()) == pcm
|
||||
|
||||
def test_default_model_and_voice_in_payload(self, tmp_path, monkeypatch):
|
||||
import json as _json
|
||||
|
||||
from tools.tts_tool import (
|
||||
DEFAULT_GEMINI_TTS_MODEL,
|
||||
DEFAULT_GEMINI_TTS_VOICE,
|
||||
_generate_gemini_tts,
|
||||
)
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
captured = {}
|
||||
|
||||
def fake_urlopen(req, timeout=None):
|
||||
captured["url"] = req.full_url
|
||||
captured["body"] = _json.loads(req.data.decode())
|
||||
captured["headers"] = dict(req.headers)
|
||||
return _mock_urlopen(_gemini_response(b"\x00\x00"))
|
||||
|
||||
with patch("tools.tts_tool.urllib.request.urlopen", side_effect=fake_urlopen):
|
||||
_generate_gemini_tts("hello", str(tmp_path / "out.wav"), {})
|
||||
|
||||
assert DEFAULT_GEMINI_TTS_MODEL in captured["url"]
|
||||
voice_cfg = captured["body"]["generationConfig"]["speechConfig"]["voiceConfig"][
|
||||
"prebuiltVoiceConfig"
|
||||
]
|
||||
assert voice_cfg["voiceName"] == DEFAULT_GEMINI_TTS_VOICE
|
||||
# Header keys normalize to capitalized form via urllib
|
||||
assert captured["headers"].get("X-goog-api-key") == "test-key"
|
||||
|
||||
def test_config_overrides(self, tmp_path, monkeypatch):
|
||||
import json as _json
|
||||
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
captured = {}
|
||||
|
||||
def fake_urlopen(req, timeout=None):
|
||||
captured["url"] = req.full_url
|
||||
captured["body"] = _json.loads(req.data.decode())
|
||||
return _mock_urlopen(_gemini_response(b"\x00\x00"))
|
||||
|
||||
with patch("tools.tts_tool.urllib.request.urlopen", side_effect=fake_urlopen):
|
||||
config = {"gemini": {"model": "gemini-2.5-pro-preview-tts", "voice": "Puck"}}
|
||||
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), config)
|
||||
|
||||
assert "gemini-2.5-pro-preview-tts" in captured["url"]
|
||||
voice_cfg = captured["body"]["generationConfig"]["speechConfig"]["voiceConfig"][
|
||||
"prebuiltVoiceConfig"
|
||||
]
|
||||
assert voice_cfg["voiceName"] == "Puck"
|
||||
|
||||
def test_http_error_surfaced_as_runtime_error(self, tmp_path, monkeypatch):
|
||||
import urllib.error
|
||||
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
err = urllib.error.HTTPError(
|
||||
"https://example", 429, "Too Many Requests", {}, None
|
||||
)
|
||||
err.read = MagicMock(return_value=b'{"error": "rate limit"}')
|
||||
|
||||
with patch("tools.tts_tool.urllib.request.urlopen", side_effect=err):
|
||||
with pytest.raises(RuntimeError, match="429"):
|
||||
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
|
||||
|
||||
def test_missing_audio_payload_raises_runtime_error(self, tmp_path, monkeypatch):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
bad_response = {"candidates": [{"content": {"parts": []}}]}
|
||||
with patch(
|
||||
"tools.tts_tool.urllib.request.urlopen",
|
||||
return_value=_mock_urlopen(bad_response),
|
||||
):
|
||||
with pytest.raises(RuntimeError, match="missing audio payload"):
|
||||
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
|
||||
|
||||
|
||||
class TestTtsDispatcherGemini:
|
||||
def test_dispatcher_routes_to_gemini(self, tmp_path, monkeypatch):
|
||||
import json
|
||||
|
||||
from tools.tts_tool import text_to_speech_tool
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
pcm = struct.pack("<2h", 100, -100)
|
||||
with patch(
|
||||
"tools.tts_tool.urllib.request.urlopen",
|
||||
return_value=_mock_urlopen(_gemini_response(pcm)),
|
||||
), patch(
|
||||
"tools.tts_tool._load_tts_config", return_value={"provider": "gemini"}
|
||||
):
|
||||
# Force .wav output so we skip the ffmpeg / Opus conversion branch
|
||||
output_path = str(tmp_path / "out.wav")
|
||||
result = json.loads(text_to_speech_tool("Hello", output_path=output_path))
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["provider"] == "gemini"
|
||||
|
||||
|
||||
class TestCheckTtsRequirementsGemini:
|
||||
def test_gemini_key_returns_true(self, monkeypatch):
|
||||
from tools.tts_tool import check_tts_requirements
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), patch(
|
||||
"tools.tts_tool._import_elevenlabs", side_effect=ImportError
|
||||
), patch("tools.tts_tool._import_openai_client", side_effect=ImportError), patch(
|
||||
"tools.tts_tool._import_mistral_client", side_effect=ImportError
|
||||
), patch("tools.tts_tool._check_neutts_available", return_value=False):
|
||||
assert check_tts_requirements() is True
|
||||
|
||||
def test_google_api_key_also_accepted(self, monkeypatch):
|
||||
from tools.tts_tool import check_tts_requirements
|
||||
|
||||
monkeypatch.setenv("GOOGLE_API_KEY", "test-key")
|
||||
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), patch(
|
||||
"tools.tts_tool._import_elevenlabs", side_effect=ImportError
|
||||
), patch("tools.tts_tool._import_openai_client", side_effect=ImportError), patch(
|
||||
"tools.tts_tool._import_mistral_client", side_effect=ImportError
|
||||
), patch("tools.tts_tool._check_neutts_available", return_value=False):
|
||||
assert check_tts_requirements() is True
|
||||
|
||||
def test_no_key_returns_false(self):
|
||||
from tools.tts_tool import check_tts_requirements
|
||||
|
||||
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), patch(
|
||||
"tools.tts_tool._import_elevenlabs", side_effect=ImportError
|
||||
), patch("tools.tts_tool._import_openai_client", side_effect=ImportError), patch(
|
||||
"tools.tts_tool._import_mistral_client", side_effect=ImportError
|
||||
), patch("tools.tts_tool._check_neutts_available", return_value=False):
|
||||
assert check_tts_requirements() is False
|
||||
|
||||
|
||||
class TestGeminiTtsEdgeCases:
|
||||
"""Tests for edge cases and conversion paths added during salvage review."""
|
||||
|
||||
def test_empty_pcm_raises_runtime_error(self, tmp_path, monkeypatch):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
with patch(
|
||||
"tools.tts_tool.urllib.request.urlopen",
|
||||
return_value=_mock_urlopen(_gemini_response(b"")),
|
||||
):
|
||||
with pytest.raises(RuntimeError, match="empty audio data"):
|
||||
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
|
||||
|
||||
def test_text_part_before_audio_is_handled(self, tmp_path, monkeypatch):
|
||||
"""If the response has a text part before the audio part, still extract audio."""
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
pcm = b"\x01\x00\x02\x00"
|
||||
mixed_response = {
|
||||
"candidates": [
|
||||
{
|
||||
"content": {
|
||||
"parts": [
|
||||
{"text": "Here is your audio"},
|
||||
{"inlineData": {"data": base64.b64encode(pcm).decode()}},
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
with patch(
|
||||
"tools.tts_tool.urllib.request.urlopen",
|
||||
return_value=_mock_urlopen(mixed_response),
|
||||
):
|
||||
result = _generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
|
||||
assert result == str(tmp_path / "out.wav")
|
||||
|
||||
def test_base_url_config_override(self, tmp_path, monkeypatch):
|
||||
import json as _json
|
||||
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
captured = {}
|
||||
|
||||
def fake_urlopen(req, timeout=None):
|
||||
captured["url"] = req.full_url
|
||||
return _mock_urlopen(_gemini_response(b"\x00\x00"))
|
||||
|
||||
with patch("tools.tts_tool.urllib.request.urlopen", side_effect=fake_urlopen):
|
||||
config = {"gemini": {"base_url": "https://custom.api.example.com/v1"}}
|
||||
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), config)
|
||||
|
||||
assert "custom.api.example.com" in captured["url"]
|
||||
|
||||
def test_wav_to_mp3_conversion_with_ffmpeg(self, tmp_path, monkeypatch):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
pcm = b"\x01\x00\x02\x00\x03\x00"
|
||||
mp3_path = str(tmp_path / "out.mp3")
|
||||
|
||||
with patch(
|
||||
"tools.tts_tool.urllib.request.urlopen",
|
||||
return_value=_mock_urlopen(_gemini_response(pcm)),
|
||||
), patch("shutil.which", return_value="/usr/bin/ffmpeg"), patch(
|
||||
"subprocess.run"
|
||||
) as mock_run:
|
||||
result = _generate_gemini_tts("hi", mp3_path, {})
|
||||
|
||||
# ffmpeg should be called to convert .wav -> .mp3
|
||||
mock_run.assert_called_once()
|
||||
cmd = mock_run.call_args[0][0]
|
||||
assert cmd[0] == "/usr/bin/ffmpeg"
|
||||
assert mp3_path in cmd
|
||||
|
||||
def test_wav_to_ogg_no_ffmpeg_renames(self, tmp_path, monkeypatch):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
pcm = b"\x01\x00\x02\x00"
|
||||
ogg_path = str(tmp_path / "out.ogg")
|
||||
|
||||
with patch(
|
||||
"tools.tts_tool.urllib.request.urlopen",
|
||||
return_value=_mock_urlopen(_gemini_response(pcm)),
|
||||
), patch("shutil.which", return_value=None):
|
||||
result = _generate_gemini_tts("hi", ogg_path, {})
|
||||
|
||||
# Without ffmpeg, the WAV content gets renamed to .ogg path
|
||||
assert result == ogg_path
|
||||
assert os.path.exists(ogg_path)
|
||||
|
||||
def test_url_error_surfaced_as_runtime_error(self, tmp_path, monkeypatch):
|
||||
import urllib.error
|
||||
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
err = urllib.error.URLError("Name or service not known")
|
||||
|
||||
with patch("tools.tts_tool.urllib.request.urlopen", side_effect=err):
|
||||
with pytest.raises(RuntimeError, match="connection failed"):
|
||||
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
|
||||
@@ -8,7 +8,16 @@ import pytest
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clean_env(monkeypatch):
|
||||
for key in ("MISTRAL_API_KEY", "HERMES_SESSION_PLATFORM"):
|
||||
for key in (
|
||||
"MISTRAL_API_KEY",
|
||||
"HERMES_SESSION_PLATFORM",
|
||||
"MINIMAX_API_KEY",
|
||||
"ELEVENLABS_API_KEY",
|
||||
"OPENAI_API_KEY",
|
||||
"VOICE_TOOLS_OPENAI_KEY",
|
||||
"GEMINI_API_KEY",
|
||||
"GOOGLE_API_KEY",
|
||||
):
|
||||
monkeypatch.delenv(key, raising=False)
|
||||
|
||||
|
||||
|
||||
@@ -2,13 +2,14 @@
|
||||
"""
|
||||
Text-to-Speech Tool Module
|
||||
|
||||
Supports six TTS providers:
|
||||
Supports seven TTS providers:
|
||||
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
|
||||
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
|
||||
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
|
||||
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
|
||||
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
|
||||
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
|
||||
- Gemini TTS: Google speech generation, 30 prebuilt voices, needs GEMINI_API_KEY or GOOGLE_API_KEY
|
||||
|
||||
Output formats:
|
||||
- Opus (.ogg) for Telegram voice bubbles (requires ffmpeg for Edge TTS)
|
||||
@@ -35,7 +36,10 @@ import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import threading
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
import uuid
|
||||
import wave
|
||||
from pathlib import Path
|
||||
from typing import Callable, Dict, Any, Optional
|
||||
from urllib.parse import urljoin
|
||||
@@ -99,6 +103,12 @@ DEFAULT_XAI_LANGUAGE = "en"
|
||||
DEFAULT_XAI_SAMPLE_RATE = 24000
|
||||
DEFAULT_XAI_BIT_RATE = 128000
|
||||
DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
|
||||
DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts"
|
||||
DEFAULT_GEMINI_TTS_VOICE = "Kore"
|
||||
DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
|
||||
GEMINI_TTS_SAMPLE_RATE = 24000
|
||||
GEMINI_TTS_CHANNELS = 1
|
||||
GEMINI_TTS_SAMPLE_WIDTH = 2 # signed 16-bit PCM
|
||||
|
||||
def _get_default_output_dir() -> str:
|
||||
from hermes_constants import get_hermes_dir
|
||||
@@ -582,6 +592,114 @@ def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) ->
|
||||
return output_path
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Provider: Gemini TTS
|
||||
# ===========================================================================
|
||||
def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
||||
"""Generate audio using Google's Gemini speech-generation API.
|
||||
|
||||
Gemini returns base64-encoded PCM (signed 16-bit, 24 kHz, mono). This
|
||||
function wraps the PCM in a WAV container natively (no ffmpeg needed
|
||||
for the base case), then converts to the caller's requested extension
|
||||
via ffmpeg if available. Mirrors the NeuTTS output handling.
|
||||
|
||||
Reference: https://ai.google.dev/gemini-api/docs/speech-generation
|
||||
"""
|
||||
api_key = (
|
||||
os.getenv("GEMINI_API_KEY")
|
||||
or os.getenv("GOOGLE_API_KEY")
|
||||
or ""
|
||||
).strip()
|
||||
if not api_key:
|
||||
raise ValueError(
|
||||
"GEMINI_API_KEY (or GOOGLE_API_KEY) not set. "
|
||||
"Get one at https://aistudio.google.com/apikey"
|
||||
)
|
||||
|
||||
gm_config = tts_config.get("gemini", {})
|
||||
model = gm_config.get("model", DEFAULT_GEMINI_TTS_MODEL)
|
||||
voice = gm_config.get("voice", DEFAULT_GEMINI_TTS_VOICE)
|
||||
base_url = gm_config.get("base_url", DEFAULT_GEMINI_TTS_BASE_URL).rstrip("/")
|
||||
|
||||
endpoint = f"{base_url}/models/{model}:generateContent"
|
||||
payload = {
|
||||
"contents": [{"parts": [{"text": text}]}],
|
||||
"generationConfig": {
|
||||
"responseModalities": ["AUDIO"],
|
||||
"speechConfig": {
|
||||
"voiceConfig": {
|
||||
"prebuiltVoiceConfig": {"voiceName": voice},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
body = json.dumps(payload).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
endpoint,
|
||||
data=body,
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"x-goog-api-key": api_key,
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||
response_data = json.loads(resp.read().decode("utf-8"))
|
||||
except urllib.error.HTTPError as exc:
|
||||
err_body = exc.read().decode("utf-8", errors="ignore")[:500]
|
||||
raise RuntimeError(f"Gemini TTS HTTP {exc.code}: {err_body}") from exc
|
||||
except urllib.error.URLError as exc:
|
||||
raise RuntimeError(f"Gemini TTS connection failed: {exc.reason}") from exc
|
||||
|
||||
try:
|
||||
parts = response_data["candidates"][0]["content"]["parts"]
|
||||
audio_part = None
|
||||
for part in parts:
|
||||
if "inlineData" in part:
|
||||
audio_part = part
|
||||
break
|
||||
if audio_part is None:
|
||||
raise RuntimeError(
|
||||
f"Gemini TTS response missing audio payload: {str(response_data)[:300]}"
|
||||
)
|
||||
audio_b64 = audio_part["inlineData"]["data"]
|
||||
except (KeyError, IndexError, TypeError) as exc:
|
||||
raise RuntimeError(
|
||||
f"Gemini TTS response missing audio payload: {str(response_data)[:300]}"
|
||||
) from exc
|
||||
|
||||
pcm_bytes = base64.b64decode(audio_b64)
|
||||
if not pcm_bytes:
|
||||
raise RuntimeError("Gemini TTS returned empty audio data")
|
||||
|
||||
# Write PCM as WAV natively — ffmpeg is only needed if the caller
|
||||
# asked for a non-WAV extension (mp3/ogg).
|
||||
wav_path = output_path
|
||||
if not output_path.endswith(".wav"):
|
||||
wav_path = output_path.rsplit(".", 1)[0] + ".wav"
|
||||
|
||||
with wave.open(wav_path, "wb") as wf:
|
||||
wf.setnchannels(GEMINI_TTS_CHANNELS)
|
||||
wf.setsampwidth(GEMINI_TTS_SAMPLE_WIDTH)
|
||||
wf.setframerate(GEMINI_TTS_SAMPLE_RATE)
|
||||
wf.writeframes(pcm_bytes)
|
||||
|
||||
if wav_path != output_path:
|
||||
ffmpeg = shutil.which("ffmpeg")
|
||||
if ffmpeg:
|
||||
conv_cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
|
||||
subprocess.run(conv_cmd, check=True, timeout=30)
|
||||
os.remove(wav_path)
|
||||
else:
|
||||
# No ffmpeg — keep WAV content but honor the caller's path.
|
||||
os.rename(wav_path, output_path)
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Main tool function
|
||||
# ===========================================================================
|
||||
@@ -697,6 +815,10 @@ def text_to_speech_tool(
|
||||
logger.info("Generating speech with NeuTTS (local)...")
|
||||
_generate_neutts(text, file_str, tts_config)
|
||||
|
||||
elif provider == "gemini":
|
||||
logger.info("Generating speech with Gemini TTS...")
|
||||
_generate_gemini_tts(text, file_str, tts_config)
|
||||
|
||||
else:
|
||||
# Default: Edge TTS (free), with NeuTTS as local fallback
|
||||
edge_available = True
|
||||
@@ -736,7 +858,7 @@ def text_to_speech_tool(
|
||||
# Try Opus conversion for Telegram compatibility
|
||||
# Edge TTS outputs MP3, NeuTTS outputs WAV — both need ffmpeg conversion
|
||||
voice_compatible = False
|
||||
if provider in ("edge", "neutts", "minimax", "xai") and not file_str.endswith(".ogg"):
|
||||
if provider in ("edge", "neutts", "minimax", "xai", "gemini") and not file_str.endswith(".ogg"):
|
||||
opus_path = _convert_to_opus(file_str)
|
||||
if opus_path:
|
||||
file_str = opus_path
|
||||
@@ -817,6 +939,8 @@ def check_tts_requirements() -> bool:
|
||||
return True
|
||||
except ImportError:
|
||||
pass
|
||||
if os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY"):
|
||||
return True
|
||||
if _check_neutts_available():
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -10,7 +10,7 @@ Hermes Agent supports both text-to-speech output and voice message transcription
|
||||
|
||||
## Text-to-Speech
|
||||
|
||||
Convert text to speech with six providers:
|
||||
Convert text to speech with seven providers:
|
||||
|
||||
| Provider | Quality | Cost | API Key |
|
||||
|----------|---------|------|---------|
|
||||
@@ -20,6 +20,7 @@ Convert text to speech with six providers:
|
||||
| **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` |
|
||||
| **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` |
|
||||
| **NeuTTS** | Good | Free | None needed |
|
||||
| **Gemini TTS** | Excellent | Paid (free tier) | `GEMINI_API_KEY` |
|
||||
|
||||
### Platform Delivery
|
||||
|
||||
@@ -62,6 +63,9 @@ tts:
|
||||
ref_text: ''
|
||||
model: neuphonic/neutts-air-q4-gguf
|
||||
device: cpu
|
||||
gemini:
|
||||
model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts
|
||||
voice: "Kore" # 30 prebuilt voices (Zephyr, Puck, Charon, ...)
|
||||
```
|
||||
|
||||
**Speed control**: The global `tts.speed` value applies to all providers by default. Each provider can override it with its own `speed` setting (e.g., `tts.openai.speed: 1.5`). Provider-specific speed takes precedence over the global value. Default is `1.0` (normal speed).
|
||||
@@ -74,6 +78,7 @@ Telegram voice bubbles require Opus/OGG audio format:
|
||||
- **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
|
||||
- **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles
|
||||
- **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
|
||||
- **Gemini TTS** returns raw PCM (wrapped in WAV natively) and needs **ffmpeg** to convert for Telegram voice bubbles
|
||||
|
||||
```bash
|
||||
# Ubuntu/Debian
|
||||
|
||||
Reference in New Issue
Block a user