diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index c843405417..1baced60fc 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -24,7 +24,7 @@ Usage: import logging import os from pathlib import Path -from typing import Optional +from typing import Optional, Dict, Any logger = logging.getLogger(__name__) @@ -32,8 +32,14 @@ logger = logging.getLogger(__name__) # Default STT model -- cheapest and widely available DEFAULT_STT_MODEL = "whisper-1" +# Supported audio formats +SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"} -def transcribe_audio(file_path: str, model: Optional[str] = None) -> dict: +# Maximum file size (25MB - OpenAI limit) +MAX_FILE_SIZE = 25 * 1024 * 1024 + + +def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, Any]: """ Transcribe an audio file using OpenAI's Whisper API. @@ -55,16 +61,50 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> dict: return { "success": False, "transcript": "", - "error": "VOICE_TOOLS_OPENAI_KEY not set", + "error": "VOICE_TOOLS_OPENAI_KEY or OPENAI_API_KEY not set", } audio_path = Path(file_path) - if not audio_path.is_file(): + + # Validate file exists + if not audio_path.exists(): return { "success": False, "transcript": "", "error": f"Audio file not found: {file_path}", } + + if not audio_path.is_file(): + return { + "success": False, + "transcript": "", + "error": f"Path is not a file: {file_path}", + } + + # Validate file extension + if audio_path.suffix.lower() not in SUPPORTED_FORMATS: + return { + "success": False, + "transcript": "", + "error": f"Unsupported file format: {audio_path.suffix}. Supported formats: {', '.join(SUPPORTED_FORMATS)}", + } + + # Validate file size + try: + file_size = audio_path.stat().st_size + if file_size > MAX_FILE_SIZE: + return { + "success": False, + "transcript": "", + "error": f"File too large: {file_size / (1024*1024):.1f}MB (max {MAX_FILE_SIZE / (1024*1024)}MB)", + } + except OSError as e: + logger.error("Failed to get file size for %s: %s", file_path, e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Failed to access file: {e}", + } # Use provided model, or fall back to default if model is None: @@ -72,6 +112,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> dict: try: from openai import OpenAI + from openai import APIError, APIConnectionError, APITimeoutError client = OpenAI(api_key=api_key, base_url="https://api.openai.com/v1") @@ -92,10 +133,45 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> dict: "transcript": transcript_text, } - except Exception as e: - logger.error("Transcription error: %s", e) + except FileNotFoundError: + logger.error("Audio file not found: %s", file_path, exc_info=True) return { "success": False, "transcript": "", - "error": str(e), + "error": f"Audio file not found: {file_path}", + } + except PermissionError: + logger.error("Permission denied accessing file: %s", file_path, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Permission denied: {file_path}", + } + except APIConnectionError as e: + logger.error("API connection error during transcription: %s", e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Connection error: {e}", + } + except APITimeoutError as e: + logger.error("API timeout during transcription: %s", e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Request timeout: {e}", + } + except APIError as e: + logger.error("OpenAI API error during transcription: %s", e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"API error: {e}", + } + except Exception as e: + logger.error("Unexpected error during transcription: %s", e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Transcription failed: {e}", }