From 5e4fa297eb9b4e828acafd895116073e43d36fea Mon Sep 17 00:00:00 2001 From: noestelar Date: Wed, 25 Mar 2026 08:36:13 -0700 Subject: [PATCH] fix(whatsapp): download documents, audio, and video media from messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add downloadMediaMessage() calls for documents, audio/voice notes, and video in bridge.js — previously only images were downloaded, leaving all other file types inaccessible to the agent. Handle local file paths from the bridge for DOCUMENT, VOICE, and VIDEO types in whatsapp.py with proper MIME detection. Inject text content inline for readable files (.txt, .md, .csv, .json, etc.). Follow-up fixes applied during salvage: - Remove unused cache_document_from_bytes import - Add 100KB size cap on text injection (matches Telegram/Discord/Slack) - Align injection format with other platforms Cherry-picked from PR #2818. Also fixes #2856 (bugs 1 & 2). PR #2865 by ayberkesn fixed the same voice note issue. --- gateway/platforms/whatsapp.py | 54 +++++++++++++++++++++++++++++-- scripts/whatsapp-bridge/bridge.js | 38 +++++++++++++++++++++- 2 files changed, 88 insertions(+), 4 deletions(-) diff --git a/gateway/platforms/whatsapp.py b/gateway/platforms/whatsapp.py index 6697800e509..6ab68a627dd 100644 --- a/gateway/platforms/whatsapp.py +++ b/gateway/platforms/whatsapp.py @@ -74,6 +74,7 @@ from gateway.platforms.base import ( MessageEvent, MessageType, SendResult, + SUPPORTED_DOCUMENT_TYPES, cache_image_from_url, cache_audio_from_url, ) @@ -665,7 +666,7 @@ class WhatsAppAdapter(BasePlatformAdapter): user_name=data.get("senderName"), ) - # Download image media URLs to the local cache so the vision tool + # Download media URLs to the local cache so agent tools # can access them reliably regardless of URL expiration. raw_urls = data.get("mediaUrls", []) cached_urls = [] @@ -696,12 +697,59 @@ class WhatsAppAdapter(BasePlatformAdapter): print(f"[{self.name}] Failed to cache voice: {e}", flush=True) cached_urls.append(url) media_types.append("audio/ogg") + elif msg_type == MessageType.VOICE and os.path.isabs(url): + # Local file path — bridge already downloaded the audio + cached_urls.append(url) + media_types.append("audio/ogg") + print(f"[{self.name}] Using bridge-cached audio: {url}", flush=True) + elif msg_type == MessageType.DOCUMENT and os.path.isabs(url): + # Local file path — bridge already downloaded the document + cached_urls.append(url) + ext = Path(url).suffix.lower() + mime = SUPPORTED_DOCUMENT_TYPES.get(ext, "application/octet-stream") + media_types.append(mime) + print(f"[{self.name}] Using bridge-cached document: {url}", flush=True) + elif msg_type == MessageType.VIDEO and os.path.isabs(url): + cached_urls.append(url) + media_types.append("video/mp4") + print(f"[{self.name}] Using bridge-cached video: {url}", flush=True) else: cached_urls.append(url) media_types.append("unknown") - + + # For text-readable documents, inject file content directly into + # the message text so the agent can read it inline. + # Cap at 100KB to match Telegram/Discord/Slack behaviour. + body = data.get("body", "") + MAX_TEXT_INJECT_BYTES = 100 * 1024 + if msg_type == MessageType.DOCUMENT and cached_urls: + for doc_path in cached_urls: + ext = Path(doc_path).suffix.lower() + if ext in (".txt", ".md", ".csv", ".json", ".xml", ".yaml", ".yml", ".log", ".py", ".js", ".ts", ".html", ".css"): + try: + file_size = Path(doc_path).stat().st_size + if file_size > MAX_TEXT_INJECT_BYTES: + print(f"[{self.name}] Skipping text injection for {doc_path} ({file_size} bytes > {MAX_TEXT_INJECT_BYTES})", flush=True) + continue + content = Path(doc_path).read_text(errors="replace") + fname = Path(doc_path).name + # Remove the doc__ prefix for display + display_name = fname + if "_" in fname: + parts = fname.split("_", 2) + if len(parts) >= 3: + display_name = parts[2] + injection = f"[Content of {display_name}]:\n{content}" + if body: + body = f"{injection}\n\n{body}" + else: + body = injection + print(f"[{self.name}] Injected text content from: {doc_path}", flush=True) + except Exception as e: + print(f"[{self.name}] Failed to read document text: {e}", flush=True) + return MessageEvent( - text=data.get("body", ""), + text=body, message_type=msg_type, source=source, raw_message=data, diff --git a/scripts/whatsapp-bridge/bridge.js b/scripts/whatsapp-bridge/bridge.js index c573aa89b54..0dff8c2e241 100644 --- a/scripts/whatsapp-bridge/bridge.js +++ b/scripts/whatsapp-bridge/bridge.js @@ -43,6 +43,8 @@ const WHATSAPP_DEBUG = const PORT = parseInt(getArg('port', '3000'), 10); const SESSION_DIR = getArg('session', path.join(process.env.HOME || '~', '.hermes', 'whatsapp', 'session')); const IMAGE_CACHE_DIR = path.join(process.env.HOME || '~', '.hermes', 'image_cache'); +const DOCUMENT_CACHE_DIR = path.join(process.env.HOME || '~', '.hermes', 'document_cache'); +const AUDIO_CACHE_DIR = path.join(process.env.HOME || '~', '.hermes', 'audio_cache'); const PAIR_ONLY = args.includes('--pair-only'); const WHATSAPP_MODE = getArg('mode', process.env.WHATSAPP_MODE || 'self-chat'); // "bot" or "self-chat" const ALLOWED_USERS = (process.env.WHATSAPP_ALLOWED_USERS || '').split(',').map(s => s.trim()).filter(Boolean); @@ -224,13 +226,47 @@ async function startSocket() { body = msg.message.videoMessage.caption || ''; hasMedia = true; mediaType = 'video'; + try { + const buf = await downloadMediaMessage(msg, 'buffer', {}, { logger, reuploadRequest: sock.updateMediaMessage }); + const mime = msg.message.videoMessage.mimetype || 'video/mp4'; + const ext = mime.includes('mp4') ? '.mp4' : '.mkv'; + mkdirSync(DOCUMENT_CACHE_DIR, { recursive: true }); + const filePath = path.join(DOCUMENT_CACHE_DIR, `vid_${randomBytes(6).toString('hex')}${ext}`); + writeFileSync(filePath, buf); + mediaUrls.push(filePath); + } catch (err) { + console.error('[bridge] Failed to download video:', err.message); + } } else if (msg.message.audioMessage || msg.message.pttMessage) { hasMedia = true; mediaType = msg.message.pttMessage ? 'ptt' : 'audio'; + try { + const audioMsg = msg.message.pttMessage || msg.message.audioMessage; + const buf = await downloadMediaMessage(msg, 'buffer', {}, { logger, reuploadRequest: sock.updateMediaMessage }); + const mime = audioMsg.mimetype || 'audio/ogg'; + const ext = mime.includes('ogg') ? '.ogg' : mime.includes('mp4') ? '.m4a' : '.ogg'; + mkdirSync(AUDIO_CACHE_DIR, { recursive: true }); + const filePath = path.join(AUDIO_CACHE_DIR, `aud_${randomBytes(6).toString('hex')}${ext}`); + writeFileSync(filePath, buf); + mediaUrls.push(filePath); + } catch (err) { + console.error('[bridge] Failed to download audio:', err.message); + } } else if (msg.message.documentMessage) { - body = msg.message.documentMessage.caption || msg.message.documentMessage.fileName || ''; + body = msg.message.documentMessage.caption || ''; hasMedia = true; mediaType = 'document'; + const fileName = msg.message.documentMessage.fileName || 'document'; + try { + const buf = await downloadMediaMessage(msg, 'buffer', {}, { logger, reuploadRequest: sock.updateMediaMessage }); + mkdirSync(DOCUMENT_CACHE_DIR, { recursive: true }); + const safeFileName = path.basename(fileName).replace(/[^a-zA-Z0-9._-]/g, '_'); + const filePath = path.join(DOCUMENT_CACHE_DIR, `doc_${randomBytes(6).toString('hex')}_${safeFileName}`); + writeFileSync(filePath, buf); + mediaUrls.push(filePath); + } catch (err) { + console.error('[bridge] Failed to download document:', err.message); + } } // For media without caption, use a placeholder so the API message is never empty