Compare commits

...

1 Commits

Author SHA1 Message Date
Ben
291d5a2b4b fix(slack): transcribe in-app voice messages (audio/mp4) instead of failing
Slack in-app voice clips ("record a clip") arrive as MP4/AAC containers
(mimetype audio/mp4, filename audio_message*.mp4), and Slack sometimes
labels them video/mp4. The inbound audio handler derived the cache
extension from the mimetype and fell back to ".ogg" for anything not in
{.ogg,.mp3,.wav,.webm,.m4a} — so audio/mp4 voice messages were cached as
.ogg. OpenAI STT (whisper-1, gpt-4o-transcribe) sniffs the container from
the FILENAME extension, so it received MP4 bytes named .ogg and rejected
them. WhatsApp .ogg and uploaded .m4a worked only because their extension
happened to match the bytes.

Fix:
- _resolve_slack_audio_ext(): pick the cache extension from the real
  filename first, then a mimetype map (audio/mp4 -> .m4a), defaulting to
  .m4a — never the bogus .ogg fallback. Mirrors the video branch and the
  audio map already in gateway/platforms/bluebubbles.py.
- _is_slack_voice_clip(): detect audio-only clips mislabeled video/mp4
  via the slack_audio subtype / audio_message* filename, and route them
  through the audio path (cached as audio, reported as audio/*) so they
  reach STT instead of video understanding. Genuine videos (and
  slack_video screen recordings) are left on the video path.

Verified end-to-end against a real audio-only MP4: old path cached it as
.ogg (ffprobe shows MP4 bytes -> container mismatch -> OpenAI rejects);
new path caches it as .mp4 (extension matches bytes -> accepted).

Adds inbound-audio tests (previously none): helper unit tests plus
_handle_slack_message E2E coverage for audio/mp4, video/mp4-mislabeled
voice clips, and a real video staying on the video path. Confirmed the
two voice-message tests fail without the fix (mutation check).
2026-06-23 13:19:40 +10:00
2 changed files with 298 additions and 3 deletions

View File

@@ -303,6 +303,81 @@ def _resolve_slack_proxy_url() -> Optional[str]:
return proxy_url
# Map Slack audio mimetypes to the file extension that matches the actual
# container bytes. Critically, Slack's in-app "record a clip" voice messages
# arrive as MP4/AAC containers (``audio/mp4``, filename ``audio_message*.mp4``),
# NOT Ogg — so the extension we cache them under must be one a downstream STT
# backend (OpenAI Whisper / gpt-4o-transcribe) will accept for that container.
# OpenAI sniffs the container from the FILENAME extension, so a wrong extension
# (e.g. caching MP4 bytes as ``.ogg``) makes transcription fail outright.
# Mirrors the proven map in gateway/platforms/bluebubbles.py.
_SLACK_AUDIO_MIME_TO_EXT = {
"audio/ogg": ".ogg",
"audio/opus": ".ogg",
"audio/mpeg": ".mp3",
"audio/mp3": ".mp3",
"audio/wav": ".wav",
"audio/x-wav": ".wav",
"audio/webm": ".webm",
"audio/mp4": ".m4a",
"audio/x-m4a": ".m4a",
"audio/m4a": ".m4a",
"audio/aac": ".m4a",
"audio/flac": ".flac",
"audio/x-flac": ".flac",
}
# Extensions OpenAI/Whisper-family STT backends accept (kept in sync with
# tools/transcription_tools.SUPPORTED_FORMATS).
_SLACK_STT_SUPPORTED_EXTS = frozenset(
{".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg", ".aac", ".flac"}
)
def _resolve_slack_audio_ext(file_obj: Dict[str, Any], mimetype: str) -> str:
"""Pick the cache extension that matches an inbound Slack audio file's bytes.
Resolution order (mirrors the video branch + bluebubbles.py):
1. The real extension from the uploaded filename, when it's a format a
Whisper-family STT backend accepts (so ``audio_message.mp4`` →
``.mp4``, ``clip.m4a`` → ``.m4a``).
2. A mimetype → extension lookup (so ``audio/mp4`` → ``.m4a``).
3. ``.m4a`` as a last resort — never ``.ogg``, which was the original bug:
MP4/AAC voice messages cached as ``.ogg`` are rejected by OpenAI because
the bytes don't match the container the extension claims.
"""
name = (file_obj.get("name") or "").strip()
_, name_ext = os.path.splitext(name)
name_ext = name_ext.lower()
if name_ext in _SLACK_STT_SUPPORTED_EXTS:
return name_ext
mime_key = (mimetype or "").split(";", 1)[0].strip().lower()
if mime_key in _SLACK_AUDIO_MIME_TO_EXT:
return _SLACK_AUDIO_MIME_TO_EXT[mime_key]
return ".m4a"
def _is_slack_voice_clip(file_obj: Dict[str, Any]) -> bool:
"""Return True when a Slack file is an audio-only voice clip.
Slack's in-app voice recordings are audio-only MP4 containers, but Slack
sometimes reports them with a ``video/mp4`` mimetype, which would otherwise
route them to video understanding instead of speech-to-text. Detect them by
Slack's stable markers — the ``slack_audio`` subtype and the
``audio_message*`` filename pattern — so genuine videos are left untouched.
"""
subtype = (file_obj.get("subtype") or "").strip().lower()
if subtype == "slack_audio":
# slack_audio is always audio-only. (slack_video clips carry a real
# video track, so they are deliberately NOT matched here.)
return True
name = (file_obj.get("name") or "").strip().lower()
return name.startswith("audio_message")
class SlackAdapter(BasePlatformAdapter):
"""
Slack bot adapter using Socket Mode.
@@ -2637,9 +2712,7 @@ class SlackAdapter(BasePlatformAdapter):
)
elif mimetype.startswith("audio/") and url:
try:
ext = "." + mimetype.split("/")[-1].split(";")[0]
if ext not in {".ogg", ".mp3", ".wav", ".webm", ".m4a"}:
ext = ".ogg"
ext = _resolve_slack_audio_ext(f, mimetype)
cached = await self._download_slack_file(
url, ext, audio=True, team_id=team_id
)
@@ -2657,6 +2730,41 @@ class SlackAdapter(BasePlatformAdapter):
e,
exc_info=True,
)
elif mimetype.startswith("video/") and url and _is_slack_voice_clip(f):
# Slack in-app voice clips are audio-only MP4 containers that
# Slack sometimes mislabels with a ``video/mp4`` mimetype.
# Cache them as audio and report an ``audio/*`` type so the
# gateway routes them to speech-to-text instead of video
# understanding. Without this, voice messages recorded in Slack
# never get transcribed.
try:
ext = _resolve_slack_audio_ext(f, mimetype)
cached = await self._download_slack_file(
url, ext, audio=True, team_id=team_id
)
media_urls.append(cached)
# Report a coherent audio mimetype matching the cached
# extension so downstream STT routing recognizes it.
media_types.append(
{".m4a": "audio/mp4"}.get(ext, "audio/mp4")
)
logger.debug(
"[Slack] Cached voice clip (mislabeled %s) as audio: %s",
mimetype,
cached,
)
except Exception as e: # pragma: no cover - defensive logging
detail = self._describe_slack_download_failure(e, file_obj=f)
if detail:
attachment_notices.append(detail)
logger.warning("[Slack] %s", detail)
else:
logger.warning(
"[Slack] Failed to cache voice clip from %s: %s",
url,
e,
exc_info=True,
)
elif mimetype.startswith("video/") and url:
try:
original_filename = f.get("name", "")

View File

@@ -1754,6 +1754,193 @@ class TestIncomingDocumentHandling:
assert "> /deploy now" in msg_event.text
# ---------------------------------------------------------------------------
# TestIncomingAudioHandling — Slack voice messages (regression)
# ---------------------------------------------------------------------------
class TestSlackAudioExtResolution:
"""Unit coverage for the inbound-audio extension resolver.
Regression for: Slack in-app voice messages are MP4/AAC containers
(``audio/mp4``, filename ``audio_message*.mp4``) that the old code cached
as ``.ogg`` (the catch-all fallback), so OpenAI STT — which sniffs the
container from the filename extension — rejected them. WhatsApp ``.ogg``
and uploaded ``.m4a`` worked because their extension happened to match.
"""
def test_slack_voice_message_mp4_keeps_real_extension(self):
"""The core bug: audio/mp4 voice message must NOT become .ogg."""
f = {"name": "audio_message.mp4", "mimetype": "audio/mp4"}
ext = _slack_mod._resolve_slack_audio_ext(f, f["mimetype"])
assert ext != ".ogg", "regression: MP4 voice message mislabeled as .ogg"
assert ext in {".mp4", ".m4a"}
assert ext in _slack_mod._SLACK_STT_SUPPORTED_EXTS
def test_whatsapp_ogg_preserved(self):
f = {"name": "voice.ogg", "mimetype": "audio/ogg"}
assert _slack_mod._resolve_slack_audio_ext(f, f["mimetype"]) == ".ogg"
def test_m4a_upload_preserved(self):
f = {"name": "clip.m4a", "mimetype": "audio/x-m4a"}
assert _slack_mod._resolve_slack_audio_ext(f, f["mimetype"]) == ".m4a"
def test_mp3_upload_preserved(self):
f = {"name": "song.mp3", "mimetype": "audio/mpeg"}
assert _slack_mod._resolve_slack_audio_ext(f, f["mimetype"]) == ".mp3"
def test_mimetype_used_when_filename_extension_missing(self):
"""No usable filename ext → fall back to the mime map, not .ogg."""
f = {"name": "", "mimetype": "audio/mp4"}
assert _slack_mod._resolve_slack_audio_ext(f, f["mimetype"]) == ".m4a"
def test_unknown_audio_defaults_to_m4a_not_ogg(self):
"""A truly unknown audio type defaults to the broadly-decodable .m4a."""
f = {"name": "weird", "mimetype": "audio/x-some-future-codec"}
ext = _slack_mod._resolve_slack_audio_ext(f, f["mimetype"])
assert ext == ".m4a"
assert ext != ".ogg"
class TestSlackVoiceClipDetection:
"""Unit coverage for the video/mp4-mislabeled voice-clip detector."""
def test_audio_message_filename_detected(self):
assert _slack_mod._is_slack_voice_clip(
{"name": "audio_message.mp4", "mimetype": "video/mp4"}
)
def test_slack_audio_subtype_detected(self):
assert _slack_mod._is_slack_voice_clip(
{"name": "clip.mp4", "subtype": "slack_audio", "mimetype": "video/mp4"}
)
def test_real_video_not_detected(self):
"""A genuine uploaded video must NOT be hijacked into the audio path."""
assert not _slack_mod._is_slack_voice_clip(
{"name": "vacation.mp4", "mimetype": "video/mp4"}
)
def test_slack_video_clip_not_detected(self):
"""slack_video clips carry a real video track — leave them as video."""
assert not _slack_mod._is_slack_voice_clip(
{"name": "screen_recording.mp4", "subtype": "slack_video"}
)
class TestIncomingAudioHandling:
def _make_event(self, files=None, text="hello"):
return {
"text": text,
"user": "U_USER",
"channel": "D123",
"channel_type": "im",
"ts": "1234567890.000001",
"files": files or [],
"blocks": [],
"attachments": [],
}
@pytest.mark.asyncio
async def test_voice_message_cached_with_correct_extension(self, adapter, tmp_path):
"""audio/mp4 voice message is cached with an STT-acceptable extension,
not the old .ogg fallback, and routed as audio."""
captured = {}
async def _fake_download(url, ext, audio=False, team_id=""):
captured["ext"] = ext
captured["audio"] = audio
path = tmp_path / f"cached{ext}"
path.write_bytes(b"\x00\x00\x00\x18ftypmp42fake mp4 bytes")
return str(path)
with patch.object(adapter, "_download_slack_file", side_effect=_fake_download):
event = self._make_event(
files=[
{
"mimetype": "audio/mp4",
"name": "audio_message.mp4",
"subtype": "slack_audio",
"url_private_download": "https://files.slack.com/audio_message.mp4",
"size": 2048,
}
]
)
await adapter._handle_slack_message(event)
assert captured.get("audio") is True
assert captured["ext"] != ".ogg", "regression: voice message cached as .ogg"
assert captured["ext"] in {".mp4", ".m4a"}
msg_event = adapter.handle_message.call_args[0][0]
assert len(msg_event.media_urls) == 1
# media_type stays audio/* so the gateway routes it to STT
assert msg_event.media_types[0].startswith("audio/")
@pytest.mark.asyncio
async def test_video_mp4_voice_clip_rerouted_to_audio(self, adapter, tmp_path):
"""A voice clip mislabeled video/mp4 is rerouted to the audio path
(cached as audio, reported as audio/*) instead of video understanding."""
captured = {}
async def _fake_download(url, ext, audio=False, team_id=""):
captured["ext"] = ext
captured["audio"] = audio
path = tmp_path / f"cached{ext}"
path.write_bytes(b"\x00\x00\x00\x18ftypmp42fake mp4 bytes")
return str(path)
with patch.object(adapter, "_download_slack_file", side_effect=_fake_download):
event = self._make_event(
files=[
{
"mimetype": "video/mp4",
"name": "audio_message.mp4",
"subtype": "slack_audio",
"url_private_download": "https://files.slack.com/audio_message.mp4",
"size": 2048,
}
]
)
await adapter._handle_slack_message(event)
assert captured.get("audio") is True
assert captured["ext"] in {".mp4", ".m4a"}
msg_event = adapter.handle_message.call_args[0][0]
assert len(msg_event.media_urls) == 1
assert msg_event.media_types[0].startswith("audio/"), (
"voice clip should route to STT, not video understanding"
)
@pytest.mark.asyncio
async def test_real_video_still_routed_as_video(self, adapter, tmp_path):
"""A genuine uploaded video must remain on the video path."""
async def _fake_download_bytes(url, team_id=""):
return b"\x00\x00\x00\x18ftypisomfake real video"
with patch.object(
adapter, "_download_slack_file_bytes", side_effect=_fake_download_bytes
):
event = self._make_event(
files=[
{
"mimetype": "video/mp4",
"name": "vacation.mp4",
"url_private_download": "https://files.slack.com/vacation.mp4",
"size": 4096,
}
]
)
await adapter._handle_slack_message(event)
msg_event = adapter.handle_message.call_args[0][0]
assert len(msg_event.media_urls) == 1
assert msg_event.media_types[0].startswith("video/"), (
"a real video must not be hijacked into the audio path"
)
# ---------------------------------------------------------------------------
# TestMessageRouting
# ---------------------------------------------------------------------------