mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-29 05:06:48 +08:00
Compare commits
1 Commits
chore/remo
...
fix/slack-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
291d5a2b4b |
@@ -303,6 +303,81 @@ def _resolve_slack_proxy_url() -> Optional[str]:
|
||||
return proxy_url
|
||||
|
||||
|
||||
# Map Slack audio mimetypes to the file extension that matches the actual
|
||||
# container bytes. Critically, Slack's in-app "record a clip" voice messages
|
||||
# arrive as MP4/AAC containers (``audio/mp4``, filename ``audio_message*.mp4``),
|
||||
# NOT Ogg — so the extension we cache them under must be one a downstream STT
|
||||
# backend (OpenAI Whisper / gpt-4o-transcribe) will accept for that container.
|
||||
# OpenAI sniffs the container from the FILENAME extension, so a wrong extension
|
||||
# (e.g. caching MP4 bytes as ``.ogg``) makes transcription fail outright.
|
||||
# Mirrors the proven map in gateway/platforms/bluebubbles.py.
|
||||
_SLACK_AUDIO_MIME_TO_EXT = {
|
||||
"audio/ogg": ".ogg",
|
||||
"audio/opus": ".ogg",
|
||||
"audio/mpeg": ".mp3",
|
||||
"audio/mp3": ".mp3",
|
||||
"audio/wav": ".wav",
|
||||
"audio/x-wav": ".wav",
|
||||
"audio/webm": ".webm",
|
||||
"audio/mp4": ".m4a",
|
||||
"audio/x-m4a": ".m4a",
|
||||
"audio/m4a": ".m4a",
|
||||
"audio/aac": ".m4a",
|
||||
"audio/flac": ".flac",
|
||||
"audio/x-flac": ".flac",
|
||||
}
|
||||
|
||||
# Extensions OpenAI/Whisper-family STT backends accept (kept in sync with
|
||||
# tools/transcription_tools.SUPPORTED_FORMATS).
|
||||
_SLACK_STT_SUPPORTED_EXTS = frozenset(
|
||||
{".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg", ".aac", ".flac"}
|
||||
)
|
||||
|
||||
|
||||
def _resolve_slack_audio_ext(file_obj: Dict[str, Any], mimetype: str) -> str:
|
||||
"""Pick the cache extension that matches an inbound Slack audio file's bytes.
|
||||
|
||||
Resolution order (mirrors the video branch + bluebubbles.py):
|
||||
|
||||
1. The real extension from the uploaded filename, when it's a format a
|
||||
Whisper-family STT backend accepts (so ``audio_message.mp4`` →
|
||||
``.mp4``, ``clip.m4a`` → ``.m4a``).
|
||||
2. A mimetype → extension lookup (so ``audio/mp4`` → ``.m4a``).
|
||||
3. ``.m4a`` as a last resort — never ``.ogg``, which was the original bug:
|
||||
MP4/AAC voice messages cached as ``.ogg`` are rejected by OpenAI because
|
||||
the bytes don't match the container the extension claims.
|
||||
"""
|
||||
name = (file_obj.get("name") or "").strip()
|
||||
_, name_ext = os.path.splitext(name)
|
||||
name_ext = name_ext.lower()
|
||||
if name_ext in _SLACK_STT_SUPPORTED_EXTS:
|
||||
return name_ext
|
||||
|
||||
mime_key = (mimetype or "").split(";", 1)[0].strip().lower()
|
||||
if mime_key in _SLACK_AUDIO_MIME_TO_EXT:
|
||||
return _SLACK_AUDIO_MIME_TO_EXT[mime_key]
|
||||
|
||||
return ".m4a"
|
||||
|
||||
|
||||
def _is_slack_voice_clip(file_obj: Dict[str, Any]) -> bool:
|
||||
"""Return True when a Slack file is an audio-only voice clip.
|
||||
|
||||
Slack's in-app voice recordings are audio-only MP4 containers, but Slack
|
||||
sometimes reports them with a ``video/mp4`` mimetype, which would otherwise
|
||||
route them to video understanding instead of speech-to-text. Detect them by
|
||||
Slack's stable markers — the ``slack_audio`` subtype and the
|
||||
``audio_message*`` filename pattern — so genuine videos are left untouched.
|
||||
"""
|
||||
subtype = (file_obj.get("subtype") or "").strip().lower()
|
||||
if subtype == "slack_audio":
|
||||
# slack_audio is always audio-only. (slack_video clips carry a real
|
||||
# video track, so they are deliberately NOT matched here.)
|
||||
return True
|
||||
name = (file_obj.get("name") or "").strip().lower()
|
||||
return name.startswith("audio_message")
|
||||
|
||||
|
||||
class SlackAdapter(BasePlatformAdapter):
|
||||
"""
|
||||
Slack bot adapter using Socket Mode.
|
||||
@@ -2637,9 +2712,7 @@ class SlackAdapter(BasePlatformAdapter):
|
||||
)
|
||||
elif mimetype.startswith("audio/") and url:
|
||||
try:
|
||||
ext = "." + mimetype.split("/")[-1].split(";")[0]
|
||||
if ext not in {".ogg", ".mp3", ".wav", ".webm", ".m4a"}:
|
||||
ext = ".ogg"
|
||||
ext = _resolve_slack_audio_ext(f, mimetype)
|
||||
cached = await self._download_slack_file(
|
||||
url, ext, audio=True, team_id=team_id
|
||||
)
|
||||
@@ -2657,6 +2730,41 @@ class SlackAdapter(BasePlatformAdapter):
|
||||
e,
|
||||
exc_info=True,
|
||||
)
|
||||
elif mimetype.startswith("video/") and url and _is_slack_voice_clip(f):
|
||||
# Slack in-app voice clips are audio-only MP4 containers that
|
||||
# Slack sometimes mislabels with a ``video/mp4`` mimetype.
|
||||
# Cache them as audio and report an ``audio/*`` type so the
|
||||
# gateway routes them to speech-to-text instead of video
|
||||
# understanding. Without this, voice messages recorded in Slack
|
||||
# never get transcribed.
|
||||
try:
|
||||
ext = _resolve_slack_audio_ext(f, mimetype)
|
||||
cached = await self._download_slack_file(
|
||||
url, ext, audio=True, team_id=team_id
|
||||
)
|
||||
media_urls.append(cached)
|
||||
# Report a coherent audio mimetype matching the cached
|
||||
# extension so downstream STT routing recognizes it.
|
||||
media_types.append(
|
||||
{".m4a": "audio/mp4"}.get(ext, "audio/mp4")
|
||||
)
|
||||
logger.debug(
|
||||
"[Slack] Cached voice clip (mislabeled %s) as audio: %s",
|
||||
mimetype,
|
||||
cached,
|
||||
)
|
||||
except Exception as e: # pragma: no cover - defensive logging
|
||||
detail = self._describe_slack_download_failure(e, file_obj=f)
|
||||
if detail:
|
||||
attachment_notices.append(detail)
|
||||
logger.warning("[Slack] %s", detail)
|
||||
else:
|
||||
logger.warning(
|
||||
"[Slack] Failed to cache voice clip from %s: %s",
|
||||
url,
|
||||
e,
|
||||
exc_info=True,
|
||||
)
|
||||
elif mimetype.startswith("video/") and url:
|
||||
try:
|
||||
original_filename = f.get("name", "")
|
||||
|
||||
@@ -1754,6 +1754,193 @@ class TestIncomingDocumentHandling:
|
||||
assert "> /deploy now" in msg_event.text
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TestIncomingAudioHandling — Slack voice messages (regression)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSlackAudioExtResolution:
|
||||
"""Unit coverage for the inbound-audio extension resolver.
|
||||
|
||||
Regression for: Slack in-app voice messages are MP4/AAC containers
|
||||
(``audio/mp4``, filename ``audio_message*.mp4``) that the old code cached
|
||||
as ``.ogg`` (the catch-all fallback), so OpenAI STT — which sniffs the
|
||||
container from the filename extension — rejected them. WhatsApp ``.ogg``
|
||||
and uploaded ``.m4a`` worked because their extension happened to match.
|
||||
"""
|
||||
|
||||
def test_slack_voice_message_mp4_keeps_real_extension(self):
|
||||
"""The core bug: audio/mp4 voice message must NOT become .ogg."""
|
||||
f = {"name": "audio_message.mp4", "mimetype": "audio/mp4"}
|
||||
ext = _slack_mod._resolve_slack_audio_ext(f, f["mimetype"])
|
||||
assert ext != ".ogg", "regression: MP4 voice message mislabeled as .ogg"
|
||||
assert ext in {".mp4", ".m4a"}
|
||||
assert ext in _slack_mod._SLACK_STT_SUPPORTED_EXTS
|
||||
|
||||
def test_whatsapp_ogg_preserved(self):
|
||||
f = {"name": "voice.ogg", "mimetype": "audio/ogg"}
|
||||
assert _slack_mod._resolve_slack_audio_ext(f, f["mimetype"]) == ".ogg"
|
||||
|
||||
def test_m4a_upload_preserved(self):
|
||||
f = {"name": "clip.m4a", "mimetype": "audio/x-m4a"}
|
||||
assert _slack_mod._resolve_slack_audio_ext(f, f["mimetype"]) == ".m4a"
|
||||
|
||||
def test_mp3_upload_preserved(self):
|
||||
f = {"name": "song.mp3", "mimetype": "audio/mpeg"}
|
||||
assert _slack_mod._resolve_slack_audio_ext(f, f["mimetype"]) == ".mp3"
|
||||
|
||||
def test_mimetype_used_when_filename_extension_missing(self):
|
||||
"""No usable filename ext → fall back to the mime map, not .ogg."""
|
||||
f = {"name": "", "mimetype": "audio/mp4"}
|
||||
assert _slack_mod._resolve_slack_audio_ext(f, f["mimetype"]) == ".m4a"
|
||||
|
||||
def test_unknown_audio_defaults_to_m4a_not_ogg(self):
|
||||
"""A truly unknown audio type defaults to the broadly-decodable .m4a."""
|
||||
f = {"name": "weird", "mimetype": "audio/x-some-future-codec"}
|
||||
ext = _slack_mod._resolve_slack_audio_ext(f, f["mimetype"])
|
||||
assert ext == ".m4a"
|
||||
assert ext != ".ogg"
|
||||
|
||||
|
||||
class TestSlackVoiceClipDetection:
|
||||
"""Unit coverage for the video/mp4-mislabeled voice-clip detector."""
|
||||
|
||||
def test_audio_message_filename_detected(self):
|
||||
assert _slack_mod._is_slack_voice_clip(
|
||||
{"name": "audio_message.mp4", "mimetype": "video/mp4"}
|
||||
)
|
||||
|
||||
def test_slack_audio_subtype_detected(self):
|
||||
assert _slack_mod._is_slack_voice_clip(
|
||||
{"name": "clip.mp4", "subtype": "slack_audio", "mimetype": "video/mp4"}
|
||||
)
|
||||
|
||||
def test_real_video_not_detected(self):
|
||||
"""A genuine uploaded video must NOT be hijacked into the audio path."""
|
||||
assert not _slack_mod._is_slack_voice_clip(
|
||||
{"name": "vacation.mp4", "mimetype": "video/mp4"}
|
||||
)
|
||||
|
||||
def test_slack_video_clip_not_detected(self):
|
||||
"""slack_video clips carry a real video track — leave them as video."""
|
||||
assert not _slack_mod._is_slack_voice_clip(
|
||||
{"name": "screen_recording.mp4", "subtype": "slack_video"}
|
||||
)
|
||||
|
||||
|
||||
class TestIncomingAudioHandling:
|
||||
def _make_event(self, files=None, text="hello"):
|
||||
return {
|
||||
"text": text,
|
||||
"user": "U_USER",
|
||||
"channel": "D123",
|
||||
"channel_type": "im",
|
||||
"ts": "1234567890.000001",
|
||||
"files": files or [],
|
||||
"blocks": [],
|
||||
"attachments": [],
|
||||
}
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_voice_message_cached_with_correct_extension(self, adapter, tmp_path):
|
||||
"""audio/mp4 voice message is cached with an STT-acceptable extension,
|
||||
not the old .ogg fallback, and routed as audio."""
|
||||
captured = {}
|
||||
|
||||
async def _fake_download(url, ext, audio=False, team_id=""):
|
||||
captured["ext"] = ext
|
||||
captured["audio"] = audio
|
||||
path = tmp_path / f"cached{ext}"
|
||||
path.write_bytes(b"\x00\x00\x00\x18ftypmp42fake mp4 bytes")
|
||||
return str(path)
|
||||
|
||||
with patch.object(adapter, "_download_slack_file", side_effect=_fake_download):
|
||||
event = self._make_event(
|
||||
files=[
|
||||
{
|
||||
"mimetype": "audio/mp4",
|
||||
"name": "audio_message.mp4",
|
||||
"subtype": "slack_audio",
|
||||
"url_private_download": "https://files.slack.com/audio_message.mp4",
|
||||
"size": 2048,
|
||||
}
|
||||
]
|
||||
)
|
||||
await adapter._handle_slack_message(event)
|
||||
|
||||
assert captured.get("audio") is True
|
||||
assert captured["ext"] != ".ogg", "regression: voice message cached as .ogg"
|
||||
assert captured["ext"] in {".mp4", ".m4a"}
|
||||
|
||||
msg_event = adapter.handle_message.call_args[0][0]
|
||||
assert len(msg_event.media_urls) == 1
|
||||
# media_type stays audio/* so the gateway routes it to STT
|
||||
assert msg_event.media_types[0].startswith("audio/")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_video_mp4_voice_clip_rerouted_to_audio(self, adapter, tmp_path):
|
||||
"""A voice clip mislabeled video/mp4 is rerouted to the audio path
|
||||
(cached as audio, reported as audio/*) instead of video understanding."""
|
||||
captured = {}
|
||||
|
||||
async def _fake_download(url, ext, audio=False, team_id=""):
|
||||
captured["ext"] = ext
|
||||
captured["audio"] = audio
|
||||
path = tmp_path / f"cached{ext}"
|
||||
path.write_bytes(b"\x00\x00\x00\x18ftypmp42fake mp4 bytes")
|
||||
return str(path)
|
||||
|
||||
with patch.object(adapter, "_download_slack_file", side_effect=_fake_download):
|
||||
event = self._make_event(
|
||||
files=[
|
||||
{
|
||||
"mimetype": "video/mp4",
|
||||
"name": "audio_message.mp4",
|
||||
"subtype": "slack_audio",
|
||||
"url_private_download": "https://files.slack.com/audio_message.mp4",
|
||||
"size": 2048,
|
||||
}
|
||||
]
|
||||
)
|
||||
await adapter._handle_slack_message(event)
|
||||
|
||||
assert captured.get("audio") is True
|
||||
assert captured["ext"] in {".mp4", ".m4a"}
|
||||
msg_event = adapter.handle_message.call_args[0][0]
|
||||
assert len(msg_event.media_urls) == 1
|
||||
assert msg_event.media_types[0].startswith("audio/"), (
|
||||
"voice clip should route to STT, not video understanding"
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_real_video_still_routed_as_video(self, adapter, tmp_path):
|
||||
"""A genuine uploaded video must remain on the video path."""
|
||||
|
||||
async def _fake_download_bytes(url, team_id=""):
|
||||
return b"\x00\x00\x00\x18ftypisomfake real video"
|
||||
|
||||
with patch.object(
|
||||
adapter, "_download_slack_file_bytes", side_effect=_fake_download_bytes
|
||||
):
|
||||
event = self._make_event(
|
||||
files=[
|
||||
{
|
||||
"mimetype": "video/mp4",
|
||||
"name": "vacation.mp4",
|
||||
"url_private_download": "https://files.slack.com/vacation.mp4",
|
||||
"size": 4096,
|
||||
}
|
||||
]
|
||||
)
|
||||
await adapter._handle_slack_message(event)
|
||||
|
||||
msg_event = adapter.handle_message.call_args[0][0]
|
||||
assert len(msg_event.media_urls) == 1
|
||||
assert msg_event.media_types[0].startswith("video/"), (
|
||||
"a real video must not be hijacked into the audio path"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TestMessageRouting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user