Compare commits

...

1 Commits

Author SHA1 Message Date
Evi Nova
73a4c7ad4a fix(gateway): validate Slack image downloads before caching
Slack may return an HTML sign-in/redirect page instead of actual media
bytes (e.g. expired token, restricted file access). This adds two layers
of defense:

1. Content-Type check in slack.py rejects text/html responses early
2. Magic-byte validation in base.py's cache_image_from_bytes() rejects
   non-image data regardless of source platform

Also adds ValueError guards in wecom.py and email.py so the new
validation doesn't crash those adapters.

Closes #6829
2026-04-10 03:52:46 -07:00
5 changed files with 128 additions and 7 deletions

View File

@@ -216,6 +216,23 @@ def get_image_cache_dir() -> Path:
return IMAGE_CACHE_DIR return IMAGE_CACHE_DIR
def _looks_like_image(data: bytes) -> bool:
"""Return True if *data* starts with a known image magic-byte sequence."""
if len(data) < 4:
return False
if data[:8] == b"\x89PNG\r\n\x1a\n":
return True
if data[:3] == b"\xff\xd8\xff":
return True
if data[:6] in (b"GIF87a", b"GIF89a"):
return True
if data[:2] == b"BM":
return True
if data[:4] == b"RIFF" and len(data) >= 12 and data[8:12] == b"WEBP":
return True
return False
def cache_image_from_bytes(data: bytes, ext: str = ".jpg") -> str: def cache_image_from_bytes(data: bytes, ext: str = ".jpg") -> str:
""" """
Save raw image bytes to the cache and return the absolute file path. Save raw image bytes to the cache and return the absolute file path.
@@ -226,7 +243,17 @@ def cache_image_from_bytes(data: bytes, ext: str = ".jpg") -> str:
Returns: Returns:
Absolute path to the cached image file as a string. Absolute path to the cached image file as a string.
Raises:
ValueError: If *data* does not look like a valid image (e.g. an HTML
error page returned by the upstream server).
""" """
if not _looks_like_image(data):
snippet = data[:80].decode("utf-8", errors="replace")
raise ValueError(
f"Refusing to cache non-image data as {ext} "
f"(starts with: {snippet!r})"
)
cache_dir = get_image_cache_dir() cache_dir = get_image_cache_dir()
filename = f"img_{uuid.uuid4().hex[:12]}{ext}" filename = f"img_{uuid.uuid4().hex[:12]}{ext}"
filepath = cache_dir / filename filepath = cache_dir / filename

View File

@@ -195,7 +195,11 @@ def _extract_attachments(
ext = Path(filename).suffix.lower() ext = Path(filename).suffix.lower()
if ext in _IMAGE_EXTS: if ext in _IMAGE_EXTS:
cached_path = cache_image_from_bytes(payload, ext) try:
cached_path = cache_image_from_bytes(payload, ext)
except ValueError:
logger.debug("Skipping non-image attachment %s (invalid magic bytes)", filename)
continue
attachments.append({ attachments.append({
"path": cached_path, "path": cached_path,
"filename": filename, "filename": filename,

View File

@@ -1596,6 +1596,18 @@ class SlackAdapter(BasePlatformAdapter):
) )
response.raise_for_status() response.raise_for_status()
# Slack may return an HTML sign-in/redirect page
# instead of actual media bytes (e.g. expired token,
# restricted file access). Detect this early so we
# don't cache bogus data and confuse downstream tools.
ct = response.headers.get("content-type", "")
if "text/html" in ct:
raise ValueError(
"Slack returned HTML instead of media "
f"(content-type: {ct}); "
"check bot token scopes and file permissions"
)
if audio: if audio:
from gateway.platforms.base import cache_audio_from_bytes from gateway.platforms.base import cache_audio_from_bytes
return cache_audio_from_bytes(response.content, ext) return cache_audio_from_bytes(response.content, ext)

View File

@@ -696,7 +696,11 @@ class WeComAdapter(BasePlatformAdapter):
if kind == "image": if kind == "image":
ext = self._detect_image_ext(raw) ext = self._detect_image_ext(raw)
return cache_image_from_bytes(raw, ext), self._mime_for_ext(ext, fallback="image/jpeg") try:
return cache_image_from_bytes(raw, ext), self._mime_for_ext(ext, fallback="image/jpeg")
except ValueError as exc:
logger.warning("[%s] Rejected non-image bytes: %s", self.name, exc)
return None
filename = str(media.get("filename") or media.get("name") or "wecom_file") filename = str(media.get("filename") or media.get("name") or "wecom_file")
return cache_document_from_bytes(raw, filename), mimetypes.guess_type(filename)[0] or "application/octet-stream" return cache_document_from_bytes(raw, filename), mimetypes.guess_type(filename)[0] or "application/octet-stream"
@@ -722,7 +726,11 @@ class WeComAdapter(BasePlatformAdapter):
content_type = str(headers.get("content-type") or "").split(";", 1)[0].strip() or "application/octet-stream" content_type = str(headers.get("content-type") or "").split(";", 1)[0].strip() or "application/octet-stream"
if kind == "image": if kind == "image":
ext = self._guess_extension(url, content_type, fallback=self._detect_image_ext(raw)) ext = self._guess_extension(url, content_type, fallback=self._detect_image_ext(raw))
return cache_image_from_bytes(raw, ext), content_type or self._mime_for_ext(ext, fallback="image/jpeg") try:
return cache_image_from_bytes(raw, ext), content_type or self._mime_for_ext(ext, fallback="image/jpeg")
except ValueError as exc:
logger.warning("[%s] Rejected non-image bytes from %s: %s", self.name, url, exc)
return None
filename = self._guess_filename(url, headers.get("content-disposition"), content_type) filename = self._guess_filename(url, headers.get("content-disposition"), content_type)
return cache_document_from_bytes(raw, filename), content_type return cache_document_from_bytes(raw, filename), content_type

View File

@@ -34,6 +34,45 @@ def _make_timeout_error() -> httpx.TimeoutException:
return httpx.TimeoutException("timed out") return httpx.TimeoutException("timed out")
# ---------------------------------------------------------------------------
# cache_image_from_bytes (base.py)
# ---------------------------------------------------------------------------
class TestCacheImageFromBytes:
"""Tests for gateway.platforms.base.cache_image_from_bytes"""
def test_caches_valid_jpeg(self, tmp_path, monkeypatch):
monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img")
from gateway.platforms.base import cache_image_from_bytes
path = cache_image_from_bytes(b"\xff\xd8\xff fake jpeg data", ".jpg")
assert path.endswith(".jpg")
def test_caches_valid_png(self, tmp_path, monkeypatch):
monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img")
from gateway.platforms.base import cache_image_from_bytes
path = cache_image_from_bytes(b"\x89PNG\r\n\x1a\n fake png data", ".png")
assert path.endswith(".png")
def test_rejects_html_content(self, tmp_path, monkeypatch):
monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img")
from gateway.platforms.base import cache_image_from_bytes
with pytest.raises(ValueError, match="non-image data"):
cache_image_from_bytes(b"<!DOCTYPE html><html><title>Slack</title></html>", ".png")
def test_rejects_empty_data(self, tmp_path, monkeypatch):
monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img")
from gateway.platforms.base import cache_image_from_bytes
with pytest.raises(ValueError, match="non-image data"):
cache_image_from_bytes(b"", ".jpg")
def test_rejects_plain_text(self, tmp_path, monkeypatch):
monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img")
from gateway.platforms.base import cache_image_from_bytes
with pytest.raises(ValueError, match="non-image data"):
cache_image_from_bytes(b"just some text, not an image", ".jpg")
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# cache_image_from_url (base.py) # cache_image_from_url (base.py)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -71,7 +110,7 @@ class TestCacheImageFromUrl:
monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img")
fake_response = MagicMock() fake_response = MagicMock()
fake_response.content = b"image data" fake_response.content = b"\xff\xd8\xff image data"
fake_response.raise_for_status = MagicMock() fake_response.raise_for_status = MagicMock()
mock_client = AsyncMock() mock_client = AsyncMock()
@@ -101,7 +140,7 @@ class TestCacheImageFromUrl:
monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img")
ok_response = MagicMock() ok_response = MagicMock()
ok_response.content = b"image data" ok_response.content = b"\xff\xd8\xff image data"
ok_response.raise_for_status = MagicMock() ok_response.raise_for_status = MagicMock()
mock_client = AsyncMock() mock_client = AsyncMock()
@@ -395,8 +434,9 @@ class TestSlackDownloadSlackFile:
adapter = _make_slack_adapter() adapter = _make_slack_adapter()
fake_response = MagicMock() fake_response = MagicMock()
fake_response.content = b"fake image bytes" fake_response.content = b"\x89PNG\r\n\x1a\n fake png"
fake_response.raise_for_status = MagicMock() fake_response.raise_for_status = MagicMock()
fake_response.headers = {"content-type": "image/png"}
mock_client = AsyncMock() mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=fake_response) mock_client.get = AsyncMock(return_value=fake_response)
@@ -413,14 +453,44 @@ class TestSlackDownloadSlackFile:
assert path.endswith(".jpg") assert path.endswith(".jpg")
mock_client.get.assert_called_once() mock_client.get.assert_called_once()
def test_rejects_html_response(self, tmp_path, monkeypatch):
"""An HTML sign-in page from Slack is rejected, not cached as image."""
monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img")
adapter = _make_slack_adapter()
fake_response = MagicMock()
fake_response.content = b"<!DOCTYPE html><html><title>Slack</title></html>"
fake_response.raise_for_status = MagicMock()
fake_response.headers = {"content-type": "text/html; charset=utf-8"}
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=fake_response)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
async def run():
with patch("httpx.AsyncClient", return_value=mock_client):
await adapter._download_slack_file(
"https://files.slack.com/img.jpg", ext=".jpg"
)
with pytest.raises(ValueError, match="HTML instead of media"):
asyncio.run(run())
# Verify nothing was cached
img_dir = tmp_path / "img"
if img_dir.exists():
assert list(img_dir.iterdir()) == []
def test_retries_on_timeout_then_succeeds(self, tmp_path, monkeypatch): def test_retries_on_timeout_then_succeeds(self, tmp_path, monkeypatch):
"""Timeout on first attempt triggers retry; success on second.""" """Timeout on first attempt triggers retry; success on second."""
monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img")
adapter = _make_slack_adapter() adapter = _make_slack_adapter()
fake_response = MagicMock() fake_response = MagicMock()
fake_response.content = b"image bytes" fake_response.content = b"\x89PNG\r\n\x1a\n image bytes"
fake_response.raise_for_status = MagicMock() fake_response.raise_for_status = MagicMock()
fake_response.headers = {"content-type": "image/png"}
mock_client = AsyncMock() mock_client = AsyncMock()
mock_client.get = AsyncMock( mock_client.get = AsyncMock(