hermes-agent/tests/plugins/test_google_meet_realtime.py

"""Tests for plugins.google_meet.realtime.openai_client (v2).

Uses a scripted fake WebSocket — no network, no API key required.
"""

from __future__ import annotations

import base64
import json
import sys
import threading
import types
from pathlib import Path
from unittest.mock import patch

import pytest


@pytest.fixture(autouse=True)
def _isolate_home(tmp_path, monkeypatch):
    hermes_home = tmp_path / ".hermes"
    hermes_home.mkdir()
    monkeypatch.setenv("HERMES_HOME", str(hermes_home))
    yield hermes_home


# ---------------------------------------------------------------------------
# Fake WebSocket
# ---------------------------------------------------------------------------


class _FakeWS:
    """Scripted WS: send() records frames, recv() pops a queue."""

    def __init__(self, recv_frames: list):
        self.sent: list[dict] = []
        self._recv_q: list = list(recv_frames)
        self.closed = False

    def send(self, payload):
        # Always accept str payloads — client encodes JSON with json.dumps.
        if isinstance(payload, (bytes, bytearray)):
            payload = payload.decode()
        self.sent.append(json.loads(payload))

    def recv(self, timeout=None):  # noqa: ARG002
        if not self._recv_q:
            raise RuntimeError("fake ws: no more frames")
        frame = self._recv_q.pop(0)
        if isinstance(frame, dict):
            return json.dumps(frame)
        return frame

    def close(self):
        self.closed = True


def _install_fake_websockets(monkeypatch, fake_ws):
    """Install a fake ``websockets.sync.client`` module in sys.modules."""
    mod_websockets = types.ModuleType("websockets")
    mod_sync = types.ModuleType("websockets.sync")
    mod_sync_client = types.ModuleType("websockets.sync.client")

    captured = {"url": None, "headers": None, "kwargs": None}

    def _connect(url, **kwargs):
        captured["url"] = url
        captured["kwargs"] = kwargs
        captured["headers"] = (
            kwargs.get("additional_headers") or kwargs.get("extra_headers")
        )
        return fake_ws

    mod_sync_client.connect = _connect
    mod_sync.client = mod_sync_client
    mod_websockets.sync = mod_sync

    monkeypatch.setitem(sys.modules, "websockets", mod_websockets)
    monkeypatch.setitem(sys.modules, "websockets.sync", mod_sync)
    monkeypatch.setitem(sys.modules, "websockets.sync.client", mod_sync_client)
    return captured


# ---------------------------------------------------------------------------
# connect()
# ---------------------------------------------------------------------------


def test_connect_sends_session_update_with_voice_and_instructions(monkeypatch):
    from plugins.google_meet.realtime.openai_client import RealtimeSession

    ws = _FakeWS(recv_frames=[])
    captured = _install_fake_websockets(monkeypatch, ws)

    sess = RealtimeSession(
        api_key="sk-test",
        model="gpt-realtime",
        voice="verse",
        instructions="Be brief.",
    )
    sess.connect()

    # Auth + beta headers set.
    assert captured["url"].startswith("wss://api.openai.com/v1/realtime")
    assert "model=gpt-realtime" in captured["url"]
    headers = captured["headers"] or []
    hdict = dict(headers)
    assert hdict.get("Authorization") == "Bearer sk-test"
    assert hdict.get("OpenAI-Beta") == "realtime=v1"

    # First frame sent must be session.update with the right shape.
    assert len(ws.sent) == 1
    update = ws.sent[0]
    assert update["type"] == "session.update"
    s = update["session"]
    assert s["voice"] == "verse"
    assert s["instructions"] == "Be brief."
    assert set(s["modalities"]) == {"audio", "text"}
    assert s["output_audio_format"] == "pcm16"
    assert s["input_audio_format"] == "pcm16"


# ---------------------------------------------------------------------------
# speak()
# ---------------------------------------------------------------------------


def test_speak_sends_create_and_response_and_writes_audio(monkeypatch, tmp_path):
    from plugins.google_meet.realtime.openai_client import RealtimeSession

    audio_bytes = b"\x01\x02\x03\x04PCM!"
    b64 = base64.b64encode(audio_bytes).decode()

    recv_frames = [
        {"type": "response.created"},
        {"type": "response.audio.delta", "delta": b64},
        {"type": "response.audio.delta", "delta": base64.b64encode(b"more").decode()},
        {"type": "response.done"},
    ]
    ws = _FakeWS(recv_frames=recv_frames)
    _install_fake_websockets(monkeypatch, ws)

    sink = tmp_path / "out.pcm"
    sess = RealtimeSession(api_key="sk-test", audio_sink_path=sink)
    sess.connect()
    result = sess.speak("Hello everyone.")

    # Frames sent after session.update: conversation.item.create then response.create.
    types_sent = [f["type"] for f in ws.sent]
    assert types_sent == ["session.update", "conversation.item.create", "response.create"]

    item = ws.sent[1]["item"]
    assert item["role"] == "user"
    assert item["content"][0]["type"] == "input_text"
    assert item["content"][0]["text"] == "Hello everyone."

    resp = ws.sent[2]["response"]
    assert resp["modalities"] == ["audio"]

    # Audio file got decoded + appended bytes.
    data = sink.read_bytes()
    assert data == audio_bytes + b"more"
    assert result["ok"] is True
    assert result["bytes_written"] == len(audio_bytes) + len(b"more")
    assert result["duration_ms"] >= 0.0


def test_speak_raises_on_error_frame(monkeypatch, tmp_path):
    from plugins.google_meet.realtime.openai_client import RealtimeSession

    ws = _FakeWS(recv_frames=[
        {"type": "response.created"},
        {"type": "error", "error": {"message": "bad juju"}},
    ])
    _install_fake_websockets(monkeypatch, ws)

    sess = RealtimeSession(api_key="sk-test", audio_sink_path=tmp_path / "o.pcm")
    sess.connect()
    with pytest.raises(RuntimeError, match="bad juju"):
        sess.speak("hi")


def test_speak_without_connect_raises(monkeypatch):
    from plugins.google_meet.realtime.openai_client import RealtimeSession

    sess = RealtimeSession(api_key="sk-test")
    with pytest.raises(RuntimeError, match="connect"):
        sess.speak("hi")


def test_close_is_idempotent_and_closes_ws(monkeypatch):
    from plugins.google_meet.realtime.openai_client import RealtimeSession

    ws = _FakeWS(recv_frames=[])
    _install_fake_websockets(monkeypatch, ws)

    sess = RealtimeSession(api_key="sk-test")
    sess.connect()
    sess.close()
    assert ws.closed is True
    # Second close is a no-op.
    sess.close()


# ---------------------------------------------------------------------------
# websockets dependency missing
# ---------------------------------------------------------------------------


def test_connect_raises_clean_error_when_websockets_missing(monkeypatch):
    from plugins.google_meet.realtime.openai_client import RealtimeSession

    # Make `import websockets.sync.client` fail.
    monkeypatch.setitem(sys.modules, "websockets", None)
    monkeypatch.setitem(sys.modules, "websockets.sync", None)
    monkeypatch.setitem(sys.modules, "websockets.sync.client", None)

    sess = RealtimeSession(api_key="sk-test")
    with pytest.raises(RuntimeError, match="pip install websockets"):
        sess.connect()


# ---------------------------------------------------------------------------
# RealtimeSpeaker
# ---------------------------------------------------------------------------


class _StubSession:
    def __init__(self):
        self.spoken: list[str] = []

    def speak(self, text, timeout=30.0):  # noqa: ARG002
        self.spoken.append(text)
        return {"ok": True, "bytes_written": len(text), "duration_ms": 1.0}


def test_speaker_run_until_stopped_processes_queue(tmp_path):
    from plugins.google_meet.realtime.openai_client import RealtimeSpeaker

    queue = tmp_path / "queue.jsonl"
    processed = tmp_path / "processed.jsonl"
    queue.write_text(
        json.dumps({"id": "a", "text": "hello one"}) + "\n"
        + json.dumps({"id": "b", "text": "hello two"}) + "\n"
    )

    stub = _StubSession()
    speaker = RealtimeSpeaker(stub, queue_path=queue, processed_path=processed)

    # Stop once the queue is empty.
    def _stop():
        return queue.exists() and queue.read_text().strip() == ""

    speaker.run_until_stopped(_stop, poll_interval=0.01)

    assert stub.spoken == ["hello one", "hello two"]

    # Processed file has both entries, in order.
    lines = [json.loads(l) for l in processed.read_text().splitlines() if l.strip()]
    assert [l["id"] for l in lines] == ["a", "b"]
    assert all(l["result"]["ok"] for l in lines)

    # Queue is empty (possibly empty string) after processing.
    assert queue.read_text().strip() == ""


def test_speaker_exits_immediately_when_stop_fn_true(tmp_path):
    from plugins.google_meet.realtime.openai_client import RealtimeSpeaker

    queue = tmp_path / "q.jsonl"
    queue.write_text(json.dumps({"id": "x", "text": "never spoken"}) + "\n")

    stub = _StubSession()
    speaker = RealtimeSpeaker(stub, queue_path=queue)
    speaker.run_until_stopped(lambda: True, poll_interval=0.01)
    assert stub.spoken == []


def test_speaker_drops_line_without_processed_path_when_none(tmp_path):
    from plugins.google_meet.realtime.openai_client import RealtimeSpeaker

    queue = tmp_path / "q.jsonl"
    queue.write_text(json.dumps({"id": "only", "text": "once"}) + "\n")

    stub = _StubSession()
    speaker = RealtimeSpeaker(stub, queue_path=queue, processed_path=None)

    def _stop():
        return queue.read_text().strip() == ""

    speaker.run_until_stopped(_stop, poll_interval=0.01)
    assert stub.spoken == ["once"]
    assert queue.read_text().strip() == ""