feat: telegram voice transcription (#74)

2026-01-09 20:57:04 +04:00
parent 8421ec8b4a
commit 780ba72b3a
14 changed files with 440 additions and 7 deletions
@@ -16,6 +16,24 @@ This document captures current behavior so transport changes stay intentional.
 4. High-value messages enqueue a send.
 5. All writes go through the outbox.

+## Incoming messages
+
+`parse_incoming_update` accepts text messages and voice notes.
+
+If voice transcription is enabled, takopi downloads the voice payload from Telegram,
+transcribes it with OpenAI, and routes the transcript through the same command and
+directive pipeline as typed text.
+
+Configuration (under `[transports.telegram]`):
+
+```toml
+voice_transcription = true
+```
+
+Set `OPENAI_API_KEY` in the environment. If transcription is enabled but the API key
+is missing or the audio download fails, takopi replies with a short error and skips
+the run.
+
 ## Outbox model

 - Single worker processes one op at a time.
@@ -18,6 +18,8 @@ parallel runs across threads, per thread queue support.

 `/cancel` a running task.

+optional voice note transcription for Telegram (routes transcript like typed text).
+
 ## requirements

 - `uv` for installation (`curl -LsSf https://astral.sh/uv/install.sh | sh`)
@@ -59,6 +61,9 @@ transport = "telegram"
 [transports.telegram]
 bot_token = "123456789:ABCdefGHIjklMNOpqrsTUVwxyz"
 chat_id = 123456789
+voice_transcription = true
+
+# set OPENAI_API_KEY in your environment for voice transcription

 [codex]
 # optional: profile from ~/.codex/config.toml
@@ -25,6 +25,18 @@ _RECONNECTING_RE = re.compile(
    r"^Reconnecting\.{3}\s*(?P<attempt>\d+)/(?P<max>\d+)\s*$",
    re.IGNORECASE,
 )
+_EXEC_ONLY_FLAGS = {"--skip-git-repo-check"}
+
+
+def _split_exec_flags(extra_args: list[str]) -> tuple[list[str], list[str]]:
+    base_args: list[str] = []
+    exec_args: list[str] = []
+    for arg in extra_args:
+        if arg in _EXEC_ONLY_FLAGS:
+            exec_args.append(arg)
+        else:
+            base_args.append(arg)
+    return base_args, exec_args


 def _parse_reconnect_message(message: str) -> tuple[int, int] | None:
@@ -397,7 +409,8 @@ class CodexRunner(ResumeTokenMixin, JsonlSubprocessRunner):
        state: Any,
    ) -> list[str]:
        _ = prompt, state
-        args = [*self.extra_args, "exec", "--json"]
+        base_args, exec_args = _split_exec_flags(self.extra_args)
+        args = [*base_args, "exec", *exec_args, "--json"]
        if resume:
            args.extend(["resume", resume.value, "-"])
        else:
@@ -25,6 +25,7 @@ class TelegramTransportSettings(BaseModel):

    bot_token: SecretStr | None = None
    chat_id: int | None = None
+    voice_transcription: bool = False

    @field_validator("bot_token", mode="before")
    @classmethod
@@ -1,10 +1,11 @@
 """Telegram-specific clients and adapters."""

 from .client import parse_incoming_update, poll_incoming
-from .types import TelegramIncomingMessage
+from .types import TelegramIncomingMessage, TelegramVoice

 __all__ = [
    "TelegramIncomingMessage",
+    "TelegramVoice",
    "parse_incoming_update",
    "poll_incoming",
 ]
@@ -14,6 +14,7 @@ from .bridge import (
    TelegramBridgeConfig,
    TelegramPresenter,
    TelegramTransport,
+    TelegramVoiceTranscriptionConfig,
    run_main_loop,
 )
 from .client import TelegramClient
@@ -43,6 +44,14 @@ def _build_startup_message(
    )


+def _build_voice_transcription_config(
+    transport_config: dict[str, object],
+) -> TelegramVoiceTranscriptionConfig:
+    return TelegramVoiceTranscriptionConfig(
+        enabled=bool(transport_config.get("voice_transcription", False)),
+    )
+
+
 class TelegramBackend(TransportBackend):
    id = "telegram"
    description = "Telegram bot"
@@ -87,12 +96,14 @@ class TelegramBackend(TransportBackend):
            presenter=presenter,
            final_notify=final_notify,
        )
+        voice_transcription = _build_voice_transcription_config(transport_config)
        cfg = TelegramBridgeConfig(
            bot=bot,
            runtime=runtime,
            chat_id=chat_id,
            startup_msg=startup_msg,
            exec_cfg=exec_cfg,
+            voice_transcription=voice_transcription,
        )
        anyio.run(run_main_loop, cfg)

@@ -1,8 +1,10 @@
 from __future__ import annotations

+import os
 import shlex
 from collections.abc import AsyncIterator, Awaitable, Callable, Sequence
 from dataclasses import dataclass
+from pathlib import Path

 import anyio

@@ -40,10 +42,14 @@ from ..transport_runtime import TransportRuntime
 from .client import BotClient, poll_incoming
 from .types import TelegramIncomingMessage
 from .render import prepare_telegram
+from .transcribe import transcribe_audio

 logger = get_logger(__name__)

 _MAX_BOT_COMMANDS = 100
+_OPENAI_AUDIO_MAX_BYTES = 25 * 1024 * 1024
+_OPENAI_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe"
+_OPENAI_TRANSCRIPTION_CHUNKING = "auto"


 def _is_cancel_command(text: str) -> bool:
@@ -191,6 +197,11 @@ class TelegramPresenter:
        return RenderedMessage(text=text, extra={"entities": entities})


+@dataclass(frozen=True)
+class TelegramVoiceTranscriptionConfig:
+    enabled: bool = False
+
+
 def _as_int(value: int | str, *, label: str) -> int:
    if isinstance(value, bool) or not isinstance(value, int):
        raise TypeError(f"Telegram {label} must be int")
@@ -285,6 +296,7 @@ class TelegramBridgeConfig:
    chat_id: int
    startup_msg: str
    exec_cfg: ExecBridgeConfig
+    voice_transcription: TelegramVoiceTranscriptionConfig | None = None


 async def _send_plain(
@@ -345,6 +357,125 @@ async def poll_updates(
        yield msg


+def _resolve_openai_api_key(
+    cfg: TelegramVoiceTranscriptionConfig,
+) -> str | None:
+    env_key = os.environ.get("OPENAI_API_KEY")
+    if isinstance(env_key, str):
+        env_key = env_key.strip()
+        if env_key:
+            return env_key
+    return None
+
+
+def _normalize_voice_filename(file_path: str | None, mime_type: str | None) -> str:
+    name = Path(file_path).name if file_path else ""
+    if not name:
+        if mime_type == "audio/ogg":
+            return "voice.ogg"
+        return "voice.dat"
+    if name.endswith(".oga"):
+        return f"{name[:-4]}.ogg"
+    return name
+
+
+async def _transcribe_voice(
+    cfg: TelegramBridgeConfig,
+    msg: TelegramIncomingMessage,
+) -> str | None:
+    voice = msg.voice
+    if voice is None:
+        return msg.text
+    settings = cfg.voice_transcription
+    if settings is None or not settings.enabled:
+        await _send_plain(
+            cfg.exec_cfg.transport,
+            chat_id=msg.chat_id,
+            user_msg_id=msg.message_id,
+            text="voice transcription is disabled.",
+        )
+        return None
+    api_key = _resolve_openai_api_key(settings)
+    if not api_key:
+        await _send_plain(
+            cfg.exec_cfg.transport,
+            chat_id=msg.chat_id,
+            user_msg_id=msg.message_id,
+            text="voice transcription requires OPENAI_API_KEY.",
+        )
+        return None
+    if voice.file_size is not None and voice.file_size > _OPENAI_AUDIO_MAX_BYTES:
+        await _send_plain(
+            cfg.exec_cfg.transport,
+            chat_id=msg.chat_id,
+            user_msg_id=msg.message_id,
+            text="voice message is too large to transcribe.",
+        )
+        return None
+    file_info = await cfg.bot.get_file(voice.file_id)
+    if not isinstance(file_info, dict):
+        await _send_plain(
+            cfg.exec_cfg.transport,
+            chat_id=msg.chat_id,
+            user_msg_id=msg.message_id,
+            text="failed to fetch voice file.",
+        )
+        return None
+    file_path = file_info.get("file_path")
+    if not isinstance(file_path, str) or not file_path:
+        await _send_plain(
+            cfg.exec_cfg.transport,
+            chat_id=msg.chat_id,
+            user_msg_id=msg.message_id,
+            text="failed to fetch voice file.",
+        )
+        return None
+    audio_bytes = await cfg.bot.download_file(file_path)
+    if not audio_bytes:
+        await _send_plain(
+            cfg.exec_cfg.transport,
+            chat_id=msg.chat_id,
+            user_msg_id=msg.message_id,
+            text="failed to download voice message.",
+        )
+        return None
+    if len(audio_bytes) > _OPENAI_AUDIO_MAX_BYTES:
+        await _send_plain(
+            cfg.exec_cfg.transport,
+            chat_id=msg.chat_id,
+            user_msg_id=msg.message_id,
+            text="voice message is too large to transcribe.",
+        )
+        return None
+    filename = _normalize_voice_filename(file_path, voice.mime_type)
+    transcript = await transcribe_audio(
+        audio_bytes,
+        filename=filename,
+        api_key=api_key,
+        model=_OPENAI_TRANSCRIPTION_MODEL,
+        chunking_strategy=_OPENAI_TRANSCRIPTION_CHUNKING,
+        mime_type=voice.mime_type,
+    )
+    if transcript is None:
+        await _send_plain(
+            cfg.exec_cfg.transport,
+            chat_id=msg.chat_id,
+            user_msg_id=msg.message_id,
+            text="voice transcription failed.",
+        )
+        return None
+    transcript = transcript.strip()
+    if not transcript:
+        await _send_plain(
+            cfg.exec_cfg.transport,
+            chat_id=msg.chat_id,
+            user_msg_id=msg.message_id,
+            text="voice transcription returned empty text.",
+        )
+        return None
+    return transcript
+
+
 async def _handle_cancel(
    cfg: TelegramBridgeConfig,
    msg: TelegramIncomingMessage,
@@ -702,6 +833,7 @@ class _TelegramCommandExecutor(CommandExecutor):
 async def _dispatch_command(
    cfg: TelegramBridgeConfig,
    msg: TelegramIncomingMessage,
+    text: str,
    command_id: str,
    args_text: str,
    running_tasks: RunningTasks,
@@ -738,7 +870,7 @@ async def _dispatch_command(
        return
    ctx = CommandContext(
        command=command_id,
-        text=msg.text,
+        text=text,
        args_text=args_text,
        args=_split_command_args(args_text),
        message=message_ref,
@@ -826,6 +958,10 @@ async def run_main_loop(

            async for msg in poller(cfg):
                text = msg.text
+                if msg.voice is not None:
+                    text = await _transcribe_voice(cfg, msg)
+                    if text is None:
+                        continue
                user_msg_id = msg.message_id
                chat_id = msg.chat_id
                reply_id = msg.reply_to_message_id
@@ -850,6 +986,7 @@ async def run_main_loop(
                            _dispatch_command,
                            cfg,
                            msg,
+                            text,
                            command_id,
                            args_text,
                            running_tasks,
@@ -18,7 +18,7 @@ import httpx
 import anyio

 from ..logging import get_logger
-from .types import TelegramIncomingMessage
+from .types import TelegramIncomingMessage, TelegramVoice

 logger = get_logger(__name__)

@@ -50,8 +50,30 @@ def parse_incoming_update(
    if not isinstance(msg, dict):
        return None
    text = msg.get("text")
+    voice_payload: TelegramVoice | None = None
    if not isinstance(text, str):
-        return None
+        voice = msg.get("voice")
+        if not isinstance(voice, dict):
+            return None
+        file_id = voice.get("file_id")
+        if not isinstance(file_id, str) or not file_id:
+            return None
+        voice_payload = TelegramVoice(
+            file_id=file_id,
+            mime_type=voice.get("mime_type")
+            if isinstance(voice.get("mime_type"), str)
+            else None,
+            file_size=voice.get("file_size")
+            if isinstance(voice.get("file_size"), int)
+            and not isinstance(voice.get("file_size"), bool)
+            else None,
+            duration=voice.get("duration")
+            if isinstance(voice.get("duration"), int)
+            and not isinstance(voice.get("duration"), bool)
+            else None,
+            raw=voice,
+        )
+        text = ""
    chat = msg.get("chat")
    if not isinstance(chat, dict):
        return None
@@ -87,6 +109,7 @@ def parse_incoming_update(
        reply_to_message_id=reply_to_message_id,
        reply_to_text=reply_to_text,
        sender_id=sender_id,
+        voice=voice_payload,
        raw=msg,
    )

@@ -123,6 +146,10 @@ class BotClient(Protocol):
        allowed_updates: list[str] | None = None,
    ) -> list[dict] | None: ...

+    async def get_file(self, file_id: str) -> dict | None: ...
+
+    async def download_file(self, file_path: str) -> bytes | None: ...
+
    async def send_message(
        self,
        chat_id: int,
@@ -356,6 +383,7 @@ class TelegramClient:
                raise ValueError("Provide either token or client, not both.")
            self._client_override = client
            self._base = None
+            self._file_base = None
            self._http_client = None
            self._owns_http_client = False
        else:
@@ -363,6 +391,7 @@ class TelegramClient:
                raise ValueError("Telegram token is empty")
            self._client_override = None
            self._base = f"https://api.telegram.org/bot{token}"
+            self._file_base = f"https://api.telegram.org/file/bot{token}"
            self._http_client = http_client or httpx.AsyncClient(timeout=timeout_s)
            self._owns_http_client = http_client is None
        self._clock = clock
@@ -556,6 +585,46 @@ class TelegramClient:
            except TelegramRetryAfter as exc:
                await self._sleep(exc.retry_after)

+    async def get_file(self, file_id: str) -> dict | None:
+        while True:
+            try:
+                if self._client_override is not None:
+                    return await self._client_override.get_file(file_id)
+                result = await self._post("getFile", {"file_id": file_id})
+                return result if isinstance(result, dict) else None
+            except TelegramRetryAfter as exc:
+                await self._sleep(exc.retry_after)
+
+    async def download_file(self, file_path: str) -> bytes | None:
+        if self._client_override is not None:
+            return await self._client_override.download_file(file_path)
+        if self._http_client is None or self._file_base is None:
+            raise RuntimeError("TelegramClient is configured without an HTTP client.")
+        url = f"{self._file_base}/{file_path}"
+        try:
+            resp = await self._http_client.get(url)
+        except httpx.HTTPError as exc:
+            request_url = getattr(exc.request, "url", None)
+            logger.error(
+                "telegram.file_network_error",
+                url=str(request_url) if request_url is not None else None,
+                error=str(exc),
+                error_type=exc.__class__.__name__,
+            )
+            return None
+        try:
+            resp.raise_for_status()
+        except httpx.HTTPStatusError as exc:
+            logger.error(
+                "telegram.file_http_error",
+                status=resp.status_code,
+                url=str(resp.request.url),
+                error=str(exc),
+                body=resp.text,
+            )
+            return None
+        return resp.content
+
    async def send_message(
        self,
        chat_id: int,
@@ -0,0 +1,100 @@
+from __future__ import annotations
+
+from typing import Any
+
+import httpx
+
+from ..logging import get_logger
+
+logger = get_logger(__name__)
+
+OPENAI_TRANSCRIBE_URL = "https://api.openai.com/v1/audio/transcriptions"
+
+
+async def transcribe_audio(
+    audio_bytes: bytes,
+    *,
+    filename: str,
+    api_key: str,
+    model: str,
+    language: str | None = None,
+    prompt: str | None = None,
+    chunking_strategy: str | None = "auto",
+    mime_type: str | None = None,
+    timeout_s: float = 120,
+    http_client: httpx.AsyncClient | None = None,
+) -> str | None:
+    data: dict[str, Any] = {"model": model}
+    if language:
+        data["language"] = language
+    if prompt:
+        data["prompt"] = prompt
+    if chunking_strategy:
+        data["chunking_strategy"] = chunking_strategy
+
+    files = {
+        "file": (
+            filename,
+            audio_bytes,
+            mime_type or "application/octet-stream",
+        )
+    }
+
+    headers = {"Authorization": f"Bearer {api_key}"}
+    close_client = False
+    client = http_client
+    if client is None:
+        client = httpx.AsyncClient(timeout=timeout_s)
+        close_client = True
+    try:
+        try:
+            resp = await client.post(
+                OPENAI_TRANSCRIBE_URL,
+                data=data,
+                files=files,
+                headers=headers,
+            )
+        except httpx.HTTPError as exc:
+            request_url = getattr(exc.request, "url", None)
+            logger.error(
+                "openai.transcribe.network_error",
+                url=str(request_url) if request_url is not None else None,
+                error=str(exc),
+                error_type=exc.__class__.__name__,
+            )
+            return None
+        try:
+            resp.raise_for_status()
+        except httpx.HTTPStatusError as exc:
+            logger.error(
+                "openai.transcribe.http_error",
+                status=resp.status_code,
+                url=str(resp.request.url),
+                error=str(exc),
+                body=resp.text,
+            )
+            return None
+        try:
+            payload = resp.json()
+        except Exception as exc:
+            logger.error(
+                "openai.transcribe.bad_response",
+                status=resp.status_code,
+                url=str(resp.request.url),
+                error=str(exc),
+                error_type=exc.__class__.__name__,
+                body=resp.text,
+            )
+            return None
+    finally:
+        if close_client:
+            await client.aclose()
+
+    text = payload.get("text")
+    if not isinstance(text, str):
+        logger.error(
+            "openai.transcribe.invalid_payload",
+            payload=payload,
+        )
+        return None
+    return text
@@ -4,6 +4,15 @@ from dataclasses import dataclass
 from typing import Any


+@dataclass(frozen=True, slots=True)
+class TelegramVoice:
+    file_id: str
+    mime_type: str | None
+    file_size: int | None
+    duration: int | None
+    raw: dict[str, Any]
+
+
@dataclass(frozen=True, slots=True)
 class TelegramIncomingMessage:
    transport: str
@@ -13,4 +22,5 @@ class TelegramIncomingMessage:
    reply_to_message_id: int | None
    reply_to_text: str | None
    sender_id: int | None
+    voice: TelegramVoice | None = None
    raw: dict[str, Any] | None = None
@@ -128,6 +128,23 @@ async def test_run_allows_parallel_different_sessions() -> None:
    assert max_in_flight == 2


+def test_codex_exec_flags_after_exec() -> None:
+    runner = CodexRunner(
+        codex_cmd="codex",
+        extra_args=["-c", "notify=[]", "--skip-git-repo-check"],
+    )
+    state = runner.new_state("hi", None)
+    args = runner.build_args("hi", None, state=state)
+    assert args == [
+        "-c",
+        "notify=[]",
+        "exec",
+        "--skip-git-repo-check",
+        "--json",
+        "-",
+    ]
+
+
@pytest.mark.anyio
 async def test_run_serializes_new_session_after_session_is_known(
    tmp_path, monkeypatch
@@ -106,6 +106,14 @@ class _FakeBot:
        _ = allowed_updates
        return []

+    async def get_file(self, file_id: str) -> dict | None:
+        _ = file_id
+        return None
+
+    async def download_file(self, file_path: str) -> bytes | None:
+        _ = file_path
+        return None
+
    async def send_message(
        self,
        chat_id: int,
@@ -386,6 +394,14 @@ async def test_telegram_transport_edit_wait_false_returns_ref() -> None:
        ) -> list[dict] | None:
            return None

+        async def get_file(self, file_id: str) -> dict | None:
+            _ = file_id
+            return None
+
+        async def download_file(self, file_path: str) -> bytes | None:
+            _ = file_path
+            return None
+
        async def send_message(
            self,
            chat_id: int,
@@ -22,6 +22,7 @@ def test_parse_incoming_update_maps_fields() -> None:
    assert msg.reply_to_message_id == 5
    assert msg.reply_to_text == "prev"
    assert msg.sender_id == 99
+    assert msg.voice is None
    assert msg.raw == update["message"]


@@ -38,10 +39,36 @@ def test_parse_incoming_update_filters_non_matching_chat() -> None:
    assert parse_incoming_update(update, chat_id=999) is None


-def test_parse_incoming_update_filters_non_text() -> None:
+def test_parse_incoming_update_filters_non_text_and_non_voice() -> None:
    update = {
        "update_id": 1,
-        "message": {"message_id": 10, "chat": {"id": 123}},
+        "message": {"message_id": 10, "chat": {"id": 123}, "photo": []},
    }

    assert parse_incoming_update(update, chat_id=123) is None
+
+
+def test_parse_incoming_update_voice_message() -> None:
+    update = {
+        "update_id": 1,
+        "message": {
+            "message_id": 10,
+            "chat": {"id": 123},
+            "voice": {
+                "file_id": "voice-id",
+                "file_unique_id": "uniq",
+                "duration": 3,
+                "mime_type": "audio/ogg",
+                "file_size": 1234,
+            },
+        },
+    }
+
+    msg = parse_incoming_update(update, chat_id=123)
+    assert msg is not None
+    assert msg.text == ""
+    assert msg.voice is not None
+    assert msg.voice.file_id == "voice-id"
+    assert msg.voice.mime_type == "audio/ogg"
+    assert msg.voice.file_size == 1234
+    assert msg.voice.duration == 3
@@ -92,6 +92,14 @@ class _FakeBot:
        self._updates_attempts += 1
        return []

+    async def get_file(self, file_id: str) -> dict | None:
+        _ = file_id
+        return None
+
+    async def download_file(self, file_path: str) -> bytes | None:
+        _ = file_path
+        return None
+
    async def close(self) -> None:
        return None