feat(telegram): add voice transcription model override (#98)

2026-01-12 17:35:21 +04:00
parent 2f05b46465
commit 04671593aa
10 changed files with 26 additions and 3 deletions
@@ -10,11 +10,13 @@

 - simplify runtime, config, and telegram internals [#85](https://github.com/banteg/takopi/pull/85)
 - refactor telegram boundary types [#90](https://github.com/banteg/takopi/pull/90)
+- allow configuring the telegram voice transcription model for local whisper servers

 ### docs

 - add tips section to user guide
 - rework readme
+- document OPENAI_BASE_URL and model overrides for telegram voice transcription

 ## v0.15.0 (2026-01-11)

@@ -28,12 +28,18 @@ Configuration (under `[transports.telegram]`):

 ```toml
 voice_transcription = true
+voice_transcription_model = "gpt-4o-mini-transcribe" # optional
 ```

 Set `OPENAI_API_KEY` in the environment. If transcription is enabled but the API key
 is missing or the audio download fails, takopi replies with a short error and skips
 the run.

+To use a local OpenAI-compatible Whisper server, also set `OPENAI_BASE_URL` (for
+example, `http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server
+ignores it. If your server requires a specific model name, set
+`voice_transcription_model` (for example, `whisper-1`).
+
 ## Forum topics (optional)

 Takopi can bind Telegram forum topics to a project/branch and persist resume tokens
@@ -324,10 +324,15 @@ Dictate tasks instead of typing:
 ```toml
 [transports.telegram]
 voice_transcription = true
+voice_transcription_model = "gpt-4o-mini-transcribe" # optional
 ```

 Set `OPENAI_API_KEY` in your environment (uses OpenAI's transcription API with the
-`gpt-4o-mini-transcribe` model).
+`gpt-4o-mini-transcribe` model by default). To use a local OpenAI-compatible
+Whisper server, also set `OPENAI_BASE_URL` (for example,
+`http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server ignores it.
+If your server requires a specific model name, set `voice_transcription_model`
+accordingly (for example, `whisper-1`).

 When you send a voice note, takopi transcribes it and runs the result as a normal text message. If transcription fails, you'll get an error message and the run is skipped.

@@ -408,6 +413,7 @@ watch_config = true   # hot-reload on config changes (except transport)
 bot_token = "123456789:ABCdefGHIjklMNOpqrsTUVwxyz"
 chat_id = 123456789
 voice_transcription = true
+# voice_transcription_model = "gpt-4o-mini-transcribe"

 [transports.telegram.files]
 enabled = true
@@ -99,6 +99,7 @@ class TelegramTransportSettings(BaseModel):
    chat_id: StrictInt
    voice_transcription: bool = False
    voice_max_bytes: StrictInt = 10 * 1024 * 1024
+    voice_transcription_model: NonEmptyStr = "gpt-4o-mini-transcribe"
    topics: TelegramTopicsSettings = Field(default_factory=TelegramTopicsSettings)
    files: TelegramFilesSettings = Field(default_factory=TelegramFilesSettings)

@@ -115,6 +115,7 @@ class TelegramBackend(TransportBackend):
            exec_cfg=exec_cfg,
            voice_transcription=settings.voice_transcription,
            voice_max_bytes=int(settings.voice_max_bytes),
+            voice_transcription_model=settings.voice_transcription_model,
            topics=settings.topics,
            files=settings.files,
        )
@@ -97,6 +97,7 @@ class TelegramBridgeConfig:
    exec_cfg: ExecBridgeConfig
    voice_transcription: bool = False
    voice_max_bytes: int = 10 * 1024 * 1024
+    voice_transcription_model: str = "gpt-4o-mini-transcribe"
    files: TelegramFilesSettings = field(default_factory=TelegramFilesSettings)
    chat_ids: tuple[int, ...] | None = None
    topics: TelegramTopicsSettings = field(default_factory=TelegramTopicsSettings)
@@ -477,6 +477,7 @@ async def run_main_loop(
                        bot=cfg.bot,
                        msg=msg,
                        enabled=cfg.voice_transcription,
+                        model=cfg.voice_transcription_model,
                        max_bytes=cfg.voice_max_bytes,
                        reply=reply,
                    )
@@ -13,7 +13,6 @@ logger = get_logger(__name__)

 __all__ = ["transcribe_voice"]

-OPENAI_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe"
 VOICE_TRANSCRIPTION_DISABLED_HINT = (
    "voice transcription is disabled. enable it in config:\n"
    "```toml\n"
@@ -28,6 +27,7 @@ async def transcribe_voice(
    bot: BotClient,
    msg: TelegramIncomingMessage,
    enabled: bool,
+    model: str,
    max_bytes: int | None = None,
    reply: Callable[..., Awaitable[None]],
 ) -> str | None:
@@ -60,7 +60,7 @@ async def transcribe_voice(
    async with AsyncOpenAI(timeout=120) as client:
        try:
            response = await client.audio.transcriptions.create(
-                model=OPENAI_TRANSCRIPTION_MODEL,
+                model=model,
                file=audio_file,
            )
        except OpenAIError as exc:
@@ -133,6 +133,7 @@ def test_telegram_backend_build_and_run_wires_config(
        chat_id=321,
        voice_transcription=True,
        voice_max_bytes=1234,
+        voice_transcription_model="whisper-1",
        files=TelegramFilesSettings(enabled=True, allowed_user_ids=[1, 2]),
        topics=TelegramTopicsSettings(enabled=True, scope="main"),
    )
@@ -150,6 +151,7 @@ def test_telegram_backend_build_and_run_wires_config(
    assert cfg.chat_id == 321
    assert cfg.voice_transcription is True
    assert cfg.voice_max_bytes == 1234
+    assert cfg.voice_transcription_model == "whisper-1"
    assert cfg.files.enabled is True
    assert cfg.files.allowed_user_ids == [1, 2]
    assert cfg.topics.enabled is True
@@ -188,6 +188,7 @@ async def test_transcribe_voice_handles_missing_file() -> None:
        bot=bot,
        msg=_voice_message(),
        enabled=True,
+        model="whisper-1",
        reply=reply,
    )

@@ -207,6 +208,7 @@ async def test_transcribe_voice_handles_missing_download() -> None:
        bot=bot,
        msg=_voice_message(),
        enabled=True,
+        model="whisper-1",
        reply=reply,
    )

@@ -235,6 +237,7 @@ async def test_transcribe_voice_rejects_large_voice_without_downloading() -> Non
        bot=bot,
        msg=_voice_message(file_size=10_000),
        enabled=True,
+        model="whisper-1",
        max_bytes=100,
        reply=reply,
    )