From 04671593aa91c0df0fe7d8c2f2fcd606dc6f2930 Mon Sep 17 00:00:00 2001
From: banteg <4562643+banteg@users.noreply.github.com>
Date: Mon, 12 Jan 2026 17:35:21 +0400
Subject: [PATCH] feat(telegram): add voice transcription model override (#98)

---
 changelog.md                   | 2 ++
 docs/transports/telegram.md    | 6 ++++++
 docs/user-guide.md             | 8 +++++++-
 src/takopi/settings.py         | 1 +
 src/takopi/telegram/backend.py | 1 +
 src/takopi/telegram/bridge.py  | 1 +
 src/takopi/telegram/loop.py    | 1 +
 src/takopi/telegram/voice.py   | 4 ++--
 tests/test_telegram_backend.py | 2 ++
 tests/test_telegram_voice.py   | 3 +++
 10 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/changelog.md b/changelog.md
index f014523..2c193ac 100644
--- a/changelog.md
+++ b/changelog.md
@@ -10,11 +10,13 @@
 
 - simplify runtime, config, and telegram internals [#85](https://github.com/banteg/takopi/pull/85)
 - refactor telegram boundary types [#90](https://github.com/banteg/takopi/pull/90)
+- allow configuring the telegram voice transcription model for local whisper servers
 
 ### docs
 
 - add tips section to user guide
 - rework readme
+- document OPENAI_BASE_URL and model overrides for telegram voice transcription
 
 ## v0.15.0 (2026-01-11)
 
diff --git a/docs/transports/telegram.md b/docs/transports/telegram.md
index bb26b48..6b9a406 100644
--- a/docs/transports/telegram.md
+++ b/docs/transports/telegram.md
@@ -28,12 +28,18 @@ Configuration (under `[transports.telegram]`):
 
 ```toml
 voice_transcription = true
+voice_transcription_model = "gpt-4o-mini-transcribe" # optional
 ```
 
 Set `OPENAI_API_KEY` in the environment. If transcription is enabled but the API key
 is missing or the audio download fails, takopi replies with a short error and skips
 the run.
 
+To use a local OpenAI-compatible Whisper server, also set `OPENAI_BASE_URL` (for
+example, `http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server
+ignores it. If your server requires a specific model name, set
+`voice_transcription_model` (for example, `whisper-1`).
+
 ## Forum topics (optional)
 
 Takopi can bind Telegram forum topics to a project/branch and persist resume tokens
diff --git a/docs/user-guide.md b/docs/user-guide.md
index f92a47f..94b9ebd 100644
--- a/docs/user-guide.md
+++ b/docs/user-guide.md
@@ -324,10 +324,15 @@ Dictate tasks instead of typing:
 ```toml
 [transports.telegram]
 voice_transcription = true
+voice_transcription_model = "gpt-4o-mini-transcribe" # optional
 ```
 
 Set `OPENAI_API_KEY` in your environment (uses OpenAI's transcription API with the
-`gpt-4o-mini-transcribe` model).
+`gpt-4o-mini-transcribe` model by default). To use a local OpenAI-compatible
+Whisper server, also set `OPENAI_BASE_URL` (for example,
+`http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server ignores it.
+If your server requires a specific model name, set `voice_transcription_model`
+accordingly (for example, `whisper-1`).
 
 When you send a voice note, takopi transcribes it and runs the result as a normal text message. If transcription fails, you'll get an error message and the run is skipped.
 
@@ -408,6 +413,7 @@ watch_config = true   # hot-reload on config changes (except transport)
 bot_token = "123456789:ABCdefGHIjklMNOpqrsTUVwxyz"
 chat_id = 123456789
 voice_transcription = true
+# voice_transcription_model = "gpt-4o-mini-transcribe"
 
 [transports.telegram.files]
 enabled = true
diff --git a/src/takopi/settings.py b/src/takopi/settings.py
index ea8ac1a..bcc0275 100644
--- a/src/takopi/settings.py
+++ b/src/takopi/settings.py
@@ -99,6 +99,7 @@ class TelegramTransportSettings(BaseModel):
     chat_id: StrictInt
     voice_transcription: bool = False
     voice_max_bytes: StrictInt = 10 * 1024 * 1024
+    voice_transcription_model: NonEmptyStr = "gpt-4o-mini-transcribe"
     topics: TelegramTopicsSettings = Field(default_factory=TelegramTopicsSettings)
     files: TelegramFilesSettings = Field(default_factory=TelegramFilesSettings)
 
diff --git a/src/takopi/telegram/backend.py b/src/takopi/telegram/backend.py
index 1c748ef..1ade6b6 100644
--- a/src/takopi/telegram/backend.py
+++ b/src/takopi/telegram/backend.py
@@ -115,6 +115,7 @@ class TelegramBackend(TransportBackend):
             exec_cfg=exec_cfg,
             voice_transcription=settings.voice_transcription,
             voice_max_bytes=int(settings.voice_max_bytes),
+            voice_transcription_model=settings.voice_transcription_model,
             topics=settings.topics,
             files=settings.files,
         )
diff --git a/src/takopi/telegram/bridge.py b/src/takopi/telegram/bridge.py
index e904be9..26961f0 100644
--- a/src/takopi/telegram/bridge.py
+++ b/src/takopi/telegram/bridge.py
@@ -97,6 +97,7 @@ class TelegramBridgeConfig:
     exec_cfg: ExecBridgeConfig
     voice_transcription: bool = False
     voice_max_bytes: int = 10 * 1024 * 1024
+    voice_transcription_model: str = "gpt-4o-mini-transcribe"
     files: TelegramFilesSettings = field(default_factory=TelegramFilesSettings)
     chat_ids: tuple[int, ...] | None = None
     topics: TelegramTopicsSettings = field(default_factory=TelegramTopicsSettings)
diff --git a/src/takopi/telegram/loop.py b/src/takopi/telegram/loop.py
index 1cd9e57..67bbd61 100644
--- a/src/takopi/telegram/loop.py
+++ b/src/takopi/telegram/loop.py
@@ -477,6 +477,7 @@ async def run_main_loop(
                         bot=cfg.bot,
                         msg=msg,
                         enabled=cfg.voice_transcription,
+                        model=cfg.voice_transcription_model,
                         max_bytes=cfg.voice_max_bytes,
                         reply=reply,
                     )
diff --git a/src/takopi/telegram/voice.py b/src/takopi/telegram/voice.py
index 70ffea7..3439b3f 100644
--- a/src/takopi/telegram/voice.py
+++ b/src/takopi/telegram/voice.py
@@ -13,7 +13,6 @@ logger = get_logger(__name__)
 
 __all__ = ["transcribe_voice"]
 
-OPENAI_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe"
 VOICE_TRANSCRIPTION_DISABLED_HINT = (
     "voice transcription is disabled. enable it in config:\n"
     "```toml\n"
@@ -28,6 +27,7 @@ async def transcribe_voice(
     bot: BotClient,
     msg: TelegramIncomingMessage,
     enabled: bool,
+    model: str,
     max_bytes: int | None = None,
     reply: Callable[..., Awaitable[None]],
 ) -> str | None:
@@ -60,7 +60,7 @@ async def transcribe_voice(
     async with AsyncOpenAI(timeout=120) as client:
         try:
             response = await client.audio.transcriptions.create(
-                model=OPENAI_TRANSCRIPTION_MODEL,
+                model=model,
                 file=audio_file,
             )
         except OpenAIError as exc:
diff --git a/tests/test_telegram_backend.py b/tests/test_telegram_backend.py
index 87b3ce2..e3cb602 100644
--- a/tests/test_telegram_backend.py
+++ b/tests/test_telegram_backend.py
@@ -133,6 +133,7 @@ def test_telegram_backend_build_and_run_wires_config(
         chat_id=321,
         voice_transcription=True,
         voice_max_bytes=1234,
+        voice_transcription_model="whisper-1",
         files=TelegramFilesSettings(enabled=True, allowed_user_ids=[1, 2]),
         topics=TelegramTopicsSettings(enabled=True, scope="main"),
     )
@@ -150,6 +151,7 @@ def test_telegram_backend_build_and_run_wires_config(
     assert cfg.chat_id == 321
     assert cfg.voice_transcription is True
     assert cfg.voice_max_bytes == 1234
+    assert cfg.voice_transcription_model == "whisper-1"
     assert cfg.files.enabled is True
     assert cfg.files.allowed_user_ids == [1, 2]
     assert cfg.topics.enabled is True
diff --git a/tests/test_telegram_voice.py b/tests/test_telegram_voice.py
index 2dc64bb..be88192 100644
--- a/tests/test_telegram_voice.py
+++ b/tests/test_telegram_voice.py
@@ -188,6 +188,7 @@ async def test_transcribe_voice_handles_missing_file() -> None:
         bot=bot,
         msg=_voice_message(),
         enabled=True,
+        model="whisper-1",
         reply=reply,
     )
 
@@ -207,6 +208,7 @@ async def test_transcribe_voice_handles_missing_download() -> None:
         bot=bot,
         msg=_voice_message(),
         enabled=True,
+        model="whisper-1",
         reply=reply,
     )
 
@@ -235,6 +237,7 @@ async def test_transcribe_voice_rejects_large_voice_without_downloading() -> Non
         bot=bot,
         msg=_voice_message(file_size=10_000),
         enabled=True,
+        model="whisper-1",
         max_bytes=100,
         reply=reply,
     )