From 04671593aa91c0df0fe7d8c2f2fcd606dc6f2930 Mon Sep 17 00:00:00 2001 From: banteg <4562643+banteg@users.noreply.github.com> Date: Mon, 12 Jan 2026 17:35:21 +0400 Subject: [PATCH] feat(telegram): add voice transcription model override (#98) --- changelog.md | 2 ++ docs/transports/telegram.md | 6 ++++++ docs/user-guide.md | 8 +++++++- src/takopi/settings.py | 1 + src/takopi/telegram/backend.py | 1 + src/takopi/telegram/bridge.py | 1 + src/takopi/telegram/loop.py | 1 + src/takopi/telegram/voice.py | 4 ++-- tests/test_telegram_backend.py | 2 ++ tests/test_telegram_voice.py | 3 +++ 10 files changed, 26 insertions(+), 3 deletions(-) diff --git a/changelog.md b/changelog.md index f014523..2c193ac 100644 --- a/changelog.md +++ b/changelog.md @@ -10,11 +10,13 @@ - simplify runtime, config, and telegram internals [#85](https://github.com/banteg/takopi/pull/85) - refactor telegram boundary types [#90](https://github.com/banteg/takopi/pull/90) +- allow configuring the telegram voice transcription model for local whisper servers ### docs - add tips section to user guide - rework readme +- document OPENAI_BASE_URL and model overrides for telegram voice transcription ## v0.15.0 (2026-01-11) diff --git a/docs/transports/telegram.md b/docs/transports/telegram.md index bb26b48..6b9a406 100644 --- a/docs/transports/telegram.md +++ b/docs/transports/telegram.md @@ -28,12 +28,18 @@ Configuration (under `[transports.telegram]`): ```toml voice_transcription = true +voice_transcription_model = "gpt-4o-mini-transcribe" # optional ``` Set `OPENAI_API_KEY` in the environment. If transcription is enabled but the API key is missing or the audio download fails, takopi replies with a short error and skips the run. +To use a local OpenAI-compatible Whisper server, also set `OPENAI_BASE_URL` (for +example, `http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server +ignores it. If your server requires a specific model name, set +`voice_transcription_model` (for example, `whisper-1`). + ## Forum topics (optional) Takopi can bind Telegram forum topics to a project/branch and persist resume tokens diff --git a/docs/user-guide.md b/docs/user-guide.md index f92a47f..94b9ebd 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -324,10 +324,15 @@ Dictate tasks instead of typing: ```toml [transports.telegram] voice_transcription = true +voice_transcription_model = "gpt-4o-mini-transcribe" # optional ``` Set `OPENAI_API_KEY` in your environment (uses OpenAI's transcription API with the -`gpt-4o-mini-transcribe` model). +`gpt-4o-mini-transcribe` model by default). To use a local OpenAI-compatible +Whisper server, also set `OPENAI_BASE_URL` (for example, +`http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server ignores it. +If your server requires a specific model name, set `voice_transcription_model` +accordingly (for example, `whisper-1`). When you send a voice note, takopi transcribes it and runs the result as a normal text message. If transcription fails, you'll get an error message and the run is skipped. @@ -408,6 +413,7 @@ watch_config = true # hot-reload on config changes (except transport) bot_token = "123456789:ABCdefGHIjklMNOpqrsTUVwxyz" chat_id = 123456789 voice_transcription = true +# voice_transcription_model = "gpt-4o-mini-transcribe" [transports.telegram.files] enabled = true diff --git a/src/takopi/settings.py b/src/takopi/settings.py index ea8ac1a..bcc0275 100644 --- a/src/takopi/settings.py +++ b/src/takopi/settings.py @@ -99,6 +99,7 @@ class TelegramTransportSettings(BaseModel): chat_id: StrictInt voice_transcription: bool = False voice_max_bytes: StrictInt = 10 * 1024 * 1024 + voice_transcription_model: NonEmptyStr = "gpt-4o-mini-transcribe" topics: TelegramTopicsSettings = Field(default_factory=TelegramTopicsSettings) files: TelegramFilesSettings = Field(default_factory=TelegramFilesSettings) diff --git a/src/takopi/telegram/backend.py b/src/takopi/telegram/backend.py index 1c748ef..1ade6b6 100644 --- a/src/takopi/telegram/backend.py +++ b/src/takopi/telegram/backend.py @@ -115,6 +115,7 @@ class TelegramBackend(TransportBackend): exec_cfg=exec_cfg, voice_transcription=settings.voice_transcription, voice_max_bytes=int(settings.voice_max_bytes), + voice_transcription_model=settings.voice_transcription_model, topics=settings.topics, files=settings.files, ) diff --git a/src/takopi/telegram/bridge.py b/src/takopi/telegram/bridge.py index e904be9..26961f0 100644 --- a/src/takopi/telegram/bridge.py +++ b/src/takopi/telegram/bridge.py @@ -97,6 +97,7 @@ class TelegramBridgeConfig: exec_cfg: ExecBridgeConfig voice_transcription: bool = False voice_max_bytes: int = 10 * 1024 * 1024 + voice_transcription_model: str = "gpt-4o-mini-transcribe" files: TelegramFilesSettings = field(default_factory=TelegramFilesSettings) chat_ids: tuple[int, ...] | None = None topics: TelegramTopicsSettings = field(default_factory=TelegramTopicsSettings) diff --git a/src/takopi/telegram/loop.py b/src/takopi/telegram/loop.py index 1cd9e57..67bbd61 100644 --- a/src/takopi/telegram/loop.py +++ b/src/takopi/telegram/loop.py @@ -477,6 +477,7 @@ async def run_main_loop( bot=cfg.bot, msg=msg, enabled=cfg.voice_transcription, + model=cfg.voice_transcription_model, max_bytes=cfg.voice_max_bytes, reply=reply, ) diff --git a/src/takopi/telegram/voice.py b/src/takopi/telegram/voice.py index 70ffea7..3439b3f 100644 --- a/src/takopi/telegram/voice.py +++ b/src/takopi/telegram/voice.py @@ -13,7 +13,6 @@ logger = get_logger(__name__) __all__ = ["transcribe_voice"] -OPENAI_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe" VOICE_TRANSCRIPTION_DISABLED_HINT = ( "voice transcription is disabled. enable it in config:\n" "```toml\n" @@ -28,6 +27,7 @@ async def transcribe_voice( bot: BotClient, msg: TelegramIncomingMessage, enabled: bool, + model: str, max_bytes: int | None = None, reply: Callable[..., Awaitable[None]], ) -> str | None: @@ -60,7 +60,7 @@ async def transcribe_voice( async with AsyncOpenAI(timeout=120) as client: try: response = await client.audio.transcriptions.create( - model=OPENAI_TRANSCRIPTION_MODEL, + model=model, file=audio_file, ) except OpenAIError as exc: diff --git a/tests/test_telegram_backend.py b/tests/test_telegram_backend.py index 87b3ce2..e3cb602 100644 --- a/tests/test_telegram_backend.py +++ b/tests/test_telegram_backend.py @@ -133,6 +133,7 @@ def test_telegram_backend_build_and_run_wires_config( chat_id=321, voice_transcription=True, voice_max_bytes=1234, + voice_transcription_model="whisper-1", files=TelegramFilesSettings(enabled=True, allowed_user_ids=[1, 2]), topics=TelegramTopicsSettings(enabled=True, scope="main"), ) @@ -150,6 +151,7 @@ def test_telegram_backend_build_and_run_wires_config( assert cfg.chat_id == 321 assert cfg.voice_transcription is True assert cfg.voice_max_bytes == 1234 + assert cfg.voice_transcription_model == "whisper-1" assert cfg.files.enabled is True assert cfg.files.allowed_user_ids == [1, 2] assert cfg.topics.enabled is True diff --git a/tests/test_telegram_voice.py b/tests/test_telegram_voice.py index 2dc64bb..be88192 100644 --- a/tests/test_telegram_voice.py +++ b/tests/test_telegram_voice.py @@ -188,6 +188,7 @@ async def test_transcribe_voice_handles_missing_file() -> None: bot=bot, msg=_voice_message(), enabled=True, + model="whisper-1", reply=reply, ) @@ -207,6 +208,7 @@ async def test_transcribe_voice_handles_missing_download() -> None: bot=bot, msg=_voice_message(), enabled=True, + model="whisper-1", reply=reply, ) @@ -235,6 +237,7 @@ async def test_transcribe_voice_rejects_large_voice_without_downloading() -> Non bot=bot, msg=_voice_message(file_size=10_000), enabled=True, + model="whisper-1", max_bytes=100, reply=reply, )