feat(telegram): add voice transcription model override (#98)

This commit is contained in:
banteg
2026-01-12 17:35:21 +04:00
committed by GitHub
parent 2f05b46465
commit 04671593aa
10 changed files with 26 additions and 3 deletions
+2
View File
@@ -10,11 +10,13 @@
- simplify runtime, config, and telegram internals [#85](https://github.com/banteg/takopi/pull/85)
- refactor telegram boundary types [#90](https://github.com/banteg/takopi/pull/90)
- allow configuring the telegram voice transcription model for local whisper servers
### docs
- add tips section to user guide
- rework readme
- document OPENAI_BASE_URL and model overrides for telegram voice transcription
## v0.15.0 (2026-01-11)
+6
View File
@@ -28,12 +28,18 @@ Configuration (under `[transports.telegram]`):
```toml
voice_transcription = true
voice_transcription_model = "gpt-4o-mini-transcribe" # optional
```
Set `OPENAI_API_KEY` in the environment. If transcription is enabled but the API key
is missing or the audio download fails, takopi replies with a short error and skips
the run.
To use a local OpenAI-compatible Whisper server, also set `OPENAI_BASE_URL` (for
example, `http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server
ignores it. If your server requires a specific model name, set
`voice_transcription_model` (for example, `whisper-1`).
## Forum topics (optional)
Takopi can bind Telegram forum topics to a project/branch and persist resume tokens
+7 -1
View File
@@ -324,10 +324,15 @@ Dictate tasks instead of typing:
```toml
[transports.telegram]
voice_transcription = true
voice_transcription_model = "gpt-4o-mini-transcribe" # optional
```
Set `OPENAI_API_KEY` in your environment (uses OpenAI's transcription API with the
`gpt-4o-mini-transcribe` model).
`gpt-4o-mini-transcribe` model by default). To use a local OpenAI-compatible
Whisper server, also set `OPENAI_BASE_URL` (for example,
`http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server ignores it.
If your server requires a specific model name, set `voice_transcription_model`
accordingly (for example, `whisper-1`).
When you send a voice note, takopi transcribes it and runs the result as a normal text message. If transcription fails, you'll get an error message and the run is skipped.
@@ -408,6 +413,7 @@ watch_config = true # hot-reload on config changes (except transport)
bot_token = "123456789:ABCdefGHIjklMNOpqrsTUVwxyz"
chat_id = 123456789
voice_transcription = true
# voice_transcription_model = "gpt-4o-mini-transcribe"
[transports.telegram.files]
enabled = true
+1
View File
@@ -99,6 +99,7 @@ class TelegramTransportSettings(BaseModel):
chat_id: StrictInt
voice_transcription: bool = False
voice_max_bytes: StrictInt = 10 * 1024 * 1024
voice_transcription_model: NonEmptyStr = "gpt-4o-mini-transcribe"
topics: TelegramTopicsSettings = Field(default_factory=TelegramTopicsSettings)
files: TelegramFilesSettings = Field(default_factory=TelegramFilesSettings)
+1
View File
@@ -115,6 +115,7 @@ class TelegramBackend(TransportBackend):
exec_cfg=exec_cfg,
voice_transcription=settings.voice_transcription,
voice_max_bytes=int(settings.voice_max_bytes),
voice_transcription_model=settings.voice_transcription_model,
topics=settings.topics,
files=settings.files,
)
+1
View File
@@ -97,6 +97,7 @@ class TelegramBridgeConfig:
exec_cfg: ExecBridgeConfig
voice_transcription: bool = False
voice_max_bytes: int = 10 * 1024 * 1024
voice_transcription_model: str = "gpt-4o-mini-transcribe"
files: TelegramFilesSettings = field(default_factory=TelegramFilesSettings)
chat_ids: tuple[int, ...] | None = None
topics: TelegramTopicsSettings = field(default_factory=TelegramTopicsSettings)
+1
View File
@@ -477,6 +477,7 @@ async def run_main_loop(
bot=cfg.bot,
msg=msg,
enabled=cfg.voice_transcription,
model=cfg.voice_transcription_model,
max_bytes=cfg.voice_max_bytes,
reply=reply,
)
+2 -2
View File
@@ -13,7 +13,6 @@ logger = get_logger(__name__)
__all__ = ["transcribe_voice"]
OPENAI_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe"
VOICE_TRANSCRIPTION_DISABLED_HINT = (
"voice transcription is disabled. enable it in config:\n"
"```toml\n"
@@ -28,6 +27,7 @@ async def transcribe_voice(
bot: BotClient,
msg: TelegramIncomingMessage,
enabled: bool,
model: str,
max_bytes: int | None = None,
reply: Callable[..., Awaitable[None]],
) -> str | None:
@@ -60,7 +60,7 @@ async def transcribe_voice(
async with AsyncOpenAI(timeout=120) as client:
try:
response = await client.audio.transcriptions.create(
model=OPENAI_TRANSCRIPTION_MODEL,
model=model,
file=audio_file,
)
except OpenAIError as exc:
+2
View File
@@ -133,6 +133,7 @@ def test_telegram_backend_build_and_run_wires_config(
chat_id=321,
voice_transcription=True,
voice_max_bytes=1234,
voice_transcription_model="whisper-1",
files=TelegramFilesSettings(enabled=True, allowed_user_ids=[1, 2]),
topics=TelegramTopicsSettings(enabled=True, scope="main"),
)
@@ -150,6 +151,7 @@ def test_telegram_backend_build_and_run_wires_config(
assert cfg.chat_id == 321
assert cfg.voice_transcription is True
assert cfg.voice_max_bytes == 1234
assert cfg.voice_transcription_model == "whisper-1"
assert cfg.files.enabled is True
assert cfg.files.allowed_user_ids == [1, 2]
assert cfg.topics.enabled is True
+3
View File
@@ -188,6 +188,7 @@ async def test_transcribe_voice_handles_missing_file() -> None:
bot=bot,
msg=_voice_message(),
enabled=True,
model="whisper-1",
reply=reply,
)
@@ -207,6 +208,7 @@ async def test_transcribe_voice_handles_missing_download() -> None:
bot=bot,
msg=_voice_message(),
enabled=True,
model="whisper-1",
reply=reply,
)
@@ -235,6 +237,7 @@ async def test_transcribe_voice_rejects_large_voice_without_downloading() -> Non
bot=bot,
msg=_voice_message(file_size=10_000),
enabled=True,
model="whisper-1",
max_bytes=100,
reply=reply,
)