feat(telegram): add voice transcription model override (#98)
This commit is contained in:
@@ -10,11 +10,13 @@
|
||||
|
||||
- simplify runtime, config, and telegram internals [#85](https://github.com/banteg/takopi/pull/85)
|
||||
- refactor telegram boundary types [#90](https://github.com/banteg/takopi/pull/90)
|
||||
- allow configuring the telegram voice transcription model for local whisper servers
|
||||
|
||||
### docs
|
||||
|
||||
- add tips section to user guide
|
||||
- rework readme
|
||||
- document OPENAI_BASE_URL and model overrides for telegram voice transcription
|
||||
|
||||
## v0.15.0 (2026-01-11)
|
||||
|
||||
|
||||
@@ -28,12 +28,18 @@ Configuration (under `[transports.telegram]`):
|
||||
|
||||
```toml
|
||||
voice_transcription = true
|
||||
voice_transcription_model = "gpt-4o-mini-transcribe" # optional
|
||||
```
|
||||
|
||||
Set `OPENAI_API_KEY` in the environment. If transcription is enabled but the API key
|
||||
is missing or the audio download fails, takopi replies with a short error and skips
|
||||
the run.
|
||||
|
||||
To use a local OpenAI-compatible Whisper server, also set `OPENAI_BASE_URL` (for
|
||||
example, `http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server
|
||||
ignores it. If your server requires a specific model name, set
|
||||
`voice_transcription_model` (for example, `whisper-1`).
|
||||
|
||||
## Forum topics (optional)
|
||||
|
||||
Takopi can bind Telegram forum topics to a project/branch and persist resume tokens
|
||||
|
||||
+7
-1
@@ -324,10 +324,15 @@ Dictate tasks instead of typing:
|
||||
```toml
|
||||
[transports.telegram]
|
||||
voice_transcription = true
|
||||
voice_transcription_model = "gpt-4o-mini-transcribe" # optional
|
||||
```
|
||||
|
||||
Set `OPENAI_API_KEY` in your environment (uses OpenAI's transcription API with the
|
||||
`gpt-4o-mini-transcribe` model).
|
||||
`gpt-4o-mini-transcribe` model by default). To use a local OpenAI-compatible
|
||||
Whisper server, also set `OPENAI_BASE_URL` (for example,
|
||||
`http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server ignores it.
|
||||
If your server requires a specific model name, set `voice_transcription_model`
|
||||
accordingly (for example, `whisper-1`).
|
||||
|
||||
When you send a voice note, takopi transcribes it and runs the result as a normal text message. If transcription fails, you'll get an error message and the run is skipped.
|
||||
|
||||
@@ -408,6 +413,7 @@ watch_config = true # hot-reload on config changes (except transport)
|
||||
bot_token = "123456789:ABCdefGHIjklMNOpqrsTUVwxyz"
|
||||
chat_id = 123456789
|
||||
voice_transcription = true
|
||||
# voice_transcription_model = "gpt-4o-mini-transcribe"
|
||||
|
||||
[transports.telegram.files]
|
||||
enabled = true
|
||||
|
||||
@@ -99,6 +99,7 @@ class TelegramTransportSettings(BaseModel):
|
||||
chat_id: StrictInt
|
||||
voice_transcription: bool = False
|
||||
voice_max_bytes: StrictInt = 10 * 1024 * 1024
|
||||
voice_transcription_model: NonEmptyStr = "gpt-4o-mini-transcribe"
|
||||
topics: TelegramTopicsSettings = Field(default_factory=TelegramTopicsSettings)
|
||||
files: TelegramFilesSettings = Field(default_factory=TelegramFilesSettings)
|
||||
|
||||
|
||||
@@ -115,6 +115,7 @@ class TelegramBackend(TransportBackend):
|
||||
exec_cfg=exec_cfg,
|
||||
voice_transcription=settings.voice_transcription,
|
||||
voice_max_bytes=int(settings.voice_max_bytes),
|
||||
voice_transcription_model=settings.voice_transcription_model,
|
||||
topics=settings.topics,
|
||||
files=settings.files,
|
||||
)
|
||||
|
||||
@@ -97,6 +97,7 @@ class TelegramBridgeConfig:
|
||||
exec_cfg: ExecBridgeConfig
|
||||
voice_transcription: bool = False
|
||||
voice_max_bytes: int = 10 * 1024 * 1024
|
||||
voice_transcription_model: str = "gpt-4o-mini-transcribe"
|
||||
files: TelegramFilesSettings = field(default_factory=TelegramFilesSettings)
|
||||
chat_ids: tuple[int, ...] | None = None
|
||||
topics: TelegramTopicsSettings = field(default_factory=TelegramTopicsSettings)
|
||||
|
||||
@@ -477,6 +477,7 @@ async def run_main_loop(
|
||||
bot=cfg.bot,
|
||||
msg=msg,
|
||||
enabled=cfg.voice_transcription,
|
||||
model=cfg.voice_transcription_model,
|
||||
max_bytes=cfg.voice_max_bytes,
|
||||
reply=reply,
|
||||
)
|
||||
|
||||
@@ -13,7 +13,6 @@ logger = get_logger(__name__)
|
||||
|
||||
__all__ = ["transcribe_voice"]
|
||||
|
||||
OPENAI_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe"
|
||||
VOICE_TRANSCRIPTION_DISABLED_HINT = (
|
||||
"voice transcription is disabled. enable it in config:\n"
|
||||
"```toml\n"
|
||||
@@ -28,6 +27,7 @@ async def transcribe_voice(
|
||||
bot: BotClient,
|
||||
msg: TelegramIncomingMessage,
|
||||
enabled: bool,
|
||||
model: str,
|
||||
max_bytes: int | None = None,
|
||||
reply: Callable[..., Awaitable[None]],
|
||||
) -> str | None:
|
||||
@@ -60,7 +60,7 @@ async def transcribe_voice(
|
||||
async with AsyncOpenAI(timeout=120) as client:
|
||||
try:
|
||||
response = await client.audio.transcriptions.create(
|
||||
model=OPENAI_TRANSCRIPTION_MODEL,
|
||||
model=model,
|
||||
file=audio_file,
|
||||
)
|
||||
except OpenAIError as exc:
|
||||
|
||||
@@ -133,6 +133,7 @@ def test_telegram_backend_build_and_run_wires_config(
|
||||
chat_id=321,
|
||||
voice_transcription=True,
|
||||
voice_max_bytes=1234,
|
||||
voice_transcription_model="whisper-1",
|
||||
files=TelegramFilesSettings(enabled=True, allowed_user_ids=[1, 2]),
|
||||
topics=TelegramTopicsSettings(enabled=True, scope="main"),
|
||||
)
|
||||
@@ -150,6 +151,7 @@ def test_telegram_backend_build_and_run_wires_config(
|
||||
assert cfg.chat_id == 321
|
||||
assert cfg.voice_transcription is True
|
||||
assert cfg.voice_max_bytes == 1234
|
||||
assert cfg.voice_transcription_model == "whisper-1"
|
||||
assert cfg.files.enabled is True
|
||||
assert cfg.files.allowed_user_ids == [1, 2]
|
||||
assert cfg.topics.enabled is True
|
||||
|
||||
@@ -188,6 +188,7 @@ async def test_transcribe_voice_handles_missing_file() -> None:
|
||||
bot=bot,
|
||||
msg=_voice_message(),
|
||||
enabled=True,
|
||||
model="whisper-1",
|
||||
reply=reply,
|
||||
)
|
||||
|
||||
@@ -207,6 +208,7 @@ async def test_transcribe_voice_handles_missing_download() -> None:
|
||||
bot=bot,
|
||||
msg=_voice_message(),
|
||||
enabled=True,
|
||||
model="whisper-1",
|
||||
reply=reply,
|
||||
)
|
||||
|
||||
@@ -235,6 +237,7 @@ async def test_transcribe_voice_rejects_large_voice_without_downloading() -> Non
|
||||
bot=bot,
|
||||
msg=_voice_message(file_size=10_000),
|
||||
enabled=True,
|
||||
model="whisper-1",
|
||||
max_bytes=100,
|
||||
reply=reply,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user