fix(telegram): separate voice transcription client (#166)

This commit is contained in:
codyw912
2026-01-17 14:50:08 -05:00
committed by GitHub
parent 9d7c6fcd8c
commit ee365d76ff
12 changed files with 83 additions and 15 deletions
+12 -4
View File
@@ -9,6 +9,10 @@ Enable transcription so voice notes become normal text runs.
```sh
takopi config set transports.telegram.voice_transcription true
takopi config set transports.telegram.voice_transcription_model "gpt-4o-mini-transcribe"
# local OpenAI-compatible transcription server (optional)
takopi config set transports.telegram.voice_transcription_base_url "http://localhost:8000/v1"
takopi config set transports.telegram.voice_transcription_api_key "local"
```
=== "toml"
@@ -17,13 +21,17 @@ Enable transcription so voice notes become normal text runs.
[transports.telegram]
voice_transcription = true
voice_transcription_model = "gpt-4o-mini-transcribe" # optional
voice_transcription_base_url = "http://localhost:8000/v1" # optional
voice_transcription_api_key = "local" # optional
```
Set `OPENAI_API_KEY` in your environment (uses OpenAIs transcription API).
Set `OPENAI_API_KEY` in your environment (or `voice_transcription_api_key` in config).
To use a local OpenAI-compatible Whisper server, also set `OPENAI_BASE_URL`
(for example, `http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server ignores it.
If your server requires a specific model name, set `voice_transcription_model` (for example, `whisper-1`).
To use a local OpenAI-compatible Whisper server, set `voice_transcription_base_url`
(and `voice_transcription_api_key` if the server expects one). This keeps engine
requests on their own base URL without relying on `OPENAI_BASE_URL`. If your server
requires a specific model name, set `voice_transcription_model` (for example,
`whisper-1`).
## Behavior
+2
View File
@@ -51,6 +51,8 @@ If you expect to edit config while Takopi is running, set:
| `voice_transcription` | bool | `false` | Enable voice note transcription. |
| `voice_max_bytes` | int | `10485760` | Max voice note size (bytes). |
| `voice_transcription_model` | string | `"gpt-4o-mini-transcribe"` | OpenAI transcription model name. |
| `voice_transcription_base_url` | string\|null | `null` | Override base URL for voice transcription only. |
| `voice_transcription_api_key` | string\|null | `null` | Override API key for voice transcription only. |
| `session_mode` | `"stateless"`\|`"chat"` | `"stateless"` | Auto-resume mode. Onboarding sets `"chat"` for assistant/workspace. |
| `show_resume_line` | bool | `true` | Show resume line in message footer. Onboarding sets `false` for assistant/workspace. |
+14 -7
View File
@@ -33,6 +33,10 @@ Configuration (under `[transports.telegram]`):
```sh
takopi config set transports.telegram.voice_transcription true
takopi config set transports.telegram.voice_transcription_model "gpt-4o-mini-transcribe"
# local OpenAI-compatible transcription server (optional)
takopi config set transports.telegram.voice_transcription_base_url "http://localhost:8000/v1"
takopi config set transports.telegram.voice_transcription_api_key "local"
```
=== "toml"
@@ -40,16 +44,19 @@ Configuration (under `[transports.telegram]`):
```toml
voice_transcription = true
voice_transcription_model = "gpt-4o-mini-transcribe" # optional
voice_transcription_base_url = "http://localhost:8000/v1" # optional
voice_transcription_api_key = "local" # optional
```
Set `OPENAI_API_KEY` in the environment. If transcription is enabled but the API key
is missing or the audio download fails, takopi replies with a short error and skips
the run.
Set `OPENAI_API_KEY` in the environment (or `voice_transcription_api_key` in config).
If transcription is enabled but no API key is available or the audio download fails,
takopi replies with a short error and skips the run.
To use a local OpenAI-compatible Whisper server, also set `OPENAI_BASE_URL` (for
example, `http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server
ignores it. If your server requires a specific model name, set
`voice_transcription_model` (for example, `whisper-1`).
To use a local OpenAI-compatible Whisper server, set `voice_transcription_base_url`
(and `voice_transcription_api_key` if the server expects one). This keeps engine
requests on their own base URL without relying on `OPENAI_BASE_URL`. If your server
requires a specific model name, set `voice_transcription_model` (for example,
`whisper-1`).
### Trigger mode (mentions-only)
+6 -1
View File
@@ -47,9 +47,14 @@ def _doctor_file_checks(settings: TakopiSettings) -> list[DoctorCheck]:
def _doctor_voice_checks(settings: TakopiSettings) -> list[DoctorCheck]:
if not settings.transports.telegram.voice_transcription:
return [DoctorCheck("voice transcription", "ok", "disabled")]
api_key = settings.transports.telegram.voice_transcription_api_key
if api_key:
return [
DoctorCheck("voice transcription", "ok", "voice_transcription_api_key set")
]
if os.environ.get("OPENAI_API_KEY"):
return [DoctorCheck("voice transcription", "ok", "OPENAI_API_KEY set")]
return [DoctorCheck("voice transcription", "error", "OPENAI_API_KEY not set")]
return [DoctorCheck("voice transcription", "error", "API key not set")]
async def _doctor_telegram_checks(
+2
View File
@@ -98,6 +98,8 @@ class TelegramTransportSettings(BaseModel):
voice_transcription: bool = False
voice_max_bytes: StrictInt = 10 * 1024 * 1024
voice_transcription_model: NonEmptyStr = "gpt-4o-mini-transcribe"
voice_transcription_base_url: NonEmptyStr | None = None
voice_transcription_api_key: NonEmptyStr | None = None
session_mode: Literal["stateless", "chat"] = "stateless"
show_resume_line: bool = True
forward_coalesce_s: float = Field(default=1.0, ge=0)
+2
View File
@@ -138,6 +138,8 @@ class TelegramBackend(TransportBackend):
voice_transcription=settings.voice_transcription,
voice_max_bytes=int(settings.voice_max_bytes),
voice_transcription_model=settings.voice_transcription_model,
voice_transcription_base_url=settings.voice_transcription_base_url,
voice_transcription_api_key=settings.voice_transcription_api_key,
forward_coalesce_s=settings.forward_coalesce_s,
media_group_debounce_s=settings.media_group_debounce_s,
topics=settings.topics,
+2
View File
@@ -124,6 +124,8 @@ class TelegramBridgeConfig:
voice_transcription: bool = False
voice_max_bytes: int = 10 * 1024 * 1024
voice_transcription_model: str = "gpt-4o-mini-transcribe"
voice_transcription_base_url: str | None = None
voice_transcription_api_key: str | None = None
forward_coalesce_s: float = 1.0
media_group_debounce_s: float = 1.0
files: TelegramFilesSettings = field(default_factory=TelegramFilesSettings)
+2
View File
@@ -1663,6 +1663,8 @@ async def run_main_loop(
model=cfg.voice_transcription_model,
max_bytes=cfg.voice_max_bytes,
reply=reply,
base_url=cfg.voice_transcription_base_url,
api_key=cfg.voice_transcription_api_key,
)
if text is None:
return
+17 -2
View File
@@ -28,10 +28,23 @@ class VoiceTranscriber(Protocol):
class OpenAIVoiceTranscriber:
def __init__(
self,
*,
base_url: str | None = None,
api_key: str | None = None,
) -> None:
self._base_url = base_url
self._api_key = api_key
async def transcribe(self, *, model: str, audio_bytes: bytes) -> str:
audio_file = io.BytesIO(audio_bytes)
audio_file.name = "voice.ogg"
async with AsyncOpenAI(timeout=120) as client:
async with AsyncOpenAI(
base_url=self._base_url,
api_key=self._api_key,
timeout=120,
) as client:
response = await client.audio.transcriptions.create(
model=model,
file=audio_file,
@@ -48,6 +61,8 @@ async def transcribe_voice(
max_bytes: int | None = None,
reply: Callable[..., Awaitable[None]],
transcriber: VoiceTranscriber | None = None,
base_url: str | None = None,
api_key: str | None = None,
) -> str | None:
voice = msg.voice
if voice is None:
@@ -74,7 +89,7 @@ async def transcribe_voice(
await reply(text="voice message is too large to transcribe.")
return None
if transcriber is None:
transcriber = OpenAIVoiceTranscriber()
transcriber = OpenAIVoiceTranscriber(base_url=base_url, api_key=api_key)
try:
return await transcriber.transcribe(model=model, audio_bytes=audio_bytes)
except OpenAIError as exc:
+17
View File
@@ -135,6 +135,23 @@ def test_doctor_voice_checks(monkeypatch) -> None:
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
checks = cli._doctor_voice_checks(settings)
assert checks[0].status == "error"
assert checks[0].detail == "API key not set"
settings_with_key = _settings(
{
"transports": {
"telegram": {
"bot_token": "token",
"chat_id": 1,
"voice_transcription": True,
"voice_transcription_api_key": "local",
}
}
}
)
checks = cli._doctor_voice_checks(settings_with_key)
assert checks[0].status == "ok"
assert checks[0].detail == "voice_transcription_api_key set"
monkeypatch.setenv("OPENAI_API_KEY", "key")
checks = cli._doctor_voice_checks(settings)
+4
View File
@@ -143,6 +143,8 @@ def test_telegram_backend_build_and_run_wires_config(
voice_transcription=True,
voice_max_bytes=1234,
voice_transcription_model="whisper-1",
voice_transcription_base_url="http://localhost:8000/v1",
voice_transcription_api_key="local",
files=TelegramFilesSettings(enabled=True, allowed_user_ids=[1, 2]),
topics=TelegramTopicsSettings(enabled=True, scope="main"),
)
@@ -161,6 +163,8 @@ def test_telegram_backend_build_and_run_wires_config(
assert cfg.voice_transcription is True
assert cfg.voice_max_bytes == 1234
assert cfg.voice_transcription_model == "whisper-1"
assert cfg.voice_transcription_base_url == "http://localhost:8000/v1"
assert cfg.voice_transcription_api_key == "local"
assert cfg.files.enabled is True
assert cfg.files.allowed_user_ids == [1, 2]
assert cfg.topics.enabled is True
+3 -1
View File
@@ -1954,8 +1954,10 @@ async def test_run_main_loop_voice_transcript_preserves_directive(
model: str,
max_bytes: int | None = None,
reply,
base_url: str | None = None,
api_key: str | None = None,
) -> str:
_ = bot, msg, enabled, model, max_bytes, reply
_ = bot, msg, enabled, model, max_bytes, reply, base_url, api_key
return "/codex do thing"
monkeypatch.setattr(telegram_loop, "transcribe_voice", _fake_transcribe)