From ee365d76ffc29d62fd6ad86fcfdd39fbff00a883 Mon Sep 17 00:00:00 2001 From: codyw912 <32690983+codyw912@users.noreply.github.com> Date: Sat, 17 Jan 2026 14:50:08 -0500 Subject: [PATCH] fix(telegram): separate voice transcription client (#166) --- docs/how-to/voice-notes.md | 16 ++++++++++++---- docs/reference/config.md | 2 ++ docs/reference/transports/telegram.md | 21 ++++++++++++++------- src/takopi/cli/doctor.py | 7 ++++++- src/takopi/settings.py | 2 ++ src/takopi/telegram/backend.py | 2 ++ src/takopi/telegram/bridge.py | 2 ++ src/takopi/telegram/loop.py | 2 ++ src/takopi/telegram/voice.py | 19 +++++++++++++++++-- tests/test_cli_helpers.py | 17 +++++++++++++++++ tests/test_telegram_backend.py | 4 ++++ tests/test_telegram_bridge.py | 4 +++- 12 files changed, 83 insertions(+), 15 deletions(-) diff --git a/docs/how-to/voice-notes.md b/docs/how-to/voice-notes.md index eb56a35..925106f 100644 --- a/docs/how-to/voice-notes.md +++ b/docs/how-to/voice-notes.md @@ -9,6 +9,10 @@ Enable transcription so voice notes become normal text runs. ```sh takopi config set transports.telegram.voice_transcription true takopi config set transports.telegram.voice_transcription_model "gpt-4o-mini-transcribe" + + # local OpenAI-compatible transcription server (optional) + takopi config set transports.telegram.voice_transcription_base_url "http://localhost:8000/v1" + takopi config set transports.telegram.voice_transcription_api_key "local" ``` === "toml" @@ -17,13 +21,17 @@ Enable transcription so voice notes become normal text runs. [transports.telegram] voice_transcription = true voice_transcription_model = "gpt-4o-mini-transcribe" # optional + voice_transcription_base_url = "http://localhost:8000/v1" # optional + voice_transcription_api_key = "local" # optional ``` -Set `OPENAI_API_KEY` in your environment (uses OpenAI’s transcription API). +Set `OPENAI_API_KEY` in your environment (or `voice_transcription_api_key` in config). -To use a local OpenAI-compatible Whisper server, also set `OPENAI_BASE_URL` -(for example, `http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server ignores it. -If your server requires a specific model name, set `voice_transcription_model` (for example, `whisper-1`). +To use a local OpenAI-compatible Whisper server, set `voice_transcription_base_url` +(and `voice_transcription_api_key` if the server expects one). This keeps engine +requests on their own base URL without relying on `OPENAI_BASE_URL`. If your server +requires a specific model name, set `voice_transcription_model` (for example, +`whisper-1`). ## Behavior diff --git a/docs/reference/config.md b/docs/reference/config.md index 3dfb474..d24f14a 100644 --- a/docs/reference/config.md +++ b/docs/reference/config.md @@ -51,6 +51,8 @@ If you expect to edit config while Takopi is running, set: | `voice_transcription` | bool | `false` | Enable voice note transcription. | | `voice_max_bytes` | int | `10485760` | Max voice note size (bytes). | | `voice_transcription_model` | string | `"gpt-4o-mini-transcribe"` | OpenAI transcription model name. | +| `voice_transcription_base_url` | string\|null | `null` | Override base URL for voice transcription only. | +| `voice_transcription_api_key` | string\|null | `null` | Override API key for voice transcription only. | | `session_mode` | `"stateless"`\|`"chat"` | `"stateless"` | Auto-resume mode. Onboarding sets `"chat"` for assistant/workspace. | | `show_resume_line` | bool | `true` | Show resume line in message footer. Onboarding sets `false` for assistant/workspace. | diff --git a/docs/reference/transports/telegram.md b/docs/reference/transports/telegram.md index 74bb594..294258f 100644 --- a/docs/reference/transports/telegram.md +++ b/docs/reference/transports/telegram.md @@ -33,6 +33,10 @@ Configuration (under `[transports.telegram]`): ```sh takopi config set transports.telegram.voice_transcription true takopi config set transports.telegram.voice_transcription_model "gpt-4o-mini-transcribe" + + # local OpenAI-compatible transcription server (optional) + takopi config set transports.telegram.voice_transcription_base_url "http://localhost:8000/v1" + takopi config set transports.telegram.voice_transcription_api_key "local" ``` === "toml" @@ -40,16 +44,19 @@ Configuration (under `[transports.telegram]`): ```toml voice_transcription = true voice_transcription_model = "gpt-4o-mini-transcribe" # optional + voice_transcription_base_url = "http://localhost:8000/v1" # optional + voice_transcription_api_key = "local" # optional ``` -Set `OPENAI_API_KEY` in the environment. If transcription is enabled but the API key -is missing or the audio download fails, takopi replies with a short error and skips -the run. +Set `OPENAI_API_KEY` in the environment (or `voice_transcription_api_key` in config). +If transcription is enabled but no API key is available or the audio download fails, +takopi replies with a short error and skips the run. -To use a local OpenAI-compatible Whisper server, also set `OPENAI_BASE_URL` (for -example, `http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server -ignores it. If your server requires a specific model name, set -`voice_transcription_model` (for example, `whisper-1`). +To use a local OpenAI-compatible Whisper server, set `voice_transcription_base_url` +(and `voice_transcription_api_key` if the server expects one). This keeps engine +requests on their own base URL without relying on `OPENAI_BASE_URL`. If your server +requires a specific model name, set `voice_transcription_model` (for example, +`whisper-1`). ### Trigger mode (mentions-only) diff --git a/src/takopi/cli/doctor.py b/src/takopi/cli/doctor.py index f91c3c2..4f97d2a 100644 --- a/src/takopi/cli/doctor.py +++ b/src/takopi/cli/doctor.py @@ -47,9 +47,14 @@ def _doctor_file_checks(settings: TakopiSettings) -> list[DoctorCheck]: def _doctor_voice_checks(settings: TakopiSettings) -> list[DoctorCheck]: if not settings.transports.telegram.voice_transcription: return [DoctorCheck("voice transcription", "ok", "disabled")] + api_key = settings.transports.telegram.voice_transcription_api_key + if api_key: + return [ + DoctorCheck("voice transcription", "ok", "voice_transcription_api_key set") + ] if os.environ.get("OPENAI_API_KEY"): return [DoctorCheck("voice transcription", "ok", "OPENAI_API_KEY set")] - return [DoctorCheck("voice transcription", "error", "OPENAI_API_KEY not set")] + return [DoctorCheck("voice transcription", "error", "API key not set")] async def _doctor_telegram_checks( diff --git a/src/takopi/settings.py b/src/takopi/settings.py index 247db49..1669e4f 100644 --- a/src/takopi/settings.py +++ b/src/takopi/settings.py @@ -98,6 +98,8 @@ class TelegramTransportSettings(BaseModel): voice_transcription: bool = False voice_max_bytes: StrictInt = 10 * 1024 * 1024 voice_transcription_model: NonEmptyStr = "gpt-4o-mini-transcribe" + voice_transcription_base_url: NonEmptyStr | None = None + voice_transcription_api_key: NonEmptyStr | None = None session_mode: Literal["stateless", "chat"] = "stateless" show_resume_line: bool = True forward_coalesce_s: float = Field(default=1.0, ge=0) diff --git a/src/takopi/telegram/backend.py b/src/takopi/telegram/backend.py index 98c37b4..fa70784 100644 --- a/src/takopi/telegram/backend.py +++ b/src/takopi/telegram/backend.py @@ -138,6 +138,8 @@ class TelegramBackend(TransportBackend): voice_transcription=settings.voice_transcription, voice_max_bytes=int(settings.voice_max_bytes), voice_transcription_model=settings.voice_transcription_model, + voice_transcription_base_url=settings.voice_transcription_base_url, + voice_transcription_api_key=settings.voice_transcription_api_key, forward_coalesce_s=settings.forward_coalesce_s, media_group_debounce_s=settings.media_group_debounce_s, topics=settings.topics, diff --git a/src/takopi/telegram/bridge.py b/src/takopi/telegram/bridge.py index 3775b3f..68f821e 100644 --- a/src/takopi/telegram/bridge.py +++ b/src/takopi/telegram/bridge.py @@ -124,6 +124,8 @@ class TelegramBridgeConfig: voice_transcription: bool = False voice_max_bytes: int = 10 * 1024 * 1024 voice_transcription_model: str = "gpt-4o-mini-transcribe" + voice_transcription_base_url: str | None = None + voice_transcription_api_key: str | None = None forward_coalesce_s: float = 1.0 media_group_debounce_s: float = 1.0 files: TelegramFilesSettings = field(default_factory=TelegramFilesSettings) diff --git a/src/takopi/telegram/loop.py b/src/takopi/telegram/loop.py index cb6458f..73e537c 100644 --- a/src/takopi/telegram/loop.py +++ b/src/takopi/telegram/loop.py @@ -1663,6 +1663,8 @@ async def run_main_loop( model=cfg.voice_transcription_model, max_bytes=cfg.voice_max_bytes, reply=reply, + base_url=cfg.voice_transcription_base_url, + api_key=cfg.voice_transcription_api_key, ) if text is None: return diff --git a/src/takopi/telegram/voice.py b/src/takopi/telegram/voice.py index 16d133e..be55951 100644 --- a/src/takopi/telegram/voice.py +++ b/src/takopi/telegram/voice.py @@ -28,10 +28,23 @@ class VoiceTranscriber(Protocol): class OpenAIVoiceTranscriber: + def __init__( + self, + *, + base_url: str | None = None, + api_key: str | None = None, + ) -> None: + self._base_url = base_url + self._api_key = api_key + async def transcribe(self, *, model: str, audio_bytes: bytes) -> str: audio_file = io.BytesIO(audio_bytes) audio_file.name = "voice.ogg" - async with AsyncOpenAI(timeout=120) as client: + async with AsyncOpenAI( + base_url=self._base_url, + api_key=self._api_key, + timeout=120, + ) as client: response = await client.audio.transcriptions.create( model=model, file=audio_file, @@ -48,6 +61,8 @@ async def transcribe_voice( max_bytes: int | None = None, reply: Callable[..., Awaitable[None]], transcriber: VoiceTranscriber | None = None, + base_url: str | None = None, + api_key: str | None = None, ) -> str | None: voice = msg.voice if voice is None: @@ -74,7 +89,7 @@ async def transcribe_voice( await reply(text="voice message is too large to transcribe.") return None if transcriber is None: - transcriber = OpenAIVoiceTranscriber() + transcriber = OpenAIVoiceTranscriber(base_url=base_url, api_key=api_key) try: return await transcriber.transcribe(model=model, audio_bytes=audio_bytes) except OpenAIError as exc: diff --git a/tests/test_cli_helpers.py b/tests/test_cli_helpers.py index 1f0ff65..b2454f6 100644 --- a/tests/test_cli_helpers.py +++ b/tests/test_cli_helpers.py @@ -135,6 +135,23 @@ def test_doctor_voice_checks(monkeypatch) -> None: monkeypatch.delenv("OPENAI_API_KEY", raising=False) checks = cli._doctor_voice_checks(settings) assert checks[0].status == "error" + assert checks[0].detail == "API key not set" + + settings_with_key = _settings( + { + "transports": { + "telegram": { + "bot_token": "token", + "chat_id": 1, + "voice_transcription": True, + "voice_transcription_api_key": "local", + } + } + } + ) + checks = cli._doctor_voice_checks(settings_with_key) + assert checks[0].status == "ok" + assert checks[0].detail == "voice_transcription_api_key set" monkeypatch.setenv("OPENAI_API_KEY", "key") checks = cli._doctor_voice_checks(settings) diff --git a/tests/test_telegram_backend.py b/tests/test_telegram_backend.py index 00058b6..4f690c1 100644 --- a/tests/test_telegram_backend.py +++ b/tests/test_telegram_backend.py @@ -143,6 +143,8 @@ def test_telegram_backend_build_and_run_wires_config( voice_transcription=True, voice_max_bytes=1234, voice_transcription_model="whisper-1", + voice_transcription_base_url="http://localhost:8000/v1", + voice_transcription_api_key="local", files=TelegramFilesSettings(enabled=True, allowed_user_ids=[1, 2]), topics=TelegramTopicsSettings(enabled=True, scope="main"), ) @@ -161,6 +163,8 @@ def test_telegram_backend_build_and_run_wires_config( assert cfg.voice_transcription is True assert cfg.voice_max_bytes == 1234 assert cfg.voice_transcription_model == "whisper-1" + assert cfg.voice_transcription_base_url == "http://localhost:8000/v1" + assert cfg.voice_transcription_api_key == "local" assert cfg.files.enabled is True assert cfg.files.allowed_user_ids == [1, 2] assert cfg.topics.enabled is True diff --git a/tests/test_telegram_bridge.py b/tests/test_telegram_bridge.py index f0385b2..291e69a 100644 --- a/tests/test_telegram_bridge.py +++ b/tests/test_telegram_bridge.py @@ -1954,8 +1954,10 @@ async def test_run_main_loop_voice_transcript_preserves_directive( model: str, max_bytes: int | None = None, reply, + base_url: str | None = None, + api_key: str | None = None, ) -> str: - _ = bot, msg, enabled, model, max_bytes, reply + _ = bot, msg, enabled, model, max_bytes, reply, base_url, api_key return "/codex do thing" monkeypatch.setattr(telegram_loop, "transcribe_voice", _fake_transcribe)