fix(telegram): separate voice transcription client (#166)
This commit is contained in:
@@ -9,6 +9,10 @@ Enable transcription so voice notes become normal text runs.
|
||||
```sh
|
||||
takopi config set transports.telegram.voice_transcription true
|
||||
takopi config set transports.telegram.voice_transcription_model "gpt-4o-mini-transcribe"
|
||||
|
||||
# local OpenAI-compatible transcription server (optional)
|
||||
takopi config set transports.telegram.voice_transcription_base_url "http://localhost:8000/v1"
|
||||
takopi config set transports.telegram.voice_transcription_api_key "local"
|
||||
```
|
||||
|
||||
=== "toml"
|
||||
@@ -17,13 +21,17 @@ Enable transcription so voice notes become normal text runs.
|
||||
[transports.telegram]
|
||||
voice_transcription = true
|
||||
voice_transcription_model = "gpt-4o-mini-transcribe" # optional
|
||||
voice_transcription_base_url = "http://localhost:8000/v1" # optional
|
||||
voice_transcription_api_key = "local" # optional
|
||||
```
|
||||
|
||||
Set `OPENAI_API_KEY` in your environment (uses OpenAI’s transcription API).
|
||||
Set `OPENAI_API_KEY` in your environment (or `voice_transcription_api_key` in config).
|
||||
|
||||
To use a local OpenAI-compatible Whisper server, also set `OPENAI_BASE_URL`
|
||||
(for example, `http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server ignores it.
|
||||
If your server requires a specific model name, set `voice_transcription_model` (for example, `whisper-1`).
|
||||
To use a local OpenAI-compatible Whisper server, set `voice_transcription_base_url`
|
||||
(and `voice_transcription_api_key` if the server expects one). This keeps engine
|
||||
requests on their own base URL without relying on `OPENAI_BASE_URL`. If your server
|
||||
requires a specific model name, set `voice_transcription_model` (for example,
|
||||
`whisper-1`).
|
||||
|
||||
## Behavior
|
||||
|
||||
|
||||
@@ -51,6 +51,8 @@ If you expect to edit config while Takopi is running, set:
|
||||
| `voice_transcription` | bool | `false` | Enable voice note transcription. |
|
||||
| `voice_max_bytes` | int | `10485760` | Max voice note size (bytes). |
|
||||
| `voice_transcription_model` | string | `"gpt-4o-mini-transcribe"` | OpenAI transcription model name. |
|
||||
| `voice_transcription_base_url` | string\|null | `null` | Override base URL for voice transcription only. |
|
||||
| `voice_transcription_api_key` | string\|null | `null` | Override API key for voice transcription only. |
|
||||
| `session_mode` | `"stateless"`\|`"chat"` | `"stateless"` | Auto-resume mode. Onboarding sets `"chat"` for assistant/workspace. |
|
||||
| `show_resume_line` | bool | `true` | Show resume line in message footer. Onboarding sets `false` for assistant/workspace. |
|
||||
|
||||
|
||||
@@ -33,6 +33,10 @@ Configuration (under `[transports.telegram]`):
|
||||
```sh
|
||||
takopi config set transports.telegram.voice_transcription true
|
||||
takopi config set transports.telegram.voice_transcription_model "gpt-4o-mini-transcribe"
|
||||
|
||||
# local OpenAI-compatible transcription server (optional)
|
||||
takopi config set transports.telegram.voice_transcription_base_url "http://localhost:8000/v1"
|
||||
takopi config set transports.telegram.voice_transcription_api_key "local"
|
||||
```
|
||||
|
||||
=== "toml"
|
||||
@@ -40,16 +44,19 @@ Configuration (under `[transports.telegram]`):
|
||||
```toml
|
||||
voice_transcription = true
|
||||
voice_transcription_model = "gpt-4o-mini-transcribe" # optional
|
||||
voice_transcription_base_url = "http://localhost:8000/v1" # optional
|
||||
voice_transcription_api_key = "local" # optional
|
||||
```
|
||||
|
||||
Set `OPENAI_API_KEY` in the environment. If transcription is enabled but the API key
|
||||
is missing or the audio download fails, takopi replies with a short error and skips
|
||||
the run.
|
||||
Set `OPENAI_API_KEY` in the environment (or `voice_transcription_api_key` in config).
|
||||
If transcription is enabled but no API key is available or the audio download fails,
|
||||
takopi replies with a short error and skips the run.
|
||||
|
||||
To use a local OpenAI-compatible Whisper server, also set `OPENAI_BASE_URL` (for
|
||||
example, `http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server
|
||||
ignores it. If your server requires a specific model name, set
|
||||
`voice_transcription_model` (for example, `whisper-1`).
|
||||
To use a local OpenAI-compatible Whisper server, set `voice_transcription_base_url`
|
||||
(and `voice_transcription_api_key` if the server expects one). This keeps engine
|
||||
requests on their own base URL without relying on `OPENAI_BASE_URL`. If your server
|
||||
requires a specific model name, set `voice_transcription_model` (for example,
|
||||
`whisper-1`).
|
||||
|
||||
### Trigger mode (mentions-only)
|
||||
|
||||
|
||||
@@ -47,9 +47,14 @@ def _doctor_file_checks(settings: TakopiSettings) -> list[DoctorCheck]:
|
||||
def _doctor_voice_checks(settings: TakopiSettings) -> list[DoctorCheck]:
|
||||
if not settings.transports.telegram.voice_transcription:
|
||||
return [DoctorCheck("voice transcription", "ok", "disabled")]
|
||||
api_key = settings.transports.telegram.voice_transcription_api_key
|
||||
if api_key:
|
||||
return [
|
||||
DoctorCheck("voice transcription", "ok", "voice_transcription_api_key set")
|
||||
]
|
||||
if os.environ.get("OPENAI_API_KEY"):
|
||||
return [DoctorCheck("voice transcription", "ok", "OPENAI_API_KEY set")]
|
||||
return [DoctorCheck("voice transcription", "error", "OPENAI_API_KEY not set")]
|
||||
return [DoctorCheck("voice transcription", "error", "API key not set")]
|
||||
|
||||
|
||||
async def _doctor_telegram_checks(
|
||||
|
||||
@@ -98,6 +98,8 @@ class TelegramTransportSettings(BaseModel):
|
||||
voice_transcription: bool = False
|
||||
voice_max_bytes: StrictInt = 10 * 1024 * 1024
|
||||
voice_transcription_model: NonEmptyStr = "gpt-4o-mini-transcribe"
|
||||
voice_transcription_base_url: NonEmptyStr | None = None
|
||||
voice_transcription_api_key: NonEmptyStr | None = None
|
||||
session_mode: Literal["stateless", "chat"] = "stateless"
|
||||
show_resume_line: bool = True
|
||||
forward_coalesce_s: float = Field(default=1.0, ge=0)
|
||||
|
||||
@@ -138,6 +138,8 @@ class TelegramBackend(TransportBackend):
|
||||
voice_transcription=settings.voice_transcription,
|
||||
voice_max_bytes=int(settings.voice_max_bytes),
|
||||
voice_transcription_model=settings.voice_transcription_model,
|
||||
voice_transcription_base_url=settings.voice_transcription_base_url,
|
||||
voice_transcription_api_key=settings.voice_transcription_api_key,
|
||||
forward_coalesce_s=settings.forward_coalesce_s,
|
||||
media_group_debounce_s=settings.media_group_debounce_s,
|
||||
topics=settings.topics,
|
||||
|
||||
@@ -124,6 +124,8 @@ class TelegramBridgeConfig:
|
||||
voice_transcription: bool = False
|
||||
voice_max_bytes: int = 10 * 1024 * 1024
|
||||
voice_transcription_model: str = "gpt-4o-mini-transcribe"
|
||||
voice_transcription_base_url: str | None = None
|
||||
voice_transcription_api_key: str | None = None
|
||||
forward_coalesce_s: float = 1.0
|
||||
media_group_debounce_s: float = 1.0
|
||||
files: TelegramFilesSettings = field(default_factory=TelegramFilesSettings)
|
||||
|
||||
@@ -1663,6 +1663,8 @@ async def run_main_loop(
|
||||
model=cfg.voice_transcription_model,
|
||||
max_bytes=cfg.voice_max_bytes,
|
||||
reply=reply,
|
||||
base_url=cfg.voice_transcription_base_url,
|
||||
api_key=cfg.voice_transcription_api_key,
|
||||
)
|
||||
if text is None:
|
||||
return
|
||||
|
||||
@@ -28,10 +28,23 @@ class VoiceTranscriber(Protocol):
|
||||
|
||||
|
||||
class OpenAIVoiceTranscriber:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
base_url: str | None = None,
|
||||
api_key: str | None = None,
|
||||
) -> None:
|
||||
self._base_url = base_url
|
||||
self._api_key = api_key
|
||||
|
||||
async def transcribe(self, *, model: str, audio_bytes: bytes) -> str:
|
||||
audio_file = io.BytesIO(audio_bytes)
|
||||
audio_file.name = "voice.ogg"
|
||||
async with AsyncOpenAI(timeout=120) as client:
|
||||
async with AsyncOpenAI(
|
||||
base_url=self._base_url,
|
||||
api_key=self._api_key,
|
||||
timeout=120,
|
||||
) as client:
|
||||
response = await client.audio.transcriptions.create(
|
||||
model=model,
|
||||
file=audio_file,
|
||||
@@ -48,6 +61,8 @@ async def transcribe_voice(
|
||||
max_bytes: int | None = None,
|
||||
reply: Callable[..., Awaitable[None]],
|
||||
transcriber: VoiceTranscriber | None = None,
|
||||
base_url: str | None = None,
|
||||
api_key: str | None = None,
|
||||
) -> str | None:
|
||||
voice = msg.voice
|
||||
if voice is None:
|
||||
@@ -74,7 +89,7 @@ async def transcribe_voice(
|
||||
await reply(text="voice message is too large to transcribe.")
|
||||
return None
|
||||
if transcriber is None:
|
||||
transcriber = OpenAIVoiceTranscriber()
|
||||
transcriber = OpenAIVoiceTranscriber(base_url=base_url, api_key=api_key)
|
||||
try:
|
||||
return await transcriber.transcribe(model=model, audio_bytes=audio_bytes)
|
||||
except OpenAIError as exc:
|
||||
|
||||
@@ -135,6 +135,23 @@ def test_doctor_voice_checks(monkeypatch) -> None:
|
||||
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
||||
checks = cli._doctor_voice_checks(settings)
|
||||
assert checks[0].status == "error"
|
||||
assert checks[0].detail == "API key not set"
|
||||
|
||||
settings_with_key = _settings(
|
||||
{
|
||||
"transports": {
|
||||
"telegram": {
|
||||
"bot_token": "token",
|
||||
"chat_id": 1,
|
||||
"voice_transcription": True,
|
||||
"voice_transcription_api_key": "local",
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
checks = cli._doctor_voice_checks(settings_with_key)
|
||||
assert checks[0].status == "ok"
|
||||
assert checks[0].detail == "voice_transcription_api_key set"
|
||||
|
||||
monkeypatch.setenv("OPENAI_API_KEY", "key")
|
||||
checks = cli._doctor_voice_checks(settings)
|
||||
|
||||
@@ -143,6 +143,8 @@ def test_telegram_backend_build_and_run_wires_config(
|
||||
voice_transcription=True,
|
||||
voice_max_bytes=1234,
|
||||
voice_transcription_model="whisper-1",
|
||||
voice_transcription_base_url="http://localhost:8000/v1",
|
||||
voice_transcription_api_key="local",
|
||||
files=TelegramFilesSettings(enabled=True, allowed_user_ids=[1, 2]),
|
||||
topics=TelegramTopicsSettings(enabled=True, scope="main"),
|
||||
)
|
||||
@@ -161,6 +163,8 @@ def test_telegram_backend_build_and_run_wires_config(
|
||||
assert cfg.voice_transcription is True
|
||||
assert cfg.voice_max_bytes == 1234
|
||||
assert cfg.voice_transcription_model == "whisper-1"
|
||||
assert cfg.voice_transcription_base_url == "http://localhost:8000/v1"
|
||||
assert cfg.voice_transcription_api_key == "local"
|
||||
assert cfg.files.enabled is True
|
||||
assert cfg.files.allowed_user_ids == [1, 2]
|
||||
assert cfg.topics.enabled is True
|
||||
|
||||
@@ -1954,8 +1954,10 @@ async def test_run_main_loop_voice_transcript_preserves_directive(
|
||||
model: str,
|
||||
max_bytes: int | None = None,
|
||||
reply,
|
||||
base_url: str | None = None,
|
||||
api_key: str | None = None,
|
||||
) -> str:
|
||||
_ = bot, msg, enabled, model, max_bytes, reply
|
||||
_ = bot, msg, enabled, model, max_bytes, reply, base_url, api_key
|
||||
return "/codex do thing"
|
||||
|
||||
monkeypatch.setattr(telegram_loop, "transcribe_voice", _fake_transcribe)
|
||||
|
||||
Reference in New Issue
Block a user