fix(telegram): separate voice transcription client (#166)
This commit is contained in:
@@ -9,6 +9,10 @@ Enable transcription so voice notes become normal text runs.
|
|||||||
```sh
|
```sh
|
||||||
takopi config set transports.telegram.voice_transcription true
|
takopi config set transports.telegram.voice_transcription true
|
||||||
takopi config set transports.telegram.voice_transcription_model "gpt-4o-mini-transcribe"
|
takopi config set transports.telegram.voice_transcription_model "gpt-4o-mini-transcribe"
|
||||||
|
|
||||||
|
# local OpenAI-compatible transcription server (optional)
|
||||||
|
takopi config set transports.telegram.voice_transcription_base_url "http://localhost:8000/v1"
|
||||||
|
takopi config set transports.telegram.voice_transcription_api_key "local"
|
||||||
```
|
```
|
||||||
|
|
||||||
=== "toml"
|
=== "toml"
|
||||||
@@ -17,13 +21,17 @@ Enable transcription so voice notes become normal text runs.
|
|||||||
[transports.telegram]
|
[transports.telegram]
|
||||||
voice_transcription = true
|
voice_transcription = true
|
||||||
voice_transcription_model = "gpt-4o-mini-transcribe" # optional
|
voice_transcription_model = "gpt-4o-mini-transcribe" # optional
|
||||||
|
voice_transcription_base_url = "http://localhost:8000/v1" # optional
|
||||||
|
voice_transcription_api_key = "local" # optional
|
||||||
```
|
```
|
||||||
|
|
||||||
Set `OPENAI_API_KEY` in your environment (uses OpenAI’s transcription API).
|
Set `OPENAI_API_KEY` in your environment (or `voice_transcription_api_key` in config).
|
||||||
|
|
||||||
To use a local OpenAI-compatible Whisper server, also set `OPENAI_BASE_URL`
|
To use a local OpenAI-compatible Whisper server, set `voice_transcription_base_url`
|
||||||
(for example, `http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server ignores it.
|
(and `voice_transcription_api_key` if the server expects one). This keeps engine
|
||||||
If your server requires a specific model name, set `voice_transcription_model` (for example, `whisper-1`).
|
requests on their own base URL without relying on `OPENAI_BASE_URL`. If your server
|
||||||
|
requires a specific model name, set `voice_transcription_model` (for example,
|
||||||
|
`whisper-1`).
|
||||||
|
|
||||||
## Behavior
|
## Behavior
|
||||||
|
|
||||||
|
|||||||
@@ -51,6 +51,8 @@ If you expect to edit config while Takopi is running, set:
|
|||||||
| `voice_transcription` | bool | `false` | Enable voice note transcription. |
|
| `voice_transcription` | bool | `false` | Enable voice note transcription. |
|
||||||
| `voice_max_bytes` | int | `10485760` | Max voice note size (bytes). |
|
| `voice_max_bytes` | int | `10485760` | Max voice note size (bytes). |
|
||||||
| `voice_transcription_model` | string | `"gpt-4o-mini-transcribe"` | OpenAI transcription model name. |
|
| `voice_transcription_model` | string | `"gpt-4o-mini-transcribe"` | OpenAI transcription model name. |
|
||||||
|
| `voice_transcription_base_url` | string\|null | `null` | Override base URL for voice transcription only. |
|
||||||
|
| `voice_transcription_api_key` | string\|null | `null` | Override API key for voice transcription only. |
|
||||||
| `session_mode` | `"stateless"`\|`"chat"` | `"stateless"` | Auto-resume mode. Onboarding sets `"chat"` for assistant/workspace. |
|
| `session_mode` | `"stateless"`\|`"chat"` | `"stateless"` | Auto-resume mode. Onboarding sets `"chat"` for assistant/workspace. |
|
||||||
| `show_resume_line` | bool | `true` | Show resume line in message footer. Onboarding sets `false` for assistant/workspace. |
|
| `show_resume_line` | bool | `true` | Show resume line in message footer. Onboarding sets `false` for assistant/workspace. |
|
||||||
|
|
||||||
|
|||||||
@@ -33,6 +33,10 @@ Configuration (under `[transports.telegram]`):
|
|||||||
```sh
|
```sh
|
||||||
takopi config set transports.telegram.voice_transcription true
|
takopi config set transports.telegram.voice_transcription true
|
||||||
takopi config set transports.telegram.voice_transcription_model "gpt-4o-mini-transcribe"
|
takopi config set transports.telegram.voice_transcription_model "gpt-4o-mini-transcribe"
|
||||||
|
|
||||||
|
# local OpenAI-compatible transcription server (optional)
|
||||||
|
takopi config set transports.telegram.voice_transcription_base_url "http://localhost:8000/v1"
|
||||||
|
takopi config set transports.telegram.voice_transcription_api_key "local"
|
||||||
```
|
```
|
||||||
|
|
||||||
=== "toml"
|
=== "toml"
|
||||||
@@ -40,16 +44,19 @@ Configuration (under `[transports.telegram]`):
|
|||||||
```toml
|
```toml
|
||||||
voice_transcription = true
|
voice_transcription = true
|
||||||
voice_transcription_model = "gpt-4o-mini-transcribe" # optional
|
voice_transcription_model = "gpt-4o-mini-transcribe" # optional
|
||||||
|
voice_transcription_base_url = "http://localhost:8000/v1" # optional
|
||||||
|
voice_transcription_api_key = "local" # optional
|
||||||
```
|
```
|
||||||
|
|
||||||
Set `OPENAI_API_KEY` in the environment. If transcription is enabled but the API key
|
Set `OPENAI_API_KEY` in the environment (or `voice_transcription_api_key` in config).
|
||||||
is missing or the audio download fails, takopi replies with a short error and skips
|
If transcription is enabled but no API key is available or the audio download fails,
|
||||||
the run.
|
takopi replies with a short error and skips the run.
|
||||||
|
|
||||||
To use a local OpenAI-compatible Whisper server, also set `OPENAI_BASE_URL` (for
|
To use a local OpenAI-compatible Whisper server, set `voice_transcription_base_url`
|
||||||
example, `http://localhost:8000/v1`) and a dummy `OPENAI_API_KEY` if your server
|
(and `voice_transcription_api_key` if the server expects one). This keeps engine
|
||||||
ignores it. If your server requires a specific model name, set
|
requests on their own base URL without relying on `OPENAI_BASE_URL`. If your server
|
||||||
`voice_transcription_model` (for example, `whisper-1`).
|
requires a specific model name, set `voice_transcription_model` (for example,
|
||||||
|
`whisper-1`).
|
||||||
|
|
||||||
### Trigger mode (mentions-only)
|
### Trigger mode (mentions-only)
|
||||||
|
|
||||||
|
|||||||
@@ -47,9 +47,14 @@ def _doctor_file_checks(settings: TakopiSettings) -> list[DoctorCheck]:
|
|||||||
def _doctor_voice_checks(settings: TakopiSettings) -> list[DoctorCheck]:
|
def _doctor_voice_checks(settings: TakopiSettings) -> list[DoctorCheck]:
|
||||||
if not settings.transports.telegram.voice_transcription:
|
if not settings.transports.telegram.voice_transcription:
|
||||||
return [DoctorCheck("voice transcription", "ok", "disabled")]
|
return [DoctorCheck("voice transcription", "ok", "disabled")]
|
||||||
|
api_key = settings.transports.telegram.voice_transcription_api_key
|
||||||
|
if api_key:
|
||||||
|
return [
|
||||||
|
DoctorCheck("voice transcription", "ok", "voice_transcription_api_key set")
|
||||||
|
]
|
||||||
if os.environ.get("OPENAI_API_KEY"):
|
if os.environ.get("OPENAI_API_KEY"):
|
||||||
return [DoctorCheck("voice transcription", "ok", "OPENAI_API_KEY set")]
|
return [DoctorCheck("voice transcription", "ok", "OPENAI_API_KEY set")]
|
||||||
return [DoctorCheck("voice transcription", "error", "OPENAI_API_KEY not set")]
|
return [DoctorCheck("voice transcription", "error", "API key not set")]
|
||||||
|
|
||||||
|
|
||||||
async def _doctor_telegram_checks(
|
async def _doctor_telegram_checks(
|
||||||
|
|||||||
@@ -98,6 +98,8 @@ class TelegramTransportSettings(BaseModel):
|
|||||||
voice_transcription: bool = False
|
voice_transcription: bool = False
|
||||||
voice_max_bytes: StrictInt = 10 * 1024 * 1024
|
voice_max_bytes: StrictInt = 10 * 1024 * 1024
|
||||||
voice_transcription_model: NonEmptyStr = "gpt-4o-mini-transcribe"
|
voice_transcription_model: NonEmptyStr = "gpt-4o-mini-transcribe"
|
||||||
|
voice_transcription_base_url: NonEmptyStr | None = None
|
||||||
|
voice_transcription_api_key: NonEmptyStr | None = None
|
||||||
session_mode: Literal["stateless", "chat"] = "stateless"
|
session_mode: Literal["stateless", "chat"] = "stateless"
|
||||||
show_resume_line: bool = True
|
show_resume_line: bool = True
|
||||||
forward_coalesce_s: float = Field(default=1.0, ge=0)
|
forward_coalesce_s: float = Field(default=1.0, ge=0)
|
||||||
|
|||||||
@@ -138,6 +138,8 @@ class TelegramBackend(TransportBackend):
|
|||||||
voice_transcription=settings.voice_transcription,
|
voice_transcription=settings.voice_transcription,
|
||||||
voice_max_bytes=int(settings.voice_max_bytes),
|
voice_max_bytes=int(settings.voice_max_bytes),
|
||||||
voice_transcription_model=settings.voice_transcription_model,
|
voice_transcription_model=settings.voice_transcription_model,
|
||||||
|
voice_transcription_base_url=settings.voice_transcription_base_url,
|
||||||
|
voice_transcription_api_key=settings.voice_transcription_api_key,
|
||||||
forward_coalesce_s=settings.forward_coalesce_s,
|
forward_coalesce_s=settings.forward_coalesce_s,
|
||||||
media_group_debounce_s=settings.media_group_debounce_s,
|
media_group_debounce_s=settings.media_group_debounce_s,
|
||||||
topics=settings.topics,
|
topics=settings.topics,
|
||||||
|
|||||||
@@ -124,6 +124,8 @@ class TelegramBridgeConfig:
|
|||||||
voice_transcription: bool = False
|
voice_transcription: bool = False
|
||||||
voice_max_bytes: int = 10 * 1024 * 1024
|
voice_max_bytes: int = 10 * 1024 * 1024
|
||||||
voice_transcription_model: str = "gpt-4o-mini-transcribe"
|
voice_transcription_model: str = "gpt-4o-mini-transcribe"
|
||||||
|
voice_transcription_base_url: str | None = None
|
||||||
|
voice_transcription_api_key: str | None = None
|
||||||
forward_coalesce_s: float = 1.0
|
forward_coalesce_s: float = 1.0
|
||||||
media_group_debounce_s: float = 1.0
|
media_group_debounce_s: float = 1.0
|
||||||
files: TelegramFilesSettings = field(default_factory=TelegramFilesSettings)
|
files: TelegramFilesSettings = field(default_factory=TelegramFilesSettings)
|
||||||
|
|||||||
@@ -1663,6 +1663,8 @@ async def run_main_loop(
|
|||||||
model=cfg.voice_transcription_model,
|
model=cfg.voice_transcription_model,
|
||||||
max_bytes=cfg.voice_max_bytes,
|
max_bytes=cfg.voice_max_bytes,
|
||||||
reply=reply,
|
reply=reply,
|
||||||
|
base_url=cfg.voice_transcription_base_url,
|
||||||
|
api_key=cfg.voice_transcription_api_key,
|
||||||
)
|
)
|
||||||
if text is None:
|
if text is None:
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -28,10 +28,23 @@ class VoiceTranscriber(Protocol):
|
|||||||
|
|
||||||
|
|
||||||
class OpenAIVoiceTranscriber:
|
class OpenAIVoiceTranscriber:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
base_url: str | None = None,
|
||||||
|
api_key: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
self._base_url = base_url
|
||||||
|
self._api_key = api_key
|
||||||
|
|
||||||
async def transcribe(self, *, model: str, audio_bytes: bytes) -> str:
|
async def transcribe(self, *, model: str, audio_bytes: bytes) -> str:
|
||||||
audio_file = io.BytesIO(audio_bytes)
|
audio_file = io.BytesIO(audio_bytes)
|
||||||
audio_file.name = "voice.ogg"
|
audio_file.name = "voice.ogg"
|
||||||
async with AsyncOpenAI(timeout=120) as client:
|
async with AsyncOpenAI(
|
||||||
|
base_url=self._base_url,
|
||||||
|
api_key=self._api_key,
|
||||||
|
timeout=120,
|
||||||
|
) as client:
|
||||||
response = await client.audio.transcriptions.create(
|
response = await client.audio.transcriptions.create(
|
||||||
model=model,
|
model=model,
|
||||||
file=audio_file,
|
file=audio_file,
|
||||||
@@ -48,6 +61,8 @@ async def transcribe_voice(
|
|||||||
max_bytes: int | None = None,
|
max_bytes: int | None = None,
|
||||||
reply: Callable[..., Awaitable[None]],
|
reply: Callable[..., Awaitable[None]],
|
||||||
transcriber: VoiceTranscriber | None = None,
|
transcriber: VoiceTranscriber | None = None,
|
||||||
|
base_url: str | None = None,
|
||||||
|
api_key: str | None = None,
|
||||||
) -> str | None:
|
) -> str | None:
|
||||||
voice = msg.voice
|
voice = msg.voice
|
||||||
if voice is None:
|
if voice is None:
|
||||||
@@ -74,7 +89,7 @@ async def transcribe_voice(
|
|||||||
await reply(text="voice message is too large to transcribe.")
|
await reply(text="voice message is too large to transcribe.")
|
||||||
return None
|
return None
|
||||||
if transcriber is None:
|
if transcriber is None:
|
||||||
transcriber = OpenAIVoiceTranscriber()
|
transcriber = OpenAIVoiceTranscriber(base_url=base_url, api_key=api_key)
|
||||||
try:
|
try:
|
||||||
return await transcriber.transcribe(model=model, audio_bytes=audio_bytes)
|
return await transcriber.transcribe(model=model, audio_bytes=audio_bytes)
|
||||||
except OpenAIError as exc:
|
except OpenAIError as exc:
|
||||||
|
|||||||
@@ -135,6 +135,23 @@ def test_doctor_voice_checks(monkeypatch) -> None:
|
|||||||
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
||||||
checks = cli._doctor_voice_checks(settings)
|
checks = cli._doctor_voice_checks(settings)
|
||||||
assert checks[0].status == "error"
|
assert checks[0].status == "error"
|
||||||
|
assert checks[0].detail == "API key not set"
|
||||||
|
|
||||||
|
settings_with_key = _settings(
|
||||||
|
{
|
||||||
|
"transports": {
|
||||||
|
"telegram": {
|
||||||
|
"bot_token": "token",
|
||||||
|
"chat_id": 1,
|
||||||
|
"voice_transcription": True,
|
||||||
|
"voice_transcription_api_key": "local",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
checks = cli._doctor_voice_checks(settings_with_key)
|
||||||
|
assert checks[0].status == "ok"
|
||||||
|
assert checks[0].detail == "voice_transcription_api_key set"
|
||||||
|
|
||||||
monkeypatch.setenv("OPENAI_API_KEY", "key")
|
monkeypatch.setenv("OPENAI_API_KEY", "key")
|
||||||
checks = cli._doctor_voice_checks(settings)
|
checks = cli._doctor_voice_checks(settings)
|
||||||
|
|||||||
@@ -143,6 +143,8 @@ def test_telegram_backend_build_and_run_wires_config(
|
|||||||
voice_transcription=True,
|
voice_transcription=True,
|
||||||
voice_max_bytes=1234,
|
voice_max_bytes=1234,
|
||||||
voice_transcription_model="whisper-1",
|
voice_transcription_model="whisper-1",
|
||||||
|
voice_transcription_base_url="http://localhost:8000/v1",
|
||||||
|
voice_transcription_api_key="local",
|
||||||
files=TelegramFilesSettings(enabled=True, allowed_user_ids=[1, 2]),
|
files=TelegramFilesSettings(enabled=True, allowed_user_ids=[1, 2]),
|
||||||
topics=TelegramTopicsSettings(enabled=True, scope="main"),
|
topics=TelegramTopicsSettings(enabled=True, scope="main"),
|
||||||
)
|
)
|
||||||
@@ -161,6 +163,8 @@ def test_telegram_backend_build_and_run_wires_config(
|
|||||||
assert cfg.voice_transcription is True
|
assert cfg.voice_transcription is True
|
||||||
assert cfg.voice_max_bytes == 1234
|
assert cfg.voice_max_bytes == 1234
|
||||||
assert cfg.voice_transcription_model == "whisper-1"
|
assert cfg.voice_transcription_model == "whisper-1"
|
||||||
|
assert cfg.voice_transcription_base_url == "http://localhost:8000/v1"
|
||||||
|
assert cfg.voice_transcription_api_key == "local"
|
||||||
assert cfg.files.enabled is True
|
assert cfg.files.enabled is True
|
||||||
assert cfg.files.allowed_user_ids == [1, 2]
|
assert cfg.files.allowed_user_ids == [1, 2]
|
||||||
assert cfg.topics.enabled is True
|
assert cfg.topics.enabled is True
|
||||||
|
|||||||
@@ -1954,8 +1954,10 @@ async def test_run_main_loop_voice_transcript_preserves_directive(
|
|||||||
model: str,
|
model: str,
|
||||||
max_bytes: int | None = None,
|
max_bytes: int | None = None,
|
||||||
reply,
|
reply,
|
||||||
|
base_url: str | None = None,
|
||||||
|
api_key: str | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
_ = bot, msg, enabled, model, max_bytes, reply
|
_ = bot, msg, enabled, model, max_bytes, reply, base_url, api_key
|
||||||
return "/codex do thing"
|
return "/codex do thing"
|
||||||
|
|
||||||
monkeypatch.setattr(telegram_loop, "transcribe_voice", _fake_transcribe)
|
monkeypatch.setattr(telegram_loop, "transcribe_voice", _fake_transcribe)
|
||||||
|
|||||||
Reference in New Issue
Block a user