feat: add llm cleanup voice mode
This commit is contained in:
Executable
+49
@@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
NGINX_CONF="${NGINX_CONF:-/etc/nginx/nginx.conf}"
|
||||
LLM_PROXY_PATH="${LLM_PROXY_PATH:-/llm/}"
|
||||
LLM_UPSTREAM="${LLM_UPSTREAM:-http://127.0.0.1:11435/}"
|
||||
|
||||
if [[ ! -f "$NGINX_CONF" ]]; then
|
||||
echo "nginx config not found: $NGINX_CONF" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
echo "run as root: sudo $0" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
backup_path="${NGINX_CONF}.webterm-llm-$(date +%Y%m%d-%H%M%S).bak"
|
||||
cp "$NGINX_CONF" "$backup_path"
|
||||
echo "backup created: $backup_path"
|
||||
|
||||
python3 - "$NGINX_CONF" "$LLM_PROXY_PATH" "$LLM_UPSTREAM" <<'PY'
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
config_path = Path(sys.argv[1])
|
||||
location_path = sys.argv[2]
|
||||
upstream = sys.argv[3]
|
||||
text = config_path.read_text()
|
||||
|
||||
if f"location {location_path}" in text:
|
||||
print(f"proxy location already present: {location_path}")
|
||||
raise SystemExit(0)
|
||||
|
||||
target = """ location / {\n if ($valid_origin = "0") { return 403; }\n proxy_pass http://127.0.0.1:8080;\n proxy_http_version 1.1;\n proxy_set_header Upgrade $http_upgrade;\n proxy_set_header Connection "upgrade";\n proxy_set_header Host $host;\n proxy_set_header X-Real-IP $remote_addr;\n proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n proxy_set_header X-Forwarded-Proto $scheme;\n }\n"""
|
||||
|
||||
replacement = f""" location {location_path} {{\n if ($valid_origin = "0") {{ return 403; }}\n proxy_pass {upstream};\n proxy_http_version 1.1;\n proxy_connect_timeout 30s;\n proxy_send_timeout 300s;\n proxy_read_timeout 300s;\n proxy_set_header Host $host;\n proxy_set_header X-Real-IP $remote_addr;\n proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n proxy_set_header X-Forwarded-Proto $scheme;\n }}\n\n{target}"""
|
||||
|
||||
if target not in text:
|
||||
print("target webterm location block not found in nginx.conf", file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
|
||||
config_path.write_text(text.replace(target, replacement, 1))
|
||||
print(f"inserted proxy location {location_path} -> {upstream}")
|
||||
PY
|
||||
|
||||
nginx -t
|
||||
echo "nginx config valid"
|
||||
echo "reload when ready: sudo systemctl reload nginx"
|
||||
@@ -13,9 +13,12 @@ const (
|
||||
DefaultFontSize = 16
|
||||
DefaultTerminalWidth = 132
|
||||
DefaultTerminalHeight = 45
|
||||
DefaultVoiceLLMBaseURL = "/llm"
|
||||
|
||||
ScreenshotForceRedrawEnv = "WEBTERM_SCREENSHOT_FORCE_REDRAW"
|
||||
ScreenshotModeEnv = "WEBTERM_SCREENSHOT_MODE"
|
||||
VoiceLLMBaseURLEnv = "WEBTERM_VOICE_LLM_BASE_URL"
|
||||
VoiceLLMModelEnv = "WEBTERM_VOICE_LLM_MODEL"
|
||||
AuthUsernameEnv = "WEBTERM_AUTH_USERNAME"
|
||||
AuthPasswordEnv = "WEBTERM_AUTH_PASSWORD"
|
||||
AuthCookieSecretEnv = "WEBTERM_AUTH_COOKIE_SECRET"
|
||||
|
||||
+13
-1
@@ -1888,7 +1888,19 @@ func (s *LocalServer) handleRoot(w http.ResponseWriter, r *http.Request) {
|
||||
fontFamily = "var(--webterm-mono)"
|
||||
}
|
||||
escapedFont := strings.ReplaceAll(fontFamily, `"`, """)
|
||||
dataAttrs := fmt.Sprintf(`data-session-websocket-url="%s" data-session-route-key="%s" data-session-name="%s" data-font-size="%d" data-scrollback="1000" data-theme="%s" data-font-family="%s"`, htmlAttrEscape(wsURL), htmlAttrEscape(routeKey), htmlAttrEscape(app.Name), s.fontSize, htmlAttrEscape(theme), escapedFont)
|
||||
voiceLLMBaseURL := strings.TrimSpace(os.Getenv(VoiceLLMBaseURLEnv))
|
||||
voiceLLMModel := strings.TrimSpace(os.Getenv(VoiceLLMModelEnv))
|
||||
dataAttrs := fmt.Sprintf(
|
||||
`data-session-websocket-url="%s" data-session-route-key="%s" data-session-name="%s" data-font-size="%d" data-scrollback="1000" data-theme="%s" data-font-family="%s" data-voice-llm-base-url="%s" data-voice-llm-model="%s"`,
|
||||
htmlAttrEscape(wsURL),
|
||||
htmlAttrEscape(routeKey),
|
||||
htmlAttrEscape(app.Name),
|
||||
s.fontSize,
|
||||
htmlAttrEscape(theme),
|
||||
escapedFont,
|
||||
htmlAttrEscape(voiceLLMBaseURL),
|
||||
htmlAttrEscape(voiceLLMModel),
|
||||
)
|
||||
cacheBust := "?v=" + s.staticAssetCacheBust
|
||||
page := fmt.Sprintf(`<!DOCTYPE html><html><head><meta charset="utf-8"><title>%s</title><link rel="stylesheet" href="/static/monospace.css%s"><style>html,body{width:100%%;height:100%%}body{background:%s;margin:0;padding:0;overflow:hidden;font-family:var(--webterm-mono);display:flex;flex-direction:column;height:100vh;height:100dvh}.webterm-terminal{width:100%%;flex:1;min-height:0;display:block;overflow:hidden}</style></head><body><div id="terminal" class="webterm-terminal" %s></div><script type="module" src="/static/js/terminal.js%s"></script></body></html>`, htmlEscape(app.Name), cacheBust, themeBG, dataAttrs, cacheBust)
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
|
||||
@@ -588,6 +588,26 @@ func TestRootTerminalPageAndSparklineValidation(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRootTerminalPageIncludesVoiceLLMConfig(t *testing.T) {
|
||||
t.Setenv(VoiceLLMBaseURLEnv, "http://cachy.lan:11434")
|
||||
t.Setenv(VoiceLLMModelEnv, "llama-cleanup")
|
||||
|
||||
_, httpServer, _ := newServerForTests(t, false)
|
||||
resp, err := http.Get(httpServer.URL + "/?route_key=shell")
|
||||
if err != nil {
|
||||
t.Fatalf("root request error = %v", err)
|
||||
}
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
_ = resp.Body.Close()
|
||||
text := string(body)
|
||||
if !strings.Contains(text, `data-voice-llm-base-url="http://cachy.lan:11434"`) {
|
||||
t.Fatalf("expected voice LLM base URL in page attrs, got %q", text)
|
||||
}
|
||||
if !strings.Contains(text, `data-voice-llm-model="llama-cleanup"`) {
|
||||
t.Fatalf("expected voice LLM model in page attrs, got %q", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMarkRouteActivityBroadcastsWithoutBlockingGlobalLock(t *testing.T) {
|
||||
server := NewLocalServer(Config{}, ServerOptions{})
|
||||
ready := make(chan string, 1)
|
||||
|
||||
File diff suppressed because one or more lines are too long
+348
-10
@@ -27,6 +27,21 @@ const VOICE_PROCESSOR_BUFFER_SIZE = 4096;
|
||||
const VOICE_VAD_WINDOW_SIZE = 512;
|
||||
const VOICE_BUFFER_SIZE_SECONDS = 30;
|
||||
const VOICE_STATUS_MAX_LENGTH = 48;
|
||||
const DEFAULT_VOICE_LLM_BASE_URL = "/llm";
|
||||
const VOICE_LLM_TIMEOUT_MS = 180_000;
|
||||
const VOICE_MODE_STORAGE_KEY = "webterm:voice-mode";
|
||||
|
||||
const VOICE_INSERT_COMMAND = "insert text";
|
||||
const VOICE_SUBMIT_COMMAND = "submit text";
|
||||
const VOICE_CANCEL_COMMAND = "cancel text";
|
||||
|
||||
type VoiceMode = "live" | "cleanup";
|
||||
type VoiceFinalizeAction = "insert" | "submit";
|
||||
type VoiceCommandAction = VoiceFinalizeAction | "cancel";
|
||||
|
||||
const VOICE_THINKING_SYSTEM_PROMPT = `You are a helpful voice-to-task translator. You will recieve a raw speech-to-text transcript that may contain filler words, false starts, and rambling. Then you will receive some instructions after which you'll analyze the text in the best way to help clean it up. Do not ask any questions. Just think outloud.`;
|
||||
const VOICE_THINKING_ANALYSIS_PROMPT = `Analyze the user's Intent and the Functional meaning of each sentence. Evaluate Correctness — are these genuine instructions or speech mistakes? Consider Efficiency of each phrase and whether there are better Alternatives or unnecessary filler words to remove.`;
|
||||
const VOICE_CLEANUP_SYSTEM_PROMPT = `You clean up raw speech-to-text transcripts into concise terminal-ready text. Remove filler words, false starts, repetitions, and obvious recognition mistakes while preserving user intent. Do not ask questions. Return only cleaned transcript text with no explanation, labels, or quotes.`;
|
||||
|
||||
type SherpaModule = {
|
||||
HEAPF32: Float32Array;
|
||||
@@ -81,6 +96,15 @@ type SherpaRuntime = {
|
||||
createVad: (module: SherpaModule, config?: unknown) => SherpaVad;
|
||||
};
|
||||
|
||||
type OpenAIChatMessage = {
|
||||
role: "system" | "user" | "assistant";
|
||||
content: string;
|
||||
};
|
||||
|
||||
type OpenAIModelsResponse = {
|
||||
data?: Array<{ id?: string }>;
|
||||
};
|
||||
|
||||
declare global {
|
||||
interface Window {
|
||||
CircularBuffer?: new (capacity: number, module: SherpaModule) => SherpaCircularBuffer;
|
||||
@@ -1031,10 +1055,17 @@ class WebTerminal {
|
||||
private voiceSilentGain: GainNode | null = null;
|
||||
private voiceInputStream: MediaStream | null = null;
|
||||
private voiceAssetBase = `${getStaticJSBasePath()}${DEFAULT_SHERPA_ASSET_DIR}/`;
|
||||
private voiceLlmBaseUrl = "";
|
||||
private voiceLlmModelOverride = "";
|
||||
private voiceLlmDetectedModel: string | null = null;
|
||||
private voiceMode: VoiceMode = "live";
|
||||
private voiceSpeechDetected = false;
|
||||
private voiceReceivedAudio = false;
|
||||
private voicePendingSeparator = false;
|
||||
private voiceDraftTranscript = "";
|
||||
private voiceFinalizeToken = 0;
|
||||
private isVoiceStarting = false;
|
||||
private voiceState: "idle" | "loading" | "listening" | "error" | "unsupported" = "idle";
|
||||
private voiceState: "idle" | "loading" | "listening" | "processing" | "error" | "unsupported" = "idle";
|
||||
private voiceStartupErrorCleanup: (() => void) | null = null;
|
||||
private static sharedTextEncoder = new TextEncoder();
|
||||
|
||||
@@ -1151,6 +1182,223 @@ class WebTerminal {
|
||||
document.title = this.baseTitle;
|
||||
}
|
||||
|
||||
private loadVoiceModePreference(): VoiceMode {
|
||||
try {
|
||||
const stored = localStorage.getItem(VOICE_MODE_STORAGE_KEY);
|
||||
return stored === "cleanup" ? "cleanup" : "live";
|
||||
} catch {
|
||||
return "live";
|
||||
}
|
||||
}
|
||||
|
||||
private persistVoiceModePreference(): void {
|
||||
try {
|
||||
localStorage.setItem(VOICE_MODE_STORAGE_KEY, this.voiceMode);
|
||||
} catch {
|
||||
// Ignore storage failures in private browsing or restricted contexts.
|
||||
}
|
||||
}
|
||||
|
||||
private setVoiceMode(mode: VoiceMode): void {
|
||||
this.voiceMode = mode;
|
||||
this.persistVoiceModePreference();
|
||||
this.syncModifierButtons();
|
||||
const configError = this.getVoiceLlmConfigError();
|
||||
if (configError) {
|
||||
this.setVoiceState("error", configError);
|
||||
return;
|
||||
}
|
||||
if (this.voiceState === "idle" || this.voiceState === "error") {
|
||||
this.setVoiceState("idle", mode === "cleanup" ? "Ready: Cleanup" : "Ready: Live");
|
||||
}
|
||||
}
|
||||
|
||||
private normalizeVoiceCommandText(value: string): string {
|
||||
return value
|
||||
.toLowerCase()
|
||||
.replace(/[\s]+/g, " ")
|
||||
.replace(/[.,!?;:]+$/g, "")
|
||||
.trim();
|
||||
}
|
||||
|
||||
private getVoiceCommandAction(transcript: string): VoiceCommandAction | null {
|
||||
const normalized = this.normalizeVoiceCommandText(transcript);
|
||||
if (!normalized) {
|
||||
return null;
|
||||
}
|
||||
if (normalized === VOICE_INSERT_COMMAND || normalized.endsWith(` ${VOICE_INSERT_COMMAND}`)) {
|
||||
return "insert";
|
||||
}
|
||||
if (normalized === VOICE_SUBMIT_COMMAND || normalized.endsWith(` ${VOICE_SUBMIT_COMMAND}`)) {
|
||||
return "submit";
|
||||
}
|
||||
if (normalized === VOICE_CANCEL_COMMAND || normalized.endsWith(` ${VOICE_CANCEL_COMMAND}`)) {
|
||||
return "cancel";
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private stripVoiceCommandSuffix(transcript: string): string {
|
||||
let value = transcript.trim();
|
||||
const patterns = [
|
||||
new RegExp(`(?:^|\\s)${VOICE_INSERT_COMMAND}[\\s,.!?;:]*$`, "i"),
|
||||
new RegExp(`(?:^|\\s)${VOICE_SUBMIT_COMMAND}[\\s,.!?;:]*$`, "i"),
|
||||
new RegExp(`(?:^|\\s)${VOICE_CANCEL_COMMAND}[\\s,.!?;:]*$`, "i"),
|
||||
];
|
||||
for (const pattern of patterns) {
|
||||
value = value.replace(pattern, "").trim();
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
private appendVoiceDraftSegment(transcript: string): string {
|
||||
const next = this.formatVoiceTranscriptForInsert(transcript);
|
||||
this.voiceDraftTranscript += next;
|
||||
return this.voiceDraftTranscript;
|
||||
}
|
||||
|
||||
private voiceDraftPreview(limit = 40): string {
|
||||
const text = this.voiceDraftTranscript.trim();
|
||||
if (!text) {
|
||||
return "Draft empty";
|
||||
}
|
||||
return text.length > limit ? `${text.slice(0, limit - 1)}…` : text;
|
||||
}
|
||||
|
||||
private getVoiceLlmConfigError(): string | null {
|
||||
if (this.voiceMode !== "cleanup") {
|
||||
return null;
|
||||
}
|
||||
if (!this.voiceLlmBaseUrl) {
|
||||
return "Cleanup mode missing LLM URL";
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private async resolveVoiceLlmModel(): Promise<string> {
|
||||
if (this.voiceLlmModelOverride) {
|
||||
return this.voiceLlmModelOverride;
|
||||
}
|
||||
if (this.voiceLlmDetectedModel) {
|
||||
return this.voiceLlmDetectedModel;
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeoutId = window.setTimeout(() => controller.abort(), VOICE_LLM_TIMEOUT_MS);
|
||||
try {
|
||||
const response = await fetch(`${this.voiceLlmBaseUrl}/v1/models`, {
|
||||
method: "GET",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
signal: controller.signal,
|
||||
});
|
||||
if (!response.ok) {
|
||||
const body = await response.text().catch(() => "");
|
||||
throw new Error(`LLM model discovery failed: ${response.status}${body ? ` ${body}` : ""}`);
|
||||
}
|
||||
const payload = await response.json() as OpenAIModelsResponse;
|
||||
const model = payload.data?.find((entry) => typeof entry.id === "string" && entry.id.trim())?.id?.trim();
|
||||
if (!model) {
|
||||
throw new Error("LLM model discovery returned no models");
|
||||
}
|
||||
this.voiceLlmDetectedModel = model;
|
||||
return model;
|
||||
} finally {
|
||||
clearTimeout(timeoutId);
|
||||
}
|
||||
}
|
||||
|
||||
private async requestVoiceChatCompletion(messages: OpenAIChatMessage[]): Promise<string> {
|
||||
const model = await this.resolveVoiceLlmModel();
|
||||
const controller = new AbortController();
|
||||
const timeoutId = window.setTimeout(() => controller.abort(), VOICE_LLM_TIMEOUT_MS);
|
||||
try {
|
||||
const response = await fetch(`${this.voiceLlmBaseUrl}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
messages,
|
||||
stream: false,
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
if (!response.ok) {
|
||||
const body = await response.text().catch(() => "");
|
||||
throw new Error(`LLM request failed: ${response.status}${body ? ` ${body}` : ""}`);
|
||||
}
|
||||
const payload = await response.json() as {
|
||||
choices?: Array<{ message?: { content?: string | Array<{ text?: string }> } }>;
|
||||
};
|
||||
const content = payload.choices?.[0]?.message?.content;
|
||||
if (typeof content === "string") {
|
||||
return content.trim();
|
||||
}
|
||||
if (Array.isArray(content)) {
|
||||
return content
|
||||
.map((item) => item.text ?? "")
|
||||
.join("")
|
||||
.trim();
|
||||
}
|
||||
throw new Error("LLM response missing content");
|
||||
} finally {
|
||||
clearTimeout(timeoutId);
|
||||
}
|
||||
}
|
||||
|
||||
private async cleanupVoiceTranscript(rawTranscript: string): Promise<string> {
|
||||
const analysisUser = `${rawTranscript}\n\n${VOICE_THINKING_ANALYSIS_PROMPT}`;
|
||||
const analysisResponse = await this.requestVoiceChatCompletion([
|
||||
{ role: "system", content: VOICE_THINKING_SYSTEM_PROMPT },
|
||||
{ role: "user", content: analysisUser },
|
||||
]);
|
||||
const finalUser = `<transcript>\n${rawTranscript}\n</transcript>\n\nClean up the transcript now:`;
|
||||
return this.requestVoiceChatCompletion([
|
||||
{ role: "system", content: VOICE_CLEANUP_SYSTEM_PROMPT },
|
||||
{ role: "user", content: analysisUser },
|
||||
{ role: "assistant", content: analysisResponse },
|
||||
{ role: "user", content: finalUser },
|
||||
]);
|
||||
}
|
||||
|
||||
private async finalizeVoiceCleanup(action: VoiceFinalizeAction): Promise<void> {
|
||||
const finalizeToken = ++this.voiceFinalizeToken;
|
||||
const rawTranscript = this.stripVoiceCommandSuffix(this.voiceDraftTranscript);
|
||||
if (!rawTranscript) {
|
||||
this.resetVoiceDraftState();
|
||||
this.setVoiceState("idle", "Ready: Cleanup");
|
||||
return;
|
||||
}
|
||||
|
||||
this.setVoiceState("processing", "Cleaning...");
|
||||
const cleaned = (await this.cleanupVoiceTranscript(rawTranscript)).trim();
|
||||
if (finalizeToken !== this.voiceFinalizeToken) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.resetVoiceDraftState();
|
||||
if (!cleaned) {
|
||||
this.setVoiceState("idle", "Ready: Cleanup");
|
||||
return;
|
||||
}
|
||||
|
||||
this.sendStdin(cleaned);
|
||||
if (action === "submit") {
|
||||
this.sendStdin("\r");
|
||||
this.setVoiceState("idle", `Submitted: ${cleaned}`);
|
||||
return;
|
||||
}
|
||||
this.setVoiceState("idle", `Inserted: ${cleaned}`);
|
||||
}
|
||||
|
||||
private resetVoiceDraftState(): void {
|
||||
this.voiceDraftTranscript = "";
|
||||
this.voicePendingSeparator = false;
|
||||
}
|
||||
|
||||
private ensureErrorOverlay(): HTMLDivElement {
|
||||
if (this.errorOverlay) {
|
||||
return this.errorOverlay;
|
||||
@@ -1827,6 +2075,10 @@ class WebTerminal {
|
||||
if (assetOverride) {
|
||||
this.voiceAssetBase = assetOverride.endsWith("/") ? assetOverride : `${assetOverride}/`;
|
||||
}
|
||||
this.voiceLlmBaseUrl =
|
||||
this.element.dataset.voiceLlmBaseUrl?.trim().replace(/\/+$/, "") || DEFAULT_VOICE_LLM_BASE_URL;
|
||||
this.voiceLlmModelOverride = this.element.dataset.voiceLlmModel?.trim() ?? "";
|
||||
this.voiceMode = this.loadVoiceModePreference();
|
||||
|
||||
if (window.getComputedStyle(this.element).position === "static") {
|
||||
this.element.style.position = "relative";
|
||||
@@ -1876,7 +2128,12 @@ class WebTerminal {
|
||||
return;
|
||||
}
|
||||
|
||||
this.setVoiceState("idle", "Ready");
|
||||
const configError = this.getVoiceLlmConfigError();
|
||||
if (configError) {
|
||||
this.setVoiceState("error", configError);
|
||||
return;
|
||||
}
|
||||
this.setVoiceState("idle", this.voiceMode === "cleanup" ? "Ready: Cleanup" : "Ready: Live");
|
||||
}
|
||||
|
||||
private async toggleVoiceInput(): Promise<void> {
|
||||
@@ -1884,7 +2141,13 @@ class WebTerminal {
|
||||
return;
|
||||
}
|
||||
if (this.voiceProcessor) {
|
||||
await this.stopVoiceInput();
|
||||
await this.stopVoiceInput("insert");
|
||||
return;
|
||||
}
|
||||
|
||||
const configError = this.getVoiceLlmConfigError();
|
||||
if (configError) {
|
||||
this.setVoiceState("error", configError);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1895,11 +2158,13 @@ class WebTerminal {
|
||||
try {
|
||||
await this.startVoiceInput();
|
||||
await this.ensureVoiceRuntime();
|
||||
this.setVoiceState("listening", "Listening...");
|
||||
this.resetVoiceDraftState();
|
||||
this.setVoiceState("listening", this.voiceMode === "cleanup" ? "Listening: Cleanup" : "Listening...");
|
||||
this.focusTerminalInput();
|
||||
} catch (error) {
|
||||
console.error("[webterm] Failed to start sherpa voice input:", error);
|
||||
this.disconnectVoiceAudio();
|
||||
this.resetVoiceDraftState();
|
||||
this.setVoiceState("error", this.describeVoiceError(error));
|
||||
} finally {
|
||||
this.isVoiceStarting = false;
|
||||
@@ -2004,7 +2269,7 @@ class WebTerminal {
|
||||
await audioContext.resume().catch(() => undefined);
|
||||
}
|
||||
|
||||
private async stopVoiceInput(): Promise<void> {
|
||||
private async stopVoiceInput(finalizeAction?: VoiceFinalizeAction): Promise<void> {
|
||||
this.clearVoiceStartupErrorCapture();
|
||||
try {
|
||||
this.flushVoiceSegments();
|
||||
@@ -2012,7 +2277,22 @@ class WebTerminal {
|
||||
console.error("[webterm] Failed to flush sherpa voice segments:", error);
|
||||
}
|
||||
this.disconnectVoiceAudio();
|
||||
this.setVoiceState("idle", "Ready");
|
||||
if (this.voiceMode === "cleanup") {
|
||||
const action = finalizeAction ?? "insert";
|
||||
if (this.voiceDraftTranscript.trim()) {
|
||||
try {
|
||||
await this.finalizeVoiceCleanup(action);
|
||||
} catch (error) {
|
||||
console.error("[webterm] Failed to finalize cleanup transcript:", error);
|
||||
this.setVoiceState("error", this.describeVoiceError(error));
|
||||
}
|
||||
} else {
|
||||
this.resetVoiceDraftState();
|
||||
this.setVoiceState("idle", "Ready: Cleanup");
|
||||
}
|
||||
} else {
|
||||
this.setVoiceState("idle", "Ready: Live");
|
||||
}
|
||||
this.focusTerminalInput();
|
||||
}
|
||||
|
||||
@@ -2110,7 +2390,30 @@ class WebTerminal {
|
||||
this.setVoiceState("listening", "Listening...");
|
||||
continue;
|
||||
}
|
||||
this.sendStdin(transcript);
|
||||
if (this.voiceMode === "cleanup") {
|
||||
const fullDraft = this.appendVoiceDraftSegment(transcript);
|
||||
const command = this.getVoiceCommandAction(fullDraft);
|
||||
if (command === "cancel") {
|
||||
this.voiceFinalizeToken += 1;
|
||||
this.resetVoiceDraftState();
|
||||
this.disconnectVoiceAudio();
|
||||
this.setVoiceState("idle", "Canceled");
|
||||
this.focusTerminalInput();
|
||||
return;
|
||||
}
|
||||
if (command === "insert" || command === "submit") {
|
||||
this.disconnectVoiceAudio();
|
||||
void this.finalizeVoiceCleanup(command).catch((error) => {
|
||||
console.error("[webterm] Failed to finalize cleanup transcript:", error);
|
||||
this.setVoiceState("error", this.describeVoiceError(error));
|
||||
});
|
||||
this.focusTerminalInput();
|
||||
return;
|
||||
}
|
||||
this.setVoiceState("listening", `Draft: ${this.voiceDraftPreview()}`);
|
||||
continue;
|
||||
}
|
||||
this.sendStdin(this.formatVoiceTranscriptForInsert(transcript));
|
||||
this.setVoiceState("listening", `Sent: ${transcript}`);
|
||||
this.focusTerminalInput();
|
||||
} finally {
|
||||
@@ -2119,10 +2422,20 @@ class WebTerminal {
|
||||
}
|
||||
}
|
||||
|
||||
private formatVoiceTranscriptForInsert(transcript: string): string {
|
||||
const needsSeparator =
|
||||
this.voicePendingSeparator &&
|
||||
!/^\s/.test(transcript) &&
|
||||
!/^[,.;:!?)]/.test(transcript);
|
||||
this.voicePendingSeparator = !/\s$/.test(transcript);
|
||||
return needsSeparator ? ` ${transcript}` : transcript;
|
||||
}
|
||||
|
||||
private flushVoiceSegments(): void {
|
||||
this.drainVoiceBuffer(true);
|
||||
this.voiceVad?.reset();
|
||||
this.voiceBuffer?.reset();
|
||||
this.voicePendingSeparator = false;
|
||||
}
|
||||
|
||||
private destroyVoiceEngine(): void {
|
||||
@@ -2132,6 +2445,7 @@ class WebTerminal {
|
||||
this.voiceRecognizer = null;
|
||||
this.voiceVad = null;
|
||||
this.voiceBuffer = null;
|
||||
this.voicePendingSeparator = false;
|
||||
}
|
||||
|
||||
private downsampleBuffer(
|
||||
@@ -2169,6 +2483,9 @@ class WebTerminal {
|
||||
if (typeof error === "string" && error.trim()) {
|
||||
return error.trim();
|
||||
}
|
||||
if (error instanceof DOMException && error.name === "AbortError") {
|
||||
return `LLM request timed out after ${Math.round(VOICE_LLM_TIMEOUT_MS / 1000)}s`;
|
||||
}
|
||||
if (error instanceof Error && error.message.trim()) {
|
||||
const cause =
|
||||
"cause" in error && error.cause
|
||||
@@ -2231,7 +2548,7 @@ class WebTerminal {
|
||||
}
|
||||
|
||||
private setVoiceState(
|
||||
state: "idle" | "loading" | "listening" | "error" | "unsupported",
|
||||
state: "idle" | "loading" | "listening" | "processing" | "error" | "unsupported",
|
||||
message: string
|
||||
): void {
|
||||
this.voiceState = state;
|
||||
@@ -2255,6 +2572,10 @@ class WebTerminal {
|
||||
if (this.voiceControls) {
|
||||
this.voiceControls.style.background = "rgba(45, 10, 10, 0.78)";
|
||||
}
|
||||
} else if (state === "processing") {
|
||||
if (this.voiceControls) {
|
||||
this.voiceControls.style.background = "rgba(16, 34, 44, 0.82)";
|
||||
}
|
||||
} else if (state === "error") {
|
||||
if (this.voiceControls) {
|
||||
this.voiceControls.style.background = "rgba(46, 29, 8, 0.82)";
|
||||
@@ -2457,7 +2778,7 @@ class WebTerminal {
|
||||
if (this.voiceState === "listening") {
|
||||
return "Stop";
|
||||
}
|
||||
if (this.voiceState === "loading") {
|
||||
if (this.voiceState === "loading" || this.voiceState === "processing") {
|
||||
return "Mic...";
|
||||
}
|
||||
return "Mic";
|
||||
@@ -2542,7 +2863,7 @@ class WebTerminal {
|
||||
return this.mobileKeyboardMode === "symbol";
|
||||
}
|
||||
if (key.key.actionId === "toggle-voice") {
|
||||
return this.voiceState === "listening" || this.voiceState === "loading";
|
||||
return this.voiceState === "listening" || this.voiceState === "loading" || this.voiceState === "processing";
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -2901,6 +3222,8 @@ class WebTerminal {
|
||||
<span class="keybar-label">Font</span>
|
||||
<button class="keybar-font-shrink" title="Smaller font">A−</button>
|
||||
<button class="keybar-font-grow" title="Larger font">A+</button>
|
||||
<span class="keybar-label">Voice</span>
|
||||
<button class="keybar-voice-mode" title="Toggle voice mode">Live</button>
|
||||
`;
|
||||
|
||||
keybar.appendChild(keysPanel);
|
||||
@@ -3047,6 +3370,21 @@ class WebTerminal {
|
||||
this.fit();
|
||||
});
|
||||
|
||||
const voiceModeButton = settingsPanel.querySelector(".keybar-voice-mode") as HTMLButtonElement | null;
|
||||
const updateVoiceModeButton = () => {
|
||||
if (!voiceModeButton) {
|
||||
return;
|
||||
}
|
||||
voiceModeButton.textContent = this.voiceMode === "cleanup" ? "Clean" : "Live";
|
||||
voiceModeButton.classList.toggle("active", this.voiceMode === "cleanup");
|
||||
};
|
||||
updateVoiceModeButton();
|
||||
voiceModeButton?.addEventListener("touchstart", (e) => {
|
||||
e.preventDefault();
|
||||
this.setVoiceMode(this.voiceMode === "cleanup" ? "live" : "cleanup");
|
||||
updateVoiceModeButton();
|
||||
});
|
||||
|
||||
// Handle key button presses
|
||||
keysPanel.querySelectorAll("button[data-key]").forEach((btn) => {
|
||||
btn.addEventListener("touchstart", (e) => {
|
||||
|
||||
Reference in New Issue
Block a user