From 674e62d98330f1e6b8de8d597daedb3826cfe8e2 Mon Sep 17 00:00:00 2001 From: GitHub Copilot Date: Sat, 14 Feb 2026 16:40:28 +0000 Subject: [PATCH] refactor: upgrade go-te to v0.1.0, remove C1 normalization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit go-te v0.1.0 fixes the ByteStream C1 bug (Latin-1 fallback for invalid UTF-8 bytes), so NormalizeC1Controls is no longer needed. Removed: - NormalizeC1Controls function and its tests/fuzz target - utf8Buffer field from TerminalSession and DockerExecSession - C1BUG.md (fix shipped upstream) The handleOutput pipeline now does: FilterDA → replay → tracker → connector (was: FilterDA → NormalizeC1 → replay → tracker → connector) All tests pass with -race. 13 fuzz targets remain. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- go/C1BUG.md | 169 ------------------------------ go/go.mod | 2 +- go/go.sum | 4 +- go/webterm/docker_exec_session.go | 11 +- go/webterm/normalize.go | 67 ------------ go/webterm/normalize_test.go | 58 ---------- go/webterm/terminal_session.go | 11 +- 7 files changed, 11 insertions(+), 311 deletions(-) delete mode 100644 go/C1BUG.md diff --git a/go/C1BUG.md b/go/C1BUG.md deleted file mode 100644 index d065727..0000000 --- a/go/C1BUG.md +++ /dev/null @@ -1,169 +0,0 @@ -# Bug: ByteStream in UTF-8 mode permanently stalls on C1 control bytes - -## Summary - -`ByteStream.Feed()` in UTF-8 mode (the default) permanently stalls when it encounters a raw C1 control byte (0x80–0x9F). The invalid byte remains in the internal buffer and blocks all subsequent input from being processed, effectively freezing the terminal emulator. The buffer also grows without bound as new data is appended but never consumed. - -This is despite `Stream.handleGround()` having correct and complete C1 handling at the rune level. - -## Affected code - -`pkg/te/byte_stream.go`, lines 17–29: - -```go -func (st *ByteStream) Feed(data []byte) error { - if st.useUTF8 { - st.buffer = append(st.buffer, data...) - var out []rune - for len(st.buffer) > 0 { - r, size := utf8.DecodeRune(st.buffer) - if r == utf8.RuneError && size == 1 { - break // ← BUG: stops forever, bad byte stays in buffer - } - st.buffer = st.buffer[size:] - out = append(out, r) - } - return st.Stream.Feed(string(out)) - } - // ...non-UTF8 path works correctly... -} -``` - -## Root cause - -Bytes in the range 0x80–0xBF are not valid UTF-8 lead bytes. When `utf8.DecodeRune()` encounters one, it returns `(RuneError, 1)`. The current code responds by **breaking out of the loop** without consuming the byte. On the next call to `Feed()`, new data is appended to the buffer, but the loop immediately hits the same stalled byte and breaks again. - -This creates two problems: - -1. **Permanent parser stall** — no further input is ever processed. -2. **Unbounded memory growth** — every `Feed()` call appends to the buffer but nothing is ever consumed. - -## Why C1 bytes appear in real terminal output - -C1 control codes (0x80–0x9F) are part of the ISO/IEC 6429 (ECMA-48) standard and are actively used by: - -- **tmux** — sends `0x9B` (CSI) instead of `ESC [` in certain configurations -- **screen** — similarly uses 8-bit C1 controls -- **Some SSH servers** — may emit C1 codes depending on locale/terminfo -- **Legacy applications** — programs that assume 8-bit terminal support - -The `Stream.handleGround()` function already handles these correctly: - -```go -case '\x9b': st.state = stateCSI; st.resetCSI() // CSI -case '\x9d': st.state = stateOSC; st.current = "" // OSC -case '\x90': st.state = stateDCS; st.dcsData = "" // DCS -case '\x98': st.state = stateSOS // SOS -case '\x9e': st.state = statePM // PM -case '\x9f': st.state = stateAPC // APC -case '\x9a': st.listener.ReportDeviceAttributes(...) // DECID -case '\x84': st.listener.Index() // IND -case '\x85': st.listener.LineFeed(); ...CarriageReturn // NEL -case '\x88': st.listener.SetTabStop() // HTS -case '\x8d': st.listener.ReverseIndex() // RI -case '\x96': st.listener.StartProtectedArea() // SPA -case '\x97': st.listener.EndProtectedArea() // EPA -``` - -The bug is only in the byte-to-rune conversion layer (`ByteStream`), not in the parser itself. - -## Reproduction - -```go -package main - -import ( - "fmt" - "github.com/rcarmo/go-te/pkg/te" -) - -func main() { - screen := te.NewDiffScreen(20, 3) - stream := te.NewByteStream(screen, false) - - // 0x9B is C1 CSI — equivalent to ESC [ - // This should render "A" in red (SGR 31) - stream.Feed([]byte{0x9B, '3', '1', 'm', 'A'}) - fmt.Printf("Cell[0][0]: %q\n", screen.Buffer[0][0].Data) - // Got: " " (empty) — the 0x9B stalled the parser - - // All subsequent input is silently dropped - stream.Feed([]byte("Hello World")) - fmt.Printf("Cell[0][0]: %q\n", screen.Buffer[0][0].Data) - // Got: " " (still empty) — parser is permanently frozen - - // Prove that Stream (rune-level) handles it correctly - screen2 := te.NewDiffScreen(20, 3) - stream2 := te.NewStream(screen2, false) - stream2.Feed(string([]rune{0x9B, '3', '1', 'm', 'A'})) - fmt.Printf("Stream cell: %q fg=%s\n", - screen2.Buffer[0][0].Data, - screen2.Buffer[0][0].Attr.Fg.Name) - // Got: "A" fg=red — works perfectly at rune level -} -``` - -Output: -``` -Cell[0][0]: " " -Cell[0][0]: " " -Stream cell: "A" fg=red -``` - -## Suggested fix - -When `DecodeRune` returns `RuneError`, treat the byte as a Latin-1 code point (i.e., `rune(st.buffer[0])`) and advance by one byte. This passes the raw value to `Stream.handleGround()` where the existing C1 switch cases handle it correctly. - -```go -func (st *ByteStream) Feed(data []byte) error { - if st.useUTF8 { - st.buffer = append(st.buffer, data...) - var out []rune - for len(st.buffer) > 0 { - r, size := utf8.DecodeRune(st.buffer) - if r == utf8.RuneError && size == 1 { - // Treat invalid UTF-8 byte as Latin-1 code point. - // This allows C1 controls (0x80-0x9F) to reach - // Stream.handleGround() where they are handled correctly. - out = append(out, rune(st.buffer[0])) - st.buffer = st.buffer[1:] - continue - } - st.buffer = st.buffer[size:] - out = append(out, r) - } - return st.Stream.Feed(string(out)) - } - out := make([]rune, len(data)) - for i, b := range data { - out[i] = rune(b) - } - return st.Stream.Feed(string(out)) -} -``` - -This matches the behavior of the non-UTF8 code path (line 31–34) which already does `rune(b)` for each byte, and is consistent with how real terminal emulators (xterm, VTE, etc.) handle C1 bytes in UTF-8 mode. - -## Impact - -Any application using `ByteStream` (the standard way to feed raw PTY output to go-te) will silently freeze if the child process or terminal multiplexer emits a single C1 control byte. The workaround is to pre-process all input with a C1→7-bit normalizer before calling `Feed()`, which is what we do in the webterm Go port via `NormalizeC1Controls()`. - -## Affected C1 bytes - -| Byte | Name | Stream handler | -|------|------|----------------| -| 0x84 | IND (Index) | `listener.Index()` | -| 0x85 | NEL (Next Line) | `listener.LineFeed()` + `CarriageReturn()` | -| 0x88 | HTS (Horizontal Tab Set) | `listener.SetTabStop()` | -| 0x8D | RI (Reverse Index) | `listener.ReverseIndex()` | -| 0x90 | DCS (Device Control String) | enters DCS state | -| 0x96 | SPA (Start Protected Area) | `listener.StartProtectedArea()` | -| 0x97 | EPA (End Protected Area) | `listener.EndProtectedArea()` | -| 0x98 | SOS (Start of String) | enters SOS state | -| 0x9A | DECID | `listener.ReportDeviceAttributes()` | -| 0x9B | CSI (Control Sequence Introducer) | enters CSI state | -| 0x9D | OSC (Operating System Command) | enters OSC state | -| 0x9E | PM (Privacy Message) | enters PM state | -| 0x9F | APC (Application Program Command) | enters APC state | - -All 13 of these are unreachable via `ByteStream` in UTF-8 mode despite having working handlers in `Stream`. diff --git a/go/go.mod b/go/go.mod index c7ffde0..57fbf89 100644 --- a/go/go.mod +++ b/go/go.mod @@ -6,7 +6,7 @@ require ( github.com/creack/pty v1.1.18 github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 github.com/gorilla/websocket v1.5.3 - github.com/rcarmo/go-te v0.0.0-20260214100434-edf070e453db + github.com/rcarmo/go-te v0.1.0 gopkg.in/yaml.v3 v3.0.1 ) diff --git a/go/go.sum b/go/go.sum index 0798803..0a3d51d 100644 --- a/go/go.sum +++ b/go/go.sum @@ -8,8 +8,8 @@ github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aN github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw= github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= -github.com/rcarmo/go-te v0.0.0-20260214100434-edf070e453db h1:wJrQ4ABeQey49QEDHbHHZISpXoC5OtIeihEF4WA2M14= -github.com/rcarmo/go-te v0.0.0-20260214100434-edf070e453db/go.mod h1:cLsrtroxCubS+OHHwH0riB6xeNESfntaHEeI1jPAedk= +github.com/rcarmo/go-te v0.1.0 h1:BH9Ub+A0AVBY5Q00El4QMVaWAMbycVHgMHQI2Kz8J/o= +github.com/rcarmo/go-te v0.1.0/go.mod h1:cLsrtroxCubS+OHHwH0riB6xeNESfntaHEeI1jPAedk= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= diff --git a/go/webterm/docker_exec_session.go b/go/webterm/docker_exec_session.go index 2ebb69e..3b34a56 100644 --- a/go/webterm/docker_exec_session.go +++ b/go/webterm/docker_exec_session.go @@ -43,7 +43,6 @@ type DockerExecSession struct { tracker *terminalstate.Tracker replay *ReplayBuffer escapeBuffer []byte - utf8Buffer []byte width int height int running bool @@ -145,19 +144,17 @@ func (s *DockerExecSession) handleOutput(data []byte) { s.mu.Lock() filtered, escapeBuffer := FilterDASequences(data, s.escapeBuffer) s.escapeBuffer = escapeBuffer - normalized, utf8Buffer := NormalizeC1Controls(filtered, s.utf8Buffer) - s.utf8Buffer = utf8Buffer tracker := s.tracker connector := s.connector s.mu.Unlock() - if len(normalized) == 0 { + if len(filtered) == 0 { return } - s.replay.Add(normalized) + s.replay.Add(filtered) if tracker != nil { - _ = tracker.Feed(normalized) + _ = tracker.Feed(filtered) } - connector.OnData(normalized) + connector.OnData(filtered) } func (s *DockerExecSession) createExec() (string, error) { diff --git a/go/webterm/normalize.go b/go/webterm/normalize.go index 6b42b82..277c7e3 100644 --- a/go/webterm/normalize.go +++ b/go/webterm/normalize.go @@ -10,73 +10,6 @@ var ( daPartialPattern = regexp.MustCompile(`\x1b(?:\[(?:[?>=][\d;]*)?)?$`) ) -func NormalizeC1Controls(data []byte, utf8Buffer []byte) ([]byte, []byte) { - if len(data) == 0 && len(utf8Buffer) == 0 { - return nil, nil - } - merged := append(append([]byte{}, utf8Buffer...), data...) - out := make([]byte, 0, len(merged)) - pending := make([]byte, 0, 4) - expectedContinuations := 0 - - c1Map := map[byte][]byte{ - 0x9B: []byte("\x1b["), - 0x9D: []byte("\x1b]"), - 0x9C: []byte("\x1b\\"), - 0x90: []byte("\x1bP"), - 0x98: []byte("\x1bX"), - 0x9E: []byte("\x1b^"), - 0x9F: []byte("\x1b_"), - } - - for i := 0; i < len(merged); { - b := merged[i] - if expectedContinuations > 0 { - if b >= 0x80 && b <= 0xBF { - pending = append(pending, b) - expectedContinuations-- - i++ - if expectedContinuations == 0 { - out = append(out, pending...) - pending = pending[:0] - } - continue - } - out = append(out, pending...) - pending = pending[:0] - expectedContinuations = 0 - continue - } - switch { - case b >= 0xC2 && b <= 0xDF: - pending = append(pending, b) - expectedContinuations = 1 - i++ - continue - case b >= 0xE0 && b <= 0xEF: - pending = append(pending, b) - expectedContinuations = 2 - i++ - continue - case b >= 0xF0 && b <= 0xF4: - pending = append(pending, b) - expectedContinuations = 3 - i++ - continue - } - if replacement, ok := c1Map[b]; ok { - out = append(out, replacement...) - } else { - out = append(out, b) - } - i++ - } - if len(pending) > 0 { - return out, pending - } - return out, nil -} - func FilterDASequences(data []byte, escapeBuffer []byte) ([]byte, []byte) { merged := append(append([]byte{}, escapeBuffer...), data...) if len(merged) == 0 { diff --git a/go/webterm/normalize_test.go b/go/webterm/normalize_test.go index 74235bc..7cf3bfc 100644 --- a/go/webterm/normalize_test.go +++ b/go/webterm/normalize_test.go @@ -5,32 +5,6 @@ import ( "testing" ) -func TestNormalizeC1Controls(t *testing.T) { - input := []byte{0x9B, '3', '1', 'm', 'A'} - normalized, pending := NormalizeC1Controls(input, nil) - if string(pending) != "" { - t.Fatalf("expected no pending bytes, got %q", string(pending)) - } - if string(normalized) != "\x1b[31mA" { - t.Fatalf("unexpected normalized output: %q", string(normalized)) - } -} - -func TestNormalizeC1ControlsPreservesSplitUTF8(t *testing.T) { - first := []byte{0xC3} - normalized, pending := NormalizeC1Controls(first, nil) - if len(normalized) != 0 { - t.Fatalf("expected no output for incomplete utf8") - } - second, pending2 := NormalizeC1Controls([]byte{0xA9}, pending) - if len(pending2) != 0 { - t.Fatalf("expected no pending bytes after completion") - } - if string(second) != "é" { - t.Fatalf("unexpected utf8 output: %q", string(second)) - } -} - func TestFilterDASequencesCompleteAndPartial(t *testing.T) { data := []byte("a\x1b[?1;10;0cb") filtered, buffer := FilterDASequences(data, nil) @@ -57,38 +31,6 @@ func TestFilterDASequencesCompleteAndPartial(t *testing.T) { } } -func FuzzNormalizeC1Controls(f *testing.F) { - f.Add([]byte{0x9B, '3', '1', 'm', 'A'}, []byte{}) - f.Add([]byte{0xC3, 0xA9}, []byte{}) - f.Add([]byte{0xA9}, []byte{0xC3}) - f.Add([]byte("hello world"), []byte{}) - f.Add([]byte{0xF0, 0x9F, 0x98, 0x80}, []byte{}) - f.Add([]byte{0x90, 0x98, 0x9C, 0x9D, 0x9E, 0x9F}, []byte{}) - f.Add([]byte{}, []byte{0xE0, 0xA0}) - - f.Fuzz(func(t *testing.T, data []byte, pending []byte) { - out, remaining := NormalizeC1Controls(data, pending) - // Must never panic (implicit). Output + remaining should account for all non-C1 bytes. - _ = out - // Remaining must be a valid incomplete UTF-8 prefix (0-3 bytes) - if len(remaining) > 3 { - t.Errorf("remaining too large: %d bytes", len(remaining)) - } - // No C1 control bytes should survive in output - for _, b := range out { - if b >= 0x80 && b <= 0x9F { - // Could be part of valid UTF-8 continuation byte (0x80-0xBF) - if b >= 0x90 && b <= 0x9F { - // These are C1 controls that should have been replaced, - // but only if they weren't valid UTF-8 continuation bytes. - // Since C1 replacement only happens for lead bytes (not continuations), - // we just verify no standalone C1 controls remain. - } - } - } - }) -} - func FuzzFilterDASequences(f *testing.F) { f.Add([]byte("a\x1b[?1;10;0cb"), []byte{}) f.Add([]byte("plain text"), []byte{}) diff --git a/go/webterm/terminal_session.go b/go/webterm/terminal_session.go index 4eafd7c..49d9432 100644 --- a/go/webterm/terminal_session.go +++ b/go/webterm/terminal_session.go @@ -24,7 +24,6 @@ type TerminalSession struct { tracker *terminalstate.Tracker replay *ReplayBuffer escapeBuffer []byte - utf8Buffer []byte width int height int started bool @@ -134,20 +133,18 @@ func (s *TerminalSession) handleOutput(data []byte) { s.mu.Lock() filtered, escapeBuffer := FilterDASequences(data, s.escapeBuffer) s.escapeBuffer = escapeBuffer - normalized, utf8Buffer := NormalizeC1Controls(filtered, s.utf8Buffer) - s.utf8Buffer = utf8Buffer tracker := s.tracker connector := s.connector s.mu.Unlock() - if len(normalized) == 0 { + if len(filtered) == 0 { return } - s.replay.Add(normalized) + s.replay.Add(filtered) if tracker != nil { - _ = tracker.Feed(normalized) + _ = tracker.Feed(filtered) } - connector.OnData(normalized) + connector.OnData(filtered) } func (s *TerminalSession) Close() error {