Files
codex/codex-rs/core/Cargo.toml
LIHUA 397279d46e Fix: Improve text encoding for shell output in VSCode preview (#6178) (#6182)
## 🐛 Problem

Users running commands with non-ASCII characters (like Russian text
"пример") in Windows/WSL environments experience garbled text in
VSCode's shell preview window, with Unicode replacement characters (�)
appearing instead of the actual text.

**Issue**: https://github.com/openai/codex/issues/6178

## 🔧 Root Cause

The issue was in `StreamOutput<Vec<u8>>::from_utf8_lossy()` method in
`codex-rs/core/src/exec.rs`, which used `String::from_utf8_lossy()` to
convert shell output bytes to strings. This function immediately
replaces any invalid UTF-8 byte sequences with replacement characters,
without attempting to decode using other common encodings.

In Windows/WSL environments, shell output often uses encodings like:

- Windows-1252 (common Windows encoding)
- Latin-1/ISO-8859-1 (extended ASCII)

## 🛠️ Solution

Replaced the simple `String::from_utf8_lossy()` call with intelligent
encoding detection via a new `bytes_to_string_smart()` function that
tries multiple encoding strategies:

1. **UTF-8** (fast path for valid UTF-8)
2. **Windows-1252** (handles Windows-specific characters in 0x80-0x9F
range)
3. **Latin-1** (fallback for extended ASCII)
4. **Lossy UTF-8** (final fallback, same as before)

## 📁 Changes

### New Files

- `codex-rs/core/src/text_encoding.rs` - Smart encoding detection module
- `codex-rs/core/tests/suite/text_encoding_fix.rs` - Integration tests

### Modified Files

- `codex-rs/core/src/lib.rs` - Added text_encoding module
- `codex-rs/core/src/exec.rs` - Updated StreamOutput::from_utf8_lossy()
- `codex-rs/core/tests/suite/mod.rs` - Registered new test module

##  Testing

- **5 unit tests** covering UTF-8, Windows-1252, Latin-1, and fallback
scenarios
- **2 integration tests** simulating the exact Issue #6178 scenario
- **Demonstrates improvement** over the previous
`String::from_utf8_lossy()` approach

All tests pass:

```bash
cargo test -p codex-core text_encoding
cargo test -p codex-core test_shell_output_encoding_issue_6178
```

## 🎯 Impact

-  **Eliminates garbled text** in VSCode shell preview for non-ASCII
content
-  **Supports Windows/WSL environments** with proper encoding detection
-  **Zero performance impact** for UTF-8 text (fast path)
-  **Backward compatible** - UTF-8 content works exactly as before
-  **Handles edge cases** with robust fallback mechanism

## 🧪 Test Scenarios

The fix has been tested with:

- Russian text ("пример")
- Windows-1252 quotation marks (""test")
- Latin-1 accented characters ("café")
- Mixed encoding content
- Invalid byte sequences (graceful fallback)

## 📋 Checklist

- [X] Addresses the reported issue
- [X] Includes comprehensive tests
- [X] Maintains backward compatibility
- [X] Follows project coding conventions
- [X] No breaking changes

---------

Co-authored-by: Josh McKinney <joshka@openai.com>
2025-11-20 11:04:11 -08:00

131 lines
4.1 KiB
TOML

[package]
edition = "2024"
name = "codex-core"
version = { workspace = true }
[lib]
doctest = false
name = "codex_core"
path = "src/lib.rs"
[lints]
workspace = true
[dependencies]
anyhow = { workspace = true }
askama = { workspace = true }
async-channel = { workspace = true }
async-trait = { workspace = true }
base64 = { workspace = true }
bytes = { workspace = true }
chrono = { workspace = true, features = ["serde"] }
chardetng = { workspace = true }
codex-app-server-protocol = { workspace = true }
codex-apply-patch = { workspace = true }
codex-async-utils = { workspace = true }
codex-execpolicy = { workspace = true }
codex-file-search = { workspace = true }
codex-git = { workspace = true }
codex-keyring-store = { workspace = true }
codex-otel = { workspace = true, features = ["otel"] }
codex-protocol = { workspace = true }
codex-rmcp-client = { workspace = true }
codex-utils-pty = { workspace = true }
codex-utils-readiness = { workspace = true }
codex-utils-string = { workspace = true }
codex-utils-tokenizer = { workspace = true }
codex-windows-sandbox = { package = "codex-windows-sandbox", path = "../windows-sandbox-rs" }
dirs = { workspace = true }
dunce = { workspace = true }
env-flags = { workspace = true }
encoding_rs = { workspace = true }
eventsource-stream = { workspace = true }
futures = { workspace = true }
http = { workspace = true }
indexmap = { workspace = true }
keyring = { workspace = true, features = ["crypto-rust"] }
libc = { workspace = true }
mcp-types = { workspace = true }
os_info = { workspace = true }
rand = { workspace = true }
regex-lite = { workspace = true }
reqwest = { workspace = true, features = ["json", "stream"] }
serde = { workspace = true, features = ["derive"] }
serde_json = { workspace = true }
sha1 = { workspace = true }
sha2 = { workspace = true }
shlex = { workspace = true }
similar = { workspace = true }
strum_macros = { workspace = true }
tempfile = { workspace = true }
test-case = "3.3.1"
test-log = { workspace = true }
thiserror = { workspace = true }
time = { workspace = true, features = [
"formatting",
"parsing",
"local-offset",
"macros",
] }
tokio = { workspace = true, features = [
"io-std",
"macros",
"process",
"rt-multi-thread",
"signal",
] }
tokio-util = { workspace = true, features = ["rt"] }
toml = { workspace = true }
toml_edit = { workspace = true }
tracing = { workspace = true, features = ["log"] }
tree-sitter = { workspace = true }
tree-sitter-bash = { workspace = true }
uuid = { workspace = true, features = ["serde", "v4", "v5"] }
which = { workspace = true }
wildmatch = { workspace = true }
[target.'cfg(target_os = "linux")'.dependencies]
landlock = { workspace = true }
seccompiler = { workspace = true }
keyring = { workspace = true, features = ["linux-native-async-persistent"] }
[target.'cfg(target_os = "macos")'.dependencies]
core-foundation = "0.9"
keyring = { workspace = true, features = ["apple-native"] }
# Build OpenSSL from source for musl builds.
[target.x86_64-unknown-linux-musl.dependencies]
openssl-sys = { workspace = true, features = ["vendored"] }
# Build OpenSSL from source for musl builds.
[target.aarch64-unknown-linux-musl.dependencies]
openssl-sys = { workspace = true, features = ["vendored"] }
[target.'cfg(target_os = "windows")'.dependencies]
keyring = { workspace = true, features = ["windows-native"] }
[target.'cfg(any(target_os = "freebsd", target_os = "openbsd"))'.dependencies]
keyring = { workspace = true, features = ["sync-secret-service"] }
[dev-dependencies]
assert_cmd = { workspace = true }
assert_matches = { workspace = true }
codex-arg0 = { workspace = true }
core_test_support = { workspace = true }
ctor = { workspace = true }
escargot = { workspace = true }
image = { workspace = true, features = ["jpeg", "png"] }
maplit = { workspace = true }
predicates = { workspace = true }
pretty_assertions = { workspace = true }
serial_test = { workspace = true }
tempfile = { workspace = true }
tokio-test = { workspace = true }
tracing-test = { workspace = true, features = ["no-env-filter"] }
walkdir = { workspace = true }
wiremock = { workspace = true }
[package.metadata.cargo-shear]
ignored = ["openssl-sys"]