Files
codex/codex-rs/protocol/src/exec_output.rs
Ahmed Ibrahim 6fff9955f1 extract models manager and related ownership from core (#16508)
## Summary
- split `models-manager` out of `core` and add `ModelsManagerConfig`
plus `Config::to_models_manager_config()` so model metadata paths stop
depending on `core::Config`
- move login-owned/auth-owned code out of `core` into `codex-login`,
move model provider config into `codex-model-provider-info`, move API
bridge mapping into `codex-api`, move protocol-owned types/impls into
`codex-protocol`, and move response debug helpers into a dedicated
`response-debug-context` crate
- move feedback tag emission into `codex-feedback`, relocate tests to
the crates that now own the code, and keep broad temporary re-exports so
this PR avoids a giant import-only rewrite

## Major moves and decisions
- created `codex-models-manager` as the owner for model
cache/catalog/config/model info logic, including the new
`ModelsManagerConfig` struct
- created `codex-model-provider-info` as the owner for provider config
parsing/defaults and kept temporary `codex-login`/`codex-core`
re-exports for old import paths
- moved `api_bridge` error mapping + `CoreAuthProvider` into
`codex-api`, while `codex-login::api_bridge` temporarily re-exports
those symbols and keeps the `auth_provider_from_auth` wrapper
- moved `auth_env_telemetry` and `provider_auth` ownership to
`codex-login`
- moved `CodexErr` ownership to `codex-protocol::error`, plus
`StreamOutput`, `bytes_to_string_smart`, and network policy helpers to
protocol-owned modules
- created `codex-response-debug-context` for
`extract_response_debug_context`, `telemetry_transport_error_message`,
and related response-debug plumbing instead of leaving that behavior in
`core`
- moved `FeedbackRequestTags`, `emit_feedback_request_tags`, and
`emit_feedback_request_tags_with_auth_env` to `codex-feedback`
- deferred removal of temporary re-exports and the mechanical import
rewrites to a stacked follow-up PR so this PR stays reviewable

## Test moves
- moved auth refresh coverage from `core/tests/suite/auth_refresh.rs` to
`login/tests/suite/auth_refresh.rs`
- moved text encoding coverage from
`core/tests/suite/text_encoding_fix.rs` to
`protocol/src/exec_output_tests.rs`
- moved model info override coverage from
`core/tests/suite/model_info_overrides.rs` to
`models-manager/src/model_info_overrides_tests.rs`

---------

Co-authored-by: Codex <noreply@openai.com>
2026-04-02 23:00:02 -07:00

170 lines
6.5 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Text encoding detection and conversion utilities for shell output.
//!
//! Windows users frequently run into code pages such as CP1251 or CP866 when invoking commands
//! through VS Code. Those bytes show up as invalid UTF-8 and used to be replaced with the standard
//! Unicode replacement character. We now lean on `chardetng` and `encoding_rs` so we can
//! automatically detect and decode the vast majority of legacy encodings before falling back to
//! lossy UTF-8 decoding.
use chardetng::EncodingDetector;
use encoding_rs::Encoding;
use encoding_rs::IBM866;
use encoding_rs::WINDOWS_1252;
use std::time::Duration;
#[derive(Debug, Clone)]
pub struct StreamOutput<T: Clone> {
pub text: T,
pub truncated_after_lines: Option<u32>,
}
impl StreamOutput<String> {
pub fn new(text: String) -> Self {
Self {
text,
truncated_after_lines: None,
}
}
}
impl StreamOutput<Vec<u8>> {
pub fn from_utf8_lossy(&self) -> StreamOutput<String> {
StreamOutput {
text: bytes_to_string_smart(&self.text),
truncated_after_lines: self.truncated_after_lines,
}
}
}
#[derive(Clone, Debug)]
pub struct ExecToolCallOutput {
pub exit_code: i32,
pub stdout: StreamOutput<String>,
pub stderr: StreamOutput<String>,
pub aggregated_output: StreamOutput<String>,
pub duration: Duration,
pub timed_out: bool,
}
impl Default for ExecToolCallOutput {
fn default() -> Self {
Self {
exit_code: 0,
stdout: StreamOutput::new(String::new()),
stderr: StreamOutput::new(String::new()),
aggregated_output: StreamOutput::new(String::new()),
duration: Duration::ZERO,
timed_out: false,
}
}
}
/// Attempts to convert arbitrary bytes to UTF-8 with best-effort encoding detection.
pub fn bytes_to_string_smart(bytes: &[u8]) -> String {
if bytes.is_empty() {
return String::new();
}
if let Ok(utf8_str) = std::str::from_utf8(bytes) {
return utf8_str.to_owned();
}
let encoding = detect_encoding(bytes);
decode_bytes(bytes, encoding)
}
// Windows-1252 reassigns a handful of 0x80-0x9F slots to smart punctuation (curly quotes, dashes,
// ™). CP866 uses those *same byte values* for uppercase Cyrillic letters. When chardetng sees shell
// snippets that mix these bytes with ASCII it sometimes guesses IBM866, so “smart quotes” render as
// Cyrillic garbage (“УФЦ”) in VS Code. However, CP866 uppercase tokens are perfectly valid output
// (e.g., `ПРИ test`) so we cannot flip every 0x80-0x9F byte to Windows-1252 either. The compromise
// is to only coerce IBM866 to Windows-1252 when (a) the high bytes are exclusively the punctuation
// values listed below and (b) we spot adjacent ASCII. This targets the real failure case without
// clobbering legitimate Cyrillic text. If another code page has a similar collision, introduce a
// dedicated allowlist (like this one) plus unit tests that capture the actual shell output we want
// to preserve. Windows-1252 byte values for smart punctuation.
const WINDOWS_1252_PUNCT_BYTES: [u8; 8] = [
0x91, // (left single quotation mark)
0x92, // (right single quotation mark)
0x93, // “ (left double quotation mark)
0x94, // ” (right double quotation mark)
0x95, // • (bullet)
0x96, // (en dash)
0x97, // — (em dash)
0x99, // ™ (trade mark sign)
];
fn detect_encoding(bytes: &[u8]) -> &'static Encoding {
let mut detector = EncodingDetector::new();
detector.feed(bytes, true);
let (encoding, _is_confident) = detector.guess_assess(None, true);
// chardetng occasionally reports IBM866 for short strings that only contain Windows-1252 “smart
// punctuation” bytes (0x80-0x9F) because that range maps to Cyrillic letters in IBM866. When
// those bytes show up alongside an ASCII word (typical shell output: `"“`test), we know the
// intent was likely CP1252 quotes/dashes. Prefer WINDOWS_1252 in that specific situation so we
// render the characters users expect instead of Cyrillic junk. References:
// - Windows-1252 reserving 0x80-0x9F for curly quotes/dashes:
// https://en.wikipedia.org/wiki/Windows-1252
// - CP866 mapping 0x93/0x94/0x96 to Cyrillic letters, so the same bytes show up as “УФЦ” when
// mis-decoded: https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT
if encoding == IBM866 && looks_like_windows_1252_punctuation(bytes) {
return WINDOWS_1252;
}
encoding
}
fn decode_bytes(bytes: &[u8], encoding: &'static Encoding) -> String {
let (decoded, _, had_errors) = encoding.decode(bytes);
if had_errors {
return String::from_utf8_lossy(bytes).into_owned();
}
decoded.into_owned()
}
/// Detect whether the byte stream looks like Windows-1252 “smart punctuation” wrapped around
/// otherwise-ASCII text.
///
/// Context: IBM866 and Windows-1252 share the 0x80-0x9F slot range. In IBM866 these bytes decode to
/// Cyrillic letters, whereas Windows-1252 maps them to curly quotes and dashes. chardetng can guess
/// IBM866 for short snippets that only contain those bytes, which turns shell output such as
/// `“test”` into unreadable Cyrillic. To avoid that, we treat inputs comprising a handful of bytes
/// from the problematic range plus ASCII letters as CP1252 punctuation. We deliberately do *not*
/// cap how many of those punctuation bytes we accept: VS Code frequently prints several quoted
/// phrases (e.g., `"foo" - "bar"`), and truncating the count would once again mis-decode those as
/// Cyrillic. If we discover additional encodings with overlapping byte ranges, prefer adding
/// encoding-specific byte allowlists like `WINDOWS_1252_PUNCT` and tests that exercise real-world
/// shell snippets.
fn looks_like_windows_1252_punctuation(bytes: &[u8]) -> bool {
let mut saw_extended_punctuation = false;
let mut saw_ascii_word = false;
for &byte in bytes {
if byte >= 0xA0 {
return false;
}
if (0x80..=0x9F).contains(&byte) {
if !is_windows_1252_punct(byte) {
return false;
}
saw_extended_punctuation = true;
}
if byte.is_ascii_alphabetic() {
saw_ascii_word = true;
}
}
saw_extended_punctuation && saw_ascii_word
}
fn is_windows_1252_punct(byte: u8) -> bool {
WINDOWS_1252_PUNCT_BYTES.contains(&byte)
}
#[cfg(test)]
#[path = "exec_output_tests.rs"]
mod tests;