diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index a37e13bbfe..c8af30147d 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -3856,6 +3856,8 @@ version = "0.0.0" dependencies = [ "pretty_assertions", "regex-lite", + "serde", + "serde_json", ] [[package]] diff --git a/codex-rs/core/src/turn_metadata.rs b/codex-rs/core/src/turn_metadata.rs index 97902ce1e1..f6a338b9ac 100644 --- a/codex-rs/core/src/turn_metadata.rs +++ b/codex-rs/core/src/turn_metadata.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use std::sync::Mutex; use std::sync::RwLock; +use codex_utils_string::to_ascii_json_string; use serde::Serialize; use serde_json::Value; use tokio::task::JoinHandle; @@ -71,7 +72,7 @@ pub(crate) struct TurnMetadataBag { impl TurnMetadataBag { fn to_header_value(&self) -> Option { - serde_json::to_string(self).ok() + to_ascii_json_string(self).ok() } } @@ -101,7 +102,7 @@ fn merge_turn_metadata( .or_insert_with(|| Value::String(value.clone())); } } - serde_json::to_string(&metadata).ok() + to_ascii_json_string(&metadata).ok() } fn build_turn_metadata_bag( diff --git a/codex-rs/core/src/turn_metadata_tests.rs b/codex-rs/core/src/turn_metadata_tests.rs index 5201eb3560..6504eadd67 100644 --- a/codex-rs/core/src/turn_metadata_tests.rs +++ b/codex-rs/core/src/turn_metadata_tests.rs @@ -7,6 +7,7 @@ use codex_protocol::protocol::SessionSource; use codex_protocol::protocol::SubAgentSource; use core_test_support::PathBufExt; use core_test_support::PathExt; +use pretty_assertions::assert_eq; use serde_json::Value; use std::collections::HashMap; use tempfile::TempDir; @@ -15,7 +16,7 @@ use tokio::process::Command; #[tokio::test] async fn build_turn_metadata_header_includes_has_changes_for_clean_repo() { let temp_dir = TempDir::new().expect("temp dir"); - let repo_path = temp_dir.path().join("repo").abs(); + let repo_path = temp_dir.path().join("repo-東京").abs(); std::fs::create_dir_all(&repo_path).expect("create repo"); Command::new("git") @@ -54,7 +55,16 @@ async fn build_turn_metadata_header_includes_has_changes_for_clean_repo() { let header = build_turn_metadata_header(&repo_path, Some("none")) .await .expect("header"); + assert!(header.is_ascii()); + assert!(!header.contains("東京")); let parsed: Value = serde_json::from_str(&header).expect("valid json"); + let expected_repo_path = repo_path.to_string_lossy().into_owned(); + let actual_repo_path = parsed + .get("workspaces") + .and_then(Value::as_object) + .and_then(|workspaces| workspaces.keys().next()) + .expect("workspace path"); + assert_eq!(actual_repo_path, &expected_repo_path); let workspace = parsed .get("workspaces") .and_then(Value::as_object) @@ -191,6 +201,7 @@ fn turn_metadata_state_merges_client_metadata_without_replacing_reserved_fields( ); state.set_responsesapi_client_metadata(HashMap::from([ ("fiber_run_id".to_string(), "fiber-123".to_string()), + ("origin".to_string(), "東京".to_string()), ("session_id".to_string(), "client-supplied".to_string()), ("thread_source".to_string(), "client-supplied".to_string()), ( @@ -201,9 +212,12 @@ fn turn_metadata_state_merges_client_metadata_without_replacing_reserved_fields( state.set_turn_started_at_unix_ms(/*turn_started_at_unix_ms*/ 1_700_000_000_123); let header = state.current_header_value().expect("header"); + assert!(header.is_ascii()); + assert!(!header.contains("東京")); let json: Value = serde_json::from_str(&header).expect("json"); assert_eq!(json["fiber_run_id"].as_str(), Some("fiber-123")); + assert_eq!(json["origin"].as_str(), Some("東京")); assert_eq!(json["session_id"].as_str(), Some("session-a")); assert_eq!(json["thread_source"].as_str(), Some("user")); assert_eq!(json["turn_id"].as_str(), Some("turn-a")); diff --git a/codex-rs/utils/string/Cargo.toml b/codex-rs/utils/string/Cargo.toml index 3f5890cfb1..a81760e5ef 100644 --- a/codex-rs/utils/string/Cargo.toml +++ b/codex-rs/utils/string/Cargo.toml @@ -9,6 +9,8 @@ workspace = true [dependencies] regex-lite = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } [dev-dependencies] pretty_assertions = { workspace = true } diff --git a/codex-rs/utils/string/src/json.rs b/codex-rs/utils/string/src/json.rs new file mode 100644 index 0000000000..fd5e7d65bc --- /dev/null +++ b/codex-rs/utils/string/src/json.rs @@ -0,0 +1,122 @@ +//! JSON serialization helpers for output that must remain parseable as JSON +//! while staying safe for ASCII-only transports. + +use std::io; + +use serde::Serialize; + +struct AsciiJsonFormatter; + +impl serde_json::ser::Formatter for AsciiJsonFormatter { + // serde_json has no ensure_ascii flag; this formatter keeps its serializer + // in charge and only escapes non-ASCII string fragments. + fn write_string_fragment(&mut self, writer: &mut W, fragment: &str) -> io::Result<()> + where + W: ?Sized + io::Write, + { + let mut start = 0; + for (index, ch) in fragment.char_indices() { + if ch.is_ascii() { + continue; + } + + if start < index { + writer.write_all(&fragment.as_bytes()[start..index])?; + } + + let mut utf16 = [0; 2]; + for code_unit in ch.encode_utf16(&mut utf16) { + write!(writer, "\\u{code_unit:04x}")?; + } + start = index + ch.len_utf8(); + } + + if start < fragment.len() { + writer.write_all(&fragment.as_bytes()[start..])?; + } + + Ok(()) + } +} + +/// Serialize JSON while escaping non-ASCII string content as `\uXXXX`. +/// +/// This is useful when JSON needs to remain parseable as JSON but must be +/// carried through ASCII-safe transports such as HTTP headers. +pub fn to_ascii_json_string(value: &T) -> serde_json::Result +where + T: Serialize + ?Sized, +{ + let mut bytes = Vec::new(); + let mut serializer = serde_json::Serializer::with_formatter(&mut bytes, AsciiJsonFormatter); + value.serialize(&mut serializer)?; + String::from_utf8(bytes) + .map_err(|err| serde_json::Error::io(io::Error::new(io::ErrorKind::InvalidData, err))) +} + +#[cfg(test)] +mod tests { + use std::collections::BTreeMap; + + use pretty_assertions::assert_eq; + use serde::Serialize; + use serde::ser::SerializeStruct; + use serde_json::Value; + use serde_json::json; + + use super::to_ascii_json_string; + + #[test] + fn to_ascii_json_string_escapes_non_ascii_strings() { + struct TestPayload; + + impl Serialize for TestPayload { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let workspaces = BTreeMap::from([("/tmp/東京", TestWorkspace)]); + let mut state = serializer.serialize_struct("TestPayload", 1)?; + state.serialize_field("workspaces", &workspaces)?; + state.end() + } + } + + struct TestWorkspace; + + impl Serialize for TestWorkspace { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut state = serializer.serialize_struct("TestWorkspace", 2)?; + state.serialize_field("label", "Agentlarım")?; + state.serialize_field("emoji", "🚀")?; + state.end() + } + } + + let value = TestPayload; + let expected_value = json!({ + "workspaces": { + "/tmp/東京": { + "label": "Agentlarım", + "emoji": "🚀" + } + } + }); + + let serialized = to_ascii_json_string(&value).expect("serialize ascii json"); + + assert_eq!( + serialized, + r#"{"workspaces":{"/tmp/\u6771\u4eac":{"label":"Agentlar\u0131m","emoji":"\ud83d\ude80"}}}"# + ); + assert!(serialized.is_ascii()); + assert!(!serialized.contains("東京")); + assert!(!serialized.contains("Agentlarım")); + assert!(!serialized.contains("🚀")); + let parsed: Value = serde_json::from_str(&serialized).expect("serialized json"); + assert_eq!(parsed, expected_value); + } +} diff --git a/codex-rs/utils/string/src/lib.rs b/codex-rs/utils/string/src/lib.rs index d7b6f153ac..2d081f79e2 100644 --- a/codex-rs/utils/string/src/lib.rs +++ b/codex-rs/utils/string/src/lib.rs @@ -1,5 +1,7 @@ +mod json; mod truncate; +pub use json::to_ascii_json_string; pub use truncate::approx_bytes_for_tokens; pub use truncate::approx_token_count; pub use truncate::approx_tokens_from_byte_count;