Add text element metadata to protocol, app server, and core (#9331)

The second part of breaking up PR
https://github.com/openai/codex/pull/9116

Summary:

- Add `TextElement` / `ByteRange` to protocol user inputs and user
message events with defaults.
- Thread `text_elements` through app-server v1/v2 request handling and
history rebuild.
- Preserve UI metadata only in user input/events (not `ContentItem`)
while keeping local image attachments in user events for rehydration.

Details:

- Protocol: `UserInput::Text` carries `text_elements`;
`UserMessageEvent` carries `text_elements` + `local_images`.
Serialization includes empty vectors for backward compatibility.
- app-server-protocol: v1 defines `V1TextElement` / `V1ByteRange` in
camelCase with conversions; v2 uses its own camelCase wrapper.
- app-server: v1/v2 input mapping includes `text_elements`; thread
history rebuilds include them.
- Core: user event emission preserves UI metadata while model history
stays clean; history replay round-trips the metadata.
This commit is contained in:
charley-oai
2026-01-15 17:26:41 -08:00
committed by GitHub
parent 004a74940a
commit 1fa8350ae7
18 changed files with 416 additions and 46 deletions

View File

@@ -197,8 +197,12 @@ impl ThreadHistoryBuilder {
if !payload.message.trim().is_empty() {
content.push(UserInput::Text {
text: payload.message.clone(),
// TODO: Thread text element ranges into thread history. Empty keeps old behavior.
text_elements: Vec::new(),
text_elements: payload
.text_elements
.iter()
.cloned()
.map(Into::into)
.collect(),
});
}
if let Some(images) = &payload.images {
@@ -206,6 +210,9 @@ impl ThreadHistoryBuilder {
content.push(UserInput::Image { url: image.clone() });
}
}
for path in &payload.local_images {
content.push(UserInput::LocalImage { path: path.clone() });
}
content
}
}

View File

@@ -16,6 +16,8 @@ use codex_protocol::protocol::ReviewDecision;
use codex_protocol::protocol::SandboxPolicy;
use codex_protocol::protocol::SessionSource;
use codex_protocol::protocol::TurnAbortReason;
use codex_protocol::user_input::ByteRange as CoreByteRange;
use codex_protocol::user_input::TextElement as CoreTextElement;
use codex_utils_absolute_path::AbsolutePathBuf;
use schemars::JsonSchema;
use serde::Deserialize;
@@ -444,9 +446,74 @@ pub struct RemoveConversationListenerParams {
#[serde(rename_all = "camelCase")]
#[serde(tag = "type", content = "data")]
pub enum InputItem {
Text { text: String },
Image { image_url: String },
LocalImage { path: PathBuf },
Text {
text: String,
/// UI-defined spans within `text` used to render or persist special elements.
#[serde(default)]
text_elements: Vec<V1TextElement>,
},
Image {
image_url: String,
},
LocalImage {
path: PathBuf,
},
}
#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq, JsonSchema, TS)]
#[serde(rename_all = "camelCase")]
#[ts(rename = "ByteRange")]
pub struct V1ByteRange {
/// Start byte offset (inclusive) within the UTF-8 text buffer.
pub start: usize,
/// End byte offset (exclusive) within the UTF-8 text buffer.
pub end: usize,
}
impl From<CoreByteRange> for V1ByteRange {
fn from(value: CoreByteRange) -> Self {
Self {
start: value.start,
end: value.end,
}
}
}
impl From<V1ByteRange> for CoreByteRange {
fn from(value: V1ByteRange) -> Self {
Self {
start: value.start,
end: value.end,
}
}
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(rename_all = "camelCase")]
#[ts(rename = "TextElement")]
pub struct V1TextElement {
/// Byte range in the parent `text` buffer that this element occupies.
pub byte_range: V1ByteRange,
/// Optional human-readable placeholder for the element, displayed in the UI.
pub placeholder: Option<String>,
}
impl From<CoreTextElement> for V1TextElement {
fn from(value: CoreTextElement) -> Self {
Self {
byte_range: value.byte_range.into(),
placeholder: value.placeholder,
}
}
}
impl From<V1TextElement> for CoreTextElement {
fn from(value: V1TextElement) -> Self {
Self {
byte_range: value.byte_range.into(),
placeholder: value.placeholder,
}
}
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]

View File

@@ -30,6 +30,8 @@ use codex_protocol::protocol::SkillMetadata as CoreSkillMetadata;
use codex_protocol::protocol::SkillScope as CoreSkillScope;
use codex_protocol::protocol::TokenUsage as CoreTokenUsage;
use codex_protocol::protocol::TokenUsageInfo as CoreTokenUsageInfo;
use codex_protocol::user_input::ByteRange as CoreByteRange;
use codex_protocol::user_input::TextElement as CoreTextElement;
use codex_protocol::user_input::UserInput as CoreUserInput;
use codex_utils_absolute_path::AbsolutePathBuf;
use mcp_types::ContentBlock as McpContentBlock;
@@ -1589,6 +1591,24 @@ pub struct ByteRange {
pub end: usize,
}
impl From<CoreByteRange> for ByteRange {
fn from(value: CoreByteRange) -> Self {
Self {
start: value.start,
end: value.end,
}
}
}
impl From<ByteRange> for CoreByteRange {
fn from(value: ByteRange) -> Self {
Self {
start: value.start,
end: value.end,
}
}
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(rename_all = "camelCase")]
#[ts(export_to = "v2/")]
@@ -1599,6 +1619,24 @@ pub struct TextElement {
pub placeholder: Option<String>,
}
impl From<CoreTextElement> for TextElement {
fn from(value: CoreTextElement) -> Self {
Self {
byte_range: value.byte_range.into(),
placeholder: value.placeholder,
}
}
}
impl From<TextElement> for CoreTextElement {
fn from(value: TextElement) -> Self {
Self {
byte_range: value.byte_range.into(),
placeholder: value.placeholder,
}
}
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]
#[serde(tag = "type", rename_all = "camelCase")]
#[ts(tag = "type")]
@@ -1625,10 +1663,12 @@ pub enum UserInput {
impl UserInput {
pub fn into_core(self) -> CoreUserInput {
match self {
UserInput::Text { text, .. } => CoreUserInput::Text {
UserInput::Text {
text,
// TODO: Thread text element ranges into v2 inputs. Empty keeps old behavior.
text_elements: Vec::new(),
text_elements,
} => CoreUserInput::Text {
text,
text_elements: text_elements.into_iter().map(Into::into).collect(),
},
UserInput::Image { url } => CoreUserInput::Image { image_url: url },
UserInput::LocalImage { path } => CoreUserInput::LocalImage { path },
@@ -1640,10 +1680,12 @@ impl UserInput {
impl From<CoreUserInput> for UserInput {
fn from(value: CoreUserInput) -> Self {
match value {
CoreUserInput::Text { text, .. } => UserInput::Text {
CoreUserInput::Text {
text,
// TODO: Thread text element ranges from core into v2 inputs.
text_elements: Vec::new(),
text_elements,
} => UserInput::Text {
text,
text_elements: text_elements.into_iter().map(Into::into).collect(),
},
CoreUserInput::Image { image_url } => UserInput::Image { url: image_url },
CoreUserInput::LocalImage { path } => UserInput::LocalImage { path },