Label attached images so agent can understand in-message labels

This commit is contained in:
Charles Cunningham
2026-01-08 17:39:30 -08:00
parent 6a57d7980b
commit 9395ef9834
4 changed files with 432 additions and 700 deletions

View File

@@ -9,10 +9,7 @@ use codex_protocol::models::ReasoningItemContent;
use codex_protocol::models::ReasoningItemReasoningSummary;
use codex_protocol::models::ResponseItem;
use codex_protocol::models::WebSearchAction;
use codex_protocol::models::is_image_close_tag_text;
use codex_protocol::models::is_image_open_tag_text;
use codex_protocol::models::is_local_image_close_tag_text;
use codex_protocol::models::is_local_image_open_tag_text;
use codex_protocol::models::is_local_image_label_text;
use codex_protocol::user_input::UserInput;
use tracing::warn;
use uuid::Uuid;
@@ -39,11 +36,8 @@ fn parse_user_message(message: &[ContentItem]) -> Option<UserMessageItem> {
for (idx, content_item) in message.iter().enumerate() {
match content_item {
ContentItem::InputText { text } => {
if (is_local_image_open_tag_text(text) || is_image_open_tag_text(text))
&& (matches!(message.get(idx + 1), Some(ContentItem::InputImage { .. })))
|| (idx > 0
&& (is_local_image_close_tag_text(text) || is_image_close_tag_text(text))
&& matches!(message.get(idx - 1), Some(ContentItem::InputImage { .. })))
if is_local_image_label_text(text)
&& matches!(message.get(idx + 1), Some(ContentItem::InputImage { .. }))
{
continue;
}
@@ -148,6 +142,7 @@ mod tests {
use codex_protocol::models::ReasoningItemReasoningSummary;
use codex_protocol::models::ResponseItem;
use codex_protocol::models::WebSearchAction;
use codex_protocol::models::local_image_label_text;
use codex_protocol::user_input::UserInput;
use pretty_assertions::assert_eq;
@@ -192,7 +187,7 @@ mod tests {
#[test]
fn skips_local_image_label_text() {
let image_url = "data:image/png;base64,abc".to_string();
let label = codex_protocol::models::local_image_open_tag_text(1);
let label = local_image_label_text(std::path::Path::new("/tmp/example.png"));
let user_text = "Please review this image.".to_string();
let item = ResponseItem::Message {
@@ -203,46 +198,6 @@ mod tests {
ContentItem::InputImage {
image_url: image_url.clone(),
},
ContentItem::InputText {
text: "</image>".to_string(),
},
ContentItem::InputText {
text: user_text.clone(),
},
],
};
let turn_item = parse_turn_item(&item).expect("expected user message turn item");
match turn_item {
TurnItem::UserMessage(user) => {
let expected_content = vec![
UserInput::Image { image_url },
UserInput::Text { text: user_text },
];
assert_eq!(user.content, expected_content);
}
other => panic!("expected TurnItem::UserMessage, got {other:?}"),
}
}
#[test]
fn skips_unnamed_image_label_text() {
let image_url = "data:image/png;base64,abc".to_string();
let label = codex_protocol::models::image_open_tag_text();
let user_text = "Please review this image.".to_string();
let item = ResponseItem::Message {
id: None,
role: "user".to_string(),
content: vec![
ContentItem::InputText { text: label },
ContentItem::InputImage {
image_url: image_url.clone(),
},
ContentItem::InputText {
text: codex_protocol::models::image_close_tag_text(),
},
ContentItem::InputText {
text: user_text.clone(),
},

View File

@@ -13,7 +13,7 @@ use crate::tools::registry::ToolHandler;
use crate::tools::registry::ToolKind;
use codex_protocol::models::ContentItem;
use codex_protocol::models::ResponseInputItem;
use codex_protocol::models::local_image_content_items_with_label_number;
use codex_protocol::models::local_image_content_items;
pub struct ViewImageHandler;
@@ -65,8 +65,7 @@ impl ToolHandler for ViewImageHandler {
}
let event_path = abs_path.clone();
let content: Vec<ContentItem> =
local_image_content_items_with_label_number(&abs_path, None);
let content: Vec<ContentItem> = local_image_content_items(&abs_path, false);
let input = ResponseInputItem::Message {
role: "user".to_string(),
content,

File diff suppressed because it is too large Load Diff

View File

@@ -182,44 +182,24 @@ fn local_image_error_placeholder(
pub const VIEW_IMAGE_TOOL_NAME: &str = "view_image";
const IMAGE_OPEN_TAG: &str = "<image>";
const IMAGE_CLOSE_TAG: &str = "</image>";
const LOCAL_IMAGE_OPEN_TAG_PREFIX: &str = "<image name=";
const LOCAL_IMAGE_OPEN_TAG_SUFFIX: &str = ">";
const LOCAL_IMAGE_CLOSE_TAG: &str = IMAGE_CLOSE_TAG;
pub fn image_open_tag_text() -> String {
IMAGE_OPEN_TAG.to_string()
fn local_image_label_suffix() -> String {
format!(" follows (you can see it without using the {VIEW_IMAGE_TOOL_NAME} tool):")
}
pub fn image_close_tag_text() -> String {
IMAGE_CLOSE_TAG.to_string()
pub fn local_image_label_text(path: &std::path::Path) -> String {
format!("Image {}{}", path.display(), local_image_label_suffix())
}
pub fn local_image_label_text(label_number: usize) -> String {
format!("[Image #{label_number}]")
pub fn is_local_image_label_text(text: &str) -> bool {
let suffix = local_image_label_suffix();
let trimmed = text.trim();
trimmed.starts_with("Image ") && trimmed.ends_with(&suffix)
}
pub fn local_image_open_tag_text(label_number: usize) -> String {
let label = local_image_label_text(label_number);
format!("{LOCAL_IMAGE_OPEN_TAG_PREFIX}{label}{LOCAL_IMAGE_OPEN_TAG_SUFFIX}")
}
pub fn is_local_image_open_tag_text(text: &str) -> bool {
text.strip_prefix(LOCAL_IMAGE_OPEN_TAG_PREFIX)
.is_some_and(|rest| rest.ends_with(LOCAL_IMAGE_OPEN_TAG_SUFFIX))
}
pub fn is_local_image_close_tag_text(text: &str) -> bool {
is_image_close_tag_text(text)
}
pub fn is_image_open_tag_text(text: &str) -> bool {
text == IMAGE_OPEN_TAG
}
pub fn is_image_close_tag_text(text: &str) -> bool {
text == IMAGE_CLOSE_TAG
fn local_image_label(path: &std::path::Path) -> ContentItem {
ContentItem::InputText {
text: local_image_label_text(path),
}
}
fn invalid_image_error_placeholder(
@@ -245,26 +225,16 @@ fn unsupported_image_error_placeholder(path: &std::path::Path, mime: &str) -> Co
}
}
pub fn local_image_content_items_with_label_number(
path: &std::path::Path,
label_number: Option<usize>,
) -> Vec<ContentItem> {
pub fn local_image_content_items(path: &std::path::Path, include_label: bool) -> Vec<ContentItem> {
match load_and_resize_to_fit(path) {
Ok(image) => {
let mut items = Vec::with_capacity(3);
if let Some(label_number) = label_number {
items.push(ContentItem::InputText {
text: local_image_open_tag_text(label_number),
});
let mut items = Vec::with_capacity(2);
if include_label {
items.push(local_image_label(path));
}
items.push(ContentItem::InputImage {
image_url: image.into_data_url(),
});
if label_number.is_some() {
items.push(ContentItem::InputText {
text: LOCAL_IMAGE_CLOSE_TAG.to_string(),
});
}
items
}
Err(err) => {
@@ -385,26 +355,14 @@ pub enum ReasoningItemContent {
impl From<Vec<UserInput>> for ResponseInputItem {
fn from(items: Vec<UserInput>) -> Self {
let mut image_index = 0;
Self::Message {
role: "user".to_string(),
content: items
.into_iter()
.flat_map(|c| match c {
UserInput::Text { text } => vec![ContentItem::InputText { text }],
UserInput::Image { image_url } => vec![
ContentItem::InputText {
text: image_open_tag_text(),
},
ContentItem::InputImage { image_url },
ContentItem::InputText {
text: image_close_tag_text(),
},
],
UserInput::LocalImage { path } => {
image_index += 1;
local_image_content_items_with_label_number(&path, Some(image_index))
}
UserInput::Image { image_url } => vec![ContentItem::InputImage { image_url }],
UserInput::LocalImage { path } => local_image_content_items(&path, true),
UserInput::Skill { .. } => Vec::new(), // Skill bodies are injected later in core
})
.collect::<Vec<ContentItem>>(),
@@ -845,33 +803,6 @@ mod tests {
Ok(())
}
#[test]
fn wraps_image_user_input_with_tags() -> Result<()> {
let image_url = "data:image/png;base64,abc".to_string();
let item = ResponseInputItem::from(vec![UserInput::Image {
image_url: image_url.clone(),
}]);
match item {
ResponseInputItem::Message { content, .. } => {
let expected = vec![
ContentItem::InputText {
text: image_open_tag_text(),
},
ContentItem::InputImage { image_url },
ContentItem::InputText {
text: image_close_tag_text(),
},
];
assert_eq!(content, expected);
}
other => panic!("expected message response but got {other:?}"),
}
Ok(())
}
#[test]
fn local_image_read_error_adds_placeholder() -> Result<()> {
let dir = tempdir()?;