mirror of
https://github.com/openai/codex.git
synced 2026-04-24 14:45:27 +00:00
Label attached images so agent can understand in-message labels
This commit is contained in:
@@ -9,10 +9,7 @@ use codex_protocol::models::ReasoningItemContent;
|
||||
use codex_protocol::models::ReasoningItemReasoningSummary;
|
||||
use codex_protocol::models::ResponseItem;
|
||||
use codex_protocol::models::WebSearchAction;
|
||||
use codex_protocol::models::is_image_close_tag_text;
|
||||
use codex_protocol::models::is_image_open_tag_text;
|
||||
use codex_protocol::models::is_local_image_close_tag_text;
|
||||
use codex_protocol::models::is_local_image_open_tag_text;
|
||||
use codex_protocol::models::is_local_image_label_text;
|
||||
use codex_protocol::user_input::UserInput;
|
||||
use tracing::warn;
|
||||
use uuid::Uuid;
|
||||
@@ -39,11 +36,8 @@ fn parse_user_message(message: &[ContentItem]) -> Option<UserMessageItem> {
|
||||
for (idx, content_item) in message.iter().enumerate() {
|
||||
match content_item {
|
||||
ContentItem::InputText { text } => {
|
||||
if (is_local_image_open_tag_text(text) || is_image_open_tag_text(text))
|
||||
&& (matches!(message.get(idx + 1), Some(ContentItem::InputImage { .. })))
|
||||
|| (idx > 0
|
||||
&& (is_local_image_close_tag_text(text) || is_image_close_tag_text(text))
|
||||
&& matches!(message.get(idx - 1), Some(ContentItem::InputImage { .. })))
|
||||
if is_local_image_label_text(text)
|
||||
&& matches!(message.get(idx + 1), Some(ContentItem::InputImage { .. }))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
@@ -148,6 +142,7 @@ mod tests {
|
||||
use codex_protocol::models::ReasoningItemReasoningSummary;
|
||||
use codex_protocol::models::ResponseItem;
|
||||
use codex_protocol::models::WebSearchAction;
|
||||
use codex_protocol::models::local_image_label_text;
|
||||
use codex_protocol::user_input::UserInput;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
@@ -192,7 +187,7 @@ mod tests {
|
||||
#[test]
|
||||
fn skips_local_image_label_text() {
|
||||
let image_url = "data:image/png;base64,abc".to_string();
|
||||
let label = codex_protocol::models::local_image_open_tag_text(1);
|
||||
let label = local_image_label_text(std::path::Path::new("/tmp/example.png"));
|
||||
let user_text = "Please review this image.".to_string();
|
||||
|
||||
let item = ResponseItem::Message {
|
||||
@@ -203,46 +198,6 @@ mod tests {
|
||||
ContentItem::InputImage {
|
||||
image_url: image_url.clone(),
|
||||
},
|
||||
ContentItem::InputText {
|
||||
text: "</image>".to_string(),
|
||||
},
|
||||
ContentItem::InputText {
|
||||
text: user_text.clone(),
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
let turn_item = parse_turn_item(&item).expect("expected user message turn item");
|
||||
|
||||
match turn_item {
|
||||
TurnItem::UserMessage(user) => {
|
||||
let expected_content = vec![
|
||||
UserInput::Image { image_url },
|
||||
UserInput::Text { text: user_text },
|
||||
];
|
||||
assert_eq!(user.content, expected_content);
|
||||
}
|
||||
other => panic!("expected TurnItem::UserMessage, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn skips_unnamed_image_label_text() {
|
||||
let image_url = "data:image/png;base64,abc".to_string();
|
||||
let label = codex_protocol::models::image_open_tag_text();
|
||||
let user_text = "Please review this image.".to_string();
|
||||
|
||||
let item = ResponseItem::Message {
|
||||
id: None,
|
||||
role: "user".to_string(),
|
||||
content: vec![
|
||||
ContentItem::InputText { text: label },
|
||||
ContentItem::InputImage {
|
||||
image_url: image_url.clone(),
|
||||
},
|
||||
ContentItem::InputText {
|
||||
text: codex_protocol::models::image_close_tag_text(),
|
||||
},
|
||||
ContentItem::InputText {
|
||||
text: user_text.clone(),
|
||||
},
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::tools::registry::ToolHandler;
|
||||
use crate::tools::registry::ToolKind;
|
||||
use codex_protocol::models::ContentItem;
|
||||
use codex_protocol::models::ResponseInputItem;
|
||||
use codex_protocol::models::local_image_content_items_with_label_number;
|
||||
use codex_protocol::models::local_image_content_items;
|
||||
|
||||
pub struct ViewImageHandler;
|
||||
|
||||
@@ -65,8 +65,7 @@ impl ToolHandler for ViewImageHandler {
|
||||
}
|
||||
let event_path = abs_path.clone();
|
||||
|
||||
let content: Vec<ContentItem> =
|
||||
local_image_content_items_with_label_number(&abs_path, None);
|
||||
let content: Vec<ContentItem> = local_image_content_items(&abs_path, false);
|
||||
let input = ResponseInputItem::Message {
|
||||
role: "user".to_string(),
|
||||
content,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -182,44 +182,24 @@ fn local_image_error_placeholder(
|
||||
|
||||
pub const VIEW_IMAGE_TOOL_NAME: &str = "view_image";
|
||||
|
||||
const IMAGE_OPEN_TAG: &str = "<image>";
|
||||
const IMAGE_CLOSE_TAG: &str = "</image>";
|
||||
const LOCAL_IMAGE_OPEN_TAG_PREFIX: &str = "<image name=";
|
||||
const LOCAL_IMAGE_OPEN_TAG_SUFFIX: &str = ">";
|
||||
const LOCAL_IMAGE_CLOSE_TAG: &str = IMAGE_CLOSE_TAG;
|
||||
|
||||
pub fn image_open_tag_text() -> String {
|
||||
IMAGE_OPEN_TAG.to_string()
|
||||
fn local_image_label_suffix() -> String {
|
||||
format!(" follows (you can see it without using the {VIEW_IMAGE_TOOL_NAME} tool):")
|
||||
}
|
||||
|
||||
pub fn image_close_tag_text() -> String {
|
||||
IMAGE_CLOSE_TAG.to_string()
|
||||
pub fn local_image_label_text(path: &std::path::Path) -> String {
|
||||
format!("Image {}{}", path.display(), local_image_label_suffix())
|
||||
}
|
||||
|
||||
pub fn local_image_label_text(label_number: usize) -> String {
|
||||
format!("[Image #{label_number}]")
|
||||
pub fn is_local_image_label_text(text: &str) -> bool {
|
||||
let suffix = local_image_label_suffix();
|
||||
let trimmed = text.trim();
|
||||
trimmed.starts_with("Image ") && trimmed.ends_with(&suffix)
|
||||
}
|
||||
|
||||
pub fn local_image_open_tag_text(label_number: usize) -> String {
|
||||
let label = local_image_label_text(label_number);
|
||||
format!("{LOCAL_IMAGE_OPEN_TAG_PREFIX}{label}{LOCAL_IMAGE_OPEN_TAG_SUFFIX}")
|
||||
}
|
||||
|
||||
pub fn is_local_image_open_tag_text(text: &str) -> bool {
|
||||
text.strip_prefix(LOCAL_IMAGE_OPEN_TAG_PREFIX)
|
||||
.is_some_and(|rest| rest.ends_with(LOCAL_IMAGE_OPEN_TAG_SUFFIX))
|
||||
}
|
||||
|
||||
pub fn is_local_image_close_tag_text(text: &str) -> bool {
|
||||
is_image_close_tag_text(text)
|
||||
}
|
||||
|
||||
pub fn is_image_open_tag_text(text: &str) -> bool {
|
||||
text == IMAGE_OPEN_TAG
|
||||
}
|
||||
|
||||
pub fn is_image_close_tag_text(text: &str) -> bool {
|
||||
text == IMAGE_CLOSE_TAG
|
||||
fn local_image_label(path: &std::path::Path) -> ContentItem {
|
||||
ContentItem::InputText {
|
||||
text: local_image_label_text(path),
|
||||
}
|
||||
}
|
||||
|
||||
fn invalid_image_error_placeholder(
|
||||
@@ -245,26 +225,16 @@ fn unsupported_image_error_placeholder(path: &std::path::Path, mime: &str) -> Co
|
||||
}
|
||||
}
|
||||
|
||||
pub fn local_image_content_items_with_label_number(
|
||||
path: &std::path::Path,
|
||||
label_number: Option<usize>,
|
||||
) -> Vec<ContentItem> {
|
||||
pub fn local_image_content_items(path: &std::path::Path, include_label: bool) -> Vec<ContentItem> {
|
||||
match load_and_resize_to_fit(path) {
|
||||
Ok(image) => {
|
||||
let mut items = Vec::with_capacity(3);
|
||||
if let Some(label_number) = label_number {
|
||||
items.push(ContentItem::InputText {
|
||||
text: local_image_open_tag_text(label_number),
|
||||
});
|
||||
let mut items = Vec::with_capacity(2);
|
||||
if include_label {
|
||||
items.push(local_image_label(path));
|
||||
}
|
||||
items.push(ContentItem::InputImage {
|
||||
image_url: image.into_data_url(),
|
||||
});
|
||||
if label_number.is_some() {
|
||||
items.push(ContentItem::InputText {
|
||||
text: LOCAL_IMAGE_CLOSE_TAG.to_string(),
|
||||
});
|
||||
}
|
||||
items
|
||||
}
|
||||
Err(err) => {
|
||||
@@ -385,26 +355,14 @@ pub enum ReasoningItemContent {
|
||||
|
||||
impl From<Vec<UserInput>> for ResponseInputItem {
|
||||
fn from(items: Vec<UserInput>) -> Self {
|
||||
let mut image_index = 0;
|
||||
Self::Message {
|
||||
role: "user".to_string(),
|
||||
content: items
|
||||
.into_iter()
|
||||
.flat_map(|c| match c {
|
||||
UserInput::Text { text } => vec![ContentItem::InputText { text }],
|
||||
UserInput::Image { image_url } => vec![
|
||||
ContentItem::InputText {
|
||||
text: image_open_tag_text(),
|
||||
},
|
||||
ContentItem::InputImage { image_url },
|
||||
ContentItem::InputText {
|
||||
text: image_close_tag_text(),
|
||||
},
|
||||
],
|
||||
UserInput::LocalImage { path } => {
|
||||
image_index += 1;
|
||||
local_image_content_items_with_label_number(&path, Some(image_index))
|
||||
}
|
||||
UserInput::Image { image_url } => vec![ContentItem::InputImage { image_url }],
|
||||
UserInput::LocalImage { path } => local_image_content_items(&path, true),
|
||||
UserInput::Skill { .. } => Vec::new(), // Skill bodies are injected later in core
|
||||
})
|
||||
.collect::<Vec<ContentItem>>(),
|
||||
@@ -845,33 +803,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wraps_image_user_input_with_tags() -> Result<()> {
|
||||
let image_url = "data:image/png;base64,abc".to_string();
|
||||
|
||||
let item = ResponseInputItem::from(vec![UserInput::Image {
|
||||
image_url: image_url.clone(),
|
||||
}]);
|
||||
|
||||
match item {
|
||||
ResponseInputItem::Message { content, .. } => {
|
||||
let expected = vec![
|
||||
ContentItem::InputText {
|
||||
text: image_open_tag_text(),
|
||||
},
|
||||
ContentItem::InputImage { image_url },
|
||||
ContentItem::InputText {
|
||||
text: image_close_tag_text(),
|
||||
},
|
||||
];
|
||||
assert_eq!(content, expected);
|
||||
}
|
||||
other => panic!("expected message response but got {other:?}"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn local_image_read_error_adds_placeholder() -> Result<()> {
|
||||
let dir = tempdir()?;
|
||||
|
||||
Reference in New Issue
Block a user