mirror of
https://github.com/openai/codex.git
synced 2026-04-24 14:45:27 +00:00
Wrap Image UserInput in <image> tags as well
This commit is contained in:
@@ -9,6 +9,8 @@ use codex_protocol::models::ReasoningItemContent;
|
||||
use codex_protocol::models::ReasoningItemReasoningSummary;
|
||||
use codex_protocol::models::ResponseItem;
|
||||
use codex_protocol::models::WebSearchAction;
|
||||
use codex_protocol::models::is_image_close_tag_text;
|
||||
use codex_protocol::models::is_image_open_tag_text;
|
||||
use codex_protocol::models::is_local_image_close_tag_text;
|
||||
use codex_protocol::models::is_local_image_open_tag_text;
|
||||
use codex_protocol::user_input::UserInput;
|
||||
@@ -37,10 +39,10 @@ fn parse_user_message(message: &[ContentItem]) -> Option<UserMessageItem> {
|
||||
for (idx, content_item) in message.iter().enumerate() {
|
||||
match content_item {
|
||||
ContentItem::InputText { text } => {
|
||||
if is_local_image_open_tag_text(text)
|
||||
if (is_local_image_open_tag_text(text) || is_image_open_tag_text(text))
|
||||
&& (matches!(message.get(idx + 1), Some(ContentItem::InputImage { .. })))
|
||||
|| (idx > 0
|
||||
&& is_local_image_close_tag_text(text)
|
||||
&& (is_local_image_close_tag_text(text) || is_image_close_tag_text(text))
|
||||
&& matches!(message.get(idx - 1), Some(ContentItem::InputImage { .. })))
|
||||
{
|
||||
continue;
|
||||
@@ -224,6 +226,43 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn skips_unnamed_image_label_text() {
|
||||
let image_url = "data:image/png;base64,abc".to_string();
|
||||
let label = codex_protocol::models::image_open_tag_text();
|
||||
let user_text = "Please review this image.".to_string();
|
||||
|
||||
let item = ResponseItem::Message {
|
||||
id: None,
|
||||
role: "user".to_string(),
|
||||
content: vec![
|
||||
ContentItem::InputText { text: label },
|
||||
ContentItem::InputImage {
|
||||
image_url: image_url.clone(),
|
||||
},
|
||||
ContentItem::InputText {
|
||||
text: codex_protocol::models::image_close_tag_text(),
|
||||
},
|
||||
ContentItem::InputText {
|
||||
text: user_text.clone(),
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
let turn_item = parse_turn_item(&item).expect("expected user message turn item");
|
||||
|
||||
match turn_item {
|
||||
TurnItem::UserMessage(user) => {
|
||||
let expected_content = vec![
|
||||
UserInput::Image { image_url },
|
||||
UserInput::Text { text: user_text },
|
||||
];
|
||||
assert_eq!(user.content, expected_content);
|
||||
}
|
||||
other => panic!("expected TurnItem::UserMessage, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn skips_user_instructions_and_env() {
|
||||
let items = vec![
|
||||
|
||||
@@ -182,9 +182,19 @@ fn local_image_error_placeholder(
|
||||
|
||||
pub const VIEW_IMAGE_TOOL_NAME: &str = "view_image";
|
||||
|
||||
const IMAGE_OPEN_TAG: &str = "<image>";
|
||||
const IMAGE_CLOSE_TAG: &str = "</image>";
|
||||
const LOCAL_IMAGE_OPEN_TAG_PREFIX: &str = "<image name=";
|
||||
const LOCAL_IMAGE_OPEN_TAG_SUFFIX: &str = ">";
|
||||
const LOCAL_IMAGE_CLOSE_TAG: &str = "</image>";
|
||||
const LOCAL_IMAGE_CLOSE_TAG: &str = IMAGE_CLOSE_TAG;
|
||||
|
||||
pub fn image_open_tag_text() -> String {
|
||||
IMAGE_OPEN_TAG.to_string()
|
||||
}
|
||||
|
||||
pub fn image_close_tag_text() -> String {
|
||||
IMAGE_CLOSE_TAG.to_string()
|
||||
}
|
||||
|
||||
pub fn local_image_label_text(label_number: usize) -> String {
|
||||
format!("[Image #{label_number}]")
|
||||
@@ -201,7 +211,15 @@ pub fn is_local_image_open_tag_text(text: &str) -> bool {
|
||||
}
|
||||
|
||||
pub fn is_local_image_close_tag_text(text: &str) -> bool {
|
||||
text == LOCAL_IMAGE_CLOSE_TAG
|
||||
is_image_close_tag_text(text)
|
||||
}
|
||||
|
||||
pub fn is_image_open_tag_text(text: &str) -> bool {
|
||||
text == IMAGE_OPEN_TAG
|
||||
}
|
||||
|
||||
pub fn is_image_close_tag_text(text: &str) -> bool {
|
||||
text == IMAGE_CLOSE_TAG
|
||||
}
|
||||
|
||||
fn invalid_image_error_placeholder(
|
||||
@@ -375,7 +393,15 @@ impl From<Vec<UserInput>> for ResponseInputItem {
|
||||
.into_iter()
|
||||
.flat_map(|c| match c {
|
||||
UserInput::Text { text } => vec![ContentItem::InputText { text }],
|
||||
UserInput::Image { image_url } => vec![ContentItem::InputImage { image_url }],
|
||||
UserInput::Image { image_url } => vec![
|
||||
ContentItem::InputText {
|
||||
text: image_open_tag_text(),
|
||||
},
|
||||
ContentItem::InputImage { image_url },
|
||||
ContentItem::InputText {
|
||||
text: image_close_tag_text(),
|
||||
},
|
||||
],
|
||||
UserInput::LocalImage { path } => {
|
||||
image_index += 1;
|
||||
local_image_content_items_with_label_number(&path, true, image_index)
|
||||
@@ -820,6 +846,33 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wraps_image_user_input_with_tags() -> Result<()> {
|
||||
let image_url = "data:image/png;base64,abc".to_string();
|
||||
|
||||
let item = ResponseInputItem::from(vec![UserInput::Image {
|
||||
image_url: image_url.clone(),
|
||||
}]);
|
||||
|
||||
match item {
|
||||
ResponseInputItem::Message { content, .. } => {
|
||||
let expected = vec![
|
||||
ContentItem::InputText {
|
||||
text: image_open_tag_text(),
|
||||
},
|
||||
ContentItem::InputImage { image_url },
|
||||
ContentItem::InputText {
|
||||
text: image_close_tag_text(),
|
||||
},
|
||||
];
|
||||
assert_eq!(content, expected);
|
||||
}
|
||||
other => panic!("expected message response but got {other:?}"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn local_image_read_error_adds_placeholder() -> Result<()> {
|
||||
let dir = tempdir()?;
|
||||
|
||||
Reference in New Issue
Block a user