Compare commits

...

1 Commits

Author SHA1 Message Date
Vivian Fang
2394ba310d [local-im] expose local image resize config 2026-05-02 01:43:57 -07:00
9 changed files with 188 additions and 71 deletions

View File

@@ -616,6 +616,8 @@ Turns attach user input (text or images) to a thread and trigger Codex generatio
- `{"type":"image","url":"https://…png"}`
- `{"type":"localImage","path":"/tmp/screenshot.png"}`
Local image files use the thread config keys `local_image_resize_policy` (`"resize_to_fit"` or `"original"`) and `local_image_max_dimension` (default `2048`) before Codex sends image data to the model.
You can optionally specify config overrides on the new turn. If specified, these settings become the default for subsequent turns on the same thread. `outputSchema` applies only to the current turn. Experimental `environments` is turn-scoped: omit it to inherit the thread's sticky environments, pass `[]` to run the turn with no environments, or pass explicit environment ids to override the sticky selection for this turn only.
`approvalsReviewer` accepts:

View File

@@ -87,6 +87,14 @@ const fn default_hide_agent_reasoning() -> Option<bool> {
Some(false)
}
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Default, PartialEq, Eq, JsonSchema)]
#[serde(rename_all = "snake_case")]
pub enum LocalImageResizePolicy {
#[default]
ResizeToFit,
Original,
}
/// Base config deserialized from ~/.codex/config.toml.
#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, JsonSchema)]
#[schemars(deny_unknown_fields)]
@@ -175,6 +183,14 @@ pub struct ConfigToml {
/// Compact prompt used for history compaction.
pub compact_prompt: Option<String>,
/// Controls how local image files are transformed before sending them to the model.
pub local_image_resize_policy: Option<LocalImageResizePolicy>,
/// Maximum width or height for local image files when `local_image_resize_policy` is
/// `resize_to_fit`.
#[schemars(range(min = 1))]
pub local_image_max_dimension: Option<u32>,
/// Optional commit attribution text for commit message co-author trailers.
///
/// Set to an empty string to disable automatic commit attribution.

View File

@@ -1067,6 +1067,13 @@
],
"description": "One action binding value in config.\n\nThis accepts either:\n\n1. A single key spec string (`\"ctrl-a\"`). 2. A list of key spec strings (`[\"ctrl-a\", \"alt-a\"]`).\n\nAn empty list explicitly unbinds the action in that scope. Because an explicit empty list is still a configured value, runtime resolution must not fall through to global or built-in defaults for that action."
},
"LocalImageResizePolicy": {
"enum": [
"resize_to_fit",
"original"
],
"type": "string"
},
"MarketplaceConfig": {
"additionalProperties": false,
"properties": {
@@ -4147,6 +4154,20 @@
"description": "System instructions.",
"type": "string"
},
"local_image_max_dimension": {
"description": "Maximum width or height for local image files when `local_image_resize_policy` is `resize_to_fit`.",
"format": "uint32",
"minimum": 1.0,
"type": "integer"
},
"local_image_resize_policy": {
"allOf": [
{
"$ref": "#/definitions/LocalImageResizePolicy"
}
],
"description": "Controls how local image files are transformed before sending them to the model."
},
"log_dir": {
"allOf": [
{

View File

@@ -6428,6 +6428,7 @@ async fn test_precedence_fixture_with_o3_profile() -> std::io::Result<()> {
include_skill_instructions: true,
include_environment_context: true,
compact_prompt: None,
local_image: LocalImageConfig::default(),
commit_attribution: None,
forced_chatgpt_workspace_id: None,
forced_login_method: None,
@@ -6630,6 +6631,7 @@ async fn test_precedence_fixture_with_gpt3_profile() -> std::io::Result<()> {
include_skill_instructions: true,
include_environment_context: true,
compact_prompt: None,
local_image: LocalImageConfig::default(),
commit_attribution: None,
forced_chatgpt_workspace_id: None,
forced_login_method: None,
@@ -6786,6 +6788,7 @@ async fn test_precedence_fixture_with_zdr_profile() -> std::io::Result<()> {
include_skill_instructions: true,
include_environment_context: true,
compact_prompt: None,
local_image: LocalImageConfig::default(),
commit_attribution: None,
forced_chatgpt_workspace_id: None,
forced_login_method: None,
@@ -6927,6 +6930,7 @@ async fn test_precedence_fixture_with_gpt5_profile() -> std::io::Result<()> {
include_skill_instructions: true,
include_environment_context: true,
compact_prompt: None,
local_image: LocalImageConfig::default(),
commit_attribution: None,
forced_chatgpt_workspace_id: None,
forced_login_method: None,

View File

@@ -26,6 +26,7 @@ use codex_config::ThreadConfigLoader;
use codex_config::config_toml::ConfigLockfileToml;
use codex_config::config_toml::ConfigToml;
use codex_config::config_toml::DEFAULT_PROJECT_DOC_MAX_BYTES;
use codex_config::config_toml::LocalImageResizePolicy;
use codex_config::config_toml::ProjectConfig;
use codex_config::config_toml::RealtimeAudioConfig;
use codex_config::config_toml::RealtimeConfig;
@@ -101,6 +102,8 @@ use codex_protocol::protocol::AskForApproval;
use codex_protocol::protocol::SandboxPolicy;
use codex_utils_absolute_path::AbsolutePathBuf;
use codex_utils_absolute_path::AbsolutePathBufGuard;
use codex_utils_image::MAX_DIMENSION;
use codex_utils_image::PromptImageMode;
use serde::Deserialize;
use serde::Serialize;
use std::collections::BTreeMap;
@@ -472,6 +475,9 @@ pub struct Config {
/// Compact prompt override.
pub compact_prompt: Option<String>,
/// Local image prompt serialization settings.
pub local_image: LocalImageConfig,
/// Optional commit attribution text for commit message co-author trailers.
///
/// - `None`: use default attribution (`Codex <noreply@openai.com>`)
@@ -854,6 +860,32 @@ pub struct TerminalResizeReflowConfig {
pub max_rows: TerminalResizeReflowMaxRows,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct LocalImageConfig {
pub resize_policy: LocalImageResizePolicy,
pub max_dimension: u32,
}
impl Default for LocalImageConfig {
fn default() -> Self {
Self {
resize_policy: LocalImageResizePolicy::default(),
max_dimension: MAX_DIMENSION,
}
}
}
impl LocalImageConfig {
pub(crate) fn prompt_image_mode(self) -> PromptImageMode {
match self.resize_policy {
LocalImageResizePolicy::ResizeToFit => PromptImageMode::ResizeToFit {
max_dimension: self.max_dimension,
},
LocalImageResizePolicy::Original => PromptImageMode::Original,
}
}
}
impl AuthManagerConfig for Config {
fn codex_home(&self) -> PathBuf {
self.codex_home.to_path_buf()
@@ -1938,6 +1970,23 @@ fn resolve_terminal_resize_reflow_config(config_toml: &ConfigToml) -> TerminalRe
}
}
fn resolve_local_image_config(config_toml: &ConfigToml) -> std::io::Result<LocalImageConfig> {
let max_dimension = config_toml
.local_image_max_dimension
.unwrap_or(MAX_DIMENSION);
if max_dimension == 0 {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"local_image_max_dimension must be at least 1",
));
}
Ok(LocalImageConfig {
resize_policy: config_toml.local_image_resize_policy.unwrap_or_default(),
max_dimension,
})
}
fn multi_agent_v2_toml_config(features: Option<&FeaturesToml>) -> Option<&MultiAgentV2ConfigToml> {
match features?.multi_agent_v2.as_ref()? {
FeatureToml::Enabled(_) => None,
@@ -2477,6 +2526,7 @@ impl Config {
.unwrap_or(WebSearchMode::Cached);
let web_search_config = resolve_web_search_config(&cfg, &config_profile);
let multi_agent_v2 = resolve_multi_agent_v2_config(&cfg, &config_profile);
let local_image = resolve_local_image_config(&cfg)?;
let apps_mcp_path_override = if features.enabled(Feature::AppsMcpPathOverride) {
let base = apps_mcp_path_override_toml_config(cfg.features.as_ref());
let profile = apps_mcp_path_override_toml_config(config_profile.features.as_ref());
@@ -2920,6 +2970,7 @@ impl Config {
personality,
developer_instructions,
compact_prompt,
local_image,
commit_attribution,
include_permissions_instructions,
include_apps_instructions,

View File

@@ -84,6 +84,7 @@ use codex_protocol::models::ContentItem;
use codex_protocol::models::MessagePhase;
use codex_protocol::models::ResponseInputItem;
use codex_protocol::models::ResponseItem;
use codex_protocol::models::response_input_item_from_user_input;
use codex_protocol::protocol::AgentMessageContentDeltaEvent;
use codex_protocol::protocol::AgentReasoningSectionBreakEvent;
use codex_protocol::protocol::AskForApproval;
@@ -303,7 +304,10 @@ pub(crate) async fn run_turn(
let additional_contexts = if input.is_empty() {
Vec::new()
} else {
let initial_input_for_turn: ResponseInputItem = ResponseInputItem::from(input.clone());
let initial_input_for_turn = response_input_item_from_user_input(
input.clone(),
turn_context.config.local_image.prompt_image_mode(),
);
let response_item: ResponseItem = initial_input_for_turn.clone().into();
let user_prompt_submit_outcome = run_user_prompt_submit_hooks(
&sess,

View File

@@ -135,7 +135,7 @@ impl ToolHandler for ViewImageHandler {
let image_mode = if use_original_detail {
PromptImageMode::Original
} else {
PromptImageMode::ResizeToFit
turn.config.local_image.prompt_image_mode()
};
let image_detail = Some(if use_original_detail {
ImageDetail::Original

View File

@@ -1197,47 +1197,54 @@ pub enum ReasoningItemContent {
Text { text: String },
}
pub fn response_input_item_from_user_input(
items: Vec<UserInput>,
local_image_mode: PromptImageMode,
) -> ResponseInputItem {
let mut image_index = 0;
ResponseInputItem::Message {
role: "user".to_string(),
content: items
.into_iter()
.flat_map(|c| match c {
UserInput::Text { text, .. } => vec![ContentItem::InputText { text }],
UserInput::Image { image_url } => {
image_index += 1;
vec![
ContentItem::InputText {
text: image_open_tag_text(),
},
ContentItem::InputImage {
image_url,
detail: Some(DEFAULT_IMAGE_DETAIL),
},
ContentItem::InputText {
text: image_close_tag_text(),
},
]
}
UserInput::LocalImage { path } => {
image_index += 1;
match std::fs::read(&path) {
Ok(file_bytes) => local_image_content_items_with_label_number(
&path,
file_bytes,
Some(image_index),
local_image_mode,
),
Err(err) => vec![local_image_error_placeholder(&path, err)],
}
}
UserInput::Skill { .. } | UserInput::Mention { .. } => Vec::new(), // Tool bodies are injected later in core
})
.collect::<Vec<ContentItem>>(),
phase: None,
}
}
impl From<Vec<UserInput>> for ResponseInputItem {
fn from(items: Vec<UserInput>) -> Self {
let mut image_index = 0;
Self::Message {
role: "user".to_string(),
content: items
.into_iter()
.flat_map(|c| match c {
UserInput::Text { text, .. } => vec![ContentItem::InputText { text }],
UserInput::Image { image_url } => {
image_index += 1;
vec![
ContentItem::InputText {
text: image_open_tag_text(),
},
ContentItem::InputImage {
image_url,
detail: Some(DEFAULT_IMAGE_DETAIL),
},
ContentItem::InputText {
text: image_close_tag_text(),
},
]
}
UserInput::LocalImage { path } => {
image_index += 1;
match std::fs::read(&path) {
Ok(file_bytes) => local_image_content_items_with_label_number(
&path,
file_bytes,
Some(image_index),
PromptImageMode::ResizeToFit,
),
Err(err) => vec![local_image_error_placeholder(&path, err)],
}
}
UserInput::Skill { .. } | UserInput::Mention { .. } => Vec::new(), // Tool bodies are injected later in core
})
.collect::<Vec<ContentItem>>(),
phase: None,
}
response_input_item_from_user_input(items, PromptImageMode::default())
}
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, JsonSchema, TS)]

View File

@@ -39,10 +39,18 @@ impl EncodedImage {
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum PromptImageMode {
ResizeToFit,
ResizeToFit { max_dimension: u32 },
Original,
}
impl Default for PromptImageMode {
fn default() -> Self {
Self::ResizeToFit {
max_dimension: MAX_DIMENSION,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
struct ImageCacheKey {
digest: [u8; 20],
@@ -77,30 +85,17 @@ pub fn load_for_prompt_bytes(
.map_err(|source| ImageProcessingError::decode_error(&path_buf, source))?;
let (width, height) = dynamic.dimensions();
let encoded = if mode == PromptImageMode::Original
|| (width <= MAX_DIMENSION && height <= MAX_DIMENSION)
{
if let Some(format) = format.filter(|format| can_preserve_source_bytes(*format)) {
let mime = format_to_mime(format);
EncodedImage {
bytes: file_bytes,
mime,
width,
height,
}
} else {
let (bytes, output_format) = encode_image(&dynamic, ImageFormat::Png)?;
let mime = format_to_mime(output_format);
EncodedImage {
bytes,
mime,
width,
height,
}
let resize_to = match mode {
PromptImageMode::ResizeToFit { max_dimension }
if width > max_dimension || height > max_dimension =>
{
Some(max_dimension)
}
} else {
let resized = dynamic.resize(MAX_DIMENSION, MAX_DIMENSION, FilterType::Triangle);
PromptImageMode::ResizeToFit { .. } | PromptImageMode::Original => None,
};
let encoded = if let Some(max_dimension) = resize_to {
let resized = dynamic.resize(max_dimension, max_dimension, FilterType::Triangle);
let target_format = format
.filter(|format| can_preserve_source_bytes(*format))
.unwrap_or(ImageFormat::Png);
@@ -112,6 +107,23 @@ pub fn load_for_prompt_bytes(
width: resized.width(),
height: resized.height(),
}
} else if let Some(format) = format.filter(|format| can_preserve_source_bytes(*format)) {
let mime = format_to_mime(format);
EncodedImage {
bytes: file_bytes,
mime,
width,
height,
}
} else {
let (bytes, output_format) = encode_image(&dynamic, ImageFormat::Png)?;
let mime = format_to_mime(output_format);
EncodedImage {
bytes,
mime,
width,
height,
}
};
Ok(encoded)
@@ -223,7 +235,7 @@ mod tests {
let encoded = load_for_prompt_bytes(
Path::new("in-memory-image"),
original_bytes.clone(),
PromptImageMode::ResizeToFit,
PromptImageMode::default(),
)
.expect("process image");
@@ -246,7 +258,7 @@ mod tests {
let processed = load_for_prompt_bytes(
Path::new("in-memory-image"),
original_bytes,
PromptImageMode::ResizeToFit,
PromptImageMode::default(),
)
.expect("process image");
@@ -272,7 +284,7 @@ mod tests {
let processed = load_for_prompt_bytes(
Path::new("in-memory-image"),
original_bytes,
PromptImageMode::ResizeToFit,
PromptImageMode::default(),
)
.expect("process image");
@@ -304,7 +316,7 @@ mod tests {
let err = load_for_prompt_bytes(
Path::new("in-memory-image"),
b"not an image".to_vec(),
PromptImageMode::ResizeToFit,
PromptImageMode::default(),
)
.expect_err("invalid image should fail");
assert!(matches!(
@@ -326,7 +338,7 @@ mod tests {
let first = load_for_prompt_bytes(
Path::new("in-memory-image"),
first_bytes,
PromptImageMode::ResizeToFit,
PromptImageMode::default(),
)
.expect("process first image");
@@ -336,7 +348,7 @@ mod tests {
let second = load_for_prompt_bytes(
Path::new("in-memory-image"),
second_bytes,
PromptImageMode::ResizeToFit,
PromptImageMode::default(),
)
.expect("process updated image");