Compare commits

...

1 Commits

Author SHA1 Message Date
Colin Young
8db0700334 app-server: expose image-context tri-state 2026-02-10 13:03:57 -08:00
20 changed files with 733 additions and 22 deletions

View File

@@ -3802,6 +3802,25 @@
],
"type": "object"
},
"InputModality": {
"description": "Canonical user-input modality tags advertised by a model.",
"oneOf": [
{
"description": "Plain text turns and tool payloads.",
"enum": [
"text"
],
"type": "string"
},
{
"description": "Image attachments included in user turns.",
"enum": [
"image"
],
"type": "string"
}
]
},
"ItemCompletedNotification": {
"properties": {
"item": {
@@ -6010,6 +6029,17 @@
"description": "Version of the CLI that created the thread.",
"type": "string"
},
"conversationModalities": {
"default": null,
"description": "Tri-state conversation modalities signal: - `None`: unknown / not yet determined - `Some([Text])`: known to be text-only - `Some([Text, Image])`: images are known to exist in context",
"items": {
"$ref": "#/definitions/InputModality"
},
"type": [
"array",
"null"
]
},
"createdAt": {
"description": "Unix timestamp (in seconds) when the thread was created.",
"format": "int64",

View File

@@ -14405,6 +14405,17 @@
"description": "Version of the CLI that created the thread.",
"type": "string"
},
"conversationModalities": {
"default": null,
"description": "Tri-state conversation modalities signal: - `None`: unknown / not yet determined - `Some([Text])`: known to be text-only - `Some([Text, Image])`: images are known to exist in context",
"items": {
"$ref": "#/definitions/v2/InputModality"
},
"type": [
"array",
"null"
]
},
"createdAt": {
"description": "Unix timestamp (in seconds) when the thread was created.",
"format": "int64",

View File

@@ -382,6 +382,25 @@
},
"type": "object"
},
"InputModality": {
"description": "Canonical user-input modality tags advertised by a model.",
"oneOf": [
{
"description": "Plain text turns and tool payloads.",
"enum": [
"text"
],
"type": "string"
},
{
"description": "Image attachments included in user turns.",
"enum": [
"image"
],
"type": "string"
}
]
},
"McpToolCallError": {
"properties": {
"message": {
@@ -701,6 +720,17 @@
"description": "Version of the CLI that created the thread.",
"type": "string"
},
"conversationModalities": {
"default": null,
"description": "Tri-state conversation modalities signal: - `None`: unknown / not yet determined - `Some([Text])`: known to be text-only - `Some([Text, Image])`: images are known to exist in context",
"items": {
"$ref": "#/definitions/InputModality"
},
"type": [
"array",
"null"
]
},
"createdAt": {
"description": "Unix timestamp (in seconds) when the thread was created.",
"format": "int64",

View File

@@ -369,6 +369,25 @@
},
"type": "object"
},
"InputModality": {
"description": "Canonical user-input modality tags advertised by a model.",
"oneOf": [
{
"description": "Plain text turns and tool payloads.",
"enum": [
"text"
],
"type": "string"
},
{
"description": "Image attachments included in user turns.",
"enum": [
"image"
],
"type": "string"
}
]
},
"McpToolCallError": {
"properties": {
"message": {
@@ -574,6 +593,17 @@
"description": "Version of the CLI that created the thread.",
"type": "string"
},
"conversationModalities": {
"default": null,
"description": "Tri-state conversation modalities signal: - `None`: unknown / not yet determined - `Some([Text])`: known to be text-only - `Some([Text, Image])`: images are known to exist in context",
"items": {
"$ref": "#/definitions/InputModality"
},
"type": [
"array",
"null"
]
},
"createdAt": {
"description": "Unix timestamp (in seconds) when the thread was created.",
"format": "int64",

View File

@@ -369,6 +369,25 @@
},
"type": "object"
},
"InputModality": {
"description": "Canonical user-input modality tags advertised by a model.",
"oneOf": [
{
"description": "Plain text turns and tool payloads.",
"enum": [
"text"
],
"type": "string"
},
{
"description": "Image attachments included in user turns.",
"enum": [
"image"
],
"type": "string"
}
]
},
"McpToolCallError": {
"properties": {
"message": {
@@ -574,6 +593,17 @@
"description": "Version of the CLI that created the thread.",
"type": "string"
},
"conversationModalities": {
"default": null,
"description": "Tri-state conversation modalities signal: - `None`: unknown / not yet determined - `Some([Text])`: known to be text-only - `Some([Text, Image])`: images are known to exist in context",
"items": {
"$ref": "#/definitions/InputModality"
},
"type": [
"array",
"null"
]
},
"createdAt": {
"description": "Unix timestamp (in seconds) when the thread was created.",
"format": "int64",

View File

@@ -382,6 +382,25 @@
},
"type": "object"
},
"InputModality": {
"description": "Canonical user-input modality tags advertised by a model.",
"oneOf": [
{
"description": "Plain text turns and tool payloads.",
"enum": [
"text"
],
"type": "string"
},
{
"description": "Image attachments included in user turns.",
"enum": [
"image"
],
"type": "string"
}
]
},
"McpToolCallError": {
"properties": {
"message": {
@@ -701,6 +720,17 @@
"description": "Version of the CLI that created the thread.",
"type": "string"
},
"conversationModalities": {
"default": null,
"description": "Tri-state conversation modalities signal: - `None`: unknown / not yet determined - `Some([Text])`: known to be text-only - `Some([Text, Image])`: images are known to exist in context",
"items": {
"$ref": "#/definitions/InputModality"
},
"type": [
"array",
"null"
]
},
"createdAt": {
"description": "Unix timestamp (in seconds) when the thread was created.",
"format": "int64",

View File

@@ -369,6 +369,25 @@
},
"type": "object"
},
"InputModality": {
"description": "Canonical user-input modality tags advertised by a model.",
"oneOf": [
{
"description": "Plain text turns and tool payloads.",
"enum": [
"text"
],
"type": "string"
},
{
"description": "Image attachments included in user turns.",
"enum": [
"image"
],
"type": "string"
}
]
},
"McpToolCallError": {
"properties": {
"message": {
@@ -574,6 +593,17 @@
"description": "Version of the CLI that created the thread.",
"type": "string"
},
"conversationModalities": {
"default": null,
"description": "Tri-state conversation modalities signal: - `None`: unknown / not yet determined - `Some([Text])`: known to be text-only - `Some([Text, Image])`: images are known to exist in context",
"items": {
"$ref": "#/definitions/InputModality"
},
"type": [
"array",
"null"
]
},
"createdAt": {
"description": "Unix timestamp (in seconds) when the thread was created.",
"format": "int64",

View File

@@ -382,6 +382,25 @@
},
"type": "object"
},
"InputModality": {
"description": "Canonical user-input modality tags advertised by a model.",
"oneOf": [
{
"description": "Plain text turns and tool payloads.",
"enum": [
"text"
],
"type": "string"
},
{
"description": "Image attachments included in user turns.",
"enum": [
"image"
],
"type": "string"
}
]
},
"McpToolCallError": {
"properties": {
"message": {
@@ -701,6 +720,17 @@
"description": "Version of the CLI that created the thread.",
"type": "string"
},
"conversationModalities": {
"default": null,
"description": "Tri-state conversation modalities signal: - `None`: unknown / not yet determined - `Some([Text])`: known to be text-only - `Some([Text, Image])`: images are known to exist in context",
"items": {
"$ref": "#/definitions/InputModality"
},
"type": [
"array",
"null"
]
},
"createdAt": {
"description": "Unix timestamp (in seconds) when the thread was created.",
"format": "int64",

View File

@@ -369,6 +369,25 @@
},
"type": "object"
},
"InputModality": {
"description": "Canonical user-input modality tags advertised by a model.",
"oneOf": [
{
"description": "Plain text turns and tool payloads.",
"enum": [
"text"
],
"type": "string"
},
{
"description": "Image attachments included in user turns.",
"enum": [
"image"
],
"type": "string"
}
]
},
"McpToolCallError": {
"properties": {
"message": {
@@ -574,6 +593,17 @@
"description": "Version of the CLI that created the thread.",
"type": "string"
},
"conversationModalities": {
"default": null,
"description": "Tri-state conversation modalities signal: - `None`: unknown / not yet determined - `Some([Text])`: known to be text-only - `Some([Text, Image])`: images are known to exist in context",
"items": {
"$ref": "#/definitions/InputModality"
},
"type": [
"array",
"null"
]
},
"createdAt": {
"description": "Unix timestamp (in seconds) when the thread was created.",
"format": "int64",

View File

@@ -369,6 +369,25 @@
},
"type": "object"
},
"InputModality": {
"description": "Canonical user-input modality tags advertised by a model.",
"oneOf": [
{
"description": "Plain text turns and tool payloads.",
"enum": [
"text"
],
"type": "string"
},
{
"description": "Image attachments included in user turns.",
"enum": [
"image"
],
"type": "string"
}
]
},
"McpToolCallError": {
"properties": {
"message": {
@@ -574,6 +593,17 @@
"description": "Version of the CLI that created the thread.",
"type": "string"
},
"conversationModalities": {
"default": null,
"description": "Tri-state conversation modalities signal: - `None`: unknown / not yet determined - `Some([Text])`: known to be text-only - `Some([Text, Image])`: images are known to exist in context",
"items": {
"$ref": "#/definitions/InputModality"
},
"type": [
"array",
"null"
]
},
"createdAt": {
"description": "Unix timestamp (in seconds) when the thread was created.",
"format": "int64",

View File

@@ -1,6 +1,7 @@
// GENERATED CODE! DO NOT MODIFY BY HAND!
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
import type { InputModality } from "../InputModality";
import type { GitInfo } from "./GitInfo";
import type { SessionSource } from "./SessionSource";
import type { Turn } from "./Turn";
@@ -10,6 +11,13 @@ export type Thread = { id: string,
* Usually the first user message in the thread, if available.
*/
preview: string,
/**
* Tri-state conversation modalities signal:
* - `None`: unknown / not yet determined
* - `Some([Text])`: known to be text-only
* - `Some([Text, Image])`: images are known to exist in context
*/
conversationModalities: Array<InputModality> | null,
/**
* Model provider used for this thread (for example, 'openai').
*/

View File

@@ -1958,6 +1958,12 @@ pub struct Thread {
pub id: String,
/// Usually the first user message in the thread, if available.
pub preview: String,
/// Tri-state conversation modalities signal:
/// - `None`: unknown / not yet determined
/// - `Some([Text])`: known to be text-only
/// - `Some([Text, Image])`: images are known to exist in context
#[serde(default)]
pub conversation_modalities: Option<Vec<InputModality>>,
/// Model provider used for this thread (for example, 'openai').
pub model_provider: String,
/// Unix timestamp (in seconds) when the thread was created.

View File

@@ -5,6 +5,7 @@ use crate::codex_message_processor::TurnSummary;
use crate::codex_message_processor::TurnSummaryStore;
use crate::codex_message_processor::read_event_msgs_from_rollout;
use crate::codex_message_processor::read_summary_from_rollout;
use crate::codex_message_processor::fetch_state_db_conversation_modalities;
use crate::codex_message_processor::summary_to_thread;
use crate::error_code::INTERNAL_ERROR_CODE;
use crate::error_code::INVALID_REQUEST_ERROR_CODE;
@@ -46,6 +47,8 @@ use codex_app_server_protocol::PatchApplyStatus;
use codex_app_server_protocol::PatchChangeKind as V2PatchChangeKind;
use codex_app_server_protocol::PlanDeltaNotification;
use codex_app_server_protocol::RawResponseItemCompletedNotification;
use codex_protocol::openai_models::InputModality;
use codex_protocol::openai_models::input_modalities_to_mask;
use codex_app_server_protocol::ReasoningSummaryPartAddedNotification;
use codex_app_server_protocol::ReasoningSummaryTextDeltaNotification;
use codex_app_server_protocol::ReasoningTextDeltaNotification;
@@ -100,6 +103,7 @@ use std::path::PathBuf;
use std::sync::Arc;
use tokio::sync::oneshot;
use tracing::error;
use tracing::warn;
type JsonValue = serde_json::Value;
@@ -852,6 +856,21 @@ pub(crate) async fn apply_bespoke_event_handling(
.await;
}
EventMsg::RawResponseItem(raw_response_item_event) => {
if raw_response_item_event.item.has_input_image() {
let modalities = [InputModality::Text, InputModality::Image];
if let Some(ctx) = conversation.state_db()
&& let Err(err) = ctx
.set_thread_conversation_modalities(
conversation_id,
input_modalities_to_mask(&modalities),
)
.await
{
warn!(
"failed to persist conversation modalities for thread {conversation_id}: {err}"
);
}
}
maybe_emit_raw_response_item_completed(
api_version,
conversation_id,
@@ -1100,7 +1119,12 @@ pub(crate) async fn apply_bespoke_event_handling(
.await
{
Ok(summary) => {
let mut thread = summary_to_thread(summary);
let conversation_modalities = fetch_state_db_conversation_modalities(
conversation.state_db().as_ref(),
conversation_id,
)
.await;
let mut thread = summary_to_thread(summary, conversation_modalities);
match read_event_msgs_from_rollout(rollout_path.as_path()).await {
Ok(events) => {
thread.turns = build_turns_from_event_msgs(&events);

View File

@@ -200,8 +200,7 @@ use codex_core::rollout_date_parts;
use codex_core::sandboxing::SandboxPermissions;
use codex_core::skills::remote::download_remote_skill;
use codex_core::skills::remote::list_remote_skills;
use codex_core::state_db::StateDbHandle;
use codex_core::state_db::get_state_db;
use codex_core::state_db::{get_state_db, StateDbHandle};
use codex_core::windows_sandbox::WindowsSandboxLevelExt;
use codex_feedback::CodexFeedback;
use codex_login::ServerOptions as LoginServerOptions;
@@ -215,6 +214,9 @@ use codex_protocol::config_types::WindowsSandboxLevel;
use codex_protocol::dynamic_tools::DynamicToolSpec as CoreDynamicToolSpec;
use codex_protocol::items::TurnItem;
use codex_protocol::models::ResponseItem;
use codex_protocol::openai_models::InputModality;
use codex_protocol::openai_models::input_modalities_from_mask;
use codex_protocol::openai_models::input_modalities_to_mask;
use codex_protocol::protocol::AgentStatus;
use codex_protocol::protocol::GitInfo as CoreGitInfo;
use codex_protocol::protocol::McpAuthStatus as CoreMcpAuthStatus;
@@ -1824,11 +1826,44 @@ impl CodexMessageProcessor {
..
} = new_conv;
let config_snapshot = thread.config_snapshot().await;
let thread = build_thread_from_snapshot(
thread_id,
&config_snapshot,
session_configured.rollout_path.clone(),
);
let fallback_provider = self.config.model_provider_id.as_str();
let state_db_ctx = get_state_db(&self.config, None).await;
// A bit hacky, but the summary contains a lot of useful information for the thread
// that unfortunately does not get returned from thread_manager.start_thread().
let thread = match session_configured.rollout_path.as_ref() {
Some(rollout_path) => {
match read_summary_from_rollout(rollout_path.as_path(), fallback_provider)
.await
{
Ok(summary) => {
let conversation_modalities = resolve_conversation_modalities(
state_db_ctx.as_deref(),
thread_id,
None,
Some(rollout_path.as_path()),
)
.await;
summary_to_thread(summary, conversation_modalities)
}
Err(err) if err.kind() == std::io::ErrorKind::NotFound => {
build_ephemeral_thread(thread_id, &config_snapshot)
}
Err(err) => {
self.send_internal_error(
request_id,
format!(
"failed to load rollout `{}` for thread {thread_id}: {err}",
rollout_path.display()
),
)
.await;
return;
}
}
}
None => build_ephemeral_thread(thread_id, &config_snapshot),
};
let response = ThreadStartResponse {
thread: thread.clone(),
@@ -2140,7 +2175,9 @@ impl CodexMessageProcessor {
message: format!("failed to read unarchived thread: {err}"),
data: None,
})?;
Ok(summary_to_thread(summary))
let conversation_modalities =
fetch_state_db_conversation_modalities(state_db_ctx.as_ref(), summary.conversation_id).await;
Ok(summary_to_thread(summary, conversation_modalities))
}
.await;
@@ -2291,7 +2328,13 @@ impl CodexMessageProcessor {
}
};
let data = summaries.into_iter().map(summary_to_thread).collect();
let state_db_ctx = get_state_db(&self.config, None).await;
let mut data = Vec::with_capacity(summaries.len());
for summary in summaries {
let conversation_modalities =
fetch_state_db_conversation_modalities(state_db_ctx.as_ref(), summary.conversation_id).await;
data.push(summary_to_thread(summary, conversation_modalities));
}
let response = ThreadListResponse { data, next_cursor };
self.outgoing.send_response(request_id, response).await;
}
@@ -2401,12 +2444,21 @@ impl CodexMessageProcessor {
return;
}
let state_db_ctx = get_state_db(&self.config, None).await;
let mut thread = if let Some(summary) = db_summary {
summary_to_thread(summary)
let conversation_modalities =
fetch_state_db_conversation_modalities(state_db_ctx.as_ref(), thread_uuid).await;
summary_to_thread(summary, conversation_modalities)
} else if let Some(rollout_path) = rollout_path.as_ref() {
let fallback_provider = self.config.model_provider_id.as_str();
match read_summary_from_rollout(rollout_path, fallback_provider).await {
Ok(summary) => summary_to_thread(summary),
Ok(summary) => {
let conversation_modalities =
fetch_state_db_conversation_modalities(state_db_ctx.as_ref(), thread_uuid)
.await;
summary_to_thread(summary, conversation_modalities)
}
Err(err) => {
self.send_internal_error(
request_id,
@@ -2441,7 +2493,17 @@ impl CodexMessageProcessor {
if include_turns {
rollout_path = loaded_rollout_path.clone();
}
build_thread_from_snapshot(thread_uuid, &config_snapshot, loaded_rollout_path)
let mut thread = if loaded_rollout_path.is_some() {
build_thread_from_snapshot(thread_uuid, &config_snapshot, loaded_rollout_path)
} else {
build_ephemeral_thread(thread_uuid, &config_snapshot)
};
if let Some(modalities) =
fetch_state_db_conversation_modalities(state_db_ctx.as_ref(), thread_uuid).await
{
thread.conversation_modalities = Some(modalities);
}
thread
};
if include_turns && let Some(rollout_path) = rollout_path.as_ref() {
@@ -2628,6 +2690,7 @@ impl CodexMessageProcessor {
};
let fallback_model_provider = config.model_provider_id.clone();
let state_db_ctx = get_state_db(&config, None).await;
match self
.thread_manager
@@ -2664,13 +2727,13 @@ impl CodexMessageProcessor {
);
}
let mut thread = match read_summary_from_rollout(
let summary = match read_summary_from_rollout(
rollout_path.as_path(),
fallback_model_provider.as_str(),
)
.await
{
Ok(summary) => summary_to_thread(summary),
Ok(summary) => summary,
Err(err) => {
self.send_internal_error(
request_id,
@@ -2683,6 +2746,14 @@ impl CodexMessageProcessor {
return;
}
};
let conversation_modalities = resolve_conversation_modalities(
state_db_ctx.as_deref(),
thread_id,
None,
Some(rollout_path.as_path()),
)
.await;
let mut thread = summary_to_thread(summary, conversation_modalities);
thread.turns = initial_messages
.as_deref()
.map_or_else(Vec::new, build_turns_from_event_msgs);
@@ -2872,13 +2943,13 @@ impl CodexMessageProcessor {
);
}
let mut thread = match read_summary_from_rollout(
let summary = match read_summary_from_rollout(
rollout_path.as_path(),
fallback_model_provider.as_str(),
)
.await
{
Ok(summary) => summary_to_thread(summary),
Ok(summary) => summary,
Err(err) => {
self.send_internal_error(
request_id,
@@ -2891,6 +2962,14 @@ impl CodexMessageProcessor {
return;
}
};
let conversation_modalities = resolve_conversation_modalities(
state_db_ctx.as_deref(),
thread_id,
None,
Some(rollout_path.as_path()),
)
.await;
let mut thread = summary_to_thread(summary, conversation_modalities);
thread.turns = initial_messages
.as_deref()
.map_or_else(Vec::new, build_turns_from_event_msgs);
@@ -4173,6 +4252,10 @@ impl CodexMessageProcessor {
return;
};
let has_image_input = items.iter().any(|item| {
matches!(item, WireInputItem::Image { .. } | WireInputItem::LocalImage { .. })
});
let mapped_items: Vec<CoreInputItem> = items
.into_iter()
.map(|item| match item {
@@ -4188,6 +4271,16 @@ impl CodexMessageProcessor {
})
.collect();
if has_image_input {
let modalities = conversation_modalities_for_has_image(true);
persist_conversation_modalities(
conversation.state_db(),
conversation_id,
&modalities,
)
.await;
}
// Submit user input to the conversation.
let _ = conversation
.submit(Op::UserInput {
@@ -4225,6 +4318,10 @@ impl CodexMessageProcessor {
return;
};
let has_image_input = items.iter().any(|item| {
matches!(item, WireInputItem::Image { .. } | WireInputItem::LocalImage { .. })
});
let mapped_items: Vec<CoreInputItem> = items
.into_iter()
.map(|item| match item {
@@ -4240,6 +4337,16 @@ impl CodexMessageProcessor {
})
.collect();
if has_image_input {
let modalities = conversation_modalities_for_has_image(true);
persist_conversation_modalities(
conversation.state_db(),
conversation_id,
&modalities,
)
.await;
}
let _ = conversation
.submit(Op::UserTurn {
items: mapped_items,
@@ -4933,7 +5040,13 @@ impl CodexMessageProcessor {
if let Some(rollout_path) = review_thread.rollout_path() {
match read_summary_from_rollout(rollout_path.as_path(), fallback_provider).await {
Ok(summary) => {
let thread = summary_to_thread(summary);
let state_db_ctx = get_state_db(&self.config, None).await;
let conversation_modalities = fetch_state_db_conversation_modalities(
state_db_ctx.as_ref(),
summary.conversation_id,
)
.await;
let thread = summary_to_thread(summary, conversation_modalities);
let notif = ThreadStartedNotification { thread };
self.outgoing
.send_server_notification(ServerNotification::ThreadStarted(notif))
@@ -5716,7 +5829,6 @@ pub(crate) async fn read_summary_from_rollout(
fallback_provider: &str,
) -> std::io::Result<ConversationSummary> {
let head = read_head_for_summary(path).await?;
let Some(first) = head.first() else {
return Err(IoError::other(format!(
"rollout at {} is empty",
@@ -5797,6 +5909,109 @@ pub(crate) async fn read_event_msgs_from_rollout(
.collect())
}
pub(crate) async fn fetch_state_db_conversation_modalities(
state_db_ctx: Option<&StateDbHandle>,
thread_id: ThreadId,
) -> Option<Vec<InputModality>> {
let ctx = state_db_ctx?;
match ctx.get_thread(thread_id).await {
Ok(Some(metadata)) => metadata
.conversation_modalities
.map(input_modalities_from_mask),
Ok(None) => None,
Err(err) => {
warn!("failed to read conversation modalities for thread {thread_id}: {err}");
None
}
}
}
async fn persist_conversation_modalities(
state_db_ctx: Option<&StateDbHandle>,
thread_id: ThreadId,
conversation_modalities: &[InputModality],
) {
let Some(ctx) = state_db_ctx else {
return;
};
if let Err(err) = ctx
.set_thread_conversation_modalities(
thread_id,
input_modalities_to_mask(conversation_modalities),
)
.await
{
warn!(
"failed to persist conversation modalities for thread {thread_id}: {err}"
);
}
}
pub(crate) async fn resolve_conversation_modalities(
state_db_ctx: Option<&StateDbHandle>,
thread_id: ThreadId,
current_modalities: Option<Vec<InputModality>>,
rollout_path: Option<&Path>,
) -> Option<Vec<InputModality>> {
if let Some(modalities) = current_modalities {
if modalities.contains(&InputModality::Image) {
persist_conversation_modalities(state_db_ctx, thread_id, &modalities).await;
}
return Some(modalities);
}
if let Some(value) = fetch_state_db_conversation_modalities(state_db_ctx, thread_id).await {
return Some(value);
}
let Some(rollout_path) = rollout_path else {
return None;
};
match read_rollout_has_image_context(rollout_path).await {
Ok(has_image_context) => {
let modalities = conversation_modalities_for_has_image(has_image_context);
persist_conversation_modalities(state_db_ctx, thread_id, &modalities).await;
Some(modalities)
}
Err(err) => {
warn!(
"failed to determine conversation modalities for rollout {}: {err}",
rollout_path.display()
);
None
}
}
}
pub(crate) async fn read_rollout_has_image_context(path: &Path) -> std::io::Result<bool> {
let items = match RolloutRecorder::get_rollout_history(path).await? {
InitialHistory::New => Vec::new(),
InitialHistory::Forked(items) => items,
InitialHistory::Resumed(resumed) => resumed.history,
};
Ok(items.into_iter().rev().any(|item| match item {
RolloutItem::ResponseItem(response_item) => response_item.has_input_image(),
RolloutItem::EventMsg(EventMsg::UserMessage(user_message)) => {
user_message
.images
.as_ref()
.is_some_and(|images| !images.is_empty())
|| !user_message.local_images.is_empty()
}
_ => false,
}))
}
fn conversation_modalities_for_has_image(has_image_context: bool) -> Vec<InputModality> {
if has_image_context {
vec![InputModality::Text, InputModality::Image]
} else {
vec![InputModality::Text]
}
}
fn extract_conversation_summary(
path: PathBuf,
head: &[serde_json::Value],
@@ -5830,7 +6045,6 @@ fn extract_conversation_summary(
.unwrap_or_else(|| fallback_provider.to_string());
let git_info = git.map(map_git_info);
let updated_at = updated_at.or_else(|| timestamp.clone());
Some(ConversationSummary {
conversation_id,
timestamp,
@@ -5882,6 +6096,7 @@ fn build_thread_from_snapshot(
Thread {
id: thread_id.to_string(),
preview: String::new(),
conversation_modalities: None,
model_provider: config_snapshot.model_provider_id.clone(),
created_at: now,
updated_at: now,
@@ -5894,7 +6109,10 @@ fn build_thread_from_snapshot(
}
}
pub(crate) fn summary_to_thread(summary: ConversationSummary) -> Thread {
pub(crate) fn summary_to_thread(
summary: ConversationSummary,
resolved_conversation_modalities: Option<Vec<InputModality>>,
) -> Thread {
let ConversationSummary {
conversation_id,
path,
@@ -5915,10 +6133,12 @@ pub(crate) fn summary_to_thread(summary: ConversationSummary) -> Thread {
branch: info.branch,
origin_url: info.origin_url,
});
let conversation_modalities = resolved_conversation_modalities;
Thread {
id: conversation_id.to_string(),
preview,
conversation_modalities,
model_provider,
created_at: created_at.map(|dt| dt.timestamp()).unwrap_or(0),
updated_at: updated_at.map(|dt| dt.timestamp()).unwrap_or(0),
@@ -6079,4 +6299,56 @@ mod tests {
assert_eq!(summary, expected);
Ok(())
}
#[tokio::test]
async fn read_rollout_has_image_context_detects_images() -> Result<()> {
use codex_protocol::protocol::EventMsg;
use codex_protocol::protocol::RolloutItem;
use codex_protocol::protocol::RolloutLine;
use codex_protocol::protocol::SessionMetaLine;
use codex_protocol::protocol::UserMessageEvent;
use std::fs;
let temp_dir = TempDir::new()?;
let path = temp_dir.path().join("rollout.jsonl");
let conversation_id = ThreadId::from_string("f3225d70-c282-4eaf-bb39-c474f8194bcb")?;
let timestamp = "2025-09-06T10:10:10.000Z".to_string();
let session_meta = SessionMeta {
id: conversation_id,
timestamp: timestamp.clone(),
model_provider: None,
..SessionMeta::default()
};
let lines = vec![
RolloutLine {
timestamp: timestamp.clone(),
item: RolloutItem::SessionMeta(SessionMetaLine {
meta: session_meta,
git: None,
}),
},
RolloutLine {
timestamp: timestamp.clone(),
item: RolloutItem::EventMsg(EventMsg::UserMessage(UserMessageEvent {
message: "legacy image event".to_string(),
images: Some(vec!["data:image/png;base64,abc123".to_string()]),
local_images: Vec::new(),
text_elements: Vec::new(),
})),
},
];
let mut contents = String::new();
for line in lines {
contents.push_str(&serde_json::to_string(&line)?);
contents.push('\n');
}
fs::write(&path, contents)?;
assert_eq!(read_rollout_has_image_context(path.as_path()).await?, true);
Ok(())
}
}

View File

@@ -72,6 +72,12 @@ pub enum ContentItem {
OutputText { text: String },
}
impl ContentItem {
pub fn is_input_image(&self) -> bool {
matches!(self, ContentItem::InputImage { .. })
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, JsonSchema, TS)]
#[serde(rename_all = "snake_case")]
/// Classifies an assistant message as interim commentary or final answer text.
@@ -195,6 +201,24 @@ pub enum ResponseItem {
Other,
}
impl ResponseItem {
pub fn has_input_image(&self) -> bool {
match self {
ResponseItem::Message { content, .. } => {
content.iter().any(ContentItem::is_input_image)
}
ResponseItem::FunctionCallOutput { output, .. } => {
output.content_items().is_some_and(|items| {
items
.iter()
.any(FunctionCallOutputContentItem::is_input_image)
})
}
_ => false,
}
}
}
pub const BASE_INSTRUCTIONS_DEFAULT: &str = include_str!("prompts/base_instructions/default.md");
/// Base instructions for the model in a thread. Corresponds to the `instructions` field in the ResponsesAPI.
@@ -795,6 +819,12 @@ pub enum FunctionCallOutputContentItem {
InputImage { image_url: String },
}
impl FunctionCallOutputContentItem {
pub fn is_input_image(&self) -> bool {
matches!(self, FunctionCallOutputContentItem::InputImage { .. })
}
}
/// Converts structured function-call output content into plain text for
/// human-readable surfaces.
///

View File

@@ -72,6 +72,35 @@ pub enum InputModality {
Image,
}
pub const INPUT_MODALITY_TEXT_MASK: i64 = 1;
pub const INPUT_MODALITY_IMAGE_MASK: i64 = 1 << 1;
pub fn input_modality_to_mask(modality: InputModality) -> i64 {
match modality {
InputModality::Text => INPUT_MODALITY_TEXT_MASK,
InputModality::Image => INPUT_MODALITY_IMAGE_MASK,
}
}
pub fn input_modalities_to_mask(modalities: &[InputModality]) -> i64 {
modalities
.iter()
.copied()
.map(input_modality_to_mask)
.fold(0, |mask, bit| mask | bit)
}
pub fn input_modalities_from_mask(mask: i64) -> Vec<InputModality> {
let mut modalities = Vec::new();
if mask & INPUT_MODALITY_TEXT_MASK != 0 {
modalities.push(InputModality::Text);
}
if mask & INPUT_MODALITY_IMAGE_MASK != 0 {
modalities.push(InputModality::Image);
}
modalities
}
/// Backward-compatible default when `input_modalities` is omitted on the wire.
///
/// Legacy payloads predate modality metadata, so we conservatively assume both text and images are

View File

@@ -0,0 +1 @@
ALTER TABLE threads ADD COLUMN conversation_modalities INTEGER;

View File

@@ -76,12 +76,29 @@ fn apply_event_msg(metadata: &mut ThreadMetadata, event: &EventMsg) {
metadata.title = title.to_string();
}
}
if user
.images
.as_ref()
.is_some_and(|images| !images.is_empty())
|| !user.local_images.is_empty()
{
metadata.conversation_modalities = Some(
codex_protocol::openai_models::INPUT_MODALITY_TEXT_MASK
| codex_protocol::openai_models::INPUT_MODALITY_IMAGE_MASK,
);
}
}
_ => {}
}
}
fn apply_response_item(_metadata: &mut ThreadMetadata, _item: &ResponseItem) {
fn apply_response_item(metadata: &mut ThreadMetadata, item: &ResponseItem) {
if item.has_input_image() {
metadata.conversation_modalities = Some(
codex_protocol::openai_models::INPUT_MODALITY_TEXT_MASK
| codex_protocol::openai_models::INPUT_MODALITY_IMAGE_MASK,
);
}
// Title and first_user_message are derived from EventMsg::UserMessage only.
}
@@ -224,6 +241,7 @@ mod tests {
approval_mode: "on-request".to_string(),
tokens_used: 1,
first_user_message: None,
conversation_modalities: None,
archived_at: None,
git_sha: None,
git_branch: None,

View File

@@ -78,6 +78,8 @@ pub struct ThreadMetadata {
pub tokens_used: i64,
/// First user message observed for this thread, if any.
pub first_user_message: Option<String>,
/// Tri-state conversation modalities bitmask (NULL means unknown).
pub conversation_modalities: Option<i64>,
/// The archive timestamp, if the thread is archived.
pub archived_at: Option<DateTime<Utc>>,
/// The git commit SHA, if known.
@@ -119,6 +121,8 @@ pub struct ThreadMetadataBuilder {
pub git_branch: Option<String>,
/// The git origin URL, if known.
pub git_origin_url: Option<String>,
/// Tri-state conversation modalities bitmask, if known.
pub conversation_modalities: Option<i64>,
}
impl ThreadMetadataBuilder {
@@ -144,6 +148,9 @@ impl ThreadMetadataBuilder {
git_sha: None,
git_branch: None,
git_origin_url: None,
conversation_modalities: Some(
codex_protocol::openai_models::INPUT_MODALITY_TEXT_MASK,
),
}
}
@@ -174,6 +181,9 @@ impl ThreadMetadataBuilder {
approval_mode,
tokens_used: 0,
first_user_message: None,
conversation_modalities: self.conversation_modalities.or(Some(
codex_protocol::openai_models::INPUT_MODALITY_TEXT_MASK,
)),
archived_at: self.archived_at.map(canonicalize_datetime),
git_sha: self.git_sha.clone(),
git_branch: self.git_branch.clone(),
@@ -225,6 +235,9 @@ impl ThreadMetadata {
if self.first_user_message != other.first_user_message {
diffs.push("first_user_message");
}
if self.conversation_modalities != other.conversation_modalities {
diffs.push("conversation_modalities");
}
if self.archived_at != other.archived_at {
diffs.push("archived_at");
}
@@ -260,6 +273,7 @@ pub(crate) struct ThreadRow {
approval_mode: String,
tokens_used: i64,
first_user_message: String,
conversation_modalities: Option<i64>,
archived_at: Option<i64>,
git_sha: Option<String>,
git_branch: Option<String>,
@@ -282,6 +296,7 @@ impl ThreadRow {
approval_mode: row.try_get("approval_mode")?,
tokens_used: row.try_get("tokens_used")?,
first_user_message: row.try_get("first_user_message")?,
conversation_modalities: row.try_get("conversation_modalities")?,
archived_at: row.try_get("archived_at")?,
git_sha: row.try_get("git_sha")?,
git_branch: row.try_get("git_branch")?,
@@ -308,6 +323,7 @@ impl TryFrom<ThreadRow> for ThreadMetadata {
approval_mode,
tokens_used,
first_user_message,
conversation_modalities,
archived_at,
git_sha,
git_branch,
@@ -327,6 +343,7 @@ impl TryFrom<ThreadRow> for ThreadMetadata {
approval_mode,
tokens_used,
first_user_message: (!first_user_message.is_empty()).then_some(first_user_message),
conversation_modalities,
archived_at: archived_at.map(epoch_seconds_to_datetime).transpose()?,
git_sha,
git_branch,

View File

@@ -235,6 +235,7 @@ SELECT
approval_mode,
tokens_used,
first_user_message,
conversation_modalities,
archived_at,
git_sha,
git_branch,
@@ -334,6 +335,7 @@ SELECT
approval_mode,
tokens_used,
first_user_message,
conversation_modalities,
archived_at,
git_sha,
git_branch,
@@ -489,6 +491,7 @@ INSERT INTO threads (
approval_mode,
tokens_used,
first_user_message,
conversation_modalities,
archived,
archived_at,
git_sha,
@@ -508,6 +511,7 @@ ON CONFLICT(id) DO UPDATE SET
approval_mode = excluded.approval_mode,
tokens_used = excluded.tokens_used,
first_user_message = excluded.first_user_message,
conversation_modalities = excluded.conversation_modalities,
archived = excluded.archived,
archived_at = excluded.archived_at,
git_sha = excluded.git_sha,
@@ -528,6 +532,7 @@ ON CONFLICT(id) DO UPDATE SET
.bind(metadata.approval_mode.as_str())
.bind(metadata.tokens_used)
.bind(metadata.first_user_message.as_deref().unwrap_or_default())
.bind(metadata.conversation_modalities)
.bind(metadata.archived_at.is_some())
.bind(metadata.archived_at.map(datetime_to_epoch_seconds))
.bind(metadata.git_sha.as_deref())
@@ -538,6 +543,26 @@ ON CONFLICT(id) DO UPDATE SET
Ok(())
}
/// Update the persisted conversation modalities mask for a thread.
pub async fn set_thread_conversation_modalities(
&self,
thread_id: ThreadId,
conversation_modalities: i64,
) -> anyhow::Result<()> {
sqlx::query(
r#"
UPDATE threads
SET conversation_modalities = ?
WHERE id = ?
"#,
)
.bind(conversation_modalities)
.bind(thread_id.to_string())
.execute(self.pool.as_ref())
.await?;
Ok(())
}
/// Persist dynamic tools for a thread if none have been stored yet.
///
/// Dynamic tools are defined at thread start and should not change afterward.