mirror of
https://github.com/openai/codex.git
synced 2026-04-24 14:45:27 +00:00
feat: switch on dying sub-agents (#11477)
[codex-generated] ## Updated PR Description (Ready To Paste) ## Problem When a sub-agent thread emits `ShutdownComplete`, the TUI switches back to the primary thread. That was also happening for user-requested exits (for example `Ctrl+C`), which could prevent a clean app exit and unexpectedly resurrect the main thread. ## Mental model The app has one primary thread and one active thread. A non-primary active thread shutting down usually means "agent died, fail back to primary," but during `ExitMode::ShutdownFirst` shutdown means "the user is exiting," not "recover this session." ## Non-goals No change to thread lifecycle, thread-manager ownership, or shutdown protocol wire format. No behavioral changes to non-shutdown events. ## Tradeoffs This adds a small local marker (`pending_shutdown_exit_thread_id`) instead of inferring intent from event timing. It is deterministic and simple, but relies on correctly setting and clearing that marker around exit. ## Architecture `App` tracks which thread is intentionally being shut down for exit. `active_non_primary_shutdown_target` centralizes failover eligibility for `ShutdownComplete` and skips failover when shutdown matches the pending-exit thread. `handle_active_thread_event` handles non-primary failover before generic forwarding and clears the pending-exit marker only when the matching active thread completes shutdown. ## Observability User-facing info/error messages continue to indicate whether failover to the main thread succeeded. The shutdown-intent path is now explicitly documented inline for easier debugging. ## Tests Added targeted tests for `active_non_primary_shutdown_target` covering non-shutdown events, primary-thread shutdown, non-primary shutdown failover, pending exit on active thread (no failover), and pending exit for another thread (still failover). Validated with: - `cargo test -p codex-tui` (pass) --------- Co-authored-by: Josh McKinney <joshka@openai.com>
This commit is contained in:
@@ -556,9 +556,21 @@ pub(crate) struct App {
|
||||
/// Set when the user confirms an update; propagated on exit.
|
||||
pub(crate) pending_update_action: Option<UpdateAction>,
|
||||
|
||||
/// Ignore the next ShutdownComplete event when we're intentionally
|
||||
/// stopping a thread (e.g., before starting a new one).
|
||||
/// One-shot guard used while switching threads.
|
||||
///
|
||||
/// We set this when intentionally stopping the current thread before moving
|
||||
/// to another one, then ignore exactly one `ShutdownComplete` so it is not
|
||||
/// misclassified as an unexpected sub-agent death.
|
||||
suppress_shutdown_complete: bool,
|
||||
/// Tracks the thread we intentionally shut down while exiting the app.
|
||||
///
|
||||
/// When this matches the active thread, its `ShutdownComplete` should lead to
|
||||
/// process exit instead of being treated as an unexpected sub-agent death that
|
||||
/// triggers failover to the primary thread.
|
||||
///
|
||||
/// This is thread-scoped state (`Option<ThreadId>`) instead of a global bool
|
||||
/// so shutdown events from other threads still take the normal failover path.
|
||||
pending_shutdown_exit_thread_id: Option<ThreadId>,
|
||||
|
||||
windows_sandbox: WindowsSandboxState,
|
||||
|
||||
@@ -933,6 +945,29 @@ impl App {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns `(closed_thread_id, primary_thread_id)` when a non-primary active
|
||||
/// thread has died and we should fail over to the primary thread.
|
||||
///
|
||||
/// A user-requested shutdown (`ExitMode::ShutdownFirst`) sets
|
||||
/// `pending_shutdown_exit_thread_id`; matching shutdown completions are ignored
|
||||
/// here so Ctrl+C-like exits don't accidentally resurrect the main thread.
|
||||
///
|
||||
/// Failover is only eligible when all of these are true:
|
||||
/// 1. the event is `ShutdownComplete`;
|
||||
/// 2. the active thread differs from the primary thread;
|
||||
/// 3. the active thread is not the pending shutdown-exit thread.
|
||||
fn active_non_primary_shutdown_target(&self, msg: &EventMsg) -> Option<(ThreadId, ThreadId)> {
|
||||
if !matches!(msg, EventMsg::ShutdownComplete) {
|
||||
return None;
|
||||
}
|
||||
let active_thread_id = self.active_thread_id?;
|
||||
let primary_thread_id = self.primary_thread_id?;
|
||||
if self.pending_shutdown_exit_thread_id == Some(active_thread_id) {
|
||||
return None;
|
||||
}
|
||||
(active_thread_id != primary_thread_id).then_some((active_thread_id, primary_thread_id))
|
||||
}
|
||||
|
||||
fn replay_thread_snapshot(&mut self, snapshot: ThreadEventSnapshot) {
|
||||
if let Some(event) = snapshot.session_configured {
|
||||
self.handle_codex_event_replay(event);
|
||||
@@ -1153,6 +1188,7 @@ impl App {
|
||||
feedback_audience,
|
||||
pending_update_action: None,
|
||||
suppress_shutdown_complete: false,
|
||||
pending_shutdown_exit_thread_id: None,
|
||||
windows_sandbox: WindowsSandboxState::default(),
|
||||
thread_event_channels: HashMap::new(),
|
||||
active_thread_id: None,
|
||||
@@ -1230,7 +1266,7 @@ impl App {
|
||||
}
|
||||
}, if app.active_thread_rx.is_some() => {
|
||||
if let Some(event) = active {
|
||||
app.handle_active_thread_event(tui, event)?;
|
||||
app.handle_active_thread_event(tui, event).await?;
|
||||
} else {
|
||||
app.clear_active_thread().await;
|
||||
}
|
||||
@@ -1582,8 +1618,15 @@ impl App {
|
||||
self.enqueue_primary_event(event).await?;
|
||||
}
|
||||
AppEvent::Exit(mode) => match mode {
|
||||
ExitMode::ShutdownFirst => self.chat_widget.submit_op(Op::Shutdown),
|
||||
ExitMode::ShutdownFirst => {
|
||||
// Mark the thread we are explicitly shutting down for exit so
|
||||
// its shutdown completion does not trigger agent failover.
|
||||
self.pending_shutdown_exit_thread_id =
|
||||
self.active_thread_id.or(self.chat_widget.thread_id());
|
||||
self.chat_widget.submit_op(Op::Shutdown);
|
||||
}
|
||||
ExitMode::Immediate => {
|
||||
self.pending_shutdown_exit_thread_id = None;
|
||||
return Ok(AppRunControl::Exit(ExitReason::UserRequested));
|
||||
}
|
||||
},
|
||||
@@ -2467,6 +2510,9 @@ impl App {
|
||||
event.msg,
|
||||
EventMsg::SessionConfigured(_) | EventMsg::TokenCount(_)
|
||||
);
|
||||
// This guard is only for intentional thread-switch shutdowns.
|
||||
// App-exit shutdowns are tracked by `pending_shutdown_exit_thread_id`
|
||||
// and resolved in `handle_active_thread_event`.
|
||||
if self.suppress_shutdown_complete && matches!(event.msg, EventMsg::ShutdownComplete) {
|
||||
self.suppress_shutdown_complete = false;
|
||||
return;
|
||||
@@ -2488,7 +2534,51 @@ impl App {
|
||||
self.chat_widget.handle_codex_event_replay(event);
|
||||
}
|
||||
|
||||
fn handle_active_thread_event(&mut self, tui: &mut tui::Tui, event: Event) -> Result<()> {
|
||||
/// Handles an event emitted by the currently active thread.
|
||||
///
|
||||
/// This function enforces shutdown intent routing: unexpected non-primary
|
||||
/// thread shutdowns fail over to the primary thread, while user-requested
|
||||
/// app exits consume only the tracked shutdown completion and then proceed.
|
||||
async fn handle_active_thread_event(&mut self, tui: &mut tui::Tui, event: Event) -> Result<()> {
|
||||
// Capture this before any potential thread switch: we only want to clear
|
||||
// the exit marker when the currently active thread acknowledges shutdown.
|
||||
let pending_shutdown_exit_completed = matches!(&event.msg, EventMsg::ShutdownComplete)
|
||||
&& self.pending_shutdown_exit_thread_id == self.active_thread_id;
|
||||
|
||||
// Processing order matters:
|
||||
//
|
||||
// 1. handle unexpected non-primary shutdown failover first;
|
||||
// 2. clear pending exit marker for matching shutdown;
|
||||
// 3. forward the event through normal handling.
|
||||
//
|
||||
// This preserves the mental model that user-requested exits do not trigger
|
||||
// failover, while true sub-agent deaths still do.
|
||||
if let Some((closed_thread_id, primary_thread_id)) =
|
||||
self.active_non_primary_shutdown_target(&event.msg)
|
||||
{
|
||||
self.thread_event_channels.remove(&closed_thread_id);
|
||||
self.select_agent_thread(tui, primary_thread_id).await?;
|
||||
if self.active_thread_id == Some(primary_thread_id) {
|
||||
self.chat_widget.add_info_message(
|
||||
format!(
|
||||
"Agent thread {closed_thread_id} closed. Switched back to main thread."
|
||||
),
|
||||
None,
|
||||
);
|
||||
} else {
|
||||
self.clear_active_thread().await;
|
||||
self.chat_widget.add_error_message(format!(
|
||||
"Agent thread {closed_thread_id} closed. Failed to switch back to main thread {primary_thread_id}.",
|
||||
));
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if pending_shutdown_exit_completed {
|
||||
// Clear only after seeing the shutdown completion for the tracked
|
||||
// thread, so unrelated shutdowns cannot consume this marker.
|
||||
self.pending_shutdown_exit_thread_id = None;
|
||||
}
|
||||
self.handle_codex_event_now(event);
|
||||
if self.backtrack_render_pending {
|
||||
tui.frame_requester().schedule_frame();
|
||||
@@ -2876,6 +2966,85 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn active_non_primary_shutdown_target_returns_none_for_non_shutdown_event() -> Result<()>
|
||||
{
|
||||
let mut app = make_test_app().await;
|
||||
app.active_thread_id = Some(ThreadId::new());
|
||||
app.primary_thread_id = Some(ThreadId::new());
|
||||
|
||||
assert_eq!(
|
||||
app.active_non_primary_shutdown_target(&EventMsg::SkillsUpdateAvailable),
|
||||
None
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn active_non_primary_shutdown_target_returns_none_for_primary_thread_shutdown()
|
||||
-> Result<()> {
|
||||
let mut app = make_test_app().await;
|
||||
let thread_id = ThreadId::new();
|
||||
app.active_thread_id = Some(thread_id);
|
||||
app.primary_thread_id = Some(thread_id);
|
||||
|
||||
assert_eq!(
|
||||
app.active_non_primary_shutdown_target(&EventMsg::ShutdownComplete),
|
||||
None
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn active_non_primary_shutdown_target_returns_ids_for_non_primary_shutdown() -> Result<()>
|
||||
{
|
||||
let mut app = make_test_app().await;
|
||||
let active_thread_id = ThreadId::new();
|
||||
let primary_thread_id = ThreadId::new();
|
||||
app.active_thread_id = Some(active_thread_id);
|
||||
app.primary_thread_id = Some(primary_thread_id);
|
||||
|
||||
assert_eq!(
|
||||
app.active_non_primary_shutdown_target(&EventMsg::ShutdownComplete),
|
||||
Some((active_thread_id, primary_thread_id))
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn active_non_primary_shutdown_target_returns_none_when_shutdown_exit_is_pending()
|
||||
-> Result<()> {
|
||||
let mut app = make_test_app().await;
|
||||
let active_thread_id = ThreadId::new();
|
||||
let primary_thread_id = ThreadId::new();
|
||||
app.active_thread_id = Some(active_thread_id);
|
||||
app.primary_thread_id = Some(primary_thread_id);
|
||||
app.pending_shutdown_exit_thread_id = Some(active_thread_id);
|
||||
|
||||
assert_eq!(
|
||||
app.active_non_primary_shutdown_target(&EventMsg::ShutdownComplete),
|
||||
None
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn active_non_primary_shutdown_target_still_switches_for_other_pending_exit_thread()
|
||||
-> Result<()> {
|
||||
let mut app = make_test_app().await;
|
||||
let active_thread_id = ThreadId::new();
|
||||
let primary_thread_id = ThreadId::new();
|
||||
app.active_thread_id = Some(active_thread_id);
|
||||
app.primary_thread_id = Some(primary_thread_id);
|
||||
app.pending_shutdown_exit_thread_id = Some(ThreadId::new());
|
||||
|
||||
assert_eq!(
|
||||
app.active_non_primary_shutdown_target(&EventMsg::ShutdownComplete),
|
||||
Some((active_thread_id, primary_thread_id))
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn make_test_app() -> App {
|
||||
let (chat_widget, app_event_tx, _rx, _op_rx) = make_chatwidget_manual_with_sender().await;
|
||||
let config = chat_widget.config_ref().clone();
|
||||
@@ -2918,6 +3087,7 @@ mod tests {
|
||||
feedback_audience: FeedbackAudience::External,
|
||||
pending_update_action: None,
|
||||
suppress_shutdown_complete: false,
|
||||
pending_shutdown_exit_thread_id: None,
|
||||
windows_sandbox: WindowsSandboxState::default(),
|
||||
thread_event_channels: HashMap::new(),
|
||||
active_thread_id: None,
|
||||
@@ -2975,6 +3145,7 @@ mod tests {
|
||||
feedback_audience: FeedbackAudience::External,
|
||||
pending_update_action: None,
|
||||
suppress_shutdown_complete: false,
|
||||
pending_shutdown_exit_thread_id: None,
|
||||
windows_sandbox: WindowsSandboxState::default(),
|
||||
thread_event_channels: HashMap::new(),
|
||||
active_thread_id: None,
|
||||
|
||||
Reference in New Issue
Block a user