feat: switch on dying sub-agents (#11477)

[codex-generated]

## Updated PR Description (Ready To Paste)

## Problem

When a sub-agent thread emits `ShutdownComplete`, the TUI switches back
to the primary thread.
That was also happening for user-requested exits (for example `Ctrl+C`),
which could prevent a
clean app exit and unexpectedly resurrect the main thread.

## Mental model

The app has one primary thread and one active thread. A non-primary
active thread shutting down
usually means "agent died, fail back to primary," but during
`ExitMode::ShutdownFirst` shutdown
means "the user is exiting," not "recover this session."

## Non-goals

No change to thread lifecycle, thread-manager ownership, or shutdown
protocol wire format.
No behavioral changes to non-shutdown events.

## Tradeoffs

This adds a small local marker (`pending_shutdown_exit_thread_id`)
instead of inferring intent
from event timing. It is deterministic and simple, but relies on
correctly setting and clearing
that marker around exit.

## Architecture

`App` tracks which thread is intentionally being shut down for exit.
`active_non_primary_shutdown_target` centralizes failover eligibility
for `ShutdownComplete` and
skips failover when shutdown matches the pending-exit thread.
`handle_active_thread_event` handles non-primary failover before generic
forwarding and clears the
pending-exit marker only when the matching active thread completes
shutdown.

## Observability

User-facing info/error messages continue to indicate whether failover to
the main thread succeeded.
The shutdown-intent path is now explicitly documented inline for easier
debugging.

## Tests

Added targeted tests for `active_non_primary_shutdown_target` covering
non-shutdown events,
primary-thread shutdown, non-primary shutdown failover, pending exit on
active thread (no failover),
and pending exit for another thread (still failover).

Validated with:
- `cargo test -p codex-tui` (pass)

---------

Co-authored-by: Josh McKinney <joshka@openai.com>
This commit is contained in:
jif-oai
2026-02-13 18:29:03 +00:00
committed by GitHub
parent 12f69b893f
commit 851fcc377b

View File

@@ -556,9 +556,21 @@ pub(crate) struct App {
/// Set when the user confirms an update; propagated on exit.
pub(crate) pending_update_action: Option<UpdateAction>,
/// Ignore the next ShutdownComplete event when we're intentionally
/// stopping a thread (e.g., before starting a new one).
/// One-shot guard used while switching threads.
///
/// We set this when intentionally stopping the current thread before moving
/// to another one, then ignore exactly one `ShutdownComplete` so it is not
/// misclassified as an unexpected sub-agent death.
suppress_shutdown_complete: bool,
/// Tracks the thread we intentionally shut down while exiting the app.
///
/// When this matches the active thread, its `ShutdownComplete` should lead to
/// process exit instead of being treated as an unexpected sub-agent death that
/// triggers failover to the primary thread.
///
/// This is thread-scoped state (`Option<ThreadId>`) instead of a global bool
/// so shutdown events from other threads still take the normal failover path.
pending_shutdown_exit_thread_id: Option<ThreadId>,
windows_sandbox: WindowsSandboxState,
@@ -933,6 +945,29 @@ impl App {
Ok(())
}
/// Returns `(closed_thread_id, primary_thread_id)` when a non-primary active
/// thread has died and we should fail over to the primary thread.
///
/// A user-requested shutdown (`ExitMode::ShutdownFirst`) sets
/// `pending_shutdown_exit_thread_id`; matching shutdown completions are ignored
/// here so Ctrl+C-like exits don't accidentally resurrect the main thread.
///
/// Failover is only eligible when all of these are true:
/// 1. the event is `ShutdownComplete`;
/// 2. the active thread differs from the primary thread;
/// 3. the active thread is not the pending shutdown-exit thread.
fn active_non_primary_shutdown_target(&self, msg: &EventMsg) -> Option<(ThreadId, ThreadId)> {
if !matches!(msg, EventMsg::ShutdownComplete) {
return None;
}
let active_thread_id = self.active_thread_id?;
let primary_thread_id = self.primary_thread_id?;
if self.pending_shutdown_exit_thread_id == Some(active_thread_id) {
return None;
}
(active_thread_id != primary_thread_id).then_some((active_thread_id, primary_thread_id))
}
fn replay_thread_snapshot(&mut self, snapshot: ThreadEventSnapshot) {
if let Some(event) = snapshot.session_configured {
self.handle_codex_event_replay(event);
@@ -1153,6 +1188,7 @@ impl App {
feedback_audience,
pending_update_action: None,
suppress_shutdown_complete: false,
pending_shutdown_exit_thread_id: None,
windows_sandbox: WindowsSandboxState::default(),
thread_event_channels: HashMap::new(),
active_thread_id: None,
@@ -1230,7 +1266,7 @@ impl App {
}
}, if app.active_thread_rx.is_some() => {
if let Some(event) = active {
app.handle_active_thread_event(tui, event)?;
app.handle_active_thread_event(tui, event).await?;
} else {
app.clear_active_thread().await;
}
@@ -1582,8 +1618,15 @@ impl App {
self.enqueue_primary_event(event).await?;
}
AppEvent::Exit(mode) => match mode {
ExitMode::ShutdownFirst => self.chat_widget.submit_op(Op::Shutdown),
ExitMode::ShutdownFirst => {
// Mark the thread we are explicitly shutting down for exit so
// its shutdown completion does not trigger agent failover.
self.pending_shutdown_exit_thread_id =
self.active_thread_id.or(self.chat_widget.thread_id());
self.chat_widget.submit_op(Op::Shutdown);
}
ExitMode::Immediate => {
self.pending_shutdown_exit_thread_id = None;
return Ok(AppRunControl::Exit(ExitReason::UserRequested));
}
},
@@ -2467,6 +2510,9 @@ impl App {
event.msg,
EventMsg::SessionConfigured(_) | EventMsg::TokenCount(_)
);
// This guard is only for intentional thread-switch shutdowns.
// App-exit shutdowns are tracked by `pending_shutdown_exit_thread_id`
// and resolved in `handle_active_thread_event`.
if self.suppress_shutdown_complete && matches!(event.msg, EventMsg::ShutdownComplete) {
self.suppress_shutdown_complete = false;
return;
@@ -2488,7 +2534,51 @@ impl App {
self.chat_widget.handle_codex_event_replay(event);
}
fn handle_active_thread_event(&mut self, tui: &mut tui::Tui, event: Event) -> Result<()> {
/// Handles an event emitted by the currently active thread.
///
/// This function enforces shutdown intent routing: unexpected non-primary
/// thread shutdowns fail over to the primary thread, while user-requested
/// app exits consume only the tracked shutdown completion and then proceed.
async fn handle_active_thread_event(&mut self, tui: &mut tui::Tui, event: Event) -> Result<()> {
// Capture this before any potential thread switch: we only want to clear
// the exit marker when the currently active thread acknowledges shutdown.
let pending_shutdown_exit_completed = matches!(&event.msg, EventMsg::ShutdownComplete)
&& self.pending_shutdown_exit_thread_id == self.active_thread_id;
// Processing order matters:
//
// 1. handle unexpected non-primary shutdown failover first;
// 2. clear pending exit marker for matching shutdown;
// 3. forward the event through normal handling.
//
// This preserves the mental model that user-requested exits do not trigger
// failover, while true sub-agent deaths still do.
if let Some((closed_thread_id, primary_thread_id)) =
self.active_non_primary_shutdown_target(&event.msg)
{
self.thread_event_channels.remove(&closed_thread_id);
self.select_agent_thread(tui, primary_thread_id).await?;
if self.active_thread_id == Some(primary_thread_id) {
self.chat_widget.add_info_message(
format!(
"Agent thread {closed_thread_id} closed. Switched back to main thread."
),
None,
);
} else {
self.clear_active_thread().await;
self.chat_widget.add_error_message(format!(
"Agent thread {closed_thread_id} closed. Failed to switch back to main thread {primary_thread_id}.",
));
}
return Ok(());
}
if pending_shutdown_exit_completed {
// Clear only after seeing the shutdown completion for the tracked
// thread, so unrelated shutdowns cannot consume this marker.
self.pending_shutdown_exit_thread_id = None;
}
self.handle_codex_event_now(event);
if self.backtrack_render_pending {
tui.frame_requester().schedule_frame();
@@ -2876,6 +2966,85 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn active_non_primary_shutdown_target_returns_none_for_non_shutdown_event() -> Result<()>
{
let mut app = make_test_app().await;
app.active_thread_id = Some(ThreadId::new());
app.primary_thread_id = Some(ThreadId::new());
assert_eq!(
app.active_non_primary_shutdown_target(&EventMsg::SkillsUpdateAvailable),
None
);
Ok(())
}
#[tokio::test]
async fn active_non_primary_shutdown_target_returns_none_for_primary_thread_shutdown()
-> Result<()> {
let mut app = make_test_app().await;
let thread_id = ThreadId::new();
app.active_thread_id = Some(thread_id);
app.primary_thread_id = Some(thread_id);
assert_eq!(
app.active_non_primary_shutdown_target(&EventMsg::ShutdownComplete),
None
);
Ok(())
}
#[tokio::test]
async fn active_non_primary_shutdown_target_returns_ids_for_non_primary_shutdown() -> Result<()>
{
let mut app = make_test_app().await;
let active_thread_id = ThreadId::new();
let primary_thread_id = ThreadId::new();
app.active_thread_id = Some(active_thread_id);
app.primary_thread_id = Some(primary_thread_id);
assert_eq!(
app.active_non_primary_shutdown_target(&EventMsg::ShutdownComplete),
Some((active_thread_id, primary_thread_id))
);
Ok(())
}
#[tokio::test]
async fn active_non_primary_shutdown_target_returns_none_when_shutdown_exit_is_pending()
-> Result<()> {
let mut app = make_test_app().await;
let active_thread_id = ThreadId::new();
let primary_thread_id = ThreadId::new();
app.active_thread_id = Some(active_thread_id);
app.primary_thread_id = Some(primary_thread_id);
app.pending_shutdown_exit_thread_id = Some(active_thread_id);
assert_eq!(
app.active_non_primary_shutdown_target(&EventMsg::ShutdownComplete),
None
);
Ok(())
}
#[tokio::test]
async fn active_non_primary_shutdown_target_still_switches_for_other_pending_exit_thread()
-> Result<()> {
let mut app = make_test_app().await;
let active_thread_id = ThreadId::new();
let primary_thread_id = ThreadId::new();
app.active_thread_id = Some(active_thread_id);
app.primary_thread_id = Some(primary_thread_id);
app.pending_shutdown_exit_thread_id = Some(ThreadId::new());
assert_eq!(
app.active_non_primary_shutdown_target(&EventMsg::ShutdownComplete),
Some((active_thread_id, primary_thread_id))
);
Ok(())
}
async fn make_test_app() -> App {
let (chat_widget, app_event_tx, _rx, _op_rx) = make_chatwidget_manual_with_sender().await;
let config = chat_widget.config_ref().clone();
@@ -2918,6 +3087,7 @@ mod tests {
feedback_audience: FeedbackAudience::External,
pending_update_action: None,
suppress_shutdown_complete: false,
pending_shutdown_exit_thread_id: None,
windows_sandbox: WindowsSandboxState::default(),
thread_event_channels: HashMap::new(),
active_thread_id: None,
@@ -2975,6 +3145,7 @@ mod tests {
feedback_audience: FeedbackAudience::External,
pending_update_action: None,
suppress_shutdown_complete: false,
pending_shutdown_exit_thread_id: None,
windows_sandbox: WindowsSandboxState::default(),
thread_event_channels: HashMap::new(),
active_thread_id: None,