Trim compaction input (#10374)

Two fixes:

1. Include trailing tool output in the total context size calculation.
Otherwise when checking whether compaction should run we ignore newly
added outputs.
2. Trim trailing tool output/tool calls until we can fit the request
into the model context size. Otherwise the compaction endpoint will fail
to compact. We only trim items that can be reproduced again by the model
(tool calls, tool call outputs).
This commit is contained in:
pakrym-oai
2026-02-02 19:03:11 -08:00
committed by GitHub
parent 7e07ec8f73
commit cbfd2a37cc
6 changed files with 354 additions and 40 deletions

View File

@@ -1051,6 +1051,11 @@ impl Session {
state.get_total_token_usage(state.server_reasoning_included())
}
async fn get_estimated_token_count(&self, turn_context: &TurnContext) -> Option<i64> {
let state = self.state.lock().await;
state.history.estimate_token_count(turn_context)
}
pub(crate) async fn get_base_instructions(&self) -> BaseInstructions {
let state = self.state.lock().await;
BaseInstructions {
@@ -3310,6 +3315,7 @@ pub(crate) async fn run_turn(
let model_info = turn_context.client.get_model_info();
let auto_compact_limit = model_info.auto_compact_token_limit().unwrap_or(i64::MAX);
let total_usage_tokens = sess.get_total_token_usage().await;
let event = EventMsg::TurnStarted(TurnStartedEvent {
model_context_window: turn_context.client.get_model_context_window(),
collaboration_mode_kind: turn_context.collaboration_mode.mode,
@@ -3465,6 +3471,19 @@ pub(crate) async fn run_turn(
let total_usage_tokens = sess.get_total_token_usage().await;
let token_limit_reached = total_usage_tokens >= auto_compact_limit;
let estimated_token_count =
sess.get_estimated_token_count(turn_context.as_ref()).await;
info!(
turn_id = %turn_context.sub_id,
total_usage_tokens,
estimated_token_count = ?estimated_token_count,
auto_compact_limit,
token_limit_reached,
needs_follow_up,
"post sampling token usage"
);
// as long as compaction works well in getting us way below the token limit, we shouldn't worry about being in an infinite loop.
if token_limit_reached && needs_follow_up {
run_auto_compact(&sess, &turn_context).await;