Add realtime silence tool (#18635)

## Summary

Adds a second realtime v2 function tool, `remain_silent`, so the
realtime model has an explicit non-speaking action when the
collaboration mode or latest context says it should not answer aloud.
This is stacked on #18597.

## Design

- Advertise `remain_silent` alongside `background_agent` in realtime v2
conversational sessions.
- Parse `remain_silent` function calls into a typed
`RealtimeEvent::NoopRequested` event.
- Have core answer that function call with an empty
`function_call_output` and deliberately avoid `response.create`, so no
follow-up realtime response is requested.
- Keep the event hidden from app-server/TUI surfaces; it is operational
plumbing, not user-visible conversation content.
This commit is contained in:
guinness-oai
2026-04-20 15:43:20 -07:00
committed by GitHub
parent a718b6fd47
commit 1029742cf7
10 changed files with 242 additions and 39 deletions

View File

@@ -1155,7 +1155,7 @@ async fn realtime_webrtc_start_emits_sdp_notification() -> Result<()> {
Some("multipart/form-data; boundary=codex-realtime-call-boundary")
);
let body = String::from_utf8(request.body).context("multipart body should be utf-8")?;
let session = r#"{"tool_choice":"auto","type":"realtime","model":"gpt-realtime-1.5","instructions":"backend prompt\n\nstartup context","output_modalities":["audio"],"audio":{"input":{"format":{"type":"audio/pcm","rate":24000},"noise_reduction":{"type":"near_field"},"transcription":{"model":"gpt-4o-mini-transcribe"},"turn_detection":{"type":"server_vad","interrupt_response":true,"create_response":true,"silence_duration_ms":500}},"output":{"format":{"type":"audio/pcm","rate":24000},"voice":"marin"}},"tools":[{"type":"function","name":"background_agent","description":"Send a user request to the background agent. Use this as the default action. Do not rephrase the user's ask or rewrite it in your own words; pass along the user's own words. If the background agent is idle, this starts a new task and returns the final result to the user. If the background agent is already working on a task, this sends the request as guidance to steer that previous task. If the user asks to do something next, later, after this, or once current work finishes, call this tool so the work is actually queued instead of merely promising to do it later.","parameters":{"type":"object","properties":{"prompt":{"type":"string","description":"The user request to delegate to the background agent."}},"required":["prompt"],"additionalProperties":false}}]}"#;
let session = r#"{"tool_choice":"auto","type":"realtime","model":"gpt-realtime-1.5","instructions":"backend prompt\n\nstartup context","output_modalities":["audio"],"audio":{"input":{"format":{"type":"audio/pcm","rate":24000},"noise_reduction":{"type":"near_field"},"transcription":{"model":"gpt-4o-mini-transcribe"},"turn_detection":{"type":"server_vad","interrupt_response":true,"create_response":true,"silence_duration_ms":500}},"output":{"format":{"type":"audio/pcm","rate":24000},"voice":"marin"}},"tools":[{"type":"function","name":"background_agent","description":"Send a user request to the background agent. Use this as the default action. Do not rephrase the user's ask or rewrite it in your own words; pass along the user's own words. If the background agent is idle, this starts a new task and returns the final result to the user. If the background agent is already working on a task, this sends the request as guidance to steer that previous task. If the user asks to do something next, later, after this, or once current work finishes, call this tool so the work is actually queued instead of merely promising to do it later.","parameters":{"type":"object","properties":{"prompt":{"type":"string","description":"The user request to delegate to the background agent."}},"required":["prompt"],"additionalProperties":false}},{"type":"function","name":"remain_silent","description":"Call this when the best response is to say nothing. Use it instead of speaking after hidden system/control messages, after background agent updates in silent modes, or whenever acknowledging aloud would be distracting. This tool has no user-visible effect.","parameters":{"type":"object","properties":{},"additionalProperties":false}}]}"#;
let session = normalized_json_string(session)?;
assert_eq!(
body,
@@ -2260,6 +2260,10 @@ fn assert_v2_session_update(request: &Value) -> Result<()> {
request["session"]["tools"][0]["name"].as_str(),
Some("background_agent")
);
assert_eq!(
request["session"]["tools"][1]["name"].as_str(),
Some("remain_silent")
);
assert_eq!(
request["session"]["audio"]["input"]["transcription"]["model"].as_str(),
Some("gpt-4o-mini-transcribe")