mirror of
https://github.com/openai/codex.git
synced 2026-04-21 21:24:51 +00:00
Compare commits
4 Commits
dev/honor-
...
codex/real
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3f8b7a987d | ||
|
|
a5420779c4 | ||
|
|
159bda93c6 | ||
|
|
f13917d50e |
@@ -5188,7 +5188,7 @@ impl CodexMessageProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
if !thread.enabled(Feature::RealtimeConversation) {
|
||||
if !thread.realtime_voice_enabled() {
|
||||
self.send_invalid_request_error(
|
||||
request_id,
|
||||
format!("thread {thread_id} does not support realtime conversation"),
|
||||
|
||||
@@ -37,6 +37,13 @@ use tokio::time::timeout;
|
||||
|
||||
const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
enum RealtimeFeatureGate {
|
||||
RealtimeV2,
|
||||
RealtimeConversation,
|
||||
Disabled,
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn realtime_conversation_streams_v2_notifications() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
@@ -50,7 +57,7 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> {
|
||||
vec![],
|
||||
vec![
|
||||
json!({
|
||||
"type": "conversation.output_audio.delta",
|
||||
"type": "response.output_audio.delta",
|
||||
"delta": "AQID",
|
||||
"sample_rate": 24_000,
|
||||
"channels": 1,
|
||||
@@ -77,7 +84,7 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> {
|
||||
codex_home.path(),
|
||||
&responses_server.uri(),
|
||||
realtime_server.uri(),
|
||||
true,
|
||||
RealtimeFeatureGate::RealtimeV2,
|
||||
)?;
|
||||
|
||||
let mut mcp = McpProcess::new(codex_home.path()).await?;
|
||||
@@ -178,6 +185,11 @@ async fn realtime_conversation_streams_v2_notifications() -> Result<()> {
|
||||
let connections = realtime_server.connections();
|
||||
assert_eq!(connections.len(), 1);
|
||||
let connection = &connections[0];
|
||||
assert_eq!(connection[0].body_json()["type"], json!("session.update"));
|
||||
assert_eq!(
|
||||
connection[0].body_json()["session"]["type"],
|
||||
json!("realtime")
|
||||
);
|
||||
assert_eq!(connection.len(), 3);
|
||||
assert_eq!(
|
||||
connection[0].body_json()["type"].as_str(),
|
||||
@@ -225,7 +237,7 @@ async fn realtime_conversation_stop_emits_closed_notification() -> Result<()> {
|
||||
codex_home.path(),
|
||||
&responses_server.uri(),
|
||||
realtime_server.uri(),
|
||||
true,
|
||||
RealtimeFeatureGate::RealtimeV2,
|
||||
)?;
|
||||
|
||||
let mut mcp = McpProcess::new(codex_home.path()).await?;
|
||||
@@ -297,7 +309,7 @@ async fn realtime_conversation_requires_feature_flag() -> Result<()> {
|
||||
codex_home.path(),
|
||||
&responses_server.uri(),
|
||||
realtime_server.uri(),
|
||||
false,
|
||||
RealtimeFeatureGate::Disabled,
|
||||
)?;
|
||||
|
||||
let mut mcp = McpProcess::new(codex_home.path()).await?;
|
||||
@@ -337,6 +349,74 @@ async fn realtime_conversation_requires_feature_flag() -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn realtime_conversation_accepts_legacy_feature_flag() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let responses_server = create_mock_responses_server_sequence_unchecked(Vec::new()).await;
|
||||
let realtime_server = start_websocket_server(vec![vec![vec![json!({
|
||||
"type": "session.updated",
|
||||
"session": { "id": "sess_backend", "instructions": "backend prompt" }
|
||||
})]]])
|
||||
.await;
|
||||
|
||||
let codex_home = TempDir::new()?;
|
||||
create_config_toml(
|
||||
codex_home.path(),
|
||||
&responses_server.uri(),
|
||||
realtime_server.uri(),
|
||||
RealtimeFeatureGate::RealtimeConversation,
|
||||
)?;
|
||||
|
||||
let mut mcp = McpProcess::new(codex_home.path()).await?;
|
||||
mcp.initialize().await?;
|
||||
login_with_api_key(&mut mcp, "sk-test-key").await?;
|
||||
|
||||
let thread_start_request_id = mcp
|
||||
.send_thread_start_request(ThreadStartParams::default())
|
||||
.await?;
|
||||
let thread_start_response: JSONRPCResponse = timeout(
|
||||
DEFAULT_TIMEOUT,
|
||||
mcp.read_stream_until_response_message(RequestId::Integer(thread_start_request_id)),
|
||||
)
|
||||
.await??;
|
||||
let thread_start: ThreadStartResponse = to_response(thread_start_response)?;
|
||||
|
||||
let start_request_id = mcp
|
||||
.send_thread_realtime_start_request(ThreadRealtimeStartParams {
|
||||
thread_id: thread_start.thread.id.clone(),
|
||||
prompt: "backend prompt".to_string(),
|
||||
session_id: None,
|
||||
})
|
||||
.await?;
|
||||
let start_response: JSONRPCResponse = timeout(
|
||||
DEFAULT_TIMEOUT,
|
||||
mcp.read_stream_until_response_message(RequestId::Integer(start_request_id)),
|
||||
)
|
||||
.await??;
|
||||
let _: ThreadRealtimeStartResponse = to_response(start_response)?;
|
||||
let started =
|
||||
read_notification::<ThreadRealtimeStartedNotification>(&mut mcp, "thread/realtime/started")
|
||||
.await?;
|
||||
assert_eq!(started.thread_id, thread_start.thread.id);
|
||||
|
||||
let connections = realtime_server.connections();
|
||||
assert_eq!(connections.len(), 1);
|
||||
let connection = &connections[0];
|
||||
assert_eq!(connection[0].body_json()["type"], json!("session.update"));
|
||||
assert_eq!(
|
||||
connection[0].body_json()["session"]["type"],
|
||||
json!("quicksilver")
|
||||
);
|
||||
assert_eq!(
|
||||
realtime_server.handshakes()[0].uri(),
|
||||
"/v1/realtime?intent=quicksilver"
|
||||
);
|
||||
|
||||
realtime_server.shutdown().await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn read_notification<T: DeserializeOwned>(mcp: &mut McpProcess, method: &str) -> Result<T> {
|
||||
let notification = timeout(
|
||||
DEFAULT_TIMEOUT,
|
||||
@@ -366,13 +446,31 @@ fn create_config_toml(
|
||||
codex_home: &Path,
|
||||
responses_server_uri: &str,
|
||||
realtime_server_uri: &str,
|
||||
realtime_enabled: bool,
|
||||
realtime_feature_gate: RealtimeFeatureGate,
|
||||
) -> std::io::Result<()> {
|
||||
let realtime_feature_key = FEATURES
|
||||
let realtime_v2_feature_key = FEATURES
|
||||
.iter()
|
||||
.find(|spec| spec.id == Feature::RealtimeV2)
|
||||
.map(|spec| spec.key)
|
||||
.unwrap_or("realtime_v2");
|
||||
let realtime_conversation_feature_key = FEATURES
|
||||
.iter()
|
||||
.find(|spec| spec.id == Feature::RealtimeConversation)
|
||||
.map(|spec| spec.key)
|
||||
.unwrap_or("realtime_conversation");
|
||||
let feature_toggles = match realtime_feature_gate {
|
||||
RealtimeFeatureGate::RealtimeV2 => {
|
||||
format!("{realtime_v2_feature_key} = true\n{realtime_conversation_feature_key} = false")
|
||||
}
|
||||
RealtimeFeatureGate::RealtimeConversation => {
|
||||
format!("{realtime_v2_feature_key} = false\n{realtime_conversation_feature_key} = true")
|
||||
}
|
||||
RealtimeFeatureGate::Disabled => {
|
||||
format!(
|
||||
"{realtime_v2_feature_key} = false\n{realtime_conversation_feature_key} = false"
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
std::fs::write(
|
||||
codex_home.join("config.toml"),
|
||||
@@ -385,7 +483,7 @@ model_provider = "mock_provider"
|
||||
experimental_realtime_ws_base_url = "{realtime_server_uri}"
|
||||
|
||||
[features]
|
||||
{realtime_feature_key} = {realtime_enabled}
|
||||
{feature_toggles}
|
||||
|
||||
[model_providers.mock_provider]
|
||||
name = "Mock provider for test"
|
||||
|
||||
@@ -1,14 +1,26 @@
|
||||
use crate::endpoint::realtime_websocket::protocol::ConversationItem;
|
||||
use crate::endpoint::realtime_websocket::protocol::ConversationItemContent;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeApiMode;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeAudioFrame;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeEvent;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeOutboundMessage;
|
||||
use crate::endpoint::realtime_websocket::protocol::RealtimeSessionConfig;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudio;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioFormat;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioInput;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutput;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioInputV1;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioInputV2;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutputFormat;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutputV1;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioOutputV2;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioV1;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionAudioV2;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionTool;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionToolParameters;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionToolProperties;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionToolProperty;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionTurnDetection;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSession;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSessionV1;
|
||||
use crate::endpoint::realtime_websocket::protocol::SessionUpdateSessionV2;
|
||||
use crate::endpoint::realtime_websocket::protocol::parse_realtime_event;
|
||||
use crate::error::ApiError;
|
||||
use crate::provider::Provider;
|
||||
@@ -17,6 +29,7 @@ use futures::SinkExt;
|
||||
use futures::StreamExt;
|
||||
use http::HeaderMap;
|
||||
use http::HeaderValue;
|
||||
use serde_json::json;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
@@ -193,12 +206,14 @@ pub struct RealtimeWebsocketConnection {
|
||||
pub struct RealtimeWebsocketWriter {
|
||||
stream: Arc<WsStream>,
|
||||
is_closed: Arc<AtomicBool>,
|
||||
mode: RealtimeApiMode,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RealtimeWebsocketEvents {
|
||||
rx_message: Arc<Mutex<mpsc::UnboundedReceiver<Result<Message, WsError>>>>,
|
||||
is_closed: Arc<AtomicBool>,
|
||||
mode: RealtimeApiMode,
|
||||
}
|
||||
|
||||
impl RealtimeWebsocketConnection {
|
||||
@@ -220,6 +235,20 @@ impl RealtimeWebsocketConnection {
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn send_function_call_output(
|
||||
&self,
|
||||
call_id: String,
|
||||
output_text: String,
|
||||
) -> Result<(), ApiError> {
|
||||
self.writer
|
||||
.send_function_call_output(call_id, output_text)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn send_response_create(&self) -> Result<(), ApiError> {
|
||||
self.writer.send_response_create().await
|
||||
}
|
||||
|
||||
pub async fn close(&self) -> Result<(), ApiError> {
|
||||
self.writer.close().await
|
||||
}
|
||||
@@ -239,6 +268,7 @@ impl RealtimeWebsocketConnection {
|
||||
fn new(
|
||||
stream: WsStream,
|
||||
rx_message: mpsc::UnboundedReceiver<Result<Message, WsError>>,
|
||||
mode: RealtimeApiMode,
|
||||
) -> Self {
|
||||
let stream = Arc::new(stream);
|
||||
let is_closed = Arc::new(AtomicBool::new(false));
|
||||
@@ -246,10 +276,12 @@ impl RealtimeWebsocketConnection {
|
||||
writer: RealtimeWebsocketWriter {
|
||||
stream: Arc::clone(&stream),
|
||||
is_closed: Arc::clone(&is_closed),
|
||||
mode,
|
||||
},
|
||||
events: RealtimeWebsocketEvents {
|
||||
rx_message: Arc::new(Mutex::new(rx_message)),
|
||||
is_closed,
|
||||
mode,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -262,14 +294,14 @@ impl RealtimeWebsocketWriter {
|
||||
}
|
||||
|
||||
pub async fn send_conversation_item_create(&self, text: String) -> Result<(), ApiError> {
|
||||
let kind = match self.mode {
|
||||
RealtimeApiMode::V1 => "text".to_string(),
|
||||
RealtimeApiMode::V2 => "input_text".to_string(),
|
||||
};
|
||||
self.send_json(RealtimeOutboundMessage::ConversationItemCreate {
|
||||
item: ConversationItem {
|
||||
kind: "message".to_string(),
|
||||
item: ConversationItem::Message {
|
||||
role: "user".to_string(),
|
||||
content: vec![ConversationItemContent {
|
||||
kind: "text".to_string(),
|
||||
text,
|
||||
}],
|
||||
content: vec![ConversationItemContent { kind, text }],
|
||||
},
|
||||
})
|
||||
.await
|
||||
@@ -280,32 +312,123 @@ impl RealtimeWebsocketWriter {
|
||||
handoff_id: String,
|
||||
output_text: String,
|
||||
) -> Result<(), ApiError> {
|
||||
self.send_json(RealtimeOutboundMessage::ConversationHandoffAppend {
|
||||
handoff_id,
|
||||
output_text,
|
||||
})
|
||||
.await
|
||||
match self.mode {
|
||||
RealtimeApiMode::V1 => {
|
||||
self.send_json(RealtimeOutboundMessage::ConversationHandoffAppend {
|
||||
handoff_id,
|
||||
output_text,
|
||||
})
|
||||
.await
|
||||
}
|
||||
RealtimeApiMode::V2 => {
|
||||
self.send_json(RealtimeOutboundMessage::ConversationItemCreate {
|
||||
item: ConversationItem::Message {
|
||||
role: "assistant".to_string(),
|
||||
content: vec![ConversationItemContent {
|
||||
kind: "output_text".to_string(),
|
||||
text: output_text,
|
||||
}],
|
||||
},
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn send_function_call_output(
|
||||
&self,
|
||||
call_id: String,
|
||||
output_text: String,
|
||||
) -> Result<(), ApiError> {
|
||||
match self.mode {
|
||||
RealtimeApiMode::V1 => Ok(()),
|
||||
RealtimeApiMode::V2 => {
|
||||
let output = json!({
|
||||
"content": output_text,
|
||||
})
|
||||
.to_string();
|
||||
self.send_json(RealtimeOutboundMessage::ConversationItemCreate {
|
||||
item: ConversationItem::FunctionCallOutput { call_id, output },
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn send_response_create(&self) -> Result<(), ApiError> {
|
||||
match self.mode {
|
||||
RealtimeApiMode::V1 => Ok(()),
|
||||
RealtimeApiMode::V2 => {
|
||||
self.send_json(RealtimeOutboundMessage::ResponseCreate)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn send_session_update(&self, instructions: String) -> Result<(), ApiError> {
|
||||
self.send_json(RealtimeOutboundMessage::SessionUpdate {
|
||||
session: SessionUpdateSession {
|
||||
let session = match self.mode {
|
||||
RealtimeApiMode::V1 => SessionUpdateSession::V1(SessionUpdateSessionV1 {
|
||||
kind: "quicksilver".to_string(),
|
||||
instructions,
|
||||
audio: SessionAudio {
|
||||
input: SessionAudioInput {
|
||||
audio: SessionAudioV1 {
|
||||
input: SessionAudioInputV1 {
|
||||
format: SessionAudioFormat {
|
||||
kind: "audio/pcm".to_string(),
|
||||
rate: 24_000,
|
||||
},
|
||||
},
|
||||
output: SessionAudioOutput {
|
||||
output: SessionAudioOutputV1 {
|
||||
voice: "mundo".to_string(),
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
.await
|
||||
}),
|
||||
RealtimeApiMode::V2 => SessionUpdateSession::V2(SessionUpdateSessionV2 {
|
||||
kind: "realtime".to_string(),
|
||||
instructions,
|
||||
output_modalities: vec!["audio".to_string()],
|
||||
audio: SessionAudioV2 {
|
||||
input: SessionAudioInputV2 {
|
||||
format: SessionAudioFormat {
|
||||
kind: "audio/pcm".to_string(),
|
||||
rate: 24_000,
|
||||
},
|
||||
turn_detection: SessionTurnDetection {
|
||||
kind: "semantic_vad".to_string(),
|
||||
interrupt_response: false,
|
||||
create_response: true,
|
||||
},
|
||||
},
|
||||
output: SessionAudioOutputV2 {
|
||||
format: SessionAudioOutputFormat {
|
||||
kind: "audio/pcm".to_string(),
|
||||
rate: 24_000,
|
||||
},
|
||||
voice: "marin".to_string(),
|
||||
},
|
||||
},
|
||||
tools: vec![SessionTool {
|
||||
kind: "function".to_string(),
|
||||
name: "codex".to_string(),
|
||||
description:
|
||||
"Delegate a request to Codex and return the final result to the user."
|
||||
.to_string(),
|
||||
parameters: SessionToolParameters {
|
||||
kind: "object".to_string(),
|
||||
properties: SessionToolProperties {
|
||||
prompt: SessionToolProperty {
|
||||
kind: "string".to_string(),
|
||||
description: "The user request to delegate to Codex.".to_string(),
|
||||
},
|
||||
},
|
||||
required: vec!["prompt".to_string()],
|
||||
},
|
||||
}],
|
||||
tool_choice: "auto".to_string(),
|
||||
}),
|
||||
};
|
||||
|
||||
self.send_json(RealtimeOutboundMessage::SessionUpdate { session })
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn close(&self) -> Result<(), ApiError> {
|
||||
@@ -342,6 +465,10 @@ impl RealtimeWebsocketWriter {
|
||||
}
|
||||
|
||||
impl RealtimeWebsocketEvents {
|
||||
fn mode(&self) -> RealtimeApiMode {
|
||||
self.mode
|
||||
}
|
||||
|
||||
pub async fn next_event(&self) -> Result<Option<RealtimeEvent>, ApiError> {
|
||||
if self.is_closed.load(Ordering::SeqCst) {
|
||||
return Ok(None);
|
||||
@@ -366,7 +493,7 @@ impl RealtimeWebsocketEvents {
|
||||
|
||||
match msg {
|
||||
Message::Text(text) => {
|
||||
if let Some(event) = parse_realtime_event(&text) {
|
||||
if let Some(event) = parse_realtime_event(&text, self.mode()) {
|
||||
debug!(?event, "realtime websocket parsed event");
|
||||
return Ok(Some(event));
|
||||
}
|
||||
@@ -412,6 +539,7 @@ impl RealtimeWebsocketClient {
|
||||
self.provider.base_url.as_str(),
|
||||
self.provider.query_params.as_ref(),
|
||||
config.model.as_deref(),
|
||||
config.mode,
|
||||
)?;
|
||||
|
||||
let mut request = ws_url
|
||||
@@ -439,7 +567,7 @@ impl RealtimeWebsocketClient {
|
||||
);
|
||||
|
||||
let (stream, rx_message) = WsStream::new(stream);
|
||||
let connection = RealtimeWebsocketConnection::new(stream, rx_message);
|
||||
let connection = RealtimeWebsocketConnection::new(stream, rx_message, config.mode);
|
||||
debug!(
|
||||
session_id = config.session_id.as_deref().unwrap_or("<none>"),
|
||||
"realtime websocket sending session.update"
|
||||
@@ -491,6 +619,7 @@ fn websocket_url_from_api_url(
|
||||
api_url: &str,
|
||||
query_params: Option<&HashMap<String, String>>,
|
||||
model: Option<&str>,
|
||||
mode: RealtimeApiMode,
|
||||
) -> Result<Url, ApiError> {
|
||||
let mut url = Url::parse(api_url)
|
||||
.map_err(|err| ApiError::Stream(format!("failed to parse realtime api_url: {err}")))?;
|
||||
@@ -512,13 +641,17 @@ fn websocket_url_from_api_url(
|
||||
|
||||
{
|
||||
let mut query = url.query_pairs_mut();
|
||||
query.append_pair("intent", "quicksilver");
|
||||
if mode == RealtimeApiMode::V1 {
|
||||
query.append_pair("intent", "quicksilver");
|
||||
}
|
||||
if let Some(model) = model {
|
||||
query.append_pair("model", model);
|
||||
}
|
||||
if let Some(query_params) = query_params {
|
||||
for (key, value) in query_params {
|
||||
if key == "intent" || (key == "model" && model.is_some()) {
|
||||
if (key == "model" && model.is_some())
|
||||
|| (key == "intent" && mode == RealtimeApiMode::V1)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
query.append_pair(key, value);
|
||||
@@ -579,7 +712,7 @@ mod tests {
|
||||
.to_string();
|
||||
|
||||
assert_eq!(
|
||||
parse_realtime_event(payload.as_str()),
|
||||
parse_realtime_event(payload.as_str(), RealtimeApiMode::V2),
|
||||
Some(RealtimeEvent::SessionUpdated {
|
||||
session_id: "sess_123".to_string(),
|
||||
instructions: Some("backend prompt".to_string()),
|
||||
@@ -590,7 +723,7 @@ mod tests {
|
||||
#[test]
|
||||
fn parse_audio_delta_event() {
|
||||
let payload = json!({
|
||||
"type": "conversation.output_audio.delta",
|
||||
"type": "response.output_audio.delta",
|
||||
"delta": "AAA=",
|
||||
"sample_rate": 48000,
|
||||
"channels": 1,
|
||||
@@ -598,7 +731,7 @@ mod tests {
|
||||
})
|
||||
.to_string();
|
||||
assert_eq!(
|
||||
parse_realtime_event(payload.as_str()),
|
||||
parse_realtime_event(payload.as_str(), RealtimeApiMode::V2),
|
||||
Some(RealtimeEvent::AudioOut(RealtimeAudioFrame {
|
||||
data: "AAA=".to_string(),
|
||||
sample_rate: 48000,
|
||||
@@ -608,6 +741,25 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_audio_delta_event_defaults_audio_shape() {
|
||||
let payload = json!({
|
||||
"type": "response.output_audio.delta",
|
||||
"delta": "AAA="
|
||||
})
|
||||
.to_string();
|
||||
|
||||
assert_eq!(
|
||||
parse_realtime_event(payload.as_str(), RealtimeApiMode::V2),
|
||||
Some(RealtimeEvent::AudioOut(RealtimeAudioFrame {
|
||||
data: "AAA=".to_string(),
|
||||
sample_rate: 24_000,
|
||||
num_channels: 1,
|
||||
samples_per_channel: None,
|
||||
}))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_conversation_item_added_event() {
|
||||
let payload = json!({
|
||||
@@ -616,7 +768,7 @@ mod tests {
|
||||
})
|
||||
.to_string();
|
||||
assert_eq!(
|
||||
parse_realtime_event(payload.as_str()),
|
||||
parse_realtime_event(payload.as_str(), RealtimeApiMode::V2),
|
||||
Some(RealtimeEvent::ConversationItemAdded(
|
||||
json!({"type": "message", "seq": 7})
|
||||
))
|
||||
@@ -631,7 +783,7 @@ mod tests {
|
||||
})
|
||||
.to_string();
|
||||
assert_eq!(
|
||||
parse_realtime_event(payload.as_str()),
|
||||
parse_realtime_event(payload.as_str(), RealtimeApiMode::V2),
|
||||
Some(RealtimeEvent::ConversationItemDone {
|
||||
item_id: "item_123".to_string(),
|
||||
})
|
||||
@@ -641,18 +793,23 @@ mod tests {
|
||||
#[test]
|
||||
fn parse_handoff_requested_event() {
|
||||
let payload = json!({
|
||||
"type": "conversation.handoff.requested",
|
||||
"handoff_id": "handoff_123",
|
||||
"item_id": "item_123",
|
||||
"input_transcript": "delegate this",
|
||||
"messages": [
|
||||
{"role": "user", "text": "delegate this"}
|
||||
]
|
||||
"type": "response.done",
|
||||
"response": {
|
||||
"output": [
|
||||
{
|
||||
"id": "item_123",
|
||||
"type": "function_call",
|
||||
"name": "codex",
|
||||
"call_id": "handoff_123",
|
||||
"arguments": "{\"prompt\":\"delegate this\"}"
|
||||
}
|
||||
]
|
||||
}
|
||||
})
|
||||
.to_string();
|
||||
|
||||
assert_eq!(
|
||||
parse_realtime_event(payload.as_str()),
|
||||
parse_realtime_event(payload.as_str(), RealtimeApiMode::V2),
|
||||
Some(RealtimeEvent::HandoffRequested(RealtimeHandoffRequested {
|
||||
handoff_id: "handoff_123".to_string(),
|
||||
item_id: "item_123".to_string(),
|
||||
@@ -665,6 +822,72 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_v1_handoff_requested_event() {
|
||||
let payload = json!({
|
||||
"type": "conversation.handoff.requested",
|
||||
"handoff_id": "handoff_legacy",
|
||||
"item_id": "item_legacy",
|
||||
"input_transcript": "delegate legacy",
|
||||
"messages": [
|
||||
{"role": "user", "text": "delegate legacy"}
|
||||
]
|
||||
})
|
||||
.to_string();
|
||||
|
||||
assert_eq!(
|
||||
parse_realtime_event(payload.as_str(), RealtimeApiMode::V1),
|
||||
Some(RealtimeEvent::HandoffRequested(RealtimeHandoffRequested {
|
||||
handoff_id: "handoff_legacy".to_string(),
|
||||
item_id: "item_legacy".to_string(),
|
||||
input_transcript: "delegate legacy".to_string(),
|
||||
messages: vec![RealtimeHandoffMessage {
|
||||
role: "user".to_string(),
|
||||
text: "delegate legacy".to_string(),
|
||||
}],
|
||||
}))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_v1_audio_delta_event() {
|
||||
let payload = json!({
|
||||
"type": "conversation.output_audio.delta",
|
||||
"delta": "AAA=",
|
||||
"sample_rate": 48000,
|
||||
"channels": 1,
|
||||
"samples_per_channel": 960
|
||||
})
|
||||
.to_string();
|
||||
assert_eq!(
|
||||
parse_realtime_event(payload.as_str(), RealtimeApiMode::V1),
|
||||
Some(RealtimeEvent::AudioOut(RealtimeAudioFrame {
|
||||
data: "AAA=".to_string(),
|
||||
sample_rate: 48000,
|
||||
num_channels: 1,
|
||||
samples_per_channel: Some(960),
|
||||
}))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_unknown_event_as_conversation_item_added() {
|
||||
let payload = json!({
|
||||
"type": "response.output_text.delta",
|
||||
"delta": "hello",
|
||||
"response_id": "resp_1",
|
||||
})
|
||||
.to_string();
|
||||
assert_eq!(
|
||||
parse_realtime_event(payload.as_str(), RealtimeApiMode::V2),
|
||||
Some(RealtimeEvent::ConversationItemAdded(json!({
|
||||
"type": "response.output_text.delta",
|
||||
"delta": "hello",
|
||||
"response_id": "resp_1",
|
||||
})))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn merge_request_headers_matches_http_precedence() {
|
||||
let mut provider_headers = HeaderMap::new();
|
||||
@@ -701,42 +924,53 @@ mod tests {
|
||||
#[test]
|
||||
fn websocket_url_from_http_base_defaults_to_ws_path() {
|
||||
let url =
|
||||
websocket_url_from_api_url("http://127.0.0.1:8011", None, None).expect("build ws url");
|
||||
assert_eq!(
|
||||
url.as_str(),
|
||||
"ws://127.0.0.1:8011/v1/realtime?intent=quicksilver"
|
||||
);
|
||||
websocket_url_from_api_url("http://127.0.0.1:8011", None, None, RealtimeApiMode::V2)
|
||||
.expect("build ws url");
|
||||
assert_eq!(url.as_str(), "ws://127.0.0.1:8011/v1/realtime");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn websocket_url_from_ws_base_defaults_to_ws_path() {
|
||||
let url =
|
||||
websocket_url_from_api_url("wss://example.com", None, Some("realtime-test-model"))
|
||||
.expect("build ws url");
|
||||
let url = websocket_url_from_api_url(
|
||||
"wss://example.com",
|
||||
None,
|
||||
Some("realtime-test-model"),
|
||||
RealtimeApiMode::V2,
|
||||
)
|
||||
.expect("build ws url");
|
||||
assert_eq!(
|
||||
url.as_str(),
|
||||
"wss://example.com/v1/realtime?intent=quicksilver&model=realtime-test-model"
|
||||
"wss://example.com/v1/realtime?model=realtime-test-model"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn websocket_url_from_v1_base_appends_realtime_path() {
|
||||
let url = websocket_url_from_api_url("https://api.openai.com/v1", None, Some("snapshot"))
|
||||
.expect("build ws url");
|
||||
let url = websocket_url_from_api_url(
|
||||
"https://api.openai.com/v1",
|
||||
None,
|
||||
Some("snapshot"),
|
||||
RealtimeApiMode::V2,
|
||||
)
|
||||
.expect("build ws url");
|
||||
assert_eq!(
|
||||
url.as_str(),
|
||||
"wss://api.openai.com/v1/realtime?intent=quicksilver&model=snapshot"
|
||||
"wss://api.openai.com/v1/realtime?model=snapshot"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn websocket_url_from_nested_v1_base_appends_realtime_path() {
|
||||
let url =
|
||||
websocket_url_from_api_url("https://example.com/openai/v1", None, Some("snapshot"))
|
||||
.expect("build ws url");
|
||||
let url = websocket_url_from_api_url(
|
||||
"https://example.com/openai/v1",
|
||||
None,
|
||||
Some("snapshot"),
|
||||
RealtimeApiMode::V2,
|
||||
)
|
||||
.expect("build ws url");
|
||||
assert_eq!(
|
||||
url.as_str(),
|
||||
"wss://example.com/openai/v1/realtime?intent=quicksilver&model=snapshot"
|
||||
"wss://example.com/openai/v1/realtime?model=snapshot"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -744,11 +978,24 @@ mod tests {
|
||||
fn websocket_url_preserves_existing_realtime_path_and_extra_query_params() {
|
||||
let url = websocket_url_from_api_url(
|
||||
"https://example.com/v1/realtime?foo=bar",
|
||||
Some(&HashMap::from([
|
||||
("trace".to_string(), "1".to_string()),
|
||||
("intent".to_string(), "ignored".to_string()),
|
||||
])),
|
||||
Some(&HashMap::from([("trace".to_string(), "1".to_string())])),
|
||||
Some("snapshot"),
|
||||
RealtimeApiMode::V2,
|
||||
)
|
||||
.expect("build ws url");
|
||||
assert_eq!(
|
||||
url.as_str(),
|
||||
"wss://example.com/v1/realtime?foo=bar&model=snapshot&trace=1"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn websocket_url_v1_mode_adds_quicksilver_intent() {
|
||||
let url = websocket_url_from_api_url(
|
||||
"https://example.com/v1/realtime?foo=bar",
|
||||
Some(&HashMap::from([("trace".to_string(), "1".to_string())])),
|
||||
Some("snapshot"),
|
||||
RealtimeApiMode::V1,
|
||||
)
|
||||
.expect("build ws url");
|
||||
assert_eq!(
|
||||
@@ -777,12 +1024,16 @@ mod tests {
|
||||
assert_eq!(first_json["type"], "session.update");
|
||||
assert_eq!(
|
||||
first_json["session"]["type"],
|
||||
Value::String("quicksilver".to_string())
|
||||
Value::String("realtime".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["instructions"],
|
||||
Value::String("backend prompt".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["output_modalities"][0],
|
||||
Value::String("audio".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["audio"]["input"]["format"]["type"],
|
||||
Value::String("audio/pcm".to_string())
|
||||
@@ -791,9 +1042,45 @@ mod tests {
|
||||
first_json["session"]["audio"]["input"]["format"]["rate"],
|
||||
Value::from(24_000)
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["audio"]["input"]["turn_detection"]["type"],
|
||||
Value::String("semantic_vad".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["audio"]["input"]["turn_detection"]["interrupt_response"],
|
||||
Value::Bool(false)
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["audio"]["input"]["turn_detection"]["create_response"],
|
||||
Value::Bool(true)
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["audio"]["output"]["format"]["type"],
|
||||
Value::String("audio/pcm".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["audio"]["output"]["format"]["rate"],
|
||||
Value::from(24_000)
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["audio"]["output"]["voice"],
|
||||
Value::String("mundo".to_string())
|
||||
Value::String("marin".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["tool_choice"],
|
||||
Value::String("auto".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["tools"][0]["type"],
|
||||
Value::String("function".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["tools"][0]["name"],
|
||||
Value::String("codex".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["tools"][0]["parameters"]["required"][0],
|
||||
Value::String("prompt".to_string())
|
||||
);
|
||||
|
||||
ws.send(Message::Text(
|
||||
@@ -836,13 +1123,43 @@ mod tests {
|
||||
.into_text()
|
||||
.expect("text");
|
||||
let fourth_json: Value = serde_json::from_str(&fourth).expect("json");
|
||||
assert_eq!(fourth_json["type"], "conversation.handoff.append");
|
||||
assert_eq!(fourth_json["handoff_id"], "handoff_1");
|
||||
assert_eq!(fourth_json["output_text"], "hello from codex");
|
||||
assert_eq!(fourth_json["type"], "conversation.item.create");
|
||||
assert_eq!(fourth_json["item"]["type"], "message");
|
||||
assert_eq!(fourth_json["item"]["role"], "assistant");
|
||||
assert_eq!(
|
||||
fourth_json["item"]["content"][0]["type"],
|
||||
Value::String("output_text".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
fourth_json["item"]["content"][0]["text"],
|
||||
Value::String("hello from codex".to_string())
|
||||
);
|
||||
|
||||
let fifth = ws
|
||||
.next()
|
||||
.await
|
||||
.expect("fifth msg")
|
||||
.expect("fifth msg ok")
|
||||
.into_text()
|
||||
.expect("text");
|
||||
let fifth_json: Value = serde_json::from_str(&fifth).expect("json");
|
||||
assert_eq!(fifth_json["type"], "conversation.item.create");
|
||||
assert_eq!(fifth_json["item"]["type"], "function_call_output");
|
||||
assert_eq!(fifth_json["item"]["call_id"], "handoff_1");
|
||||
|
||||
let sixth = ws
|
||||
.next()
|
||||
.await
|
||||
.expect("sixth msg")
|
||||
.expect("sixth msg ok")
|
||||
.into_text()
|
||||
.expect("text");
|
||||
let sixth_json: Value = serde_json::from_str(&sixth).expect("json");
|
||||
assert_eq!(sixth_json["type"], "response.create");
|
||||
|
||||
ws.send(Message::Text(
|
||||
json!({
|
||||
"type": "conversation.output_audio.delta",
|
||||
"type": "response.output_audio.delta",
|
||||
"delta": "AQID",
|
||||
"sample_rate": 48000,
|
||||
"channels": 1
|
||||
@@ -855,11 +1172,18 @@ mod tests {
|
||||
|
||||
ws.send(Message::Text(
|
||||
json!({
|
||||
"type": "conversation.handoff.requested",
|
||||
"handoff_id": "handoff_1",
|
||||
"item_id": "item_2",
|
||||
"input_transcript": "delegate now",
|
||||
"messages": [{"role": "user", "text": "delegate now"}]
|
||||
"type": "response.done",
|
||||
"response": {
|
||||
"output": [
|
||||
{
|
||||
"id": "item_2",
|
||||
"type": "function_call",
|
||||
"name": "codex",
|
||||
"call_id": "handoff_1",
|
||||
"arguments": "{\"prompt\":\"delegate now\"}"
|
||||
}
|
||||
]
|
||||
}
|
||||
})
|
||||
.to_string()
|
||||
.into(),
|
||||
@@ -889,6 +1213,7 @@ mod tests {
|
||||
instructions: "backend prompt".to_string(),
|
||||
model: Some("realtime-test-model".to_string()),
|
||||
session_id: Some("conv_1".to_string()),
|
||||
mode: RealtimeApiMode::V2,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
@@ -929,6 +1254,14 @@ mod tests {
|
||||
)
|
||||
.await
|
||||
.expect("send handoff");
|
||||
connection
|
||||
.send_function_call_output("handoff_1".to_string(), "final from codex".to_string())
|
||||
.await
|
||||
.expect("send function output");
|
||||
connection
|
||||
.send_response_create()
|
||||
.await
|
||||
.expect("send response.create");
|
||||
|
||||
let audio_event = connection
|
||||
.next_event()
|
||||
@@ -1029,6 +1362,7 @@ mod tests {
|
||||
instructions: "backend prompt".to_string(),
|
||||
model: Some("realtime-test-model".to_string()),
|
||||
session_id: Some("conv_1".to_string()),
|
||||
mode: RealtimeApiMode::V2,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
|
||||
@@ -7,4 +7,5 @@ pub use methods::RealtimeWebsocketClient;
|
||||
pub use methods::RealtimeWebsocketConnection;
|
||||
pub use methods::RealtimeWebsocketEvents;
|
||||
pub use methods::RealtimeWebsocketWriter;
|
||||
pub use protocol::RealtimeApiMode;
|
||||
pub use protocol::RealtimeSessionConfig;
|
||||
|
||||
@@ -2,15 +2,24 @@ pub use codex_protocol::protocol::RealtimeAudioFrame;
|
||||
pub use codex_protocol::protocol::RealtimeEvent;
|
||||
pub use codex_protocol::protocol::RealtimeHandoffMessage;
|
||||
pub use codex_protocol::protocol::RealtimeHandoffRequested;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use serde_json::Value;
|
||||
use std::string::ToString;
|
||||
use tracing::debug;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum RealtimeApiMode {
|
||||
V1,
|
||||
V2,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RealtimeSessionConfig {
|
||||
pub instructions: String,
|
||||
pub model: Option<String>,
|
||||
pub session_id: Option<String>,
|
||||
pub mode: RealtimeApiMode,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
@@ -23,6 +32,8 @@ pub(super) enum RealtimeOutboundMessage {
|
||||
handoff_id: String,
|
||||
output_text: String,
|
||||
},
|
||||
#[serde(rename = "response.create")]
|
||||
ResponseCreate,
|
||||
#[serde(rename = "session.update")]
|
||||
SessionUpdate { session: SessionUpdateSession },
|
||||
#[serde(rename = "conversation.item.create")]
|
||||
@@ -30,24 +41,54 @@ pub(super) enum RealtimeOutboundMessage {
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionUpdateSession {
|
||||
#[serde(untagged)]
|
||||
pub(super) enum SessionUpdateSession {
|
||||
V1(SessionUpdateSessionV1),
|
||||
V2(SessionUpdateSessionV2),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionUpdateSessionV1 {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) instructions: String,
|
||||
pub(super) audio: SessionAudio,
|
||||
pub(super) audio: SessionAudioV1,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudio {
|
||||
pub(super) input: SessionAudioInput,
|
||||
pub(super) output: SessionAudioOutput,
|
||||
pub(super) struct SessionUpdateSessionV2 {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) instructions: String,
|
||||
pub(super) output_modalities: Vec<String>,
|
||||
pub(super) audio: SessionAudioV2,
|
||||
pub(super) tools: Vec<SessionTool>,
|
||||
pub(super) tool_choice: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioInput {
|
||||
pub(super) struct SessionAudioV1 {
|
||||
pub(super) input: SessionAudioInputV1,
|
||||
pub(super) output: SessionAudioOutputV1,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioV2 {
|
||||
pub(super) input: SessionAudioInputV2,
|
||||
pub(super) output: SessionAudioOutputV2,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioInputV1 {
|
||||
pub(super) format: SessionAudioFormat,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioInputV2 {
|
||||
pub(super) format: SessionAudioFormat,
|
||||
pub(super) turn_detection: SessionTurnDetection,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioFormat {
|
||||
#[serde(rename = "type")]
|
||||
@@ -56,16 +97,70 @@ pub(super) struct SessionAudioFormat {
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioOutput {
|
||||
pub(super) struct SessionTurnDetection {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) interrupt_response: bool,
|
||||
pub(super) create_response: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioOutputV1 {
|
||||
pub(super) voice: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct ConversationItem {
|
||||
pub(super) struct SessionAudioOutputV2 {
|
||||
pub(super) format: SessionAudioOutputFormat,
|
||||
pub(super) voice: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionAudioOutputFormat {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) role: String,
|
||||
pub(super) content: Vec<ConversationItemContent>,
|
||||
pub(super) rate: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionTool {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) name: String,
|
||||
pub(super) description: String,
|
||||
pub(super) parameters: SessionToolParameters,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionToolParameters {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) properties: SessionToolProperties,
|
||||
pub(super) required: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionToolProperties {
|
||||
pub(super) prompt: SessionToolProperty,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub(super) struct SessionToolProperty {
|
||||
#[serde(rename = "type")]
|
||||
pub(super) kind: String,
|
||||
pub(super) description: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[serde(tag = "type")]
|
||||
pub(super) enum ConversationItem {
|
||||
#[serde(rename = "message")]
|
||||
Message {
|
||||
role: String,
|
||||
content: Vec<ConversationItemContent>,
|
||||
},
|
||||
#[serde(rename = "function_call_output")]
|
||||
FunctionCallOutput { call_id: String, output: String },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
@@ -75,7 +170,7 @@ pub(super) struct ConversationItemContent {
|
||||
pub(super) text: String,
|
||||
}
|
||||
|
||||
pub(super) fn parse_realtime_event(payload: &str) -> Option<RealtimeEvent> {
|
||||
pub(super) fn parse_realtime_event(payload: &str, mode: RealtimeApiMode) -> Option<RealtimeEvent> {
|
||||
let parsed: Value = match serde_json::from_str(payload) {
|
||||
Ok(msg) => msg,
|
||||
Err(err) => {
|
||||
@@ -91,50 +186,20 @@ pub(super) fn parse_realtime_event(payload: &str) -> Option<RealtimeEvent> {
|
||||
return None;
|
||||
}
|
||||
};
|
||||
match mode {
|
||||
RealtimeApiMode::V1 => parse_realtime_event_v1(&parsed, message_type, payload),
|
||||
RealtimeApiMode::V2 => parse_realtime_event_v2(parsed, message_type),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_realtime_event_v1(
|
||||
parsed: &Value,
|
||||
message_type: &str,
|
||||
payload: &str,
|
||||
) -> Option<RealtimeEvent> {
|
||||
match message_type {
|
||||
"session.updated" => {
|
||||
let session_id = parsed
|
||||
.get("session")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|session| session.get("id"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string);
|
||||
let instructions = parsed
|
||||
.get("session")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|session| session.get("instructions"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string);
|
||||
session_id.map(|session_id| RealtimeEvent::SessionUpdated {
|
||||
session_id,
|
||||
instructions,
|
||||
})
|
||||
}
|
||||
"conversation.output_audio.delta" => {
|
||||
let data = parsed
|
||||
.get("delta")
|
||||
.and_then(Value::as_str)
|
||||
.or_else(|| parsed.get("data").and_then(Value::as_str))
|
||||
.map(str::to_string)?;
|
||||
let sample_rate = parsed
|
||||
.get("sample_rate")
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u32::try_from(v).ok())?;
|
||||
let num_channels = parsed
|
||||
.get("channels")
|
||||
.or_else(|| parsed.get("num_channels"))
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u16::try_from(v).ok())?;
|
||||
Some(RealtimeEvent::AudioOut(RealtimeAudioFrame {
|
||||
data,
|
||||
sample_rate,
|
||||
num_channels,
|
||||
samples_per_channel: parsed
|
||||
.get("samples_per_channel")
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u32::try_from(v).ok()),
|
||||
}))
|
||||
}
|
||||
"session.updated" => parse_session_updated(parsed),
|
||||
"conversation.output_audio.delta" => parse_audio_delta(parsed, false),
|
||||
"conversation.item.added" => parsed
|
||||
.get("item")
|
||||
.cloned()
|
||||
@@ -146,53 +211,222 @@ pub(super) fn parse_realtime_event(payload: &str) -> Option<RealtimeEvent> {
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.map(|item_id| RealtimeEvent::ConversationItemDone { item_id }),
|
||||
"conversation.handoff.requested" => {
|
||||
let handoff_id = parsed
|
||||
.get("handoff_id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let item_id = parsed
|
||||
.get("item_id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let input_transcript = parsed
|
||||
.get("input_transcript")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let messages = parsed
|
||||
.get("messages")
|
||||
.and_then(Value::as_array)?
|
||||
.iter()
|
||||
.filter_map(|message| {
|
||||
let role = message.get("role").and_then(Value::as_str)?.to_string();
|
||||
let text = message.get("text").and_then(Value::as_str)?.to_string();
|
||||
Some(RealtimeHandoffMessage { role, text })
|
||||
})
|
||||
.collect();
|
||||
Some(RealtimeEvent::HandoffRequested(RealtimeHandoffRequested {
|
||||
handoff_id,
|
||||
item_id,
|
||||
input_transcript,
|
||||
messages,
|
||||
}))
|
||||
}
|
||||
"error" => parsed
|
||||
.get("message")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.or_else(|| {
|
||||
parsed
|
||||
.get("error")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|error| error.get("message"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
})
|
||||
.or_else(|| parsed.get("error").map(std::string::ToString::to_string))
|
||||
.map(RealtimeEvent::Error),
|
||||
"conversation.handoff.requested" => parse_handoff_requested_v1(parsed),
|
||||
"error" => parse_realtime_error(parsed),
|
||||
_ => {
|
||||
debug!("received unsupported realtime event type: {message_type}, data: {payload}");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_realtime_event_v2(parsed: Value, message_type: &str) -> Option<RealtimeEvent> {
|
||||
match message_type {
|
||||
"session.created" | "session.updated" => parse_session_updated(&parsed),
|
||||
"response.output_audio.delta" => parse_audio_delta(&parsed, true),
|
||||
"conversation.item.added" => parsed
|
||||
.get("item")
|
||||
.cloned()
|
||||
.map(RealtimeEvent::ConversationItemAdded),
|
||||
"conversation.item.done" => parsed
|
||||
.get("item")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|item| item.get("id"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.map(|item_id| RealtimeEvent::ConversationItemDone { item_id }),
|
||||
"response.done" => {
|
||||
if let Some(handoff) = parse_handoff_requested_v2(&parsed) {
|
||||
return Some(RealtimeEvent::HandoffRequested(handoff));
|
||||
}
|
||||
Some(RealtimeEvent::ConversationItemAdded(parsed))
|
||||
}
|
||||
"error" => parse_realtime_error(&parsed),
|
||||
_ => Some(RealtimeEvent::ConversationItemAdded(parsed)),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_session_updated(parsed: &Value) -> Option<RealtimeEvent> {
|
||||
let session_id = parsed
|
||||
.get("session")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|session| session.get("id"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string);
|
||||
let instructions = parsed
|
||||
.get("session")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|session| session.get("instructions"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string);
|
||||
session_id.map(|session_id| RealtimeEvent::SessionUpdated {
|
||||
session_id,
|
||||
instructions,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_audio_delta(parsed: &Value, default_shape: bool) -> Option<RealtimeEvent> {
|
||||
let data = parsed
|
||||
.get("delta")
|
||||
.and_then(Value::as_str)
|
||||
.or_else(|| parsed.get("data").and_then(Value::as_str))
|
||||
.map(str::to_string)?;
|
||||
let sample_rate = parsed
|
||||
.get("sample_rate")
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u32::try_from(v).ok());
|
||||
let num_channels = parsed
|
||||
.get("channels")
|
||||
.or_else(|| parsed.get("num_channels"))
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u16::try_from(v).ok());
|
||||
Some(RealtimeEvent::AudioOut(RealtimeAudioFrame {
|
||||
data,
|
||||
sample_rate: sample_rate.or_else(|| default_shape.then_some(24_000))?,
|
||||
num_channels: num_channels.or_else(|| default_shape.then_some(1))?,
|
||||
samples_per_channel: parsed
|
||||
.get("samples_per_channel")
|
||||
.and_then(Value::as_u64)
|
||||
.and_then(|v| u32::try_from(v).ok()),
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_realtime_error(parsed: &Value) -> Option<RealtimeEvent> {
|
||||
parsed
|
||||
.get("message")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.or_else(|| {
|
||||
parsed
|
||||
.get("error")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|error| error.get("message"))
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
})
|
||||
.or_else(|| parsed.get("error").map(ToString::to_string))
|
||||
.map(RealtimeEvent::Error)
|
||||
}
|
||||
|
||||
fn parse_handoff_requested_v1(parsed: &Value) -> Option<RealtimeHandoffRequested> {
|
||||
let handoff_id = parsed
|
||||
.get("handoff_id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let item_id = parsed
|
||||
.get("item_id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let input_transcript = parsed
|
||||
.get("input_transcript")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let messages = parsed
|
||||
.get("messages")
|
||||
.and_then(Value::as_array)?
|
||||
.iter()
|
||||
.filter_map(|message| {
|
||||
let role = message.get("role").and_then(Value::as_str)?.to_string();
|
||||
let text = message.get("text").and_then(Value::as_str)?.to_string();
|
||||
Some(RealtimeHandoffMessage { role, text })
|
||||
})
|
||||
.collect();
|
||||
Some(RealtimeEvent::HandoffRequested(RealtimeHandoffRequested {
|
||||
handoff_id,
|
||||
item_id,
|
||||
input_transcript,
|
||||
messages,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_handoff_requested_v2(parsed: &Value) -> Option<RealtimeHandoffRequested> {
|
||||
let outputs = parsed
|
||||
.get("response")
|
||||
.and_then(Value::as_object)
|
||||
.and_then(|response| response.get("output"))
|
||||
.and_then(Value::as_array)?;
|
||||
let function_call = outputs.iter().find(|item| {
|
||||
item.get("type").and_then(Value::as_str) == Some("function_call")
|
||||
&& item.get("name").and_then(Value::as_str) == Some("codex")
|
||||
})?;
|
||||
let handoff_id = function_call
|
||||
.get("call_id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)?;
|
||||
let item_id = function_call
|
||||
.get("id")
|
||||
.and_then(Value::as_str)
|
||||
.map(str::to_string)
|
||||
.unwrap_or_else(|| handoff_id.clone());
|
||||
let arguments = function_call
|
||||
.get("arguments")
|
||||
.and_then(Value::as_str)
|
||||
.unwrap_or_default();
|
||||
let (input_transcript, messages) = parse_handoff_arguments(arguments);
|
||||
Some(RealtimeHandoffRequested {
|
||||
handoff_id,
|
||||
item_id,
|
||||
input_transcript,
|
||||
messages,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_handoff_arguments(arguments: &str) -> (String, Vec<RealtimeHandoffMessage>) {
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct HandoffArguments {
|
||||
#[serde(default)]
|
||||
prompt: Option<String>,
|
||||
#[serde(default)]
|
||||
text: Option<String>,
|
||||
#[serde(default)]
|
||||
input: Option<String>,
|
||||
#[serde(default)]
|
||||
message: Option<String>,
|
||||
#[serde(default)]
|
||||
input_transcript: Option<String>,
|
||||
#[serde(default)]
|
||||
messages: Vec<RealtimeHandoffMessage>,
|
||||
}
|
||||
|
||||
let Some(parsed) = serde_json::from_str::<HandoffArguments>(arguments).ok() else {
|
||||
return (
|
||||
arguments.to_string(),
|
||||
vec![RealtimeHandoffMessage {
|
||||
role: "user".to_string(),
|
||||
text: arguments.to_string(),
|
||||
}],
|
||||
);
|
||||
};
|
||||
let messages = parsed
|
||||
.messages
|
||||
.into_iter()
|
||||
.filter(|message| !message.text.is_empty())
|
||||
.collect::<Vec<_>>();
|
||||
for value in [
|
||||
parsed.prompt,
|
||||
parsed.text,
|
||||
parsed.input,
|
||||
parsed.message,
|
||||
parsed.input_transcript,
|
||||
]
|
||||
.into_iter()
|
||||
.flatten()
|
||||
{
|
||||
if !value.is_empty() {
|
||||
if messages.is_empty() {
|
||||
return (
|
||||
value.clone(),
|
||||
vec![RealtimeHandoffMessage {
|
||||
role: "user".to_string(),
|
||||
text: value,
|
||||
}],
|
||||
);
|
||||
}
|
||||
return (value, messages);
|
||||
}
|
||||
}
|
||||
if let Some(first_message) = messages.first() {
|
||||
return (first_message.text.clone(), messages);
|
||||
}
|
||||
(String::new(), messages)
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@ pub use crate::common::create_text_param_for_request;
|
||||
pub use crate::endpoint::compact::CompactClient;
|
||||
pub use crate::endpoint::memories::MemoriesClient;
|
||||
pub use crate::endpoint::models::ModelsClient;
|
||||
pub use crate::endpoint::realtime_websocket::RealtimeApiMode;
|
||||
pub use crate::endpoint::realtime_websocket::RealtimeSessionConfig;
|
||||
pub use crate::endpoint::realtime_websocket::RealtimeWebsocketClient;
|
||||
pub use crate::endpoint::realtime_websocket::RealtimeWebsocketConnection;
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::collections::HashMap;
|
||||
use std::future::Future;
|
||||
use std::time::Duration;
|
||||
|
||||
use codex_api::RealtimeApiMode;
|
||||
use codex_api::RealtimeAudioFrame;
|
||||
use codex_api::RealtimeEvent;
|
||||
use codex_api::RealtimeSessionConfig;
|
||||
@@ -81,7 +82,7 @@ async fn realtime_ws_e2e_session_create_and_event_flow() {
|
||||
assert_eq!(first_json["type"], "session.update");
|
||||
assert_eq!(
|
||||
first_json["session"]["type"],
|
||||
Value::String("quicksilver".to_string())
|
||||
Value::String("realtime".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["instructions"],
|
||||
@@ -95,6 +96,30 @@ async fn realtime_ws_e2e_session_create_and_event_flow() {
|
||||
first_json["session"]["audio"]["input"]["format"]["rate"],
|
||||
Value::from(24_000)
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["audio"]["input"]["turn_detection"]["type"],
|
||||
Value::String("semantic_vad".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["audio"]["input"]["turn_detection"]["interrupt_response"],
|
||||
Value::Bool(false)
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["audio"]["input"]["turn_detection"]["create_response"],
|
||||
Value::Bool(true)
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["tool_choice"],
|
||||
Value::String("auto".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["tools"][0]["type"],
|
||||
Value::String("function".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
first_json["session"]["tools"][0]["name"],
|
||||
Value::String("codex".to_string())
|
||||
);
|
||||
|
||||
ws.send(Message::Text(
|
||||
json!({
|
||||
@@ -119,7 +144,7 @@ async fn realtime_ws_e2e_session_create_and_event_flow() {
|
||||
|
||||
ws.send(Message::Text(
|
||||
json!({
|
||||
"type": "conversation.output_audio.delta",
|
||||
"type": "response.output_audio.delta",
|
||||
"delta": "AQID",
|
||||
"sample_rate": 48000,
|
||||
"channels": 1
|
||||
@@ -139,6 +164,7 @@ async fn realtime_ws_e2e_session_create_and_event_flow() {
|
||||
instructions: "backend prompt".to_string(),
|
||||
model: Some("realtime-test-model".to_string()),
|
||||
session_id: Some("conv_123".to_string()),
|
||||
mode: RealtimeApiMode::V2,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
@@ -231,6 +257,7 @@ async fn realtime_ws_e2e_send_while_next_event_waits() {
|
||||
instructions: "backend prompt".to_string(),
|
||||
model: Some("realtime-test-model".to_string()),
|
||||
session_id: Some("conv_123".to_string()),
|
||||
mode: RealtimeApiMode::V2,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
@@ -294,6 +321,7 @@ async fn realtime_ws_e2e_disconnected_emitted_once() {
|
||||
instructions: "backend prompt".to_string(),
|
||||
model: Some("realtime-test-model".to_string()),
|
||||
session_id: Some("conv_123".to_string()),
|
||||
mode: RealtimeApiMode::V2,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
@@ -311,7 +339,7 @@ async fn realtime_ws_e2e_disconnected_emitted_once() {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn realtime_ws_e2e_ignores_unknown_text_events() {
|
||||
async fn realtime_ws_e2e_forwards_unknown_text_events() {
|
||||
let (addr, server) = spawn_realtime_ws_server(|mut ws: RealtimeWsStream| async move {
|
||||
let first = ws
|
||||
.next()
|
||||
@@ -354,6 +382,7 @@ async fn realtime_ws_e2e_ignores_unknown_text_events() {
|
||||
instructions: "backend prompt".to_string(),
|
||||
model: Some("realtime-test-model".to_string()),
|
||||
session_id: Some("conv_123".to_string()),
|
||||
mode: RealtimeApiMode::V2,
|
||||
},
|
||||
HeaderMap::new(),
|
||||
HeaderMap::new(),
|
||||
@@ -361,13 +390,26 @@ async fn realtime_ws_e2e_ignores_unknown_text_events() {
|
||||
.await
|
||||
.expect("connect");
|
||||
|
||||
let event = connection
|
||||
let first_event = connection
|
||||
.next_event()
|
||||
.await
|
||||
.expect("next event")
|
||||
.expect("event");
|
||||
assert_eq!(
|
||||
event,
|
||||
first_event,
|
||||
RealtimeEvent::ConversationItemAdded(json!({
|
||||
"type": "response.created",
|
||||
"response": {"id": "resp_unknown"}
|
||||
}))
|
||||
);
|
||||
|
||||
let second_event = connection
|
||||
.next_event()
|
||||
.await
|
||||
.expect("next event")
|
||||
.expect("event");
|
||||
assert_eq!(
|
||||
second_event,
|
||||
RealtimeEvent::SessionUpdated {
|
||||
session_id: "sess_after_unknown".to_string(),
|
||||
instructions: Some("backend prompt".to_string()),
|
||||
|
||||
@@ -398,6 +398,9 @@
|
||||
"prevent_idle_sleep": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"realtime_v2": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"realtime_conversation": {
|
||||
"type": "boolean"
|
||||
},
|
||||
@@ -1789,6 +1792,9 @@
|
||||
"prevent_idle_sleep": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"realtime_v2": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"realtime_conversation": {
|
||||
"type": "boolean"
|
||||
},
|
||||
@@ -2248,4 +2254,4 @@
|
||||
},
|
||||
"title": "ConfigToml",
|
||||
"type": "object"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2447,6 +2447,9 @@ impl Session {
|
||||
if !matches!(msg, EventMsg::TurnComplete(_)) {
|
||||
return;
|
||||
}
|
||||
if let Err(err) = self.conversation.handoff_complete().await {
|
||||
debug!("failed to send final realtime handoff tool output: {err}");
|
||||
}
|
||||
self.conversation.clear_active_handoff().await;
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ use crate::codex::SteerInputError;
|
||||
use crate::config::ConstraintResult;
|
||||
use crate::error::Result as CodexResult;
|
||||
use crate::features::Feature;
|
||||
use crate::features::RealtimeVoiceMode;
|
||||
use crate::file_watcher::WatchRegistration;
|
||||
use crate::protocol::Event;
|
||||
use crate::protocol::Op;
|
||||
@@ -143,4 +144,12 @@ impl CodexThread {
|
||||
pub fn enabled(&self, feature: Feature) -> bool {
|
||||
self.codex.enabled(feature)
|
||||
}
|
||||
|
||||
pub fn realtime_voice_mode(&self) -> Option<RealtimeVoiceMode> {
|
||||
self.codex.features().realtime_voice_mode()
|
||||
}
|
||||
|
||||
pub fn realtime_voice_enabled(&self) -> bool {
|
||||
self.realtime_voice_mode().is_some()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,6 +155,8 @@ pub enum Feature {
|
||||
FastMode,
|
||||
/// Enable voice transcription in the TUI composer.
|
||||
VoiceTranscription,
|
||||
/// Enable realtime voice conversation mode in the TUI.
|
||||
RealtimeV2,
|
||||
/// Enable experimental realtime voice conversation mode in the TUI.
|
||||
RealtimeConversation,
|
||||
/// Prevent idle system sleep while a turn is actively running.
|
||||
@@ -165,6 +167,12 @@ pub enum Feature {
|
||||
ResponsesWebsocketsV2,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum RealtimeVoiceMode {
|
||||
V1,
|
||||
V2,
|
||||
}
|
||||
|
||||
impl Feature {
|
||||
pub fn key(self) -> &'static str {
|
||||
self.info().key
|
||||
@@ -237,6 +245,18 @@ impl Features {
|
||||
self.enabled.contains(&f)
|
||||
}
|
||||
|
||||
pub fn realtime_voice_mode(&self) -> Option<RealtimeVoiceMode> {
|
||||
if self.enabled(Feature::RealtimeV2) {
|
||||
return Some(RealtimeVoiceMode::V2);
|
||||
}
|
||||
self.enabled(Feature::RealtimeConversation)
|
||||
.then_some(RealtimeVoiceMode::V1)
|
||||
}
|
||||
|
||||
pub fn realtime_voice_enabled(&self) -> bool {
|
||||
self.realtime_voice_mode().is_some()
|
||||
}
|
||||
|
||||
pub fn enable(&mut self, f: Feature) -> &mut Self {
|
||||
self.enabled.insert(f);
|
||||
self
|
||||
@@ -717,6 +737,12 @@ pub const FEATURES: &[FeatureSpec] = &[
|
||||
stage: Stage::UnderDevelopment,
|
||||
default_enabled: false,
|
||||
},
|
||||
FeatureSpec {
|
||||
id: Feature::RealtimeV2,
|
||||
key: "realtime_v2",
|
||||
stage: Stage::UnderDevelopment,
|
||||
default_enabled: false,
|
||||
},
|
||||
FeatureSpec {
|
||||
id: Feature::RealtimeConversation,
|
||||
key: "realtime_conversation",
|
||||
@@ -887,4 +913,60 @@ mod tests {
|
||||
assert_eq!(feature_for_key("multi_agent"), Some(Feature::Collab));
|
||||
assert_eq!(feature_for_key("collab"), Some(Feature::Collab));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn realtime_voice_enabled_is_false_when_no_realtime_flags_are_enabled() {
|
||||
let features = Features::with_defaults();
|
||||
assert_eq!(features.realtime_voice_enabled(), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn realtime_voice_enabled_is_true_when_realtime_v2_is_enabled() {
|
||||
let mut features = Features::with_defaults();
|
||||
features.enable(Feature::RealtimeV2);
|
||||
assert_eq!(features.realtime_voice_enabled(), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn realtime_voice_enabled_is_true_when_legacy_realtime_conversation_is_enabled() {
|
||||
let mut features = Features::with_defaults();
|
||||
features.enable(Feature::RealtimeConversation);
|
||||
assert_eq!(features.realtime_voice_enabled(), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn realtime_voice_enabled_is_true_when_both_realtime_flags_are_enabled() {
|
||||
let mut features = Features::with_defaults();
|
||||
features.enable(Feature::RealtimeV2);
|
||||
features.enable(Feature::RealtimeConversation);
|
||||
assert_eq!(features.realtime_voice_enabled(), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn realtime_voice_mode_is_none_when_no_realtime_flags_are_enabled() {
|
||||
let features = Features::with_defaults();
|
||||
assert_eq!(features.realtime_voice_mode(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn realtime_voice_mode_is_v2_when_realtime_v2_is_enabled() {
|
||||
let mut features = Features::with_defaults();
|
||||
features.enable(Feature::RealtimeV2);
|
||||
assert_eq!(features.realtime_voice_mode(), Some(RealtimeVoiceMode::V2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn realtime_voice_mode_is_v1_when_only_legacy_realtime_flag_is_enabled() {
|
||||
let mut features = Features::with_defaults();
|
||||
features.enable(Feature::RealtimeConversation);
|
||||
assert_eq!(features.realtime_voice_mode(), Some(RealtimeVoiceMode::V1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn realtime_voice_mode_prefers_v2_when_both_flags_are_enabled() {
|
||||
let mut features = Features::with_defaults();
|
||||
features.enable(Feature::RealtimeV2);
|
||||
features.enable(Feature::RealtimeConversation);
|
||||
assert_eq!(features.realtime_voice_mode(), Some(RealtimeVoiceMode::V2));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,10 +5,12 @@ use crate::codex::Session;
|
||||
use crate::default_client::default_headers;
|
||||
use crate::error::CodexErr;
|
||||
use crate::error::Result as CodexResult;
|
||||
use crate::features::RealtimeVoiceMode;
|
||||
use async_channel::Receiver;
|
||||
use async_channel::Sender;
|
||||
use async_channel::TrySendError;
|
||||
use codex_api::Provider as ApiProvider;
|
||||
use codex_api::RealtimeApiMode;
|
||||
use codex_api::RealtimeAudioFrame;
|
||||
use codex_api::RealtimeEvent;
|
||||
use codex_api::RealtimeSessionConfig;
|
||||
@@ -43,6 +45,7 @@ const AUDIO_IN_QUEUE_CAPACITY: usize = 256;
|
||||
const USER_TEXT_IN_QUEUE_CAPACITY: usize = 64;
|
||||
const HANDOFF_OUT_QUEUE_CAPACITY: usize = 64;
|
||||
const OUTPUT_EVENTS_QUEUE_CAPACITY: usize = 256;
|
||||
const DEFAULT_REALTIME_MODEL: &str = "gpt-realtime-1.5";
|
||||
|
||||
pub(crate) struct RealtimeConversationManager {
|
||||
state: Mutex<Option<ConversationState>>,
|
||||
@@ -52,19 +55,29 @@ pub(crate) struct RealtimeConversationManager {
|
||||
struct RealtimeHandoffState {
|
||||
output_tx: Sender<HandoffOutput>,
|
||||
active_handoff: Arc<Mutex<Option<String>>>,
|
||||
last_output_text: Arc<Mutex<Option<String>>>,
|
||||
mode: RealtimeApiMode,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
struct HandoffOutput {
|
||||
handoff_id: String,
|
||||
output_text: String,
|
||||
enum HandoffOutput {
|
||||
TextUpdate {
|
||||
handoff_id: String,
|
||||
output_text: String,
|
||||
},
|
||||
FinalToolCall {
|
||||
call_id: String,
|
||||
output_text: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl RealtimeHandoffState {
|
||||
fn new(output_tx: Sender<HandoffOutput>) -> Self {
|
||||
fn new(output_tx: Sender<HandoffOutput>, mode: RealtimeApiMode) -> Self {
|
||||
Self {
|
||||
output_tx,
|
||||
active_handoff: Arc::new(Mutex::new(None)),
|
||||
last_output_text: Arc::new(Mutex::new(None)),
|
||||
mode,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,9 +85,10 @@ impl RealtimeHandoffState {
|
||||
let Some(handoff_id) = self.active_handoff.lock().await.clone() else {
|
||||
return Ok(());
|
||||
};
|
||||
*self.last_output_text.lock().await = Some(output_text.clone());
|
||||
|
||||
self.output_tx
|
||||
.send(HandoffOutput {
|
||||
.send(HandoffOutput::TextUpdate {
|
||||
handoff_id,
|
||||
output_text,
|
||||
})
|
||||
@@ -82,6 +96,26 @@ impl RealtimeHandoffState {
|
||||
.map_err(|_| CodexErr::InvalidRequest("conversation is not running".to_string()))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn send_final_output(&self) -> CodexResult<()> {
|
||||
if self.mode == RealtimeApiMode::V1 {
|
||||
return Ok(());
|
||||
}
|
||||
let Some(call_id) = self.active_handoff.lock().await.clone() else {
|
||||
return Ok(());
|
||||
};
|
||||
let Some(output_text) = self.last_output_text.lock().await.clone() else {
|
||||
return Ok(());
|
||||
};
|
||||
self.output_tx
|
||||
.send(HandoffOutput::FinalToolCall {
|
||||
call_id,
|
||||
output_text,
|
||||
})
|
||||
.await
|
||||
.map_err(|_| CodexErr::InvalidRequest("conversation is not running".to_string()))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
@@ -89,6 +123,7 @@ struct ConversationState {
|
||||
audio_tx: Sender<RealtimeAudioFrame>,
|
||||
user_text_tx: Sender<String>,
|
||||
handoff: RealtimeHandoffState,
|
||||
mode: RealtimeApiMode,
|
||||
task: JoinHandle<()>,
|
||||
realtime_active: Arc<AtomicBool>,
|
||||
}
|
||||
@@ -114,6 +149,7 @@ impl RealtimeConversationManager {
|
||||
extra_headers: Option<HeaderMap>,
|
||||
prompt: String,
|
||||
model: Option<String>,
|
||||
mode: RealtimeApiMode,
|
||||
session_id: Option<String>,
|
||||
) -> CodexResult<(Receiver<RealtimeEvent>, Arc<AtomicBool>)> {
|
||||
let previous_state = {
|
||||
@@ -130,6 +166,7 @@ impl RealtimeConversationManager {
|
||||
instructions: prompt,
|
||||
model,
|
||||
session_id,
|
||||
mode,
|
||||
};
|
||||
let client = RealtimeWebsocketClient::new(api_provider);
|
||||
let connection = client
|
||||
@@ -153,7 +190,7 @@ impl RealtimeConversationManager {
|
||||
async_channel::bounded::<RealtimeEvent>(OUTPUT_EVENTS_QUEUE_CAPACITY);
|
||||
|
||||
let realtime_active = Arc::new(AtomicBool::new(true));
|
||||
let handoff = RealtimeHandoffState::new(handoff_output_tx);
|
||||
let handoff = RealtimeHandoffState::new(handoff_output_tx, mode);
|
||||
let task = spawn_realtime_input_task(
|
||||
writer,
|
||||
events,
|
||||
@@ -169,6 +206,7 @@ impl RealtimeConversationManager {
|
||||
audio_tx,
|
||||
user_text_tx,
|
||||
handoff,
|
||||
mode,
|
||||
task,
|
||||
realtime_active: Arc::clone(&realtime_active),
|
||||
});
|
||||
@@ -232,6 +270,17 @@ impl RealtimeConversationManager {
|
||||
handoff.send_output(output_text).await
|
||||
}
|
||||
|
||||
pub(crate) async fn handoff_complete(&self) -> CodexResult<()> {
|
||||
let handoff = {
|
||||
let guard = self.state.lock().await;
|
||||
guard.as_ref().map(|state| state.handoff.clone())
|
||||
};
|
||||
let Some(handoff) = handoff else {
|
||||
return Ok(());
|
||||
};
|
||||
handoff.send_final_output().await
|
||||
}
|
||||
|
||||
pub(crate) async fn active_handoff_id(&self) -> Option<String> {
|
||||
let handoff = {
|
||||
let guard = self.state.lock().await;
|
||||
@@ -247,6 +296,7 @@ impl RealtimeConversationManager {
|
||||
};
|
||||
if let Some(handoff) = handoff {
|
||||
*handoff.active_handoff.lock().await = None;
|
||||
*handoff.last_output_text.lock().await = None;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -282,7 +332,20 @@ pub(crate) async fn handle_start(
|
||||
.experimental_realtime_ws_backend_prompt
|
||||
.clone()
|
||||
.unwrap_or(params.prompt);
|
||||
let model = config.experimental_realtime_ws_model.clone();
|
||||
let mode = config
|
||||
.features
|
||||
.realtime_voice_mode()
|
||||
.unwrap_or(RealtimeVoiceMode::V2);
|
||||
let (realtime_api_mode, model) = match mode {
|
||||
RealtimeVoiceMode::V1 => (
|
||||
RealtimeApiMode::V1,
|
||||
config.experimental_realtime_ws_model.clone(),
|
||||
),
|
||||
RealtimeVoiceMode::V2 => (
|
||||
RealtimeApiMode::V2,
|
||||
Some(DEFAULT_REALTIME_MODEL.to_string()),
|
||||
),
|
||||
};
|
||||
|
||||
let requested_session_id = params
|
||||
.session_id
|
||||
@@ -297,6 +360,7 @@ pub(crate) async fn handle_start(
|
||||
extra_headers,
|
||||
prompt,
|
||||
model,
|
||||
realtime_api_mode,
|
||||
requested_session_id.clone(),
|
||||
)
|
||||
.await
|
||||
@@ -489,17 +553,39 @@ fn spawn_realtime_input_task(
|
||||
}
|
||||
handoff_output = handoff_output_rx.recv() => {
|
||||
match handoff_output {
|
||||
Ok(HandoffOutput {
|
||||
handoff_id,
|
||||
output_text,
|
||||
}) => {
|
||||
if let Err(err) = writer
|
||||
.send_conversation_handoff_append(handoff_id, output_text)
|
||||
.await
|
||||
{
|
||||
let mapped_error = map_api_error(err);
|
||||
warn!("failed to send handoff output: {mapped_error}");
|
||||
break;
|
||||
Ok(handoff_output) => {
|
||||
match handoff_output {
|
||||
HandoffOutput::TextUpdate {
|
||||
handoff_id,
|
||||
output_text,
|
||||
} => {
|
||||
if let Err(err) = writer
|
||||
.send_conversation_handoff_append(handoff_id, output_text)
|
||||
.await
|
||||
{
|
||||
let mapped_error = map_api_error(err);
|
||||
warn!("failed to send handoff output: {mapped_error}");
|
||||
break;
|
||||
}
|
||||
}
|
||||
HandoffOutput::FinalToolCall {
|
||||
call_id,
|
||||
output_text,
|
||||
} => {
|
||||
if let Err(err) = writer
|
||||
.send_function_call_output(call_id, output_text)
|
||||
.await
|
||||
{
|
||||
let mapped_error = map_api_error(err);
|
||||
warn!("failed to send handoff tool output: {mapped_error}");
|
||||
break;
|
||||
}
|
||||
if let Err(err) = writer.send_response_create().await {
|
||||
let mapped_error = map_api_error(err);
|
||||
warn!("failed to send handoff response.create: {mapped_error}");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => break,
|
||||
@@ -511,6 +597,7 @@ fn spawn_realtime_input_task(
|
||||
if let RealtimeEvent::HandoffRequested(handoff) = &event {
|
||||
*handoff_state.active_handoff.lock().await =
|
||||
Some(handoff.handoff_id.clone());
|
||||
*handoff_state.last_output_text.lock().await = None;
|
||||
}
|
||||
let should_stop = matches!(&event, RealtimeEvent::Error(_));
|
||||
if events_tx.send(event).await.is_err() {
|
||||
@@ -582,6 +669,7 @@ mod tests {
|
||||
use super::RealtimeHandoffState;
|
||||
use super::realtime_text_from_handoff_request;
|
||||
use async_channel::bounded;
|
||||
use codex_api::RealtimeApiMode;
|
||||
use codex_protocol::protocol::RealtimeHandoffMessage;
|
||||
use codex_protocol::protocol::RealtimeHandoffRequested;
|
||||
use pretty_assertions::assert_eq;
|
||||
@@ -637,7 +725,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn clears_active_handoff_explicitly() {
|
||||
let (tx, _rx) = bounded(1);
|
||||
let state = RealtimeHandoffState::new(tx);
|
||||
let state = RealtimeHandoffState::new(tx, RealtimeApiMode::V2);
|
||||
|
||||
*state.active_handoff.lock().await = Some("handoff_1".to_string());
|
||||
assert_eq!(
|
||||
@@ -652,7 +740,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn sends_multiple_handoff_outputs_until_cleared() {
|
||||
let (tx, rx) = bounded(4);
|
||||
let state = RealtimeHandoffState::new(tx);
|
||||
let state = RealtimeHandoffState::new(tx, RealtimeApiMode::V2);
|
||||
|
||||
state
|
||||
.send_output("ignored".to_string())
|
||||
@@ -670,7 +758,7 @@ mod tests {
|
||||
let output_1 = rx.recv().await.expect("recv");
|
||||
assert_eq!(
|
||||
output_1,
|
||||
HandoffOutput {
|
||||
HandoffOutput::TextUpdate {
|
||||
handoff_id: "handoff_1".to_string(),
|
||||
output_text: "result".to_string(),
|
||||
}
|
||||
@@ -679,7 +767,7 @@ mod tests {
|
||||
let output_2 = rx.recv().await.expect("recv");
|
||||
assert_eq!(
|
||||
output_2,
|
||||
HandoffOutput {
|
||||
HandoffOutput::TextUpdate {
|
||||
handoff_id: "handoff_1".to_string(),
|
||||
output_text: "result 2".to_string(),
|
||||
}
|
||||
@@ -692,4 +780,43 @@ mod tests {
|
||||
.expect("send");
|
||||
assert!(rx.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn sends_final_tool_call_output_for_active_handoff() {
|
||||
let (tx, rx) = bounded(4);
|
||||
let state = RealtimeHandoffState::new(tx, RealtimeApiMode::V2);
|
||||
*state.active_handoff.lock().await = Some("handoff_2".to_string());
|
||||
|
||||
state
|
||||
.send_output("final text".to_string())
|
||||
.await
|
||||
.expect("send");
|
||||
let _ = rx.recv().await.expect("recv text update");
|
||||
|
||||
state.send_final_output().await.expect("send final output");
|
||||
let final_output = rx.recv().await.expect("recv final output");
|
||||
assert_eq!(
|
||||
final_output,
|
||||
HandoffOutput::FinalToolCall {
|
||||
call_id: "handoff_2".to_string(),
|
||||
output_text: "final text".to_string(),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn does_not_send_final_tool_call_output_in_v1_mode() {
|
||||
let (tx, rx) = bounded(4);
|
||||
let state = RealtimeHandoffState::new(tx, RealtimeApiMode::V1);
|
||||
*state.active_handoff.lock().await = Some("handoff_3".to_string());
|
||||
|
||||
state
|
||||
.send_output("legacy final text".to_string())
|
||||
.await
|
||||
.expect("send");
|
||||
let _ = rx.recv().await.expect("recv text update");
|
||||
|
||||
state.send_final_output().await.expect("send final output");
|
||||
assert!(rx.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use anyhow::Result;
|
||||
use codex_core::CodexAuth;
|
||||
use codex_core::auth::OPENAI_API_KEY_ENV_VAR;
|
||||
use codex_core::features::Feature;
|
||||
use codex_protocol::protocol::CodexErrorInfo;
|
||||
use codex_protocol::protocol::ConversationAudioParams;
|
||||
use codex_protocol::protocol::ConversationStartParams;
|
||||
@@ -42,7 +43,7 @@ async fn conversation_start_audio_text_close_round_trip() -> Result<()> {
|
||||
vec![],
|
||||
vec![
|
||||
json!({
|
||||
"type": "conversation.output_audio.delta",
|
||||
"type": "response.output_audio.delta",
|
||||
"delta": "AQID",
|
||||
"sample_rate": 24000,
|
||||
"channels": 1
|
||||
@@ -141,7 +142,7 @@ async fn conversation_start_audio_text_close_round_trip() -> Result<()> {
|
||||
);
|
||||
assert_eq!(
|
||||
server.handshakes()[1].uri(),
|
||||
"/v1/realtime?intent=quicksilver&model=realtime-test-model"
|
||||
"/v1/realtime?model=gpt-realtime-1.5"
|
||||
);
|
||||
let mut request_types = [
|
||||
connection[1].body_json()["type"]
|
||||
@@ -177,6 +178,95 @@ async fn conversation_start_audio_text_close_round_trip() -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn conversation_start_legacy_feature_uses_v1_realtime_protocol() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
|
||||
let server = start_websocket_server(vec![
|
||||
vec![],
|
||||
vec![
|
||||
vec![json!({
|
||||
"type": "session.updated",
|
||||
"session": { "id": "sess_legacy", "instructions": "backend prompt" }
|
||||
})],
|
||||
vec![json!({
|
||||
"type": "conversation.output_audio.delta",
|
||||
"delta": "AQID",
|
||||
"sample_rate": 24000,
|
||||
"channels": 1
|
||||
})],
|
||||
vec![],
|
||||
],
|
||||
])
|
||||
.await;
|
||||
|
||||
let mut builder = test_codex().with_config(|config| {
|
||||
config
|
||||
.features
|
||||
.enable(Feature::RealtimeConversation)
|
||||
.expect("enable legacy realtime feature");
|
||||
config
|
||||
.features
|
||||
.disable(Feature::RealtimeV2)
|
||||
.expect("disable realtime_v2 feature");
|
||||
});
|
||||
let test = builder.build_with_websocket_server(&server).await?;
|
||||
assert!(server.wait_for_handshakes(1, Duration::from_secs(2)).await);
|
||||
|
||||
test.codex
|
||||
.submit(Op::RealtimeConversationStart(ConversationStartParams {
|
||||
prompt: "backend prompt".to_string(),
|
||||
session_id: None,
|
||||
}))
|
||||
.await?;
|
||||
let _started = wait_for_event_match(&test.codex, |msg| match msg {
|
||||
EventMsg::RealtimeConversationStarted(started) => Some(Ok(started.clone())),
|
||||
EventMsg::Error(err) => Some(Err(err.clone())),
|
||||
_ => None,
|
||||
})
|
||||
.await
|
||||
.unwrap_or_else(|err: ErrorEvent| panic!("conversation start failed: {err:?}"));
|
||||
|
||||
test.codex
|
||||
.submit(Op::RealtimeConversationText(ConversationTextParams {
|
||||
text: "hello legacy".to_string(),
|
||||
}))
|
||||
.await?;
|
||||
|
||||
let audio_out = wait_for_event_match(&test.codex, |msg| match msg {
|
||||
EventMsg::RealtimeConversationRealtime(RealtimeConversationRealtimeEvent {
|
||||
payload: RealtimeEvent::AudioOut(frame),
|
||||
}) => Some(frame.clone()),
|
||||
_ => None,
|
||||
})
|
||||
.await;
|
||||
assert_eq!(audio_out.data, "AQID");
|
||||
|
||||
let connections = server.connections();
|
||||
assert_eq!(connections.len(), 2);
|
||||
let connection = &connections[1];
|
||||
assert_eq!(connection[0].body_json()["type"], json!("session.update"));
|
||||
assert_eq!(
|
||||
connection[0].body_json()["session"]["type"],
|
||||
json!("quicksilver")
|
||||
);
|
||||
assert_eq!(
|
||||
server.handshakes()[1].uri(),
|
||||
"/v1/realtime?intent=quicksilver&model=realtime-test-model"
|
||||
);
|
||||
assert_eq!(
|
||||
connection[1].body_json()["type"],
|
||||
json!("conversation.item.create")
|
||||
);
|
||||
assert_eq!(
|
||||
connection[1].body_json()["item"]["content"][0]["type"],
|
||||
json!("text")
|
||||
);
|
||||
|
||||
server.shutdown().await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn conversation_start_uses_openai_env_key_fallback_with_chatgpt_auth() -> Result<()> {
|
||||
skip_if_no_network!(Ok(()));
|
||||
@@ -387,7 +477,7 @@ async fn conversation_second_start_replaces_runtime() -> Result<()> {
|
||||
"session": { "id": "sess_new", "instructions": "new" }
|
||||
})],
|
||||
vec![json!({
|
||||
"type": "conversation.output_audio.delta",
|
||||
"type": "response.output_audio.delta",
|
||||
"delta": "AQID",
|
||||
"sample_rate": 24000,
|
||||
"channels": 1
|
||||
@@ -600,15 +690,11 @@ async fn conversation_mirrors_assistant_message_text_to_realtime_handoff() -> Re
|
||||
"type": "session.updated",
|
||||
"session": { "id": "sess_1", "instructions": "backend prompt" }
|
||||
}),
|
||||
json!({
|
||||
"type": "conversation.handoff.requested",
|
||||
"handoff_id": "handoff_1",
|
||||
"item_id": "item_1",
|
||||
"input_transcript": "delegate hello",
|
||||
"messages": [{ "role": "user", "text": "delegate hello" }]
|
||||
}),
|
||||
realtime_handoff_requested_event("handoff_1", "item_1", "delegate hello"),
|
||||
],
|
||||
vec![],
|
||||
vec![],
|
||||
vec![],
|
||||
]])
|
||||
.await;
|
||||
|
||||
@@ -652,7 +738,7 @@ async fn conversation_mirrors_assistant_message_text_to_realtime_handoff() -> Re
|
||||
let deadline = tokio::time::Instant::now() + Duration::from_secs(2);
|
||||
while tokio::time::Instant::now() < deadline {
|
||||
let connections = realtime_server.connections();
|
||||
if connections.len() == 1 && connections[0].len() >= 2 {
|
||||
if connections.len() == 1 && connections[0].len() >= 4 {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
@@ -660,22 +746,46 @@ async fn conversation_mirrors_assistant_message_text_to_realtime_handoff() -> Re
|
||||
|
||||
let realtime_connections = realtime_server.connections();
|
||||
assert_eq!(realtime_connections.len(), 1);
|
||||
assert_eq!(realtime_connections[0].len(), 2);
|
||||
assert_eq!(realtime_connections[0].len(), 4);
|
||||
assert_eq!(
|
||||
realtime_connections[0][0].body_json()["type"].as_str(),
|
||||
Some("session.update")
|
||||
);
|
||||
assert_eq!(
|
||||
realtime_connections[0][1].body_json()["type"].as_str(),
|
||||
Some("conversation.handoff.append")
|
||||
Some("conversation.item.create")
|
||||
);
|
||||
assert_eq!(
|
||||
realtime_connections[0][1].body_json()["handoff_id"].as_str(),
|
||||
realtime_connections[0][1].body_json()["item"]["type"].as_str(),
|
||||
Some("message")
|
||||
);
|
||||
assert_eq!(
|
||||
realtime_connections[0][1].body_json()["item"]["role"].as_str(),
|
||||
Some("assistant")
|
||||
);
|
||||
assert_eq!(
|
||||
realtime_connections[0][1].body_json()["item"]["content"][0]["type"].as_str(),
|
||||
Some("output_text")
|
||||
);
|
||||
assert_eq!(
|
||||
realtime_connections[0][1].body_json()["item"]["content"][0]["text"].as_str(),
|
||||
Some("assistant says hi")
|
||||
);
|
||||
assert_eq!(
|
||||
realtime_connections[0][2].body_json()["type"].as_str(),
|
||||
Some("conversation.item.create")
|
||||
);
|
||||
assert_eq!(
|
||||
realtime_connections[0][2].body_json()["item"]["type"].as_str(),
|
||||
Some("function_call_output")
|
||||
);
|
||||
assert_eq!(
|
||||
realtime_connections[0][2].body_json()["item"]["call_id"].as_str(),
|
||||
Some("handoff_1")
|
||||
);
|
||||
assert_eq!(
|
||||
realtime_connections[0][1].body_json()["output_text"].as_str(),
|
||||
Some("assistant says hi")
|
||||
realtime_connections[0][3].body_json()["type"].as_str(),
|
||||
Some("response.create")
|
||||
);
|
||||
|
||||
realtime_server.shutdown().await;
|
||||
@@ -719,19 +829,15 @@ async fn conversation_handoff_persists_across_item_done_until_turn_complete() ->
|
||||
"type": "session.updated",
|
||||
"session": { "id": "sess_item_done", "instructions": "backend prompt" }
|
||||
}),
|
||||
json!({
|
||||
"type": "conversation.handoff.requested",
|
||||
"handoff_id": "handoff_item_done",
|
||||
"item_id": "item_item_done",
|
||||
"input_transcript": "delegate now",
|
||||
"messages": [{ "role": "user", "text": "delegate now" }]
|
||||
}),
|
||||
realtime_handoff_requested_event("handoff_item_done", "item_item_done", "delegate now"),
|
||||
],
|
||||
vec![json!({
|
||||
"type": "conversation.item.done",
|
||||
"item": { "id": "item_item_done" }
|
||||
})],
|
||||
vec![],
|
||||
vec![],
|
||||
vec![],
|
||||
]])
|
||||
.await;
|
||||
|
||||
@@ -769,14 +875,22 @@ async fn conversation_handoff_persists_across_item_done_until_turn_complete() ->
|
||||
let first_append = realtime_server.wait_for_request(0, 1).await;
|
||||
assert_eq!(
|
||||
first_append.body_json()["type"].as_str(),
|
||||
Some("conversation.handoff.append")
|
||||
Some("conversation.item.create")
|
||||
);
|
||||
assert_eq!(
|
||||
first_append.body_json()["handoff_id"].as_str(),
|
||||
Some("handoff_item_done")
|
||||
first_append.body_json()["item"]["type"].as_str(),
|
||||
Some("message")
|
||||
);
|
||||
assert_eq!(
|
||||
first_append.body_json()["output_text"].as_str(),
|
||||
first_append.body_json()["item"]["role"].as_str(),
|
||||
Some("assistant")
|
||||
);
|
||||
assert_eq!(
|
||||
first_append.body_json()["item"]["content"][0]["type"].as_str(),
|
||||
Some("output_text")
|
||||
);
|
||||
assert_eq!(
|
||||
first_append.body_json()["item"]["content"][0]["text"].as_str(),
|
||||
Some("assistant message 1")
|
||||
);
|
||||
|
||||
@@ -793,14 +907,22 @@ async fn conversation_handoff_persists_across_item_done_until_turn_complete() ->
|
||||
let second_append = realtime_server.wait_for_request(0, 2).await;
|
||||
assert_eq!(
|
||||
second_append.body_json()["type"].as_str(),
|
||||
Some("conversation.handoff.append")
|
||||
Some("conversation.item.create")
|
||||
);
|
||||
assert_eq!(
|
||||
second_append.body_json()["handoff_id"].as_str(),
|
||||
Some("handoff_item_done")
|
||||
second_append.body_json()["item"]["type"].as_str(),
|
||||
Some("message")
|
||||
);
|
||||
assert_eq!(
|
||||
second_append.body_json()["output_text"].as_str(),
|
||||
second_append.body_json()["item"]["role"].as_str(),
|
||||
Some("assistant")
|
||||
);
|
||||
assert_eq!(
|
||||
second_append.body_json()["item"]["content"][0]["type"].as_str(),
|
||||
Some("output_text")
|
||||
);
|
||||
assert_eq!(
|
||||
second_append.body_json()["item"]["content"][0]["text"].as_str(),
|
||||
Some("assistant message 2")
|
||||
);
|
||||
|
||||
@@ -816,6 +938,30 @@ async fn conversation_handoff_persists_across_item_done_until_turn_complete() ->
|
||||
})
|
||||
.await;
|
||||
|
||||
let final_tool_call = realtime_server.wait_for_request(0, 3).await;
|
||||
assert_eq!(
|
||||
final_tool_call.body_json()["type"].as_str(),
|
||||
Some("conversation.item.create")
|
||||
);
|
||||
assert_eq!(
|
||||
final_tool_call.body_json()["item"]["type"].as_str(),
|
||||
Some("function_call_output")
|
||||
);
|
||||
assert_eq!(
|
||||
final_tool_call.body_json()["item"]["call_id"].as_str(),
|
||||
Some("handoff_item_done")
|
||||
);
|
||||
assert_eq!(
|
||||
final_tool_call.body_json()["item"]["output"].as_str(),
|
||||
Some("{\"content\":\"assistant message 2\"}")
|
||||
);
|
||||
|
||||
let response_create = realtime_server.wait_for_request(0, 4).await;
|
||||
assert_eq!(
|
||||
response_create.body_json()["type"].as_str(),
|
||||
Some("response.create")
|
||||
);
|
||||
|
||||
realtime_server.shutdown().await;
|
||||
api_server.shutdown().await;
|
||||
Ok(())
|
||||
@@ -825,6 +971,23 @@ fn sse_event(event: Value) -> String {
|
||||
responses::sse(vec![event])
|
||||
}
|
||||
|
||||
fn realtime_handoff_requested_event(handoff_id: &str, item_id: &str, prompt: &str) -> Value {
|
||||
json!({
|
||||
"type": "response.done",
|
||||
"response": {
|
||||
"output": [
|
||||
{
|
||||
"id": item_id,
|
||||
"type": "function_call",
|
||||
"name": "codex",
|
||||
"call_id": handoff_id,
|
||||
"arguments": json!({ "prompt": prompt }).to_string(),
|
||||
}
|
||||
]
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn message_input_texts(body: &Value, role: &str) -> Vec<String> {
|
||||
body.get("input")
|
||||
.and_then(Value::as_array)
|
||||
@@ -859,13 +1022,7 @@ async fn inbound_handoff_request_starts_turn() -> Result<()> {
|
||||
"type": "session.updated",
|
||||
"session": { "id": "sess_inbound", "instructions": "backend prompt" }
|
||||
}),
|
||||
json!({
|
||||
"type": "conversation.handoff.requested",
|
||||
"handoff_id": "handoff_inbound",
|
||||
"item_id": "item_inbound",
|
||||
"input_transcript": "text from realtime",
|
||||
"messages": [{ "role": "user", "text": "text from realtime" }]
|
||||
}),
|
||||
realtime_handoff_requested_event("handoff_inbound", "item_inbound", "text from realtime"),
|
||||
]]])
|
||||
.await;
|
||||
|
||||
@@ -939,15 +1096,25 @@ async fn inbound_handoff_request_uses_all_messages() -> Result<()> {
|
||||
"session": { "id": "sess_inbound_multi", "instructions": "backend prompt" }
|
||||
}),
|
||||
json!({
|
||||
"type": "conversation.handoff.requested",
|
||||
"handoff_id": "handoff_inbound_multi",
|
||||
"item_id": "item_inbound_multi",
|
||||
"input_transcript": "ignored",
|
||||
"messages": [
|
||||
{ "role": "assistant", "text": "assistant context" },
|
||||
{ "role": "user", "text": "delegated query" },
|
||||
{ "role": "assistant", "text": "assist confirm" },
|
||||
]
|
||||
"type": "response.done",
|
||||
"response": {
|
||||
"output": [
|
||||
{
|
||||
"id": "item_inbound_multi",
|
||||
"type": "function_call",
|
||||
"name": "codex",
|
||||
"call_id": "handoff_inbound_multi",
|
||||
"arguments": json!({
|
||||
"input_transcript": "ignored",
|
||||
"messages": [
|
||||
{ "role": "assistant", "text": "assistant context" },
|
||||
{ "role": "user", "text": "delegated query" },
|
||||
{ "role": "assistant", "text": "assist confirm" },
|
||||
]
|
||||
}).to_string(),
|
||||
}
|
||||
]
|
||||
}
|
||||
}),
|
||||
]]])
|
||||
.await;
|
||||
@@ -1012,7 +1179,7 @@ async fn inbound_conversation_item_does_not_start_turn_and_still_forwards_audio(
|
||||
}
|
||||
}),
|
||||
json!({
|
||||
"type": "conversation.output_audio.delta",
|
||||
"type": "response.output_audio.delta",
|
||||
"delta": "AQID",
|
||||
"sample_rate": 24000,
|
||||
"channels": 1
|
||||
@@ -1102,13 +1269,11 @@ async fn delegated_turn_user_role_echo_does_not_redelegate_and_still_forwards_au
|
||||
"type": "session.updated",
|
||||
"session": { "id": "sess_echo_guard", "instructions": "backend prompt" }
|
||||
}),
|
||||
json!({
|
||||
"type": "conversation.handoff.requested",
|
||||
"handoff_id": "handoff_echo_guard",
|
||||
"item_id": "item_echo_guard",
|
||||
"input_transcript": "delegate now",
|
||||
"messages": [{"role": "user", "text": "delegate now"}]
|
||||
}),
|
||||
realtime_handoff_requested_event(
|
||||
"handoff_echo_guard",
|
||||
"item_echo_guard",
|
||||
"delegate now",
|
||||
),
|
||||
],
|
||||
vec![
|
||||
json!({
|
||||
@@ -1120,7 +1285,7 @@ async fn delegated_turn_user_role_echo_does_not_redelegate_and_still_forwards_au
|
||||
}
|
||||
}),
|
||||
json!({
|
||||
"type": "conversation.output_audio.delta",
|
||||
"type": "response.output_audio.delta",
|
||||
"delta": "AQID",
|
||||
"sample_rate": 24000,
|
||||
"channels": 1
|
||||
@@ -1168,22 +1333,30 @@ async fn delegated_turn_user_role_echo_does_not_redelegate_and_still_forwards_au
|
||||
let mirrored_request = realtime_server.wait_for_request(0, 1).await;
|
||||
let mirrored_request_body = mirrored_request.body_json();
|
||||
eprintln!(
|
||||
"[realtime test +{}ms] saw mirrored request type={:?} handoff_id={:?} text={:?}",
|
||||
"[realtime test +{}ms] saw mirrored request type={:?} role={:?} text={:?}",
|
||||
start.elapsed().as_millis(),
|
||||
mirrored_request_body["type"].as_str(),
|
||||
mirrored_request_body["handoff_id"].as_str(),
|
||||
mirrored_request_body["output_text"].as_str(),
|
||||
mirrored_request_body["item"]["role"].as_str(),
|
||||
mirrored_request_body["item"]["content"][0]["text"].as_str(),
|
||||
);
|
||||
assert_eq!(
|
||||
mirrored_request_body["type"].as_str(),
|
||||
Some("conversation.handoff.append")
|
||||
Some("conversation.item.create")
|
||||
);
|
||||
assert_eq!(
|
||||
mirrored_request_body["handoff_id"].as_str(),
|
||||
Some("handoff_echo_guard")
|
||||
mirrored_request_body["item"]["type"].as_str(),
|
||||
Some("message")
|
||||
);
|
||||
assert_eq!(
|
||||
mirrored_request_body["output_text"].as_str(),
|
||||
mirrored_request_body["item"]["role"].as_str(),
|
||||
Some("assistant")
|
||||
);
|
||||
assert_eq!(
|
||||
mirrored_request_body["item"]["content"][0]["type"].as_str(),
|
||||
Some("output_text")
|
||||
);
|
||||
assert_eq!(
|
||||
mirrored_request_body["item"]["content"][0]["text"].as_str(),
|
||||
Some("assistant says hi")
|
||||
);
|
||||
|
||||
@@ -1250,15 +1423,13 @@ async fn inbound_handoff_request_does_not_block_realtime_event_forwarding() -> R
|
||||
"type": "session.updated",
|
||||
"session": { "id": "sess_non_blocking", "instructions": "backend prompt" }
|
||||
}),
|
||||
realtime_handoff_requested_event(
|
||||
"handoff_non_blocking",
|
||||
"item_non_blocking",
|
||||
"delegate now",
|
||||
),
|
||||
json!({
|
||||
"type": "conversation.handoff.requested",
|
||||
"handoff_id": "handoff_non_blocking",
|
||||
"item_id": "item_non_blocking",
|
||||
"input_transcript": "delegate now",
|
||||
"messages": [{"role": "user", "text": "delegate now"}]
|
||||
}),
|
||||
json!({
|
||||
"type": "conversation.output_audio.delta",
|
||||
"type": "response.output_audio.delta",
|
||||
"delta": "AQID",
|
||||
"sample_rate": 24000,
|
||||
"channels": 1
|
||||
@@ -1377,13 +1548,11 @@ async fn inbound_handoff_request_steers_active_turn() -> Result<()> {
|
||||
"type": "session.updated",
|
||||
"session": { "id": "sess_steer", "instructions": "backend prompt" }
|
||||
})],
|
||||
vec![json!({
|
||||
"type": "conversation.handoff.requested",
|
||||
"handoff_id": "handoff_steer",
|
||||
"item_id": "item_steer",
|
||||
"input_transcript": "steer via realtime",
|
||||
"messages": [{ "role": "user", "text": "steer via realtime" }]
|
||||
})],
|
||||
vec![realtime_handoff_requested_event(
|
||||
"handoff_steer",
|
||||
"item_steer",
|
||||
"steer via realtime",
|
||||
)],
|
||||
]])
|
||||
.await;
|
||||
|
||||
@@ -1500,15 +1669,9 @@ async fn inbound_handoff_request_starts_turn_and_does_not_block_realtime_audio()
|
||||
"type": "session.updated",
|
||||
"session": { "id": "sess_handoff_request", "instructions": "backend prompt" }
|
||||
}),
|
||||
realtime_handoff_requested_event("handoff_audio", "item_audio", delegated_text),
|
||||
json!({
|
||||
"type": "conversation.handoff.requested",
|
||||
"handoff_id": "handoff_audio",
|
||||
"item_id": "item_audio",
|
||||
"input_transcript": delegated_text,
|
||||
"messages": [{ "role": "user", "text": delegated_text }]
|
||||
}),
|
||||
json!({
|
||||
"type": "conversation.output_audio.delta",
|
||||
"type": "response.output_audio.delta",
|
||||
"delta": "AQID",
|
||||
"sample_rate": 24000,
|
||||
"channels": 1
|
||||
|
||||
@@ -926,8 +926,7 @@ enum ReplayKind {
|
||||
|
||||
impl ChatWidget {
|
||||
fn realtime_conversation_enabled(&self) -> bool {
|
||||
self.config.features.enabled(Feature::RealtimeConversation)
|
||||
&& cfg!(not(target_os = "linux"))
|
||||
self.config.features.realtime_voice_enabled() && cfg!(not(target_os = "linux"))
|
||||
}
|
||||
|
||||
fn realtime_audio_device_selection_enabled(&self) -> bool {
|
||||
@@ -7074,7 +7073,7 @@ impl ChatWidget {
|
||||
if feature == Feature::VoiceTranscription {
|
||||
self.bottom_pane.set_voice_transcription_enabled(enabled);
|
||||
}
|
||||
if feature == Feature::RealtimeConversation {
|
||||
if matches!(feature, Feature::RealtimeConversation | Feature::RealtimeV2) {
|
||||
let realtime_conversation_enabled = self.realtime_conversation_enabled();
|
||||
self.bottom_pane
|
||||
.set_realtime_conversation_enabled(realtime_conversation_enabled);
|
||||
|
||||
@@ -304,7 +304,7 @@ impl ChatWidget {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
#[cfg(all(not(target_os = "linux"), feature = "voice-input"))]
|
||||
fn start_realtime_local_audio(&mut self) {
|
||||
if self.realtime_conversation.capture_stop_flag.is_some() {
|
||||
return;
|
||||
|
||||
@@ -1882,6 +1882,63 @@ fn set_chatgpt_auth(chat: &mut ChatWidget) {
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn realtime_feature_gate_uses_dual_read_feature_flags() {
|
||||
let (mut chat, _rx, _op_rx) = make_chatwidget_manual(None).await;
|
||||
let platform_realtime_supported = cfg!(not(target_os = "linux"));
|
||||
|
||||
assert_eq!(chat.realtime_conversation_enabled(), false);
|
||||
|
||||
chat.set_feature_enabled(Feature::RealtimeV2, true);
|
||||
assert_eq!(
|
||||
chat.realtime_conversation_enabled(),
|
||||
platform_realtime_supported
|
||||
);
|
||||
|
||||
chat.set_feature_enabled(Feature::RealtimeV2, false);
|
||||
assert_eq!(chat.realtime_conversation_enabled(), false);
|
||||
|
||||
chat.set_feature_enabled(Feature::RealtimeConversation, true);
|
||||
assert_eq!(
|
||||
chat.realtime_conversation_enabled(),
|
||||
platform_realtime_supported
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn disabling_realtime_v2_resets_live_realtime_session() {
|
||||
let (mut chat, _rx, mut op_rx) = make_chatwidget_manual(None).await;
|
||||
chat.set_feature_enabled(Feature::RealtimeV2, true);
|
||||
chat.realtime_conversation.phase = super::realtime::RealtimeConversationPhase::Active;
|
||||
|
||||
chat.set_feature_enabled(Feature::RealtimeV2, false);
|
||||
|
||||
assert!(!chat.realtime_conversation.is_live());
|
||||
assert!(matches!(
|
||||
chat.realtime_conversation.phase,
|
||||
super::realtime::RealtimeConversationPhase::Inactive
|
||||
));
|
||||
assert!(matches!(
|
||||
op_rx.try_recv(),
|
||||
Ok(Op::RealtimeConversationClose)
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn legacy_realtime_feature_keeps_realtime_enabled_when_realtime_v2_is_off() {
|
||||
let (mut chat, _rx, _op_rx) = make_chatwidget_manual(None).await;
|
||||
let platform_realtime_supported = cfg!(not(target_os = "linux"));
|
||||
|
||||
chat.set_feature_enabled(Feature::RealtimeConversation, true);
|
||||
chat.set_feature_enabled(Feature::RealtimeV2, true);
|
||||
chat.set_feature_enabled(Feature::RealtimeV2, false);
|
||||
|
||||
assert_eq!(
|
||||
chat.realtime_conversation_enabled(),
|
||||
platform_realtime_supported
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn prefetch_rate_limits_is_gated_on_chatgpt_auth_provider() {
|
||||
let (mut chat, _rx, _op_rx) = make_chatwidget_manual(None).await;
|
||||
|
||||
@@ -484,7 +484,7 @@ fn convert_u16_to_i16_and_peak(input: &[u16], out: &mut Vec<i16>) -> u16 {
|
||||
|
||||
pub(crate) struct RealtimeAudioPlayer {
|
||||
_stream: cpal::Stream,
|
||||
queue: Arc<Mutex<VecDeque<i16>>>,
|
||||
queue: Arc<Mutex<OutputAudioQueue>>,
|
||||
output_sample_rate: u32,
|
||||
output_channels: u16,
|
||||
}
|
||||
@@ -495,8 +495,14 @@ impl RealtimeAudioPlayer {
|
||||
crate::audio_device::select_configured_output_device_and_config(config)?;
|
||||
let output_sample_rate = config.sample_rate().0;
|
||||
let output_channels = config.channels();
|
||||
let queue = Arc::new(Mutex::new(VecDeque::new()));
|
||||
let stream = build_output_stream(&device, &config, Arc::clone(&queue))?;
|
||||
let prebuffer_samples = output_prebuffer_samples(output_sample_rate, output_channels);
|
||||
let queue = Arc::new(Mutex::new(OutputAudioQueue::default()));
|
||||
let stream = build_output_stream(
|
||||
&device,
|
||||
&config,
|
||||
Arc::clone(&queue),
|
||||
prebuffer_samples,
|
||||
)?;
|
||||
stream
|
||||
.play()
|
||||
.map_err(|e| format!("failed to start output stream: {e}"))?;
|
||||
@@ -537,13 +543,14 @@ impl RealtimeAudioPlayer {
|
||||
.lock()
|
||||
.map_err(|_| "failed to lock output audio queue".to_string())?;
|
||||
// TODO(aibrahim): Cap or trim this queue if we observe producer bursts outrunning playback.
|
||||
guard.extend(converted);
|
||||
guard.samples.extend(converted);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn clear(&self) {
|
||||
if let Ok(mut guard) = self.queue.lock() {
|
||||
guard.clear();
|
||||
guard.samples.clear();
|
||||
guard.primed = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -551,14 +558,17 @@ impl RealtimeAudioPlayer {
|
||||
fn build_output_stream(
|
||||
device: &cpal::Device,
|
||||
config: &cpal::SupportedStreamConfig,
|
||||
queue: Arc<Mutex<VecDeque<i16>>>,
|
||||
queue: Arc<Mutex<OutputAudioQueue>>,
|
||||
prebuffer_samples: usize,
|
||||
) -> Result<cpal::Stream, String> {
|
||||
let config_any: cpal::StreamConfig = config.clone().into();
|
||||
match config.sample_format() {
|
||||
cpal::SampleFormat::F32 => device
|
||||
.build_output_stream(
|
||||
&config_any,
|
||||
move |output: &mut [f32], _| fill_output_f32(output, &queue),
|
||||
move |output: &mut [f32], _| {
|
||||
fill_output_f32(output, &queue, prebuffer_samples)
|
||||
},
|
||||
move |err| error!("audio output error: {err}"),
|
||||
None,
|
||||
)
|
||||
@@ -566,7 +576,9 @@ fn build_output_stream(
|
||||
cpal::SampleFormat::I16 => device
|
||||
.build_output_stream(
|
||||
&config_any,
|
||||
move |output: &mut [i16], _| fill_output_i16(output, &queue),
|
||||
move |output: &mut [i16], _| {
|
||||
fill_output_i16(output, &queue, prebuffer_samples)
|
||||
},
|
||||
move |err| error!("audio output error: {err}"),
|
||||
None,
|
||||
)
|
||||
@@ -574,7 +586,9 @@ fn build_output_stream(
|
||||
cpal::SampleFormat::U16 => device
|
||||
.build_output_stream(
|
||||
&config_any,
|
||||
move |output: &mut [u16], _| fill_output_u16(output, &queue),
|
||||
move |output: &mut [u16], _| {
|
||||
fill_output_u16(output, &queue, prebuffer_samples)
|
||||
},
|
||||
move |err| error!("audio output error: {err}"),
|
||||
None,
|
||||
)
|
||||
@@ -583,20 +597,67 @@ fn build_output_stream(
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_output_i16(output: &mut [i16], queue: &Arc<Mutex<VecDeque<i16>>>) {
|
||||
#[derive(Default)]
|
||||
struct OutputAudioQueue {
|
||||
samples: VecDeque<i16>,
|
||||
primed: bool,
|
||||
}
|
||||
|
||||
fn output_prebuffer_samples(sample_rate: u32, channels: u16) -> usize {
|
||||
let samples_per_second = (sample_rate as usize).saturating_mul(channels as usize);
|
||||
// 120ms jitter buffer smooths websocket burstiness without adding too much latency.
|
||||
((samples_per_second as u64) * 120 / 1_000) as usize
|
||||
}
|
||||
|
||||
fn should_output_silence(
|
||||
queue: &mut OutputAudioQueue,
|
||||
min_buffer_samples: usize,
|
||||
) -> bool {
|
||||
if !queue.primed {
|
||||
if queue.samples.len() < min_buffer_samples {
|
||||
return true;
|
||||
}
|
||||
queue.primed = true;
|
||||
}
|
||||
|
||||
if queue.samples.is_empty() {
|
||||
queue.primed = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn fill_output_i16(
|
||||
output: &mut [i16],
|
||||
queue: &Arc<Mutex<OutputAudioQueue>>,
|
||||
prebuffer_samples: usize,
|
||||
) {
|
||||
if let Ok(mut guard) = queue.lock() {
|
||||
if should_output_silence(&mut guard, prebuffer_samples) {
|
||||
output.fill(0);
|
||||
return;
|
||||
}
|
||||
for sample in output {
|
||||
*sample = guard.pop_front().unwrap_or(0);
|
||||
*sample = guard.samples.pop_front().unwrap_or(0);
|
||||
}
|
||||
return;
|
||||
}
|
||||
output.fill(0);
|
||||
}
|
||||
|
||||
fn fill_output_f32(output: &mut [f32], queue: &Arc<Mutex<VecDeque<i16>>>) {
|
||||
fn fill_output_f32(
|
||||
output: &mut [f32],
|
||||
queue: &Arc<Mutex<OutputAudioQueue>>,
|
||||
prebuffer_samples: usize,
|
||||
) {
|
||||
if let Ok(mut guard) = queue.lock() {
|
||||
if should_output_silence(&mut guard, prebuffer_samples) {
|
||||
output.fill(0.0);
|
||||
return;
|
||||
}
|
||||
for sample in output {
|
||||
let v = guard.pop_front().unwrap_or(0);
|
||||
let v = guard.samples.pop_front().unwrap_or(0);
|
||||
*sample = (v as f32) / (i16::MAX as f32);
|
||||
}
|
||||
return;
|
||||
@@ -604,10 +665,18 @@ fn fill_output_f32(output: &mut [f32], queue: &Arc<Mutex<VecDeque<i16>>>) {
|
||||
output.fill(0.0);
|
||||
}
|
||||
|
||||
fn fill_output_u16(output: &mut [u16], queue: &Arc<Mutex<VecDeque<i16>>>) {
|
||||
fn fill_output_u16(
|
||||
output: &mut [u16],
|
||||
queue: &Arc<Mutex<OutputAudioQueue>>,
|
||||
prebuffer_samples: usize,
|
||||
) {
|
||||
if let Ok(mut guard) = queue.lock() {
|
||||
if should_output_silence(&mut guard, prebuffer_samples) {
|
||||
output.fill(32768);
|
||||
return;
|
||||
}
|
||||
for sample in output {
|
||||
let v = guard.pop_front().unwrap_or(0);
|
||||
let v = guard.samples.pop_front().unwrap_or(0);
|
||||
*sample = (v as i32 + 32768).clamp(0, u16::MAX as i32) as u16;
|
||||
}
|
||||
return;
|
||||
|
||||
Reference in New Issue
Block a user