From bd2a7329233b5bfc1e211a0d18419ca5c697c1e1 Mon Sep 17 00:00:00 2001 From: "Adam Perry @ OpenAI" Date: Thu, 28 May 2026 08:46:30 -0700 Subject: [PATCH] Add app-server startup benchmark crate (#24651) ## Summary - Add a new `app-server-start-bench` crate to measure app-server startup performance - Wire the benchmark into the workspace and Bazel build so it can be run consistently - Update lockfiles and repo automation to account for the new package --- .github/workflows/rust-ci-full.yml | 7 + codex-rs/Cargo.lock | 11 ++ codex-rs/Cargo.toml | 1 + .../benchmarks/app-server-start/BUILD.bazel | 20 ++ .../benchmarks/app-server-start/Cargo.toml | 16 ++ .../benchmarks/app-server-start/src/main.rs | 174 ++++++++++++++++++ justfile | 22 ++- 7 files changed, 247 insertions(+), 4 deletions(-) create mode 100644 codex-rs/benchmarks/app-server-start/BUILD.bazel create mode 100644 codex-rs/benchmarks/app-server-start/Cargo.toml create mode 100644 codex-rs/benchmarks/app-server-start/src/main.rs diff --git a/.github/workflows/rust-ci-full.yml b/.github/workflows/rust-ci-full.yml index 2111ed7b0a..702ad12dbb 100644 --- a/.github/workflows/rust-ci-full.yml +++ b/.github/workflows/rust-ci-full.yml @@ -466,6 +466,13 @@ jobs: - name: cargo clippy run: cargo clippy --target ${{ matrix.target }} --tests --profile ${{ matrix.profile }} --timings -- -D warnings + - uses: taiki-e/install-action@44c6d64aa62cd779e873306675c7a58e86d6d532 # v2.62.49 + with: + tool: just + + - name: End-to-end benchmark smoke test + run: just bench-e2e-smoke + - name: Upload Cargo timings (clippy) if: always() uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index aac0d6f90c..21f8af0d6e 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -2058,6 +2058,17 @@ dependencies = [ "uuid", ] +[[package]] +name = "codex-app-server-start-bench" +version = "0.0.0" +dependencies = [ + "anyhow", + "codex-app-server-protocol", + "divan", + "serde_json", + "tempfile", +] + [[package]] name = "codex-app-server-test-client" version = "0.0.0" diff --git a/codex-rs/Cargo.toml b/codex-rs/Cargo.toml index b1f9e4c31a..5ef6a45fc3 100644 --- a/codex-rs/Cargo.toml +++ b/codex-rs/Cargo.toml @@ -14,6 +14,7 @@ members = [ "app-server-client", "app-server-protocol", "app-server-test-client", + "benchmarks/app-server-start", "debug-client", "apply-patch", "arg0", diff --git a/codex-rs/benchmarks/app-server-start/BUILD.bazel b/codex-rs/benchmarks/app-server-start/BUILD.bazel new file mode 100644 index 0000000000..4b8f09413e --- /dev/null +++ b/codex-rs/benchmarks/app-server-start/BUILD.bazel @@ -0,0 +1,20 @@ +load("//:defs.bzl", "codex_rust_crate", "workspace_root_test") + +codex_rust_crate( + name = "app-server-start-bench", + crate_name = "codex_app_server_start_bench", +) + +workspace_root_test( + name = "startup-bench", + # Include the Bazel-built CLI in this target's runfiles and expose its + # resolved executable path through the environment consumed by the runner. + runfile_env = { + "//codex-rs/cli:codex": "CODEX_BIN", + }, + size = "large", + tags = ["manual"], + test_bin = ":codex-app-server-start-bench", + timeout = "long", + workspace_root_marker = "//codex-rs/utils/cargo-bin:repo_root.marker", +) diff --git a/codex-rs/benchmarks/app-server-start/Cargo.toml b/codex-rs/benchmarks/app-server-start/Cargo.toml new file mode 100644 index 0000000000..edf680eeea --- /dev/null +++ b/codex-rs/benchmarks/app-server-start/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "codex-app-server-start-bench" +version.workspace = true +edition.workspace = true +license.workspace = true +publish = false + +[lints] +workspace = true + +[dependencies] +anyhow = { workspace = true } +codex-app-server-protocol = { workspace = true } +serde_json = { workspace = true } +divan = { workspace = true } +tempfile = { workspace = true } diff --git a/codex-rs/benchmarks/app-server-start/src/main.rs b/codex-rs/benchmarks/app-server-start/src/main.rs new file mode 100644 index 0000000000..5363b49ffc --- /dev/null +++ b/codex-rs/benchmarks/app-server-start/src/main.rs @@ -0,0 +1,174 @@ +use std::io::BufRead; +use std::io::BufReader; +use std::io::Write; +use std::path::Path; +use std::path::PathBuf; +use std::process::Child; +use std::process::ChildStdin; +use std::process::ChildStdout; +use std::process::Command; +use std::process::Stdio; + +use anyhow::Context; +use anyhow::Result; +use anyhow::bail; +use codex_app_server_protocol::ClientInfo; +use codex_app_server_protocol::ClientRequest; +use codex_app_server_protocol::InitializeCapabilities; +use codex_app_server_protocol::InitializeParams; +use codex_app_server_protocol::InitializeResponse; +use codex_app_server_protocol::JSONRPCMessage; +use codex_app_server_protocol::JSONRPCNotification; +use codex_app_server_protocol::JSONRPCResponse; +use codex_app_server_protocol::RequestId; +use divan::Bencher; + +fn main() { + divan::main(); +} + +// Process startup is slow enough that 30 samples keep runs practical, and +// sample size 1 lets each measured server be reaped before the next starts. +// The benchmark warms one CODEX_HOME before timing to measure normal restarts. +// This e2e runner receives its separately built Codex binary from just or Bazel. +#[divan::bench(sample_count = 30, sample_size = 1, skip_ext_time)] +#[allow(clippy::expect_used)] +fn initialize_response(bencher: Bencher) { + let codex_bin = std::env::var_os("CODEX_BIN") + .map(PathBuf::from) + .expect("CODEX_BIN must point to the codex binary; run via `just bench-e2e` or Bazel"); + let codex_home = tempfile::tempdir().expect("benchmark CODEX_HOME should be created"); + drop( + start_until_initialize_response(&codex_bin, codex_home.path()) + .expect("benchmark CODEX_HOME should be initialized"), + ); + + bencher.bench_local(|| { + start_until_initialize_response(&codex_bin, codex_home.path()) + .expect("codex app-server should return an initialize response") + }); +} + +/// A running app-server that has returned a valid `initialize` response. +/// +/// Divan drops benchmark outputs after it stops each timer interval. Returning +/// this value from the measured closure keeps the `initialized` notification +/// and forced process reaping outside startup latency. +struct InitializedAppServer { + child: Child, + stdin: Option, + stdout: Option>, + acknowledge_on_drop: bool, +} + +/// Spawn a stdio app-server and return once it responds successfully to `initialize`. +fn start_until_initialize_response( + codex_bin: &Path, + codex_home: &Path, +) -> Result { + let request_id = RequestId::Integer(0); + let child = Command::new(codex_bin) + .arg("app-server") + .arg("--listen") + .arg("stdio://") + .env("CODEX_HOME", codex_home) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::null()) + .spawn() + .with_context(|| format!("failed to spawn `{}` app-server", codex_bin.display()))?; + let mut server = InitializedAppServer { + child, + stdin: None, + stdout: None, + acknowledge_on_drop: false, + }; + server.stdin = Some( + server + .child + .stdin + .take() + .context("app-server stdin unavailable")?, + ); + server.stdout = Some(BufReader::new( + server + .child + .stdout + .take() + .context("app-server stdout unavailable")?, + )); + + let request = ClientRequest::Initialize { + request_id: request_id.clone(), + params: InitializeParams { + client_info: ClientInfo { + name: "codex-app-server-start-bench".to_string(), + title: Some("Codex App Server Start Benchmark".to_string()), + version: env!("CARGO_PKG_VERSION").to_string(), + }, + capabilities: Some(InitializeCapabilities { + experimental_api: false, + request_attestation: false, + opt_out_notification_methods: None, + }), + }, + }; + let stdin = server + .stdin + .as_mut() + .context("app-server stdin unavailable")?; + writeln!(stdin, "{}", serde_json::to_string(&request)?)?; + stdin + .flush() + .context("failed to flush initialize request")?; + + let mut line = String::new(); + loop { + line.clear(); + if server + .stdout + .as_mut() + .context("app-server stdout unavailable")? + .read_line(&mut line)? + == 0 + { + bail!("app-server closed stdout before returning initialize response"); + } + + match serde_json::from_str::(line.trim())? { + JSONRPCMessage::Response(JSONRPCResponse { id, result }) if id == request_id => { + let _: InitializeResponse = serde_json::from_value(result) + .context("initialize response missing expected payload")?; + server.acknowledge_on_drop = true; + return Ok(server); + } + JSONRPCMessage::Error(error) if error.id == request_id => { + bail!("initialize failed: {error:?}"); + } + JSONRPCMessage::Request(_) + | JSONRPCMessage::Response(_) + | JSONRPCMessage::Notification(_) + | JSONRPCMessage::Error(_) => {} + } + } +} + +impl Drop for InitializedAppServer { + fn drop(&mut self) { + if self.acknowledge_on_drop + && let Some(stdin) = self.stdin.as_mut() + { + let initialized = JSONRPCMessage::Notification(JSONRPCNotification { + method: "initialized".to_string(), + params: None, + }); + if let Ok(payload) = serde_json::to_string(&initialized) { + let _ = writeln!(stdin, "{payload}"); + let _ = stdin.flush(); + } + } + let _ = self.stdin.take(); + let _ = self.child.kill(); + let _ = self.child.wait(); + } +} diff --git a/justfile b/justfile index d5e9fc3a36..d061b90fda 100644 --- a/justfile +++ b/justfile @@ -2,6 +2,9 @@ set working-directory := "codex-rs" set positional-arguments rust_min_stack := "8388608" # 8 MiB +e2e_benchmark_packages := "codex-app-server-start-bench" +e2e_codex_bin := if os() == "windows" { "./target/release/codex.exe" } else { "./target/release/codex" } +e2e_smoke_codex_bin := if os() == "windows" { "./target/debug/codex.exe" } else { "./target/debug/codex" } # Display help help: @@ -30,6 +33,11 @@ app-server-test-client *args: cargo build -p codex-cli cargo run -p codex-app-server-test-client -- --codex-bin ./target/debug/codex "$@" +# Run end-to-end performance benchmarks that require a built Codex binary. +bench-e2e *args: + cargo build --release -p codex-cli --bin codex + for package in {{ e2e_benchmark_packages }}; do CODEX_BIN="{{ e2e_codex_bin }}" cargo run --release -p "$package" -- --bench "$@"; done + # Format Rust and Python SDK code. fmt: cargo fmt -- --config imports_granularity=Item 2>/dev/null @@ -55,13 +63,19 @@ test *args: RUST_MIN_STACK={{ rust_min_stack }} cargo nextest run --no-fail-fast "$@" just bench-smoke -# Run explicit workspace benchmark targets. -bench *args: +# Run unit-test-style benchmark targets managed entirely by Cargo. +bench-unit *args: cargo bench --workspace --bench '*' "$@" -# Run benchmark targets once to ensure they start successfully. +# Smoke Cargo-managed benchmarks and compile e2e runners without measured binaries. bench-smoke: - just bench -- --test + just bench-unit -- --test + for package in {{ e2e_benchmark_packages }}; do cargo build -p "$package" --bin "$package"; done + +# Run end-to-end performance benchmark targets once. +bench-e2e-smoke: + cargo build -p codex-cli --bin codex + for package in {{ e2e_benchmark_packages }}; do CODEX_BIN="{{ e2e_smoke_codex_bin }}" cargo run -p "$package" -- --test; done # Build and run Codex from source using Bazel. # Note we have to use the combination of `[no-cd]` and `--run_under="cd $PWD &&"`