Add app-server startup benchmark crate (#24651)

## Summary
- Add a new `app-server-start-bench` crate to measure app-server startup
performance
- Wire the benchmark into the workspace and Bazel build so it can be run
consistently
- Update lockfiles and repo automation to account for the new package
This commit is contained in:
Adam Perry @ OpenAI
2026-05-28 08:46:30 -07:00
committed by GitHub
parent a4ed6c5aa0
commit bd2a732923
7 changed files with 247 additions and 4 deletions

View File

@@ -466,6 +466,13 @@ jobs:
- name: cargo clippy
run: cargo clippy --target ${{ matrix.target }} --tests --profile ${{ matrix.profile }} --timings -- -D warnings
- uses: taiki-e/install-action@44c6d64aa62cd779e873306675c7a58e86d6d532 # v2.62.49
with:
tool: just
- name: End-to-end benchmark smoke test
run: just bench-e2e-smoke
- name: Upload Cargo timings (clippy)
if: always()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0

11
codex-rs/Cargo.lock generated
View File

@@ -2058,6 +2058,17 @@ dependencies = [
"uuid",
]
[[package]]
name = "codex-app-server-start-bench"
version = "0.0.0"
dependencies = [
"anyhow",
"codex-app-server-protocol",
"divan",
"serde_json",
"tempfile",
]
[[package]]
name = "codex-app-server-test-client"
version = "0.0.0"

View File

@@ -14,6 +14,7 @@ members = [
"app-server-client",
"app-server-protocol",
"app-server-test-client",
"benchmarks/app-server-start",
"debug-client",
"apply-patch",
"arg0",

View File

@@ -0,0 +1,20 @@
load("//:defs.bzl", "codex_rust_crate", "workspace_root_test")
codex_rust_crate(
name = "app-server-start-bench",
crate_name = "codex_app_server_start_bench",
)
workspace_root_test(
name = "startup-bench",
# Include the Bazel-built CLI in this target's runfiles and expose its
# resolved executable path through the environment consumed by the runner.
runfile_env = {
"//codex-rs/cli:codex": "CODEX_BIN",
},
size = "large",
tags = ["manual"],
test_bin = ":codex-app-server-start-bench",
timeout = "long",
workspace_root_marker = "//codex-rs/utils/cargo-bin:repo_root.marker",
)

View File

@@ -0,0 +1,16 @@
[package]
name = "codex-app-server-start-bench"
version.workspace = true
edition.workspace = true
license.workspace = true
publish = false
[lints]
workspace = true
[dependencies]
anyhow = { workspace = true }
codex-app-server-protocol = { workspace = true }
serde_json = { workspace = true }
divan = { workspace = true }
tempfile = { workspace = true }

View File

@@ -0,0 +1,174 @@
use std::io::BufRead;
use std::io::BufReader;
use std::io::Write;
use std::path::Path;
use std::path::PathBuf;
use std::process::Child;
use std::process::ChildStdin;
use std::process::ChildStdout;
use std::process::Command;
use std::process::Stdio;
use anyhow::Context;
use anyhow::Result;
use anyhow::bail;
use codex_app_server_protocol::ClientInfo;
use codex_app_server_protocol::ClientRequest;
use codex_app_server_protocol::InitializeCapabilities;
use codex_app_server_protocol::InitializeParams;
use codex_app_server_protocol::InitializeResponse;
use codex_app_server_protocol::JSONRPCMessage;
use codex_app_server_protocol::JSONRPCNotification;
use codex_app_server_protocol::JSONRPCResponse;
use codex_app_server_protocol::RequestId;
use divan::Bencher;
fn main() {
divan::main();
}
// Process startup is slow enough that 30 samples keep runs practical, and
// sample size 1 lets each measured server be reaped before the next starts.
// The benchmark warms one CODEX_HOME before timing to measure normal restarts.
// This e2e runner receives its separately built Codex binary from just or Bazel.
#[divan::bench(sample_count = 30, sample_size = 1, skip_ext_time)]
#[allow(clippy::expect_used)]
fn initialize_response(bencher: Bencher) {
let codex_bin = std::env::var_os("CODEX_BIN")
.map(PathBuf::from)
.expect("CODEX_BIN must point to the codex binary; run via `just bench-e2e` or Bazel");
let codex_home = tempfile::tempdir().expect("benchmark CODEX_HOME should be created");
drop(
start_until_initialize_response(&codex_bin, codex_home.path())
.expect("benchmark CODEX_HOME should be initialized"),
);
bencher.bench_local(|| {
start_until_initialize_response(&codex_bin, codex_home.path())
.expect("codex app-server should return an initialize response")
});
}
/// A running app-server that has returned a valid `initialize` response.
///
/// Divan drops benchmark outputs after it stops each timer interval. Returning
/// this value from the measured closure keeps the `initialized` notification
/// and forced process reaping outside startup latency.
struct InitializedAppServer {
child: Child,
stdin: Option<ChildStdin>,
stdout: Option<BufReader<ChildStdout>>,
acknowledge_on_drop: bool,
}
/// Spawn a stdio app-server and return once it responds successfully to `initialize`.
fn start_until_initialize_response(
codex_bin: &Path,
codex_home: &Path,
) -> Result<InitializedAppServer> {
let request_id = RequestId::Integer(0);
let child = Command::new(codex_bin)
.arg("app-server")
.arg("--listen")
.arg("stdio://")
.env("CODEX_HOME", codex_home)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::null())
.spawn()
.with_context(|| format!("failed to spawn `{}` app-server", codex_bin.display()))?;
let mut server = InitializedAppServer {
child,
stdin: None,
stdout: None,
acknowledge_on_drop: false,
};
server.stdin = Some(
server
.child
.stdin
.take()
.context("app-server stdin unavailable")?,
);
server.stdout = Some(BufReader::new(
server
.child
.stdout
.take()
.context("app-server stdout unavailable")?,
));
let request = ClientRequest::Initialize {
request_id: request_id.clone(),
params: InitializeParams {
client_info: ClientInfo {
name: "codex-app-server-start-bench".to_string(),
title: Some("Codex App Server Start Benchmark".to_string()),
version: env!("CARGO_PKG_VERSION").to_string(),
},
capabilities: Some(InitializeCapabilities {
experimental_api: false,
request_attestation: false,
opt_out_notification_methods: None,
}),
},
};
let stdin = server
.stdin
.as_mut()
.context("app-server stdin unavailable")?;
writeln!(stdin, "{}", serde_json::to_string(&request)?)?;
stdin
.flush()
.context("failed to flush initialize request")?;
let mut line = String::new();
loop {
line.clear();
if server
.stdout
.as_mut()
.context("app-server stdout unavailable")?
.read_line(&mut line)?
== 0
{
bail!("app-server closed stdout before returning initialize response");
}
match serde_json::from_str::<JSONRPCMessage>(line.trim())? {
JSONRPCMessage::Response(JSONRPCResponse { id, result }) if id == request_id => {
let _: InitializeResponse = serde_json::from_value(result)
.context("initialize response missing expected payload")?;
server.acknowledge_on_drop = true;
return Ok(server);
}
JSONRPCMessage::Error(error) if error.id == request_id => {
bail!("initialize failed: {error:?}");
}
JSONRPCMessage::Request(_)
| JSONRPCMessage::Response(_)
| JSONRPCMessage::Notification(_)
| JSONRPCMessage::Error(_) => {}
}
}
}
impl Drop for InitializedAppServer {
fn drop(&mut self) {
if self.acknowledge_on_drop
&& let Some(stdin) = self.stdin.as_mut()
{
let initialized = JSONRPCMessage::Notification(JSONRPCNotification {
method: "initialized".to_string(),
params: None,
});
if let Ok(payload) = serde_json::to_string(&initialized) {
let _ = writeln!(stdin, "{payload}");
let _ = stdin.flush();
}
}
let _ = self.stdin.take();
let _ = self.child.kill();
let _ = self.child.wait();
}
}

View File

@@ -2,6 +2,9 @@ set working-directory := "codex-rs"
set positional-arguments
rust_min_stack := "8388608" # 8 MiB
e2e_benchmark_packages := "codex-app-server-start-bench"
e2e_codex_bin := if os() == "windows" { "./target/release/codex.exe" } else { "./target/release/codex" }
e2e_smoke_codex_bin := if os() == "windows" { "./target/debug/codex.exe" } else { "./target/debug/codex" }
# Display help
help:
@@ -30,6 +33,11 @@ app-server-test-client *args:
cargo build -p codex-cli
cargo run -p codex-app-server-test-client -- --codex-bin ./target/debug/codex "$@"
# Run end-to-end performance benchmarks that require a built Codex binary.
bench-e2e *args:
cargo build --release -p codex-cli --bin codex
for package in {{ e2e_benchmark_packages }}; do CODEX_BIN="{{ e2e_codex_bin }}" cargo run --release -p "$package" -- --bench "$@"; done
# Format Rust and Python SDK code.
fmt:
cargo fmt -- --config imports_granularity=Item 2>/dev/null
@@ -55,13 +63,19 @@ test *args:
RUST_MIN_STACK={{ rust_min_stack }} cargo nextest run --no-fail-fast "$@"
just bench-smoke
# Run explicit workspace benchmark targets.
bench *args:
# Run unit-test-style benchmark targets managed entirely by Cargo.
bench-unit *args:
cargo bench --workspace --bench '*' "$@"
# Run benchmark targets once to ensure they start successfully.
# Smoke Cargo-managed benchmarks and compile e2e runners without measured binaries.
bench-smoke:
just bench -- --test
just bench-unit -- --test
for package in {{ e2e_benchmark_packages }}; do cargo build -p "$package" --bin "$package"; done
# Run end-to-end performance benchmark targets once.
bench-e2e-smoke:
cargo build -p codex-cli --bin codex
for package in {{ e2e_benchmark_packages }}; do CODEX_BIN="{{ e2e_smoke_codex_bin }}" cargo run -p "$package" -- --test; done
# Build and run Codex from source using Bazel.
# Note we have to use the combination of `[no-cd]` and `--run_under="cd $PWD &&"`