Add app-server startup benchmark crate (#24651)

## Summary - Add a new `app-server-start-bench` crate to measure app-server startup performance - Wire the benchmark into the workspace and Bazel build so it can be run consistently - Update lockfiles and repo automation to account for the new package
2026-05-29 15:30:22 +00:00 · 2026-05-28 08:46:30 -07:00
parent a4ed6c5aa0
commit bd2a732923
7 changed files with 247 additions and 4 deletions
--- a/.github/workflows/rust-ci-full.yml
+++ b/.github/workflows/rust-ci-full.yml
@@ -466,6 +466,13 @@ jobs:
      - name: cargo clippy
        run: cargo clippy --target ${{ matrix.target }} --tests --profile ${{ matrix.profile }} --timings -- -D warnings

+      - uses: taiki-e/install-action@44c6d64aa62cd779e873306675c7a58e86d6d532 # v2.62.49
+        with:
+          tool: just
+
+      - name: End-to-end benchmark smoke test
+        run: just bench-e2e-smoke
+
      - name: Upload Cargo timings (clippy)
        if: always()
        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
--- a/codex-rs/Cargo.lock
+++ b/codex-rs/Cargo.lock
@@ -2058,6 +2058,17 @@ dependencies = [
 "uuid",
 ]

+[[package]]
+name = "codex-app-server-start-bench"
+version = "0.0.0"
+dependencies = [
+ "anyhow",
+ "codex-app-server-protocol",
+ "divan",
+ "serde_json",
+ "tempfile",
+]
+
 [[package]]
 name = "codex-app-server-test-client"
 version = "0.0.0"
--- a/codex-rs/Cargo.toml
+++ b/codex-rs/Cargo.toml
@@ -14,6 +14,7 @@ members = [
    "app-server-client",
    "app-server-protocol",
    "app-server-test-client",
+    "benchmarks/app-server-start",
    "debug-client",
    "apply-patch",
    "arg0",
--- a/codex-rs/benchmarks/app-server-start/BUILD.bazel
+++ b/codex-rs/benchmarks/app-server-start/BUILD.bazel
@@ -0,0 +1,20 @@
+load("//:defs.bzl", "codex_rust_crate", "workspace_root_test")
+
+codex_rust_crate(
+    name = "app-server-start-bench",
+    crate_name = "codex_app_server_start_bench",
+)
+
+workspace_root_test(
+    name = "startup-bench",
+    # Include the Bazel-built CLI in this target's runfiles and expose its
+    # resolved executable path through the environment consumed by the runner.
+    runfile_env = {
+        "//codex-rs/cli:codex": "CODEX_BIN",
+    },
+    size = "large",
+    tags = ["manual"],
+    test_bin = ":codex-app-server-start-bench",
+    timeout = "long",
+    workspace_root_marker = "//codex-rs/utils/cargo-bin:repo_root.marker",
+)
--- a/codex-rs/benchmarks/app-server-start/Cargo.toml
+++ b/codex-rs/benchmarks/app-server-start/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "codex-app-server-start-bench"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+publish = false
+
+[lints]
+workspace = true
+
+[dependencies]
+anyhow = { workspace = true }
+codex-app-server-protocol = { workspace = true }
+serde_json = { workspace = true }
+divan = { workspace = true }
+tempfile = { workspace = true }
--- a/codex-rs/benchmarks/app-server-start/src/main.rs
+++ b/codex-rs/benchmarks/app-server-start/src/main.rs
@@ -0,0 +1,174 @@
+use std::io::BufRead;
+use std::io::BufReader;
+use std::io::Write;
+use std::path::Path;
+use std::path::PathBuf;
+use std::process::Child;
+use std::process::ChildStdin;
+use std::process::ChildStdout;
+use std::process::Command;
+use std::process::Stdio;
+
+use anyhow::Context;
+use anyhow::Result;
+use anyhow::bail;
+use codex_app_server_protocol::ClientInfo;
+use codex_app_server_protocol::ClientRequest;
+use codex_app_server_protocol::InitializeCapabilities;
+use codex_app_server_protocol::InitializeParams;
+use codex_app_server_protocol::InitializeResponse;
+use codex_app_server_protocol::JSONRPCMessage;
+use codex_app_server_protocol::JSONRPCNotification;
+use codex_app_server_protocol::JSONRPCResponse;
+use codex_app_server_protocol::RequestId;
+use divan::Bencher;
+
+fn main() {
+    divan::main();
+}
+
+// Process startup is slow enough that 30 samples keep runs practical, and
+// sample size 1 lets each measured server be reaped before the next starts.
+// The benchmark warms one CODEX_HOME before timing to measure normal restarts.
+// This e2e runner receives its separately built Codex binary from just or Bazel.
+#[divan::bench(sample_count = 30, sample_size = 1, skip_ext_time)]
+#[allow(clippy::expect_used)]
+fn initialize_response(bencher: Bencher) {
+    let codex_bin = std::env::var_os("CODEX_BIN")
+        .map(PathBuf::from)
+        .expect("CODEX_BIN must point to the codex binary; run via `just bench-e2e` or Bazel");
+    let codex_home = tempfile::tempdir().expect("benchmark CODEX_HOME should be created");
+    drop(
+        start_until_initialize_response(&codex_bin, codex_home.path())
+            .expect("benchmark CODEX_HOME should be initialized"),
+    );
+
+    bencher.bench_local(|| {
+        start_until_initialize_response(&codex_bin, codex_home.path())
+            .expect("codex app-server should return an initialize response")
+    });
+}
+
+/// A running app-server that has returned a valid `initialize` response.
+///
+/// Divan drops benchmark outputs after it stops each timer interval. Returning
+/// this value from the measured closure keeps the `initialized` notification
+/// and forced process reaping outside startup latency.
+struct InitializedAppServer {
+    child: Child,
+    stdin: Option<ChildStdin>,
+    stdout: Option<BufReader<ChildStdout>>,
+    acknowledge_on_drop: bool,
+}
+
+/// Spawn a stdio app-server and return once it responds successfully to `initialize`.
+fn start_until_initialize_response(
+    codex_bin: &Path,
+    codex_home: &Path,
+) -> Result<InitializedAppServer> {
+    let request_id = RequestId::Integer(0);
+    let child = Command::new(codex_bin)
+        .arg("app-server")
+        .arg("--listen")
+        .arg("stdio://")
+        .env("CODEX_HOME", codex_home)
+        .stdin(Stdio::piped())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::null())
+        .spawn()
+        .with_context(|| format!("failed to spawn `{}` app-server", codex_bin.display()))?;
+    let mut server = InitializedAppServer {
+        child,
+        stdin: None,
+        stdout: None,
+        acknowledge_on_drop: false,
+    };
+    server.stdin = Some(
+        server
+            .child
+            .stdin
+            .take()
+            .context("app-server stdin unavailable")?,
+    );
+    server.stdout = Some(BufReader::new(
+        server
+            .child
+            .stdout
+            .take()
+            .context("app-server stdout unavailable")?,
+    ));
+
+    let request = ClientRequest::Initialize {
+        request_id: request_id.clone(),
+        params: InitializeParams {
+            client_info: ClientInfo {
+                name: "codex-app-server-start-bench".to_string(),
+                title: Some("Codex App Server Start Benchmark".to_string()),
+                version: env!("CARGO_PKG_VERSION").to_string(),
+            },
+            capabilities: Some(InitializeCapabilities {
+                experimental_api: false,
+                request_attestation: false,
+                opt_out_notification_methods: None,
+            }),
+        },
+    };
+    let stdin = server
+        .stdin
+        .as_mut()
+        .context("app-server stdin unavailable")?;
+    writeln!(stdin, "{}", serde_json::to_string(&request)?)?;
+    stdin
+        .flush()
+        .context("failed to flush initialize request")?;
+
+    let mut line = String::new();
+    loop {
+        line.clear();
+        if server
+            .stdout
+            .as_mut()
+            .context("app-server stdout unavailable")?
+            .read_line(&mut line)?
+            == 0
+        {
+            bail!("app-server closed stdout before returning initialize response");
+        }
+
+        match serde_json::from_str::<JSONRPCMessage>(line.trim())? {
+            JSONRPCMessage::Response(JSONRPCResponse { id, result }) if id == request_id => {
+                let _: InitializeResponse = serde_json::from_value(result)
+                    .context("initialize response missing expected payload")?;
+                server.acknowledge_on_drop = true;
+                return Ok(server);
+            }
+            JSONRPCMessage::Error(error) if error.id == request_id => {
+                bail!("initialize failed: {error:?}");
+            }
+            JSONRPCMessage::Request(_)
+            | JSONRPCMessage::Response(_)
+            | JSONRPCMessage::Notification(_)
+            | JSONRPCMessage::Error(_) => {}
+        }
+    }
+}
+
+impl Drop for InitializedAppServer {
+    fn drop(&mut self) {
+        if self.acknowledge_on_drop
+            && let Some(stdin) = self.stdin.as_mut()
+        {
+            let initialized = JSONRPCMessage::Notification(JSONRPCNotification {
+                method: "initialized".to_string(),
+                params: None,
+            });
+            if let Ok(payload) = serde_json::to_string(&initialized) {
+                let _ = writeln!(stdin, "{payload}");
+                let _ = stdin.flush();
+            }
+        }
+        let _ = self.stdin.take();
+        let _ = self.child.kill();
+        let _ = self.child.wait();
+    }
+}
--- a/22
+++ b/22
@@ -2,6 +2,9 @@ set working-directory := "codex-rs"
 set positional-arguments

 rust_min_stack := "8388608" # 8 MiB
+e2e_benchmark_packages := "codex-app-server-start-bench"
+e2e_codex_bin := if os() == "windows" { "./target/release/codex.exe" } else { "./target/release/codex" }
+e2e_smoke_codex_bin := if os() == "windows" { "./target/debug/codex.exe" } else { "./target/debug/codex" }

 # Display help
 help:
@@ -30,6 +33,11 @@ app-server-test-client *args:
    cargo build -p codex-cli
    cargo run -p codex-app-server-test-client -- --codex-bin ./target/debug/codex "$@"

+# Run end-to-end performance benchmarks that require a built Codex binary.
+bench-e2e *args:
+    cargo build --release -p codex-cli --bin codex
+    for package in {{ e2e_benchmark_packages }}; do CODEX_BIN="{{ e2e_codex_bin }}" cargo run --release -p "$package" -- --bench "$@"; done
+
 # Format Rust and Python SDK code.
 fmt:
    cargo fmt -- --config imports_granularity=Item 2>/dev/null
@@ -55,13 +63,19 @@ test *args:
    RUST_MIN_STACK={{ rust_min_stack }} cargo nextest run --no-fail-fast "$@"
    just bench-smoke

-# Run explicit workspace benchmark targets.
-bench *args:
+# Run unit-test-style benchmark targets managed entirely by Cargo.
+bench-unit *args:
    cargo bench --workspace --bench '*' "$@"

-# Run benchmark targets once to ensure they start successfully.
+# Smoke Cargo-managed benchmarks and compile e2e runners without measured binaries.
 bench-smoke:
-    just bench -- --test
+    just bench-unit -- --test
+    for package in {{ e2e_benchmark_packages }}; do cargo build -p "$package" --bin "$package"; done
+
+# Run end-to-end performance benchmark targets once.
+bench-e2e-smoke:
+    cargo build -p codex-cli --bin codex
+    for package in {{ e2e_benchmark_packages }}; do CODEX_BIN="{{ e2e_smoke_codex_bin }}" cargo run -p "$package" -- --test; done

 # Build and run Codex from source using Bazel.
 # Note we have to use the combination of `[no-cd]` and `--run_under="cd $PWD &&"`