mirror of
https://github.com/openai/codex.git
synced 2026-05-01 01:47:18 +00:00
Hacked it so that we can run benchmarks on parallel toolcalls impact.
Note that you have to first build a release binary and reference that with --codex-bin for this to work.
This commit is contained in:
526
codex-rs/scripts/codex_parallel_benchmark.py
Normal file
526
codex-rs/scripts/codex_parallel_benchmark.py
Normal file
@@ -0,0 +1,526 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Benchmark codex exec runs with and without parallel tool calls."""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import shlex
|
||||
import shutil
|
||||
import statistics
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
from concurrent.futures import Future, ThreadPoolExecutor, as_completed
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Sequence
|
||||
|
||||
|
||||
DEFAULT_MODEL = "gpt-5-codex"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModeConfig:
|
||||
label: str
|
||||
model: str
|
||||
extra_args: tuple[str, ...]
|
||||
env_pairs: tuple[tuple[str, str], ...]
|
||||
parallel_flag: str | None
|
||||
enabled: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunResult:
|
||||
index: int
|
||||
duration_s: float
|
||||
returncode: int
|
||||
stdout_path: Path
|
||||
stderr_path: Path
|
||||
metadata_path: Path
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModeResult:
|
||||
config: ModeConfig
|
||||
outputs_dir: Path
|
||||
runs: list[RunResult]
|
||||
|
||||
@property
|
||||
def durations(self) -> list[float]:
|
||||
return [run.duration_s for run in self.runs]
|
||||
|
||||
|
||||
class BenchmarkError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProgressTracker:
|
||||
total_runs: int
|
||||
completed: int = 0
|
||||
lock: threading.Lock = field(default_factory=threading.Lock, repr=False)
|
||||
|
||||
def advance(self, mode_label: str, run_index: int) -> None:
|
||||
with self.lock:
|
||||
self.completed += 1
|
||||
percentage = (self.completed / self.total_runs) * 100 if self.total_runs else 100.0
|
||||
print(
|
||||
f"[{self.completed:>3}/{self.total_runs:<3} | {percentage:5.1f}%] "
|
||||
f"mode={mode_label} run={run_index:03d}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
|
||||
def parse_key_value_pairs(pairs: Iterable[str]) -> tuple[tuple[str, str], ...]:
|
||||
parsed: list[tuple[str, str]] = []
|
||||
for pair in pairs:
|
||||
if "=" not in pair:
|
||||
raise BenchmarkError(f"Expected KEY=VALUE format, got: {pair}")
|
||||
key, value = pair.split("=", 1)
|
||||
parsed.append((key, value))
|
||||
return tuple(parsed)
|
||||
|
||||
|
||||
def parse_args(argv: Sequence[str]) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run codex exec repeatedly for parallel vs serial tool call models.",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
parser.add_argument("prompt", help="Prompt passed to codex exec. Use quotes to preserve spaces.")
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--runs",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Number of executions per mode (parallel and serial).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--codex-bin",
|
||||
default="codex",
|
||||
help="Path to codex binary. If relative, resolved against the working directory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workdir",
|
||||
default=str(Path(__file__).resolve().parents[1]),
|
||||
help="Working directory passed to codex exec commands.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default=DEFAULT_MODEL,
|
||||
help="Model slug shared by both modes when explicit overrides are not provided.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--parallel-model",
|
||||
default=None,
|
||||
help="Model slug used only for parallel runs; defaults to --model when omitted.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--serial-model",
|
||||
default=None,
|
||||
help="Model slug used only for serial runs; defaults to --model when omitted.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--parallel-extra",
|
||||
default="",
|
||||
help="Additional CLI args passed only to parallel runs (quoted string).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--serial-extra",
|
||||
default="",
|
||||
help="Additional CLI args passed only to serial runs (quoted string).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--parallel-env",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Environment overrides KEY=VALUE applied to parallel runs (repeatable).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--serial-env",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Environment overrides KEY=VALUE applied to serial runs (repeatable).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-root",
|
||||
default=str(Path(tempfile.gettempdir()) / "codex_parallel_benchmark"),
|
||||
help="Directory under which experiment outputs and plots are stored.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--label",
|
||||
default=datetime.now().strftime("%Y%m%d-%H%M%S"),
|
||||
help="Label used to create a unique run directory under output-root.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
action="store_true",
|
||||
help="Print summary JSON in addition to the human-readable report.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-parallel",
|
||||
action="store_true",
|
||||
help="Skip runs flagged as parallel (only serial runs execute).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-serial",
|
||||
action="store_true",
|
||||
help="Skip runs flagged as serial (only parallel runs execute).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--parallel-runs",
|
||||
action="store_true",
|
||||
help="Execute all codex exec runs concurrently instead of sequentially.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-workers",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Maximum number of in-flight codex exec runs when --parallel-runs is set.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Print the commands that would run without executing them.",
|
||||
)
|
||||
return parser.parse_args(argv[1:])
|
||||
|
||||
|
||||
def ensure_binary(path: str) -> str:
|
||||
candidate = Path(path)
|
||||
if candidate.is_file():
|
||||
return str(candidate.resolve())
|
||||
resolved = shutil.which(path)
|
||||
if not resolved:
|
||||
raise BenchmarkError(f"Unable to locate codex binary: {path}")
|
||||
return resolved
|
||||
|
||||
|
||||
def expand_args(arg_string: str) -> tuple[str, ...]:
|
||||
if not arg_string.strip():
|
||||
return tuple()
|
||||
return tuple(shlex.split(arg_string))
|
||||
|
||||
|
||||
def build_mode_configs(args: argparse.Namespace) -> list[ModeConfig]:
|
||||
parallel_model = args.parallel_model or args.model
|
||||
serial_model = args.serial_model or args.model
|
||||
|
||||
modes = [
|
||||
ModeConfig(
|
||||
label="parallel_on",
|
||||
model=parallel_model,
|
||||
extra_args=expand_args(args.parallel_extra),
|
||||
env_pairs=parse_key_value_pairs(args.parallel_env),
|
||||
parallel_flag="on",
|
||||
enabled=not args.skip_parallel,
|
||||
),
|
||||
ModeConfig(
|
||||
label="parallel_off",
|
||||
model=serial_model,
|
||||
extra_args=expand_args(args.serial_extra),
|
||||
env_pairs=parse_key_value_pairs(args.serial_env),
|
||||
parallel_flag="off",
|
||||
enabled=not args.skip_serial,
|
||||
),
|
||||
]
|
||||
enabled_modes = [mode for mode in modes if mode.enabled]
|
||||
if not enabled_modes:
|
||||
raise BenchmarkError("All modes skipped; enable at least one mode to run the benchmark.")
|
||||
return enabled_modes
|
||||
|
||||
|
||||
def run_command(
|
||||
codex_bin: str,
|
||||
workdir: Path,
|
||||
prompt: str,
|
||||
mode: ModeConfig,
|
||||
run_index: int,
|
||||
output_dir: Path,
|
||||
dry_run: bool,
|
||||
) -> RunResult:
|
||||
workdir.mkdir(parents=True, exist_ok=True)
|
||||
mode_dir = output_dir / mode.label
|
||||
run_dir = mode_dir / f"run_{run_index:03d}"
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
command = [codex_bin, "exec", "--model", mode.model]
|
||||
if mode.parallel_flag:
|
||||
command.extend(["--parallel-tool-calls", mode.parallel_flag])
|
||||
command.extend((*mode.extra_args, prompt))
|
||||
env = os.environ.copy()
|
||||
for key, value in mode.env_pairs:
|
||||
env[key] = value
|
||||
|
||||
stdout_path = run_dir / "stdout.txt"
|
||||
stderr_path = run_dir / "stderr.txt"
|
||||
metadata_path = run_dir / "metadata.json"
|
||||
|
||||
start_dt = datetime.now()
|
||||
if dry_run:
|
||||
duration_s = float("nan")
|
||||
returncode = 0
|
||||
stdout = ""
|
||||
stderr = ""
|
||||
else:
|
||||
start = time.perf_counter()
|
||||
result = subprocess.run(
|
||||
command,
|
||||
cwd=str(workdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
check=False,
|
||||
)
|
||||
duration_s = time.perf_counter() - start
|
||||
returncode = result.returncode
|
||||
stdout = result.stdout
|
||||
stderr = result.stderr
|
||||
stdout_path.write_text(stdout)
|
||||
stderr_path.write_text(stderr)
|
||||
|
||||
metadata = {
|
||||
"command": command,
|
||||
"env_overrides": {key: value for key, value in mode.env_pairs},
|
||||
"model": mode.model,
|
||||
"label": mode.label,
|
||||
"prompt": prompt,
|
||||
"run_index": run_index,
|
||||
"duration_seconds": duration_s,
|
||||
"returncode": returncode,
|
||||
"started_at": start_dt.isoformat(),
|
||||
}
|
||||
metadata_path.write_text(json.dumps(metadata, indent=2))
|
||||
|
||||
if dry_run:
|
||||
command_str = " ".join(shlex.quote(element) for element in command)
|
||||
print(f"[DRY-RUN] {command_str}")
|
||||
|
||||
return RunResult(
|
||||
index=run_index,
|
||||
duration_s=duration_s,
|
||||
returncode=returncode,
|
||||
stdout_path=stdout_path,
|
||||
stderr_path=stderr_path,
|
||||
metadata_path=metadata_path,
|
||||
)
|
||||
|
||||
|
||||
def execute_runs(
|
||||
*,
|
||||
codex_bin: str,
|
||||
workdir: Path,
|
||||
prompt: str,
|
||||
modes: Sequence[ModeConfig],
|
||||
runs_per_mode: int,
|
||||
output_dir: Path,
|
||||
dry_run: bool,
|
||||
progress: ProgressTracker,
|
||||
parallel_runs: bool,
|
||||
max_workers: int | None,
|
||||
) -> list[ModeResult]:
|
||||
if not modes:
|
||||
return []
|
||||
if parallel_runs:
|
||||
total_runs = runs_per_mode * len(modes)
|
||||
worker_count = max_workers or total_runs
|
||||
if worker_count < 1:
|
||||
raise BenchmarkError("max workers must be a positive integer")
|
||||
runs_by_mode: dict[str, list[RunResult]] = {mode.label: [] for mode in modes}
|
||||
future_to_mode: dict[Future[RunResult], tuple[ModeConfig, int]] = {}
|
||||
with ThreadPoolExecutor(max_workers=worker_count) as executor:
|
||||
for mode in modes:
|
||||
for idx in range(1, runs_per_mode + 1):
|
||||
future = executor.submit(
|
||||
run_command,
|
||||
codex_bin,
|
||||
workdir,
|
||||
prompt,
|
||||
mode,
|
||||
idx,
|
||||
output_dir,
|
||||
dry_run,
|
||||
)
|
||||
future_to_mode[future] = (mode, idx)
|
||||
for future in as_completed(future_to_mode):
|
||||
mode, _ = future_to_mode[future]
|
||||
result = future.result()
|
||||
runs_by_mode[mode.label].append(result)
|
||||
progress.advance(mode.label, result.index)
|
||||
mode_results: list[ModeResult] = []
|
||||
for mode in modes:
|
||||
runs = sorted(runs_by_mode[mode.label], key=lambda run: run.index)
|
||||
mode_results.append(ModeResult(config=mode, outputs_dir=output_dir / mode.label, runs=runs))
|
||||
return mode_results
|
||||
|
||||
mode_results = []
|
||||
for mode in modes:
|
||||
runs: list[RunResult] = []
|
||||
for idx in range(1, runs_per_mode + 1):
|
||||
result = run_command(
|
||||
codex_bin=codex_bin,
|
||||
workdir=workdir,
|
||||
prompt=prompt,
|
||||
mode=mode,
|
||||
run_index=idx,
|
||||
output_dir=output_dir,
|
||||
dry_run=dry_run,
|
||||
)
|
||||
runs.append(result)
|
||||
progress.advance(mode.label, idx)
|
||||
mode_results.append(ModeResult(config=mode, outputs_dir=output_dir / mode.label, runs=runs))
|
||||
return mode_results
|
||||
|
||||
|
||||
def compute_stats(values: Sequence[float]) -> dict[str, float | int]:
|
||||
clean_values = [value for value in values if math.isfinite(value)]
|
||||
if not clean_values:
|
||||
return {"count": 0}
|
||||
stats: dict[str, float | int] = {
|
||||
"count": len(clean_values),
|
||||
"min": min(clean_values),
|
||||
"max": max(clean_values),
|
||||
"mean": statistics.mean(clean_values),
|
||||
"median": statistics.median(clean_values),
|
||||
}
|
||||
if len(clean_values) > 1:
|
||||
stats["stdev"] = statistics.stdev(clean_values)
|
||||
return stats
|
||||
|
||||
|
||||
def summarize(mode_results: list[ModeResult]) -> dict[str, dict[str, float | int]]:
|
||||
summary: dict[str, dict[str, float | int]] = {}
|
||||
for result in mode_results:
|
||||
summary[result.config.label] = compute_stats(result.durations)
|
||||
return summary
|
||||
|
||||
|
||||
def write_summary(
|
||||
output_dir: Path,
|
||||
summary: dict[str, dict[str, float | int]],
|
||||
mode_results: list[ModeResult],
|
||||
) -> Path:
|
||||
payload = {
|
||||
"output_dir": str(output_dir),
|
||||
"summary": summary,
|
||||
"runs": {
|
||||
result.config.label: [
|
||||
{
|
||||
"index": run.index,
|
||||
"duration_seconds": run.duration_s,
|
||||
"returncode": run.returncode,
|
||||
"stdout_path": str(run.stdout_path),
|
||||
"stderr_path": str(run.stderr_path),
|
||||
}
|
||||
for run in result.runs
|
||||
]
|
||||
for result in mode_results
|
||||
},
|
||||
}
|
||||
summary_path = output_dir / "summary.json"
|
||||
summary_path.write_text(json.dumps(payload, indent=2))
|
||||
return summary_path
|
||||
|
||||
|
||||
def attempt_plot(output_dir: Path, mode_results: list[ModeResult]) -> Path | None:
|
||||
has_finite = any(
|
||||
math.isfinite(duration)
|
||||
for result in mode_results
|
||||
for duration in result.durations
|
||||
)
|
||||
if not has_finite:
|
||||
return None
|
||||
try:
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
except Exception as exc: # pragma: no cover - plotting is optional
|
||||
print(f"[WARN] Unable to create plot ({exc}); continue without chart.")
|
||||
return None
|
||||
|
||||
fig, ax = plt.subplots(figsize=(8, 4))
|
||||
labels = [result.config.label for result in mode_results]
|
||||
data = [
|
||||
[value for value in result.durations if math.isfinite(value)]
|
||||
for result in mode_results
|
||||
]
|
||||
ax.boxplot(data, labels=labels, showmeans=True)
|
||||
ax.set_ylabel("Duration (seconds)")
|
||||
ax.set_title("codex exec durations by mode")
|
||||
ax.grid(True, axis="y", linestyle="--", alpha=0.4)
|
||||
plot_path = output_dir / "duration_boxplot.png"
|
||||
fig.tight_layout()
|
||||
fig.savefig(plot_path)
|
||||
plt.close(fig)
|
||||
return plot_path
|
||||
|
||||
|
||||
def format_report(summary: dict[str, dict[str, float | int]], output_dir: Path, plot_path: Path | None) -> str:
|
||||
lines = ["Benchmark summary:"]
|
||||
for label, stats in summary.items():
|
||||
lines.append(f" {label}:")
|
||||
for key in sorted(stats):
|
||||
value = stats[key]
|
||||
if isinstance(value, float):
|
||||
lines.append(f" {key}: {value:.4f}")
|
||||
else:
|
||||
lines.append(f" {key}: {value}")
|
||||
lines.append(f"Outputs stored in: {output_dir}")
|
||||
if plot_path:
|
||||
lines.append(f"Plot saved to: {plot_path}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main(argv: Sequence[str]) -> int:
|
||||
args = parse_args(argv)
|
||||
try:
|
||||
codex_bin = ensure_binary(args.codex_bin)
|
||||
workdir = Path(args.workdir).resolve()
|
||||
output_root = Path(args.output_root).resolve()
|
||||
run_dir = output_root / args.label
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
modes = build_mode_configs(args)
|
||||
|
||||
total_runs = len(modes) * args.runs
|
||||
if args.max_workers is not None and args.max_workers < 1:
|
||||
raise BenchmarkError("--max-workers must be a positive integer")
|
||||
progress = ProgressTracker(total_runs=total_runs)
|
||||
parallel_runs = args.parallel_runs
|
||||
mode_results = execute_runs(
|
||||
codex_bin=codex_bin,
|
||||
workdir=workdir,
|
||||
prompt=args.prompt,
|
||||
modes=modes,
|
||||
runs_per_mode=args.runs,
|
||||
output_dir=run_dir,
|
||||
dry_run=args.dry_run,
|
||||
progress=progress,
|
||||
parallel_runs=parallel_runs,
|
||||
max_workers=args.max_workers,
|
||||
)
|
||||
|
||||
summary = summarize(mode_results)
|
||||
summary_path = write_summary(run_dir, summary, mode_results)
|
||||
plot_path = attempt_plot(run_dir, mode_results)
|
||||
report = format_report(summary, run_dir, plot_path)
|
||||
print(report)
|
||||
if args.json:
|
||||
payload = {
|
||||
"summary": summary,
|
||||
"output_dir": str(run_dir),
|
||||
"plot_path": str(plot_path) if plot_path else None,
|
||||
"summary_path": str(summary_path),
|
||||
}
|
||||
print(json.dumps(payload, indent=2))
|
||||
return 0
|
||||
except BenchmarkError as error:
|
||||
print(f"[ERROR] {error}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main(sys.argv))
|
||||
Reference in New Issue
Block a user