mirror of
https://github.com/openai/codex.git
synced 2026-04-25 15:15:15 +00:00
534 lines
20 KiB
Python
Executable File
534 lines
20 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import argparse
|
|
import concurrent.futures
|
|
import json
|
|
import os
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import threading
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
|
|
def _run(cmd: List[str], input_text: Optional[str] = None) -> Tuple[int, str, str]:
|
|
proc = subprocess.run(
|
|
cmd,
|
|
input=input_text,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
check=False,
|
|
)
|
|
return proc.returncode, proc.stdout, proc.stderr
|
|
|
|
|
|
def require(cmd: str, hint: str):
|
|
if shutil.which(cmd) is None:
|
|
print(f"Error: required command '{cmd}' not found. {hint}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
def detect_repo_root() -> Optional[str]:
|
|
code, out, _ = _run(["git", "rev-parse", "--show-toplevel"])
|
|
if code != 0:
|
|
return None
|
|
return out.strip()
|
|
|
|
|
|
def get_current_branch() -> str:
|
|
code, out, _ = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
|
|
return out.strip() if code == 0 else "HEAD"
|
|
|
|
|
|
def resolve_base_ref() -> str:
|
|
# Prefer remote tracking branch if available.
|
|
for ref in ("origin/main", "upstream/main", "main"):
|
|
code, _, _ = _run(["git", "rev-parse", "--verify", ref])
|
|
if code == 0:
|
|
return ref
|
|
return "main"
|
|
|
|
|
|
def get_diff_text(base_ref: str, head_ref: str) -> str:
|
|
# Use merge-base (three-dot) to focus only on changes introduced on the branch.
|
|
# Avoid color codes for clean parsing in the model.
|
|
code, out, err = _run(["git", "diff", "--no-color", f"{base_ref}...{head_ref}"])
|
|
if code != 0:
|
|
print(f"Error: failed to compute git diff: {err.strip()}", file=sys.stderr)
|
|
sys.exit(2)
|
|
return out
|
|
|
|
|
|
def get_changed_files_count(base_ref: str, head_ref: str) -> int:
|
|
code, out, _ = _run(["git", "diff", "--name-only", "--no-color", f"{base_ref}...{head_ref}"])
|
|
if code != 0:
|
|
return 0
|
|
return sum(1 for ln in out.splitlines() if ln.strip())
|
|
|
|
|
|
# Approximate token estimation: ~4 characters per token heuristic.
|
|
MAX_DIFF_TOKENS = 50_000
|
|
|
|
|
|
def estimate_tokens_approx(text: str) -> int:
|
|
# Conservative: 1 token per 4 characters; at least number of whitespace-separated words.
|
|
by_chars = (len(text) + 3) // 4
|
|
by_words = len(text.split())
|
|
return max(by_chars, by_words)
|
|
|
|
|
|
def study_files_in_dir(base: str) -> List[str]:
|
|
if not os.path.isdir(base):
|
|
return []
|
|
files = []
|
|
for name in os.listdir(base):
|
|
if re.match(r"PR-\d+-study\.md$", name):
|
|
files.append(os.path.join(base, name))
|
|
# Sort by PR number for determinism
|
|
def prnum(p: str) -> int:
|
|
m = re.search(r"(\d+)", os.path.basename(p))
|
|
return int(m.group(1)) if m else 0
|
|
return sorted(files, key=prnum)
|
|
|
|
|
|
def build_prompt(studyguide: str, diff_text: str, branch: str, base_ref: str) -> str:
|
|
return (
|
|
"You are a senior code reviewer. Evaluate the current branch diff against a study guide.\n\n"
|
|
f"Branch: {branch}\nBase: {base_ref}\n\n"
|
|
"STUDYGUIDE (Markdown):\n" + studyguide + "\n\n"
|
|
"DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n"
|
|
"Task: Determine whether this diff adheres to the DOs and DON'Ts from the studyguide.\n"
|
|
"- The studyguide might be irrelevant to this diff; mark that clearly.\n"
|
|
"- If relevant and the diff violates items, list each failing point.\n"
|
|
"- If everything passes, return a single green check.\n\n"
|
|
"Output: Respond with EXACTLY one JSON object as RAW JSON (no Markdown, no backticks). Nothing else.\n"
|
|
"Schema: {\n \"relevant\": boolean,\n \"passes\": boolean,\n \"failures\": [\n { \"issue\": string, \"file\": string, \"line\": number, \"excerpt\": string }\n ] // empty if passes or irrelevant\n}\n"
|
|
"Rules:\n- Use true/false for booleans.\n- Provide best-effort file and 1-based line number from the diff; if unknown, use an empty string for file and -1 for line.\n- excerpt should be a short single-line or trimmed code snippet near the line.\n- When irrelevant, set relevant=false and passes=true and failures=[].\n- Do not wrap output in code fences.\n"
|
|
)
|
|
|
|
|
|
def run_codex_exec(prompt: str, last_message_file: Optional[str] = None) -> Tuple[int, str, str]:
|
|
# Prefer globally installed codex; fallback to cargo run.
|
|
if shutil.which("codex") is not None:
|
|
cmd = ["codex", "-c", "model_reasoning_effort=high", "exec"]
|
|
if last_message_file:
|
|
cmd.extend(["--output-last-message", last_message_file])
|
|
return _run(cmd, input_text=prompt)
|
|
cmd = [
|
|
"cargo",
|
|
"run",
|
|
"--quiet",
|
|
"--bin",
|
|
"codex",
|
|
"--",
|
|
"-c",
|
|
"model_reasoning_effort=high",
|
|
"exec",
|
|
]
|
|
if last_message_file:
|
|
cmd.extend(["--output-last-message", last_message_file])
|
|
return _run(cmd, input_text=prompt)
|
|
|
|
|
|
def parse_json_from_text(text: str) -> Optional[Dict]:
|
|
# Accept raw JSON or a fenced ```json block; return parsed dict if possible.
|
|
text = text.strip()
|
|
# Prefer raw JSON
|
|
if text.startswith("{") and text.endswith("}"):
|
|
try:
|
|
return json.loads(text)
|
|
except Exception:
|
|
pass
|
|
# Fallback: fenced code block
|
|
m = re.search(r"```json\s*(\{[\s\S]*?\})\s*```", text, re.IGNORECASE)
|
|
if m:
|
|
try:
|
|
return json.loads(m.group(1))
|
|
except Exception:
|
|
return None
|
|
return None
|
|
|
|
|
|
def _format_failure_display(item: Dict[str, Any]) -> str:
|
|
issue = str(item.get("issue", "")).strip()
|
|
file = str(item.get("file", "")).strip()
|
|
line = item.get("line")
|
|
try:
|
|
line = int(line) if line is not None else -1
|
|
except Exception:
|
|
line = -1
|
|
excerpt = str(item.get("excerpt", "")).strip()
|
|
header = f"@{file}:{line}" if file else "@"
|
|
lines: List[str] = [header]
|
|
if excerpt:
|
|
lines.append(excerpt)
|
|
if issue:
|
|
lines.append(f"> {issue}")
|
|
return "\n".join(lines).strip()
|
|
|
|
|
|
def _to_structured_failure(guide: str, item: Any) -> Dict[str, Any]:
|
|
if isinstance(item, str):
|
|
return {
|
|
"guide": guide,
|
|
"issue": item.strip(),
|
|
"file": "",
|
|
"line": -1,
|
|
"excerpt": "",
|
|
}
|
|
if isinstance(item, dict):
|
|
issue = str(item.get("issue", "")).strip()
|
|
file = str(item.get("file", "")).strip()
|
|
line = item.get("line")
|
|
try:
|
|
line = int(line) if line is not None else -1
|
|
except Exception:
|
|
line = -1
|
|
excerpt = str(item.get("excerpt", "")).strip()
|
|
return {"guide": guide, "issue": issue, "file": file, "line": line, "excerpt": excerpt}
|
|
# Fallback
|
|
return {"guide": guide, "issue": str(item).strip(), "file": "", "line": -1, "excerpt": ""}
|
|
|
|
|
|
def review_one(
|
|
study_path: str,
|
|
diff_text: str,
|
|
branch: str,
|
|
base_ref: str,
|
|
out_dir: str,
|
|
force: bool = False,
|
|
) -> Tuple[str, bool, List[str], List[Dict[str, Any]], Optional[str]]:
|
|
# Returns (study_filename, passes, failures, error)
|
|
try:
|
|
with open(study_path, "r", encoding="utf-8") as f:
|
|
studyguide = f.read()
|
|
prompt = build_prompt(studyguide, diff_text, branch, base_ref)
|
|
|
|
os.makedirs(out_dir, exist_ok=True)
|
|
tmp_outfile = os.path.join(out_dir, os.path.basename(study_path).replace("-study.md", "-review.json"))
|
|
|
|
# Reuse cached result unless forcing a recompute
|
|
content = None
|
|
if (not force) and os.path.isfile(tmp_outfile) and os.path.getsize(tmp_outfile) > 0:
|
|
try:
|
|
with open(tmp_outfile, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
except Exception:
|
|
content = None
|
|
|
|
if content is None:
|
|
code, out, err = run_codex_exec(prompt, last_message_file=tmp_outfile)
|
|
if code != 0:
|
|
return (os.path.basename(study_path), False, [], [], f"codex exec failed (exit {code}): {err.strip()}")
|
|
|
|
# Prefer file written by codex; fall back to captured stdout
|
|
try:
|
|
if os.path.isfile(tmp_outfile) and os.path.getsize(tmp_outfile) > 0:
|
|
with open(tmp_outfile, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
else:
|
|
content = out
|
|
except Exception:
|
|
content = out
|
|
|
|
data = parse_json_from_text(content)
|
|
if not data:
|
|
return (os.path.basename(study_path), False, [], [], "could not parse JSON from model output")
|
|
|
|
# Normalize file on disk to pretty-printed raw JSON for future reuse.
|
|
# Normalize cache file to pretty JSON
|
|
try:
|
|
with open(tmp_outfile, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=2)
|
|
f.write("\n")
|
|
except Exception:
|
|
pass
|
|
|
|
relevant = bool(data.get("relevant", True))
|
|
passes = bool(data.get("passes", False))
|
|
raw_failures = list(data.get("failures") or [])
|
|
structured = [_to_structured_failure(os.path.basename(study_path), x) for x in raw_failures]
|
|
failures = [_format_failure_display(x) for x in structured]
|
|
|
|
# If irrelevant, treat as pass-by-default (per schema instructions)
|
|
if not relevant:
|
|
passes = True
|
|
failures = []
|
|
structured = []
|
|
|
|
return (os.path.basename(study_path), passes, failures, structured, None)
|
|
except Exception as e:
|
|
return (os.path.basename(study_path), False, [], str(e))
|
|
|
|
|
|
def aggregate_deduplicate(failures_all: List[Dict[str, Any]], diff_text: str, out_dir: str) -> Tuple[str, Optional[List[Dict[str, Any]]], Optional[str]]:
|
|
"""Run Codex to deduplicate failures. Returns (outfile_path, dedup_list_or_none, error_or_none)."""
|
|
if not failures_all:
|
|
return ("", [], None)
|
|
|
|
out_path = os.path.join(out_dir, "aggregate-dedup.json")
|
|
issues_json = json.dumps(failures_all, indent=2)
|
|
prompt = (
|
|
"You are assisting with de-duplicating code review issues.\n\n"
|
|
"DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n"
|
|
"Issues (JSON array where each item has keys: guide, issue, file, line, excerpt):\n"
|
|
+ issues_json + "\n\n"
|
|
"Task: Deduplicate issues that are semantically the same, ignoring differences in file, line, or excerpt.\n"
|
|
"Keep the single most descriptive 'issue' text for each group and retain its metadata (guide, file, line, excerpt).\n"
|
|
"Output: EXACT RAW JSON array (no Markdown, no backticks) with the same object shape as the input."
|
|
)
|
|
code, out, err = run_codex_exec(prompt, last_message_file=out_path)
|
|
if code != 0:
|
|
return (out_path, None, f"codex exec failed (exit {code}): {err.strip()}")
|
|
# Read result (file or stdout) and parse JSON
|
|
content = None
|
|
try:
|
|
if os.path.isfile(out_path) and os.path.getsize(out_path) > 0:
|
|
with open(out_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
else:
|
|
content = out
|
|
except Exception:
|
|
content = out
|
|
try:
|
|
data = json.loads(content)
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=2)
|
|
f.write("\n")
|
|
return (out_path, data, None)
|
|
except Exception as e:
|
|
return (out_path, None, f"failed to parse dedup JSON: {e}")
|
|
|
|
|
|
def aggregate_rank(dedup_list: List[Dict[str, Any]], diff_text: str, out_dir: str) -> Tuple[str, Optional[str]]:
|
|
out_path = os.path.join(out_dir, "aggregate-ranked.json")
|
|
issues_json = json.dumps(dedup_list, indent=2)
|
|
prompt = (
|
|
"You are assisting with triage and prioritization of code review issues.\n\n"
|
|
"DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n"
|
|
"Issues (JSON array; each item has guide, issue, file, line, excerpt):\n"
|
|
+ issues_json + "\n\n"
|
|
"Task: For each issue, assign a category: P0, P1, P2, NIT, WRONG, IRRELEVANT.\n"
|
|
"Output: EXACT RAW JSON object mapping category -> array of issues, preserving the same fields for each issue.\n"
|
|
"Schema: { \"P0\": Issue[], \"P1\": Issue[], \"P2\": Issue[], \"NIT\": Issue[], \"WRONG\": Issue[], \"IRRELEVANT\": Issue[] }"
|
|
)
|
|
code, out, err = run_codex_exec(prompt, last_message_file=out_path)
|
|
if code != 0:
|
|
return (out_path, f"codex exec failed (exit {code}): {err.strip()}")
|
|
# Parse and normalize JSON
|
|
content = None
|
|
try:
|
|
if os.path.isfile(out_path) and os.path.getsize(out_path) > 0:
|
|
with open(out_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
else:
|
|
content = out
|
|
except Exception:
|
|
content = out
|
|
try:
|
|
data = json.loads(content)
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=2)
|
|
f.write("\n")
|
|
return (out_path, None)
|
|
except Exception as e:
|
|
return (out_path, f"failed to parse ranked JSON: {e}")
|
|
|
|
|
|
def print_progress(passed: int, completed: int, total: int, lock: threading.Lock):
|
|
pct = int((passed / total) * 100) if total else 0
|
|
width = 30
|
|
filled = int((passed / total) * width) if total else 0
|
|
bar = "#" * filled + "-" * (width - filled)
|
|
with lock:
|
|
print(f"[{bar}] {passed}/{total} passed ({pct}%), {completed}/{total} completed")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
prog="review",
|
|
description=(
|
|
"Run codex checks of current branch diff against each studyguide in prs/<reviewer>/study.\n"
|
|
"Aggregates results, prints a progress bar and a summary of failed points."
|
|
),
|
|
)
|
|
parser.add_argument("reviewer", help="GitHub login whose studyguides to use (ignored if --study-dir is set)")
|
|
parser.add_argument("--jobs", "-j", type=int, default=10, help="Parallel jobs (default: 10)")
|
|
parser.add_argument("--base", default=None, help="Base ref to diff against (default: auto: origin/main or main)")
|
|
parser.add_argument(
|
|
"--study-dir",
|
|
"-S",
|
|
default=None,
|
|
help="Path to a folder containing PR-*-study.md files (overrides default prs/<reviewer>/study)",
|
|
)
|
|
parser.add_argument(
|
|
"--out-dir",
|
|
"-o",
|
|
default=None,
|
|
help="Directory where review JSON files should be written (default: sibling 'review' next to study-dir)",
|
|
)
|
|
parser.add_argument(
|
|
"--limit",
|
|
"-n",
|
|
type=int,
|
|
default=None,
|
|
help="Use only the first N study guides after sorting (like head -n)",
|
|
)
|
|
parser.add_argument("--show-errors", action="store_true", help="Print per-guide errors encountered")
|
|
parser.add_argument(
|
|
"--force",
|
|
action="store_true",
|
|
help="Recompute review JSONs even if cached results exist",
|
|
)
|
|
parser.add_argument(
|
|
"--clear",
|
|
action="store_true",
|
|
help="Clear the output directory (review folder) before running",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
require("gh", "Install GitHub CLI: https://cli.github.com (used by other tools in this repo)")
|
|
|
|
repo_root = detect_repo_root() or os.getcwd()
|
|
reviewer = args.reviewer
|
|
study_dir = os.path.abspath(args.study_dir) if args.study_dir else os.path.join(repo_root, "prs", reviewer, "study")
|
|
guides = study_files_in_dir(study_dir)
|
|
if not guides:
|
|
print(f"No studyguides found in {study_dir}.", file=sys.stderr)
|
|
sys.exit(0)
|
|
|
|
total_available = len(guides)
|
|
if args.limit is not None:
|
|
if args.limit <= 0:
|
|
print("Error: --limit must be a positive integer.", file=sys.stderr)
|
|
sys.exit(2)
|
|
guides = guides[: args.limit]
|
|
|
|
branch = get_current_branch()
|
|
base_ref = args.base or resolve_base_ref()
|
|
diff_text = get_diff_text(base_ref, "HEAD")
|
|
files_changed = get_changed_files_count(base_ref, "HEAD")
|
|
est_tokens = estimate_tokens_approx(diff_text)
|
|
if not diff_text.strip():
|
|
print("Warning: empty diff vs base; all guides may be irrelevant or pass.", file=sys.stderr)
|
|
|
|
if args.out_dir:
|
|
out_dir = os.path.abspath(args.out_dir)
|
|
else:
|
|
# Default: sibling 'review' next to the study folder
|
|
out_dir = os.path.join(os.path.dirname(study_dir), "review")
|
|
if args.clear and os.path.isdir(out_dir):
|
|
# Danger: delete the review folder to start fresh
|
|
try:
|
|
shutil.rmtree(out_dir)
|
|
except Exception as e:
|
|
print(f"Failed to clear output dir {out_dir}: {e}", file=sys.stderr)
|
|
sys.exit(2)
|
|
os.makedirs(out_dir, exist_ok=True)
|
|
|
|
total = len(guides)
|
|
passed = 0
|
|
completed = 0
|
|
lock = threading.Lock()
|
|
failures_all: List[Dict[str, Any]] = [] # structured failures
|
|
errors_all: List[Tuple[str, str]] = [] # (guide, error)
|
|
|
|
print(f"Running {total} review(s) against {branch} vs {base_ref}…")
|
|
print(f"Files changed: {files_changed}")
|
|
print(f"Estimated diff tokens: {est_tokens} (limit {MAX_DIFF_TOKENS})")
|
|
if est_tokens > MAX_DIFF_TOKENS:
|
|
print(
|
|
f"Error: diff is too large to review (estimated {est_tokens} tokens > limit {MAX_DIFF_TOKENS}).",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(2)
|
|
print(f"Study dir: {study_dir}")
|
|
print(f"Output dir: {out_dir}")
|
|
if args.limit is not None and args.limit < total_available:
|
|
print(f"Limit: using first {total} of {total_available} guides")
|
|
print_progress(passed, completed, total, lock)
|
|
|
|
def task(p: str):
|
|
return review_one(p, diff_text, branch, base_ref, out_dir, force=args.force)
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, args.jobs)) as ex:
|
|
futs = [ex.submit(task, p) for p in guides]
|
|
for fut in concurrent.futures.as_completed(futs):
|
|
guide_name, ok, failures_display, failures_structured, err = fut.result()
|
|
with lock:
|
|
completed += 1
|
|
if ok:
|
|
passed += 1
|
|
else:
|
|
if err:
|
|
errors_all.append((guide_name, err))
|
|
for item in failures_structured:
|
|
failures_all.append(item)
|
|
print_progress(passed, completed, total, lock)
|
|
|
|
print("")
|
|
print(f"Summary: {passed}/{total} guides passing ({int((passed/total)*100) if total else 0}%)")
|
|
if args.show_errors and errors_all:
|
|
print("\nErrors:")
|
|
for g, e in errors_all:
|
|
print(f"- {g}: {e}")
|
|
|
|
if failures_all:
|
|
print("\nFailed points:")
|
|
for item in failures_all:
|
|
print(f"- [{item.get('guide','?')}] {_format_failure_display(item)}")
|
|
else:
|
|
print("\nNo failed points detected.")
|
|
|
|
# 4) Aggregate via Codex: deduplicate (optional), then rank
|
|
if failures_all:
|
|
print("\nAggregating failed points…")
|
|
dedup_path = os.path.join(out_dir, "aggregate-dedup.json")
|
|
dedup_list: List[Dict[str, Any]] = []
|
|
if len(failures_all) == 1:
|
|
# Skip model deduplication for a single issue; still write a trace file.
|
|
single = failures_all[0]
|
|
dedup_list = [single]
|
|
try:
|
|
with open(dedup_path, 'w', encoding='utf-8') as f:
|
|
json.dump(dedup_list, f, indent=2)
|
|
f.write("\n")
|
|
except Exception as e:
|
|
print(f"Failed to write dedup file: {e}", file=sys.stderr)
|
|
else:
|
|
path, data, dedup_err = aggregate_deduplicate(failures_all, diff_text, out_dir)
|
|
if dedup_err:
|
|
print(f"Dedup error: {dedup_err}", file=sys.stderr)
|
|
else:
|
|
dedup_path = path
|
|
try:
|
|
with open(dedup_path, 'r', encoding='utf-8') as f:
|
|
dedup_list = json.load(f)
|
|
except Exception as e:
|
|
print(f"Failed to read dedup file: {e}", file=sys.stderr)
|
|
dedup_list = []
|
|
|
|
if dedup_list:
|
|
print(f"\nDeduplicated issues written to: {dedup_path}\n")
|
|
preview = json.dumps(dedup_list, indent=2)[:2000]
|
|
print(preview)
|
|
|
|
ranked_path, rank_err = aggregate_rank(dedup_list, diff_text, out_dir)
|
|
if rank_err:
|
|
print(f"Ranking error: {rank_err}", file=sys.stderr)
|
|
else:
|
|
try:
|
|
with open(ranked_path, 'r', encoding='utf-8') as f:
|
|
ranked_text = f.read()
|
|
print(f"\nRanked issues written to: {ranked_path}\n")
|
|
print(ranked_text.strip()[:2000])
|
|
except Exception as e:
|
|
print(f"Failed to read ranked file: {e}", file=sys.stderr)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|