Files
codex/codex-rs/review
Daniel Edrisian a409c34c85 better agg
2025-09-02 18:34:51 -07:00

534 lines
20 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import concurrent.futures
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile
import threading
from typing import Any, Dict, List, Optional, Tuple
def _run(cmd: List[str], input_text: Optional[str] = None) -> Tuple[int, str, str]:
proc = subprocess.run(
cmd,
input=input_text,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=False,
)
return proc.returncode, proc.stdout, proc.stderr
def require(cmd: str, hint: str):
if shutil.which(cmd) is None:
print(f"Error: required command '{cmd}' not found. {hint}", file=sys.stderr)
sys.exit(1)
def detect_repo_root() -> Optional[str]:
code, out, _ = _run(["git", "rev-parse", "--show-toplevel"])
if code != 0:
return None
return out.strip()
def get_current_branch() -> str:
code, out, _ = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
return out.strip() if code == 0 else "HEAD"
def resolve_base_ref() -> str:
# Prefer remote tracking branch if available.
for ref in ("origin/main", "upstream/main", "main"):
code, _, _ = _run(["git", "rev-parse", "--verify", ref])
if code == 0:
return ref
return "main"
def get_diff_text(base_ref: str, head_ref: str) -> str:
# Use merge-base (three-dot) to focus only on changes introduced on the branch.
# Avoid color codes for clean parsing in the model.
code, out, err = _run(["git", "diff", "--no-color", f"{base_ref}...{head_ref}"])
if code != 0:
print(f"Error: failed to compute git diff: {err.strip()}", file=sys.stderr)
sys.exit(2)
return out
def get_changed_files_count(base_ref: str, head_ref: str) -> int:
code, out, _ = _run(["git", "diff", "--name-only", "--no-color", f"{base_ref}...{head_ref}"])
if code != 0:
return 0
return sum(1 for ln in out.splitlines() if ln.strip())
# Approximate token estimation: ~4 characters per token heuristic.
MAX_DIFF_TOKENS = 50_000
def estimate_tokens_approx(text: str) -> int:
# Conservative: 1 token per 4 characters; at least number of whitespace-separated words.
by_chars = (len(text) + 3) // 4
by_words = len(text.split())
return max(by_chars, by_words)
def study_files_in_dir(base: str) -> List[str]:
if not os.path.isdir(base):
return []
files = []
for name in os.listdir(base):
if re.match(r"PR-\d+-study\.md$", name):
files.append(os.path.join(base, name))
# Sort by PR number for determinism
def prnum(p: str) -> int:
m = re.search(r"(\d+)", os.path.basename(p))
return int(m.group(1)) if m else 0
return sorted(files, key=prnum)
def build_prompt(studyguide: str, diff_text: str, branch: str, base_ref: str) -> str:
return (
"You are a senior code reviewer. Evaluate the current branch diff against a study guide.\n\n"
f"Branch: {branch}\nBase: {base_ref}\n\n"
"STUDYGUIDE (Markdown):\n" + studyguide + "\n\n"
"DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n"
"Task: Determine whether this diff adheres to the DOs and DON'Ts from the studyguide.\n"
"- The studyguide might be irrelevant to this diff; mark that clearly.\n"
"- If relevant and the diff violates items, list each failing point.\n"
"- If everything passes, return a single green check.\n\n"
"Output: Respond with EXACTLY one JSON object as RAW JSON (no Markdown, no backticks). Nothing else.\n"
"Schema: {\n \"relevant\": boolean,\n \"passes\": boolean,\n \"failures\": [\n { \"issue\": string, \"file\": string, \"line\": number, \"excerpt\": string }\n ] // empty if passes or irrelevant\n}\n"
"Rules:\n- Use true/false for booleans.\n- Provide best-effort file and 1-based line number from the diff; if unknown, use an empty string for file and -1 for line.\n- excerpt should be a short single-line or trimmed code snippet near the line.\n- When irrelevant, set relevant=false and passes=true and failures=[].\n- Do not wrap output in code fences.\n"
)
def run_codex_exec(prompt: str, last_message_file: Optional[str] = None) -> Tuple[int, str, str]:
# Prefer globally installed codex; fallback to cargo run.
if shutil.which("codex") is not None:
cmd = ["codex", "-c", "model_reasoning_effort=high", "exec"]
if last_message_file:
cmd.extend(["--output-last-message", last_message_file])
return _run(cmd, input_text=prompt)
cmd = [
"cargo",
"run",
"--quiet",
"--bin",
"codex",
"--",
"-c",
"model_reasoning_effort=high",
"exec",
]
if last_message_file:
cmd.extend(["--output-last-message", last_message_file])
return _run(cmd, input_text=prompt)
def parse_json_from_text(text: str) -> Optional[Dict]:
# Accept raw JSON or a fenced ```json block; return parsed dict if possible.
text = text.strip()
# Prefer raw JSON
if text.startswith("{") and text.endswith("}"):
try:
return json.loads(text)
except Exception:
pass
# Fallback: fenced code block
m = re.search(r"```json\s*(\{[\s\S]*?\})\s*```", text, re.IGNORECASE)
if m:
try:
return json.loads(m.group(1))
except Exception:
return None
return None
def _format_failure_display(item: Dict[str, Any]) -> str:
issue = str(item.get("issue", "")).strip()
file = str(item.get("file", "")).strip()
line = item.get("line")
try:
line = int(line) if line is not None else -1
except Exception:
line = -1
excerpt = str(item.get("excerpt", "")).strip()
header = f"@{file}:{line}" if file else "@"
lines: List[str] = [header]
if excerpt:
lines.append(excerpt)
if issue:
lines.append(f"> {issue}")
return "\n".join(lines).strip()
def _to_structured_failure(guide: str, item: Any) -> Dict[str, Any]:
if isinstance(item, str):
return {
"guide": guide,
"issue": item.strip(),
"file": "",
"line": -1,
"excerpt": "",
}
if isinstance(item, dict):
issue = str(item.get("issue", "")).strip()
file = str(item.get("file", "")).strip()
line = item.get("line")
try:
line = int(line) if line is not None else -1
except Exception:
line = -1
excerpt = str(item.get("excerpt", "")).strip()
return {"guide": guide, "issue": issue, "file": file, "line": line, "excerpt": excerpt}
# Fallback
return {"guide": guide, "issue": str(item).strip(), "file": "", "line": -1, "excerpt": ""}
def review_one(
study_path: str,
diff_text: str,
branch: str,
base_ref: str,
out_dir: str,
force: bool = False,
) -> Tuple[str, bool, List[str], List[Dict[str, Any]], Optional[str]]:
# Returns (study_filename, passes, failures, error)
try:
with open(study_path, "r", encoding="utf-8") as f:
studyguide = f.read()
prompt = build_prompt(studyguide, diff_text, branch, base_ref)
os.makedirs(out_dir, exist_ok=True)
tmp_outfile = os.path.join(out_dir, os.path.basename(study_path).replace("-study.md", "-review.json"))
# Reuse cached result unless forcing a recompute
content = None
if (not force) and os.path.isfile(tmp_outfile) and os.path.getsize(tmp_outfile) > 0:
try:
with open(tmp_outfile, "r", encoding="utf-8") as f:
content = f.read()
except Exception:
content = None
if content is None:
code, out, err = run_codex_exec(prompt, last_message_file=tmp_outfile)
if code != 0:
return (os.path.basename(study_path), False, [], [], f"codex exec failed (exit {code}): {err.strip()}")
# Prefer file written by codex; fall back to captured stdout
try:
if os.path.isfile(tmp_outfile) and os.path.getsize(tmp_outfile) > 0:
with open(tmp_outfile, "r", encoding="utf-8") as f:
content = f.read()
else:
content = out
except Exception:
content = out
data = parse_json_from_text(content)
if not data:
return (os.path.basename(study_path), False, [], [], "could not parse JSON from model output")
# Normalize file on disk to pretty-printed raw JSON for future reuse.
# Normalize cache file to pretty JSON
try:
with open(tmp_outfile, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
f.write("\n")
except Exception:
pass
relevant = bool(data.get("relevant", True))
passes = bool(data.get("passes", False))
raw_failures = list(data.get("failures") or [])
structured = [_to_structured_failure(os.path.basename(study_path), x) for x in raw_failures]
failures = [_format_failure_display(x) for x in structured]
# If irrelevant, treat as pass-by-default (per schema instructions)
if not relevant:
passes = True
failures = []
structured = []
return (os.path.basename(study_path), passes, failures, structured, None)
except Exception as e:
return (os.path.basename(study_path), False, [], str(e))
def aggregate_deduplicate(failures_all: List[Dict[str, Any]], diff_text: str, out_dir: str) -> Tuple[str, Optional[List[Dict[str, Any]]], Optional[str]]:
"""Run Codex to deduplicate failures. Returns (outfile_path, dedup_list_or_none, error_or_none)."""
if not failures_all:
return ("", [], None)
out_path = os.path.join(out_dir, "aggregate-dedup.json")
issues_json = json.dumps(failures_all, indent=2)
prompt = (
"You are assisting with de-duplicating code review issues.\n\n"
"DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n"
"Issues (JSON array where each item has keys: guide, issue, file, line, excerpt):\n"
+ issues_json + "\n\n"
"Task: Deduplicate issues that are semantically the same, ignoring differences in file, line, or excerpt.\n"
"Keep the single most descriptive 'issue' text for each group and retain its metadata (guide, file, line, excerpt).\n"
"Output: EXACT RAW JSON array (no Markdown, no backticks) with the same object shape as the input."
)
code, out, err = run_codex_exec(prompt, last_message_file=out_path)
if code != 0:
return (out_path, None, f"codex exec failed (exit {code}): {err.strip()}")
# Read result (file or stdout) and parse JSON
content = None
try:
if os.path.isfile(out_path) and os.path.getsize(out_path) > 0:
with open(out_path, "r", encoding="utf-8") as f:
content = f.read()
else:
content = out
except Exception:
content = out
try:
data = json.loads(content)
with open(out_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
f.write("\n")
return (out_path, data, None)
except Exception as e:
return (out_path, None, f"failed to parse dedup JSON: {e}")
def aggregate_rank(dedup_list: List[Dict[str, Any]], diff_text: str, out_dir: str) -> Tuple[str, Optional[str]]:
out_path = os.path.join(out_dir, "aggregate-ranked.json")
issues_json = json.dumps(dedup_list, indent=2)
prompt = (
"You are assisting with triage and prioritization of code review issues.\n\n"
"DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n"
"Issues (JSON array; each item has guide, issue, file, line, excerpt):\n"
+ issues_json + "\n\n"
"Task: For each issue, assign a category: P0, P1, P2, NIT, WRONG, IRRELEVANT.\n"
"Output: EXACT RAW JSON object mapping category -> array of issues, preserving the same fields for each issue.\n"
"Schema: { \"P0\": Issue[], \"P1\": Issue[], \"P2\": Issue[], \"NIT\": Issue[], \"WRONG\": Issue[], \"IRRELEVANT\": Issue[] }"
)
code, out, err = run_codex_exec(prompt, last_message_file=out_path)
if code != 0:
return (out_path, f"codex exec failed (exit {code}): {err.strip()}")
# Parse and normalize JSON
content = None
try:
if os.path.isfile(out_path) and os.path.getsize(out_path) > 0:
with open(out_path, "r", encoding="utf-8") as f:
content = f.read()
else:
content = out
except Exception:
content = out
try:
data = json.loads(content)
with open(out_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
f.write("\n")
return (out_path, None)
except Exception as e:
return (out_path, f"failed to parse ranked JSON: {e}")
def print_progress(passed: int, completed: int, total: int, lock: threading.Lock):
pct = int((passed / total) * 100) if total else 0
width = 30
filled = int((passed / total) * width) if total else 0
bar = "#" * filled + "-" * (width - filled)
with lock:
print(f"[{bar}] {passed}/{total} passed ({pct}%), {completed}/{total} completed")
def main():
parser = argparse.ArgumentParser(
prog="review",
description=(
"Run codex checks of current branch diff against each studyguide in prs/<reviewer>/study.\n"
"Aggregates results, prints a progress bar and a summary of failed points."
),
)
parser.add_argument("reviewer", help="GitHub login whose studyguides to use (ignored if --study-dir is set)")
parser.add_argument("--jobs", "-j", type=int, default=10, help="Parallel jobs (default: 10)")
parser.add_argument("--base", default=None, help="Base ref to diff against (default: auto: origin/main or main)")
parser.add_argument(
"--study-dir",
"-S",
default=None,
help="Path to a folder containing PR-*-study.md files (overrides default prs/<reviewer>/study)",
)
parser.add_argument(
"--out-dir",
"-o",
default=None,
help="Directory where review JSON files should be written (default: sibling 'review' next to study-dir)",
)
parser.add_argument(
"--limit",
"-n",
type=int,
default=None,
help="Use only the first N study guides after sorting (like head -n)",
)
parser.add_argument("--show-errors", action="store_true", help="Print per-guide errors encountered")
parser.add_argument(
"--force",
action="store_true",
help="Recompute review JSONs even if cached results exist",
)
parser.add_argument(
"--clear",
action="store_true",
help="Clear the output directory (review folder) before running",
)
args = parser.parse_args()
require("gh", "Install GitHub CLI: https://cli.github.com (used by other tools in this repo)")
repo_root = detect_repo_root() or os.getcwd()
reviewer = args.reviewer
study_dir = os.path.abspath(args.study_dir) if args.study_dir else os.path.join(repo_root, "prs", reviewer, "study")
guides = study_files_in_dir(study_dir)
if not guides:
print(f"No studyguides found in {study_dir}.", file=sys.stderr)
sys.exit(0)
total_available = len(guides)
if args.limit is not None:
if args.limit <= 0:
print("Error: --limit must be a positive integer.", file=sys.stderr)
sys.exit(2)
guides = guides[: args.limit]
branch = get_current_branch()
base_ref = args.base or resolve_base_ref()
diff_text = get_diff_text(base_ref, "HEAD")
files_changed = get_changed_files_count(base_ref, "HEAD")
est_tokens = estimate_tokens_approx(diff_text)
if not diff_text.strip():
print("Warning: empty diff vs base; all guides may be irrelevant or pass.", file=sys.stderr)
if args.out_dir:
out_dir = os.path.abspath(args.out_dir)
else:
# Default: sibling 'review' next to the study folder
out_dir = os.path.join(os.path.dirname(study_dir), "review")
if args.clear and os.path.isdir(out_dir):
# Danger: delete the review folder to start fresh
try:
shutil.rmtree(out_dir)
except Exception as e:
print(f"Failed to clear output dir {out_dir}: {e}", file=sys.stderr)
sys.exit(2)
os.makedirs(out_dir, exist_ok=True)
total = len(guides)
passed = 0
completed = 0
lock = threading.Lock()
failures_all: List[Dict[str, Any]] = [] # structured failures
errors_all: List[Tuple[str, str]] = [] # (guide, error)
print(f"Running {total} review(s) against {branch} vs {base_ref}")
print(f"Files changed: {files_changed}")
print(f"Estimated diff tokens: {est_tokens} (limit {MAX_DIFF_TOKENS})")
if est_tokens > MAX_DIFF_TOKENS:
print(
f"Error: diff is too large to review (estimated {est_tokens} tokens > limit {MAX_DIFF_TOKENS}).",
file=sys.stderr,
)
sys.exit(2)
print(f"Study dir: {study_dir}")
print(f"Output dir: {out_dir}")
if args.limit is not None and args.limit < total_available:
print(f"Limit: using first {total} of {total_available} guides")
print_progress(passed, completed, total, lock)
def task(p: str):
return review_one(p, diff_text, branch, base_ref, out_dir, force=args.force)
with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, args.jobs)) as ex:
futs = [ex.submit(task, p) for p in guides]
for fut in concurrent.futures.as_completed(futs):
guide_name, ok, failures_display, failures_structured, err = fut.result()
with lock:
completed += 1
if ok:
passed += 1
else:
if err:
errors_all.append((guide_name, err))
for item in failures_structured:
failures_all.append(item)
print_progress(passed, completed, total, lock)
print("")
print(f"Summary: {passed}/{total} guides passing ({int((passed/total)*100) if total else 0}%)")
if args.show_errors and errors_all:
print("\nErrors:")
for g, e in errors_all:
print(f"- {g}: {e}")
if failures_all:
print("\nFailed points:")
for item in failures_all:
print(f"- [{item.get('guide','?')}] {_format_failure_display(item)}")
else:
print("\nNo failed points detected.")
# 4) Aggregate via Codex: deduplicate (optional), then rank
if failures_all:
print("\nAggregating failed points…")
dedup_path = os.path.join(out_dir, "aggregate-dedup.json")
dedup_list: List[Dict[str, Any]] = []
if len(failures_all) == 1:
# Skip model deduplication for a single issue; still write a trace file.
single = failures_all[0]
dedup_list = [single]
try:
with open(dedup_path, 'w', encoding='utf-8') as f:
json.dump(dedup_list, f, indent=2)
f.write("\n")
except Exception as e:
print(f"Failed to write dedup file: {e}", file=sys.stderr)
else:
path, data, dedup_err = aggregate_deduplicate(failures_all, diff_text, out_dir)
if dedup_err:
print(f"Dedup error: {dedup_err}", file=sys.stderr)
else:
dedup_path = path
try:
with open(dedup_path, 'r', encoding='utf-8') as f:
dedup_list = json.load(f)
except Exception as e:
print(f"Failed to read dedup file: {e}", file=sys.stderr)
dedup_list = []
if dedup_list:
print(f"\nDeduplicated issues written to: {dedup_path}\n")
preview = json.dumps(dedup_list, indent=2)[:2000]
print(preview)
ranked_path, rank_err = aggregate_rank(dedup_list, diff_text, out_dir)
if rank_err:
print(f"Ranking error: {rank_err}", file=sys.stderr)
else:
try:
with open(ranked_path, 'r', encoding='utf-8') as f:
ranked_text = f.read()
print(f"\nRanked issues written to: {ranked_path}\n")
print(ranked_text.strip()[:2000])
except Exception as e:
print(f"Failed to read ranked file: {e}", file=sys.stderr)
if __name__ == "__main__":
main()