codex/codex-rs/review

#!/usr/bin/env python3
import argparse
import concurrent.futures
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile
import threading
from typing import Any, Dict, List, Optional, Tuple


def _run(cmd: List[str], input_text: Optional[str] = None) -> Tuple[int, str, str]:
    proc = subprocess.run(
        cmd,
        input=input_text,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        check=False,
    )
    return proc.returncode, proc.stdout, proc.stderr


def require(cmd: str, hint: str):
    if shutil.which(cmd) is None:
        print(f"Error: required command '{cmd}' not found. {hint}", file=sys.stderr)
        sys.exit(1)


def detect_repo_root() -> Optional[str]:
    code, out, _ = _run(["git", "rev-parse", "--show-toplevel"])
    if code != 0:
        return None
    return out.strip()


def get_current_branch() -> str:
    code, out, _ = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
    return out.strip() if code == 0 else "HEAD"


def resolve_base_ref() -> str:
    # Prefer remote tracking branch if available.
    for ref in ("origin/main", "upstream/main", "main"):
        code, _, _ = _run(["git", "rev-parse", "--verify", ref])
        if code == 0:
            return ref
    return "main"


def get_diff_text(base_ref: str, head_ref: str) -> str:
    # Use merge-base (three-dot) to focus only on changes introduced on the branch.
    # Avoid color codes for clean parsing in the model.
    code, out, err = _run(["git", "diff", "--no-color", f"{base_ref}...{head_ref}"])
    if code != 0:
        print(f"Error: failed to compute git diff: {err.strip()}", file=sys.stderr)
        sys.exit(2)
    return out


def get_changed_files_count(base_ref: str, head_ref: str) -> int:
    code, out, _ = _run(["git", "diff", "--name-only", "--no-color", f"{base_ref}...{head_ref}"])
    if code != 0:
        return 0
    return sum(1 for ln in out.splitlines() if ln.strip())


# Approximate token estimation: ~4 characters per token heuristic.
MAX_DIFF_TOKENS = 50_000


def estimate_tokens_approx(text: str) -> int:
    # Conservative: 1 token per 4 characters; at least number of whitespace-separated words.
    by_chars = (len(text) + 3) // 4
    by_words = len(text.split())
    return max(by_chars, by_words)


def study_files_in_dir(base: str) -> List[str]:
    if not os.path.isdir(base):
        return []
    files = []
    for name in os.listdir(base):
        if re.match(r"PR-\d+-study\.md$", name):
            files.append(os.path.join(base, name))
    # Sort by PR number for determinism
    def prnum(p: str) -> int:
        m = re.search(r"(\d+)", os.path.basename(p))
        return int(m.group(1)) if m else 0
    return sorted(files, key=prnum)


def build_prompt(studyguide: str, diff_text: str, branch: str, base_ref: str) -> str:
    return (
        "You are a senior code reviewer. Evaluate the current branch diff against a study guide.\n\n"
        f"Branch: {branch}\nBase: {base_ref}\n\n"
        "STUDYGUIDE (Markdown):\n" + studyguide + "\n\n"
        "DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n"
        "Task: Determine whether this diff adheres to the DOs and DON'Ts from the studyguide.\n"
        "- The studyguide might be irrelevant to this diff; mark that clearly.\n"
        "- If relevant and the diff violates items, list each failing point.\n"
        "- If everything passes, return a single green check.\n\n"
        "Output: Respond with EXACTLY one JSON object as RAW JSON (no Markdown, no backticks). Nothing else.\n"
        "Schema: {\n  \"relevant\": boolean,\n  \"passes\": boolean,\n  \"failures\": [\n    { \"issue\": string, \"file\": string, \"line\": number, \"excerpt\": string }\n  ]  // empty if passes or irrelevant\n}\n"
        "Rules:\n- Use true/false for booleans.\n- Provide best-effort file and 1-based line number from the diff; if unknown, use an empty string for file and -1 for line.\n- excerpt should be a short single-line or trimmed code snippet near the line.\n- When irrelevant, set relevant=false and passes=true and failures=[].\n- Do not wrap output in code fences.\n"
    )


def run_codex_exec(prompt: str, last_message_file: Optional[str] = None) -> Tuple[int, str, str]:
    # Prefer globally installed codex; fallback to cargo run.
    if shutil.which("codex") is not None:
        cmd = ["codex", "-c", "model_reasoning_effort=high", "exec"]
        if last_message_file:
            cmd.extend(["--output-last-message", last_message_file])
        return _run(cmd, input_text=prompt)
    cmd = [
        "cargo",
        "run",
        "--quiet",
        "--bin",
        "codex",
        "--",
        "-c",
        "model_reasoning_effort=high",
        "exec",
    ]
    if last_message_file:
        cmd.extend(["--output-last-message", last_message_file])
    return _run(cmd, input_text=prompt)


def parse_json_from_text(text: str) -> Optional[Dict]:
    # Accept raw JSON or a fenced ```json block; return parsed dict if possible.
    text = text.strip()
    # Prefer raw JSON
    if text.startswith("{") and text.endswith("}"):
        try:
            return json.loads(text)
        except Exception:
            pass
    # Fallback: fenced code block
    m = re.search(r"```json\s*(\{[\s\S]*?\})\s*```", text, re.IGNORECASE)
    if m:
        try:
            return json.loads(m.group(1))
        except Exception:
            return None
    return None


def _format_failure_display(item: Dict[str, Any]) -> str:
    issue = str(item.get("issue", "")).strip()
    file = str(item.get("file", "")).strip()
    line = item.get("line")
    try:
        line = int(line) if line is not None else -1
    except Exception:
        line = -1
    excerpt = str(item.get("excerpt", "")).strip()
    header = f"@{file}:{line}" if file else "@"
    lines: List[str] = [header]
    if excerpt:
        lines.append(excerpt)
    if issue:
        lines.append(f"> {issue}")
    return "\n".join(lines).strip()


def _to_structured_failure(guide: str, item: Any) -> Dict[str, Any]:
    if isinstance(item, str):
        return {
            "guide": guide,
            "issue": item.strip(),
            "file": "",
            "line": -1,
            "excerpt": "",
        }
    if isinstance(item, dict):
        issue = str(item.get("issue", "")).strip()
        file = str(item.get("file", "")).strip()
        line = item.get("line")
        try:
            line = int(line) if line is not None else -1
        except Exception:
            line = -1
        excerpt = str(item.get("excerpt", "")).strip()
        return {"guide": guide, "issue": issue, "file": file, "line": line, "excerpt": excerpt}
    # Fallback
    return {"guide": guide, "issue": str(item).strip(), "file": "", "line": -1, "excerpt": ""}


def review_one(
    study_path: str,
    diff_text: str,
    branch: str,
    base_ref: str,
    out_dir: str,
    force: bool = False,
) -> Tuple[str, bool, List[str], List[Dict[str, Any]], Optional[str]]:
    # Returns (study_filename, passes, failures, error)
    try:
        with open(study_path, "r", encoding="utf-8") as f:
            studyguide = f.read()
        prompt = build_prompt(studyguide, diff_text, branch, base_ref)

        os.makedirs(out_dir, exist_ok=True)
        tmp_outfile = os.path.join(out_dir, os.path.basename(study_path).replace("-study.md", "-review.json"))

        # Reuse cached result unless forcing a recompute
        content = None
        if (not force) and os.path.isfile(tmp_outfile) and os.path.getsize(tmp_outfile) > 0:
            try:
                with open(tmp_outfile, "r", encoding="utf-8") as f:
                    content = f.read()
            except Exception:
                content = None

        if content is None:
            code, out, err = run_codex_exec(prompt, last_message_file=tmp_outfile)
            if code != 0:
                return (os.path.basename(study_path), False, [], [], f"codex exec failed (exit {code}): {err.strip()}")

            # Prefer file written by codex; fall back to captured stdout
            try:
                if os.path.isfile(tmp_outfile) and os.path.getsize(tmp_outfile) > 0:
                    with open(tmp_outfile, "r", encoding="utf-8") as f:
                        content = f.read()
                else:
                    content = out
            except Exception:
                content = out

        data = parse_json_from_text(content)
        if not data:
            return (os.path.basename(study_path), False, [], [], "could not parse JSON from model output")

        # Normalize file on disk to pretty-printed raw JSON for future reuse.
        # Normalize cache file to pretty JSON
        try:
            with open(tmp_outfile, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2)
                f.write("\n")
        except Exception:
            pass

        relevant = bool(data.get("relevant", True))
        passes = bool(data.get("passes", False))
        raw_failures = list(data.get("failures") or [])
        structured = [_to_structured_failure(os.path.basename(study_path), x) for x in raw_failures]
        failures = [_format_failure_display(x) for x in structured]

        # If irrelevant, treat as pass-by-default (per schema instructions)
        if not relevant:
            passes = True
            failures = []
            structured = []

        return (os.path.basename(study_path), passes, failures, structured, None)
    except Exception as e:
        return (os.path.basename(study_path), False, [], str(e))


def aggregate_deduplicate(failures_all: List[Dict[str, Any]], diff_text: str, out_dir: str) -> Tuple[str, Optional[List[Dict[str, Any]]], Optional[str]]:
    """Run Codex to deduplicate failures. Returns (outfile_path, dedup_list_or_none, error_or_none)."""
    if not failures_all:
        return ("", [], None)

    out_path = os.path.join(out_dir, "aggregate-dedup.json")
    issues_json = json.dumps(failures_all, indent=2)
    prompt = (
        "You are assisting with de-duplicating code review issues.\n\n"
        "DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n"
        "Issues (JSON array where each item has keys: guide, issue, file, line, excerpt):\n"
        + issues_json + "\n\n"
        "Task: Deduplicate issues that are semantically the same, ignoring differences in file, line, or excerpt.\n"
        "Keep the single most descriptive 'issue' text for each group and retain its metadata (guide, file, line, excerpt).\n"
        "Output: EXACT RAW JSON array (no Markdown, no backticks) with the same object shape as the input."
    )
    code, out, err = run_codex_exec(prompt, last_message_file=out_path)
    if code != 0:
        return (out_path, None, f"codex exec failed (exit {code}): {err.strip()}")
    # Read result (file or stdout) and parse JSON
    content = None
    try:
        if os.path.isfile(out_path) and os.path.getsize(out_path) > 0:
            with open(out_path, "r", encoding="utf-8") as f:
                content = f.read()
        else:
            content = out
    except Exception:
        content = out
    try:
        data = json.loads(content)
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2)
            f.write("\n")
        return (out_path, data, None)
    except Exception as e:
        return (out_path, None, f"failed to parse dedup JSON: {e}")


def aggregate_rank(dedup_list: List[Dict[str, Any]], diff_text: str, out_dir: str) -> Tuple[str, Optional[str]]:
    out_path = os.path.join(out_dir, "aggregate-ranked.json")
    issues_json = json.dumps(dedup_list, indent=2)
    prompt = (
        "You are assisting with triage and prioritization of code review issues.\n\n"
        "DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n"
        "Issues (JSON array; each item has guide, issue, file, line, excerpt):\n"
        + issues_json + "\n\n"
        "Task: For each issue, assign a category: P0, P1, P2, NIT, WRONG, IRRELEVANT.\n"
        "Output: EXACT RAW JSON object mapping category -> array of issues, preserving the same fields for each issue.\n"
        "Schema: { \"P0\": Issue[], \"P1\": Issue[], \"P2\": Issue[], \"NIT\": Issue[], \"WRONG\": Issue[], \"IRRELEVANT\": Issue[] }"
    )
    code, out, err = run_codex_exec(prompt, last_message_file=out_path)
    if code != 0:
        return (out_path, f"codex exec failed (exit {code}): {err.strip()}")
    # Parse and normalize JSON
    content = None
    try:
        if os.path.isfile(out_path) and os.path.getsize(out_path) > 0:
            with open(out_path, "r", encoding="utf-8") as f:
                content = f.read()
        else:
            content = out
    except Exception:
        content = out
    try:
        data = json.loads(content)
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2)
            f.write("\n")
        return (out_path, None)
    except Exception as e:
        return (out_path, f"failed to parse ranked JSON: {e}")


def print_progress(passed: int, completed: int, total: int, lock: threading.Lock):
    pct = int((passed / total) * 100) if total else 0
    width = 30
    filled = int((passed / total) * width) if total else 0
    bar = "#" * filled + "-" * (width - filled)
    with lock:
        print(f"[{bar}] {passed}/{total} passed ({pct}%), {completed}/{total} completed")


def main():
    parser = argparse.ArgumentParser(
        prog="review",
        description=(
            "Run codex checks of current branch diff against each studyguide in prs/<reviewer>/study.\n"
            "Aggregates results, prints a progress bar and a summary of failed points."
        ),
    )
    parser.add_argument("reviewer", help="GitHub login whose studyguides to use (ignored if --study-dir is set)")
    parser.add_argument("--jobs", "-j", type=int, default=10, help="Parallel jobs (default: 10)")
    parser.add_argument("--base", default=None, help="Base ref to diff against (default: auto: origin/main or main)")
    parser.add_argument(
        "--study-dir",
        "-S",
        default=None,
        help="Path to a folder containing PR-*-study.md files (overrides default prs/<reviewer>/study)",
    )
    parser.add_argument(
        "--out-dir",
        "-o",
        default=None,
        help="Directory where review JSON files should be written (default: sibling 'review' next to study-dir)",
    )
    parser.add_argument(
        "--limit",
        "-n",
        type=int,
        default=None,
        help="Use only the first N study guides after sorting (like head -n)",
    )
    parser.add_argument("--show-errors", action="store_true", help="Print per-guide errors encountered")
    parser.add_argument(
        "--force",
        action="store_true",
        help="Recompute review JSONs even if cached results exist",
    )
    parser.add_argument(
        "--clear",
        action="store_true",
        help="Clear the output directory (review folder) before running",
    )

    args = parser.parse_args()

    require("gh", "Install GitHub CLI: https://cli.github.com (used by other tools in this repo)")

    repo_root = detect_repo_root() or os.getcwd()
    reviewer = args.reviewer
    study_dir = os.path.abspath(args.study_dir) if args.study_dir else os.path.join(repo_root, "prs", reviewer, "study")
    guides = study_files_in_dir(study_dir)
    if not guides:
        print(f"No studyguides found in {study_dir}.", file=sys.stderr)
        sys.exit(0)

    total_available = len(guides)
    if args.limit is not None:
        if args.limit <= 0:
            print("Error: --limit must be a positive integer.", file=sys.stderr)
            sys.exit(2)
        guides = guides[: args.limit]

    branch = get_current_branch()
    base_ref = args.base or resolve_base_ref()
    diff_text = get_diff_text(base_ref, "HEAD")
    files_changed = get_changed_files_count(base_ref, "HEAD")
    est_tokens = estimate_tokens_approx(diff_text)
    if not diff_text.strip():
        print("Warning: empty diff vs base; all guides may be irrelevant or pass.", file=sys.stderr)

    if args.out_dir:
        out_dir = os.path.abspath(args.out_dir)
    else:
        # Default: sibling 'review' next to the study folder
        out_dir = os.path.join(os.path.dirname(study_dir), "review")
    if args.clear and os.path.isdir(out_dir):
        # Danger: delete the review folder to start fresh
        try:
            shutil.rmtree(out_dir)
        except Exception as e:
            print(f"Failed to clear output dir {out_dir}: {e}", file=sys.stderr)
            sys.exit(2)
    os.makedirs(out_dir, exist_ok=True)

    total = len(guides)
    passed = 0
    completed = 0
    lock = threading.Lock()
    failures_all: List[Dict[str, Any]] = []  # structured failures
    errors_all: List[Tuple[str, str]] = []    # (guide, error)

    print(f"Running {total} review(s) against {branch} vs {base_ref}…")
    print(f"Files changed: {files_changed}")
    print(f"Estimated diff tokens: {est_tokens} (limit {MAX_DIFF_TOKENS})")
    if est_tokens > MAX_DIFF_TOKENS:
        print(
            f"Error: diff is too large to review (estimated {est_tokens} tokens > limit {MAX_DIFF_TOKENS}).",
            file=sys.stderr,
        )
        sys.exit(2)
    print(f"Study dir: {study_dir}")
    print(f"Output dir: {out_dir}")
    if args.limit is not None and args.limit < total_available:
        print(f"Limit: using first {total} of {total_available} guides")
    print_progress(passed, completed, total, lock)

    def task(p: str):
        return review_one(p, diff_text, branch, base_ref, out_dir, force=args.force)

    with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, args.jobs)) as ex:
        futs = [ex.submit(task, p) for p in guides]
        for fut in concurrent.futures.as_completed(futs):
            guide_name, ok, failures_display, failures_structured, err = fut.result()
            with lock:
                completed += 1
                if ok:
                    passed += 1
                else:
                    if err:
                        errors_all.append((guide_name, err))
                    for item in failures_structured:
                        failures_all.append(item)
            print_progress(passed, completed, total, lock)

    print("")
    print(f"Summary: {passed}/{total} guides passing ({int((passed/total)*100) if total else 0}%)")
    if args.show_errors and errors_all:
        print("\nErrors:")
        for g, e in errors_all:
            print(f"- {g}: {e}")

    if failures_all:
        print("\nFailed points:")
        for item in failures_all:
            print(f"- [{item.get('guide','?')}] {_format_failure_display(item)}")
    else:
        print("\nNo failed points detected.")

    # 4) Aggregate via Codex: deduplicate (optional), then rank
    if failures_all:
        print("\nAggregating failed points…")
        dedup_path = os.path.join(out_dir, "aggregate-dedup.json")
        dedup_list: List[Dict[str, Any]] = []
        if len(failures_all) == 1:
            # Skip model deduplication for a single issue; still write a trace file.
            single = failures_all[0]
            dedup_list = [single]
            try:
                with open(dedup_path, 'w', encoding='utf-8') as f:
                    json.dump(dedup_list, f, indent=2)
                    f.write("\n")
            except Exception as e:
                print(f"Failed to write dedup file: {e}", file=sys.stderr)
        else:
            path, data, dedup_err = aggregate_deduplicate(failures_all, diff_text, out_dir)
            if dedup_err:
                print(f"Dedup error: {dedup_err}", file=sys.stderr)
            else:
                dedup_path = path
                try:
                    with open(dedup_path, 'r', encoding='utf-8') as f:
                        dedup_list = json.load(f)
                except Exception as e:
                    print(f"Failed to read dedup file: {e}", file=sys.stderr)
                    dedup_list = []

        if dedup_list:
            print(f"\nDeduplicated issues written to: {dedup_path}\n")
            preview = json.dumps(dedup_list, indent=2)[:2000]
            print(preview)

            ranked_path, rank_err = aggregate_rank(dedup_list, diff_text, out_dir)
            if rank_err:
                print(f"Ranking error: {rank_err}", file=sys.stderr)
            else:
                try:
                    with open(ranked_path, 'r', encoding='utf-8') as f:
                        ranked_text = f.read()
                    print(f"\nRanked issues written to: {ranked_path}\n")
                    print(ranked_text.strip()[:2000])
                except Exception as e:
                    print(f"Failed to read ranked file: {e}", file=sys.stderr)


if __name__ == "__main__":
    main()