From a409c34c856e39ea6b2ac8eb1af7fdf8758c481e Mon Sep 17 00:00:00 2001 From: Daniel Edrisian Date: Tue, 2 Sep 2025 18:34:51 -0700 Subject: [PATCH] better agg --- codex-rs/review | 228 +++++++++++++++++++++++++++++++----------------- 1 file changed, 149 insertions(+), 79 deletions(-) diff --git a/codex-rs/review b/codex-rs/review index de1beda507..9cc61b7ed6 100755 --- a/codex-rs/review +++ b/codex-rs/review @@ -124,7 +124,7 @@ def run_codex_exec(prompt: str, last_message_file: Optional[str] = None) -> Tupl "codex", "--", "-c", - "model_reasoning_effort=low", + "model_reasoning_effort=high", "exec", ] if last_message_file: @@ -151,10 +151,33 @@ def parse_json_from_text(text: str) -> Optional[Dict]: return None -def _format_failure_item(item: Any) -> str: +def _format_failure_display(item: Dict[str, Any]) -> str: + issue = str(item.get("issue", "")).strip() + file = str(item.get("file", "")).strip() + line = item.get("line") + try: + line = int(line) if line is not None else -1 + except Exception: + line = -1 + excerpt = str(item.get("excerpt", "")).strip() + header = f"@{file}:{line}" if file else "@" + lines: List[str] = [header] + if excerpt: + lines.append(excerpt) + if issue: + lines.append(f"> {issue}") + return "\n".join(lines).strip() + + +def _to_structured_failure(guide: str, item: Any) -> Dict[str, Any]: if isinstance(item, str): - # Keep backward-compat strings - return item.strip() + return { + "guide": guide, + "issue": item.strip(), + "file": "", + "line": -1, + "excerpt": "", + } if isinstance(item, dict): issue = str(item.get("issue", "")).strip() file = str(item.get("file", "")).strip() @@ -164,17 +187,19 @@ def _format_failure_item(item: Any) -> str: except Exception: line = -1 excerpt = str(item.get("excerpt", "")).strip() - header = f"@{file}:{line}" if file else "@" - lines: List[str] = [header] - if excerpt: - lines.append(excerpt) - if issue: - lines.append(f"> {issue}") - return "\n".join(lines).strip() - return str(item).strip() + return {"guide": guide, "issue": issue, "file": file, "line": line, "excerpt": excerpt} + # Fallback + return {"guide": guide, "issue": str(item).strip(), "file": "", "line": -1, "excerpt": ""} -def review_one(study_path: str, diff_text: str, branch: str, base_ref: str, out_dir: str) -> Tuple[str, bool, List[str], Optional[str]]: +def review_one( + study_path: str, + diff_text: str, + branch: str, + base_ref: str, + out_dir: str, + force: bool = False, +) -> Tuple[str, bool, List[str], List[Dict[str, Any]], Optional[str]]: # Returns (study_filename, passes, failures, error) try: with open(study_path, "r", encoding="utf-8") as f: @@ -183,107 +208,133 @@ def review_one(study_path: str, diff_text: str, branch: str, base_ref: str, out_ os.makedirs(out_dir, exist_ok=True) tmp_outfile = os.path.join(out_dir, os.path.basename(study_path).replace("-study.md", "-review.json")) - code, out, err = run_codex_exec(prompt, last_message_file=tmp_outfile) - if code != 0: - return (os.path.basename(study_path), False, [], f"codex exec failed (exit {code}): {err.strip()}") - # Prefer file written by codex; fall back to captured stdout + # Reuse cached result unless forcing a recompute content = None - try: - if os.path.isfile(tmp_outfile) and os.path.getsize(tmp_outfile) > 0: + if (not force) and os.path.isfile(tmp_outfile) and os.path.getsize(tmp_outfile) > 0: + try: with open(tmp_outfile, "r", encoding="utf-8") as f: content = f.read() - except Exception: - pass + except Exception: + content = None + if content is None: - content = out + code, out, err = run_codex_exec(prompt, last_message_file=tmp_outfile) + if code != 0: + return (os.path.basename(study_path), False, [], [], f"codex exec failed (exit {code}): {err.strip()}") + + # Prefer file written by codex; fall back to captured stdout + try: + if os.path.isfile(tmp_outfile) and os.path.getsize(tmp_outfile) > 0: + with open(tmp_outfile, "r", encoding="utf-8") as f: + content = f.read() + else: + content = out + except Exception: + content = out data = parse_json_from_text(content) if not data: - return (os.path.basename(study_path), False, [], "could not parse JSON from model output") + return (os.path.basename(study_path), False, [], [], "could not parse JSON from model output") # Normalize file on disk to pretty-printed raw JSON for future reuse. + # Normalize cache file to pretty JSON try: with open(tmp_outfile, "w", encoding="utf-8") as f: json.dump(data, f, indent=2) f.write("\n") except Exception: - # Non-fatal pass relevant = bool(data.get("relevant", True)) passes = bool(data.get("passes", False)) raw_failures = list(data.get("failures") or []) - failures = [_format_failure_item(x) for x in raw_failures] + structured = [_to_structured_failure(os.path.basename(study_path), x) for x in raw_failures] + failures = [_format_failure_display(x) for x in structured] # If irrelevant, treat as pass-by-default (per schema instructions) if not relevant: passes = True failures = [] + structured = [] - return (os.path.basename(study_path), passes, failures, None) + return (os.path.basename(study_path), passes, failures, structured, None) except Exception as e: return (os.path.basename(study_path), False, [], str(e)) -def aggregate_deduplicate(failures_all: List[Tuple[str, str]], diff_text: str, out_dir: str) -> Tuple[str, Optional[str]]: - """Run Codex to deduplicate failures. Returns (outfile_path, error_or_none).""" +def aggregate_deduplicate(failures_all: List[Dict[str, Any]], diff_text: str, out_dir: str) -> Tuple[str, Optional[List[Dict[str, Any]]], Optional[str]]: + """Run Codex to deduplicate failures. Returns (outfile_path, dedup_list_or_none, error_or_none).""" if not failures_all: - return ("", None) + return ("", [], None) - out_path = os.path.join(out_dir, "aggregate-dedup.md") - # Build input list - items = "\n".join(f"- [{guide}] {text}" for guide, text in failures_all) + out_path = os.path.join(out_dir, "aggregate-dedup.json") + issues_json = json.dumps(failures_all, indent=2) prompt = ( "You are assisting with de-duplicating code review issues.\n\n" "DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n" - "Issues to consider (may include duplicates). Each item may include file:line headers and excerpts; treat those as context only.\n" - + items + "\n\n" - "Please deduplicate these issues. Ignore file paths/line numbers and excerpts when comparing; group by semantic issue.\n" - "Output the list of unique issues ONLY as a bullet list, selecting the most descriptive phrasing for each.\n" - "Do not add commentary, headings, or reformatting beyond a bullet list." + "Issues (JSON array where each item has keys: guide, issue, file, line, excerpt):\n" + + issues_json + "\n\n" + "Task: Deduplicate issues that are semantically the same, ignoring differences in file, line, or excerpt.\n" + "Keep the single most descriptive 'issue' text for each group and retain its metadata (guide, file, line, excerpt).\n" + "Output: EXACT RAW JSON array (no Markdown, no backticks) with the same object shape as the input." ) code, out, err = run_codex_exec(prompt, last_message_file=out_path) if code != 0: - return (out_path, f"codex exec failed (exit {code}): {err.strip()}") - # Fallback: ensure the file contains something + return (out_path, None, f"codex exec failed (exit {code}): {err.strip()}") + # Read result (file or stdout) and parse JSON + content = None try: - wrote = os.path.isfile(out_path) and os.path.getsize(out_path) > 0 - if not wrote: - with open(out_path, "w", encoding="utf-8") as f: - f.write(out) + if os.path.isfile(out_path) and os.path.getsize(out_path) > 0: + with open(out_path, "r", encoding="utf-8") as f: + content = f.read() + else: + content = out + except Exception: + content = out + try: + data = json.loads(content) + with open(out_path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2) + f.write("\n") + return (out_path, data, None) except Exception as e: - return (out_path, f"failed to write dedup output: {e}") - return (out_path, None) + return (out_path, None, f"failed to parse dedup JSON: {e}") -def aggregate_rank(dedup_text: str, diff_text: str, out_dir: str) -> Tuple[str, Optional[str]]: - out_path = os.path.join(out_dir, "aggregate-ranked.md") +def aggregate_rank(dedup_list: List[Dict[str, Any]], diff_text: str, out_dir: str) -> Tuple[str, Optional[str]]: + out_path = os.path.join(out_dir, "aggregate-ranked.json") + issues_json = json.dumps(dedup_list, indent=2) prompt = ( "You are assisting with triage and prioritization of code review issues.\n\n" "DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n" - "Issues (one per line or bullet):\n" + dedup_text + "\n\n" + "Issues (JSON array; each item has guide, issue, file, line, excerpt):\n" + + issues_json + "\n\n" "Task: For each issue, assign a category: P0, P1, P2, NIT, WRONG, IRRELEVANT.\n" - "- P0: Must-fix to prevent breakage/security/data loss.\n" - "- P1: Strongly recommended for correctness/perf/maintainability.\n" - "- P2: Nice-to-have improvements or polish.\n" - "- NIT: Stylistic nitpick.\n" - "- WRONG: The issue is incorrect given the diff.\n" - "- IRRELEVANT: Not applicable to this diff.\n\n" - "Output: EXACTLY a Markdown document grouped by sections with these headers (omit empty):\n" - "## P0\n- ...\n\n## P1\n- ...\n\n## P2\n- ...\n\n## NIT\n- ...\n\n## WRONG\n- ...\n\n## IRRELEVANT\n- ...\n" + "Output: EXACT RAW JSON object mapping category -> array of issues, preserving the same fields for each issue.\n" + "Schema: { \"P0\": Issue[], \"P1\": Issue[], \"P2\": Issue[], \"NIT\": Issue[], \"WRONG\": Issue[], \"IRRELEVANT\": Issue[] }" ) code, out, err = run_codex_exec(prompt, last_message_file=out_path) if code != 0: return (out_path, f"codex exec failed (exit {code}): {err.strip()}") + # Parse and normalize JSON + content = None try: - wrote = os.path.isfile(out_path) and os.path.getsize(out_path) > 0 - if not wrote: - with open(out_path, "w", encoding="utf-8") as f: - f.write(out) + if os.path.isfile(out_path) and os.path.getsize(out_path) > 0: + with open(out_path, "r", encoding="utf-8") as f: + content = f.read() + else: + content = out + except Exception: + content = out + try: + data = json.loads(content) + with open(out_path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2) + f.write("\n") + return (out_path, None) except Exception as e: - return (out_path, f"failed to write ranked output: {e}") - return (out_path, None) + return (out_path, f"failed to parse ranked JSON: {e}") def print_progress(passed: int, completed: int, total: int, lock: threading.Lock): @@ -326,6 +377,16 @@ def main(): help="Use only the first N study guides after sorting (like head -n)", ) parser.add_argument("--show-errors", action="store_true", help="Print per-guide errors encountered") + parser.add_argument( + "--force", + action="store_true", + help="Recompute review JSONs even if cached results exist", + ) + parser.add_argument( + "--clear", + action="store_true", + help="Clear the output directory (review folder) before running", + ) args = parser.parse_args() @@ -359,13 +420,20 @@ def main(): else: # Default: sibling 'review' next to the study folder out_dir = os.path.join(os.path.dirname(study_dir), "review") + if args.clear and os.path.isdir(out_dir): + # Danger: delete the review folder to start fresh + try: + shutil.rmtree(out_dir) + except Exception as e: + print(f"Failed to clear output dir {out_dir}: {e}", file=sys.stderr) + sys.exit(2) os.makedirs(out_dir, exist_ok=True) total = len(guides) passed = 0 completed = 0 lock = threading.Lock() - failures_all: List[Tuple[str, str]] = [] # (guide, failure) + failures_all: List[Dict[str, Any]] = [] # structured failures errors_all: List[Tuple[str, str]] = [] # (guide, error) print(f"Running {total} review(s) against {branch} vs {base_ref}…") @@ -384,12 +452,12 @@ def main(): print_progress(passed, completed, total, lock) def task(p: str): - return review_one(p, diff_text, branch, base_ref, out_dir) + return review_one(p, diff_text, branch, base_ref, out_dir, force=args.force) with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, args.jobs)) as ex: futs = [ex.submit(task, p) for p in guides] for fut in concurrent.futures.as_completed(futs): - guide_name, ok, failures, err = fut.result() + guide_name, ok, failures_display, failures_structured, err = fut.result() with lock: completed += 1 if ok: @@ -397,8 +465,8 @@ def main(): else: if err: errors_all.append((guide_name, err)) - for f in failures: - failures_all.append((guide_name, f)) + for item in failures_structured: + failures_all.append(item) print_progress(passed, completed, total, lock) print("") @@ -410,43 +478,45 @@ def main(): if failures_all: print("\nFailed points:") - for g, f in failures_all: - print(f"- [{g}] {f}") + for item in failures_all: + print(f"- [{item.get('guide','?')}] {_format_failure_display(item)}") else: print("\nNo failed points detected.") # 4) Aggregate via Codex: deduplicate (optional), then rank if failures_all: print("\nAggregating failed points…") - dedup_text = '' - dedup_path = os.path.join(out_dir, "aggregate-dedup.md") + dedup_path = os.path.join(out_dir, "aggregate-dedup.json") + dedup_list: List[Dict[str, Any]] = [] if len(failures_all) == 1: # Skip model deduplication for a single issue; still write a trace file. single = failures_all[0] - dedup_text = f"- [{single[0]}] {single[1]}\n" + dedup_list = [single] try: with open(dedup_path, 'w', encoding='utf-8') as f: - f.write(dedup_text) + json.dump(dedup_list, f, indent=2) + f.write("\n") except Exception as e: print(f"Failed to write dedup file: {e}", file=sys.stderr) else: - path, dedup_err = aggregate_deduplicate(failures_all, diff_text, out_dir) + path, data, dedup_err = aggregate_deduplicate(failures_all, diff_text, out_dir) if dedup_err: print(f"Dedup error: {dedup_err}", file=sys.stderr) else: dedup_path = path try: with open(dedup_path, 'r', encoding='utf-8') as f: - dedup_text = f.read() + dedup_list = json.load(f) except Exception as e: print(f"Failed to read dedup file: {e}", file=sys.stderr) - dedup_text = '' + dedup_list = [] - if dedup_text.strip(): + if dedup_list: print(f"\nDeduplicated issues written to: {dedup_path}\n") - print(dedup_text.strip()[:2000]) + preview = json.dumps(dedup_list, indent=2)[:2000] + print(preview) - ranked_path, rank_err = aggregate_rank(dedup_text, diff_text, out_dir) + ranked_path, rank_err = aggregate_rank(dedup_list, diff_text, out_dir) if rank_err: print(f"Ranking error: {rank_err}", file=sys.stderr) else: