better agg

This commit is contained in:
Daniel Edrisian
2025-09-02 18:34:51 -07:00
parent a95b23f9b7
commit a409c34c85

View File

@@ -124,7 +124,7 @@ def run_codex_exec(prompt: str, last_message_file: Optional[str] = None) -> Tupl
"codex", "codex",
"--", "--",
"-c", "-c",
"model_reasoning_effort=low", "model_reasoning_effort=high",
"exec", "exec",
] ]
if last_message_file: if last_message_file:
@@ -151,10 +151,33 @@ def parse_json_from_text(text: str) -> Optional[Dict]:
return None return None
def _format_failure_item(item: Any) -> str: def _format_failure_display(item: Dict[str, Any]) -> str:
issue = str(item.get("issue", "")).strip()
file = str(item.get("file", "")).strip()
line = item.get("line")
try:
line = int(line) if line is not None else -1
except Exception:
line = -1
excerpt = str(item.get("excerpt", "")).strip()
header = f"@{file}:{line}" if file else "@"
lines: List[str] = [header]
if excerpt:
lines.append(excerpt)
if issue:
lines.append(f"> {issue}")
return "\n".join(lines).strip()
def _to_structured_failure(guide: str, item: Any) -> Dict[str, Any]:
if isinstance(item, str): if isinstance(item, str):
# Keep backward-compat strings return {
return item.strip() "guide": guide,
"issue": item.strip(),
"file": "",
"line": -1,
"excerpt": "",
}
if isinstance(item, dict): if isinstance(item, dict):
issue = str(item.get("issue", "")).strip() issue = str(item.get("issue", "")).strip()
file = str(item.get("file", "")).strip() file = str(item.get("file", "")).strip()
@@ -164,17 +187,19 @@ def _format_failure_item(item: Any) -> str:
except Exception: except Exception:
line = -1 line = -1
excerpt = str(item.get("excerpt", "")).strip() excerpt = str(item.get("excerpt", "")).strip()
header = f"@{file}:{line}" if file else "@" return {"guide": guide, "issue": issue, "file": file, "line": line, "excerpt": excerpt}
lines: List[str] = [header] # Fallback
if excerpt: return {"guide": guide, "issue": str(item).strip(), "file": "", "line": -1, "excerpt": ""}
lines.append(excerpt)
if issue:
lines.append(f"> {issue}")
return "\n".join(lines).strip()
return str(item).strip()
def review_one(study_path: str, diff_text: str, branch: str, base_ref: str, out_dir: str) -> Tuple[str, bool, List[str], Optional[str]]: def review_one(
study_path: str,
diff_text: str,
branch: str,
base_ref: str,
out_dir: str,
force: bool = False,
) -> Tuple[str, bool, List[str], List[Dict[str, Any]], Optional[str]]:
# Returns (study_filename, passes, failures, error) # Returns (study_filename, passes, failures, error)
try: try:
with open(study_path, "r", encoding="utf-8") as f: with open(study_path, "r", encoding="utf-8") as f:
@@ -183,107 +208,133 @@ def review_one(study_path: str, diff_text: str, branch: str, base_ref: str, out_
os.makedirs(out_dir, exist_ok=True) os.makedirs(out_dir, exist_ok=True)
tmp_outfile = os.path.join(out_dir, os.path.basename(study_path).replace("-study.md", "-review.json")) tmp_outfile = os.path.join(out_dir, os.path.basename(study_path).replace("-study.md", "-review.json"))
code, out, err = run_codex_exec(prompt, last_message_file=tmp_outfile)
if code != 0:
return (os.path.basename(study_path), False, [], f"codex exec failed (exit {code}): {err.strip()}")
# Prefer file written by codex; fall back to captured stdout # Reuse cached result unless forcing a recompute
content = None content = None
try: if (not force) and os.path.isfile(tmp_outfile) and os.path.getsize(tmp_outfile) > 0:
if os.path.isfile(tmp_outfile) and os.path.getsize(tmp_outfile) > 0: try:
with open(tmp_outfile, "r", encoding="utf-8") as f: with open(tmp_outfile, "r", encoding="utf-8") as f:
content = f.read() content = f.read()
except Exception: except Exception:
pass content = None
if content is None: if content is None:
content = out code, out, err = run_codex_exec(prompt, last_message_file=tmp_outfile)
if code != 0:
return (os.path.basename(study_path), False, [], [], f"codex exec failed (exit {code}): {err.strip()}")
# Prefer file written by codex; fall back to captured stdout
try:
if os.path.isfile(tmp_outfile) and os.path.getsize(tmp_outfile) > 0:
with open(tmp_outfile, "r", encoding="utf-8") as f:
content = f.read()
else:
content = out
except Exception:
content = out
data = parse_json_from_text(content) data = parse_json_from_text(content)
if not data: if not data:
return (os.path.basename(study_path), False, [], "could not parse JSON from model output") return (os.path.basename(study_path), False, [], [], "could not parse JSON from model output")
# Normalize file on disk to pretty-printed raw JSON for future reuse. # Normalize file on disk to pretty-printed raw JSON for future reuse.
# Normalize cache file to pretty JSON
try: try:
with open(tmp_outfile, "w", encoding="utf-8") as f: with open(tmp_outfile, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2) json.dump(data, f, indent=2)
f.write("\n") f.write("\n")
except Exception: except Exception:
# Non-fatal
pass pass
relevant = bool(data.get("relevant", True)) relevant = bool(data.get("relevant", True))
passes = bool(data.get("passes", False)) passes = bool(data.get("passes", False))
raw_failures = list(data.get("failures") or []) raw_failures = list(data.get("failures") or [])
failures = [_format_failure_item(x) for x in raw_failures] structured = [_to_structured_failure(os.path.basename(study_path), x) for x in raw_failures]
failures = [_format_failure_display(x) for x in structured]
# If irrelevant, treat as pass-by-default (per schema instructions) # If irrelevant, treat as pass-by-default (per schema instructions)
if not relevant: if not relevant:
passes = True passes = True
failures = [] failures = []
structured = []
return (os.path.basename(study_path), passes, failures, None) return (os.path.basename(study_path), passes, failures, structured, None)
except Exception as e: except Exception as e:
return (os.path.basename(study_path), False, [], str(e)) return (os.path.basename(study_path), False, [], str(e))
def aggregate_deduplicate(failures_all: List[Tuple[str, str]], diff_text: str, out_dir: str) -> Tuple[str, Optional[str]]: def aggregate_deduplicate(failures_all: List[Dict[str, Any]], diff_text: str, out_dir: str) -> Tuple[str, Optional[List[Dict[str, Any]]], Optional[str]]:
"""Run Codex to deduplicate failures. Returns (outfile_path, error_or_none).""" """Run Codex to deduplicate failures. Returns (outfile_path, dedup_list_or_none, error_or_none)."""
if not failures_all: if not failures_all:
return ("", None) return ("", [], None)
out_path = os.path.join(out_dir, "aggregate-dedup.md") out_path = os.path.join(out_dir, "aggregate-dedup.json")
# Build input list issues_json = json.dumps(failures_all, indent=2)
items = "\n".join(f"- [{guide}] {text}" for guide, text in failures_all)
prompt = ( prompt = (
"You are assisting with de-duplicating code review issues.\n\n" "You are assisting with de-duplicating code review issues.\n\n"
"DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n" "DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n"
"Issues to consider (may include duplicates). Each item may include file:line headers and excerpts; treat those as context only.\n" "Issues (JSON array where each item has keys: guide, issue, file, line, excerpt):\n"
+ items + "\n\n" + issues_json + "\n\n"
"Please deduplicate these issues. Ignore file paths/line numbers and excerpts when comparing; group by semantic issue.\n" "Task: Deduplicate issues that are semantically the same, ignoring differences in file, line, or excerpt.\n"
"Output the list of unique issues ONLY as a bullet list, selecting the most descriptive phrasing for each.\n" "Keep the single most descriptive 'issue' text for each group and retain its metadata (guide, file, line, excerpt).\n"
"Do not add commentary, headings, or reformatting beyond a bullet list." "Output: EXACT RAW JSON array (no Markdown, no backticks) with the same object shape as the input."
) )
code, out, err = run_codex_exec(prompt, last_message_file=out_path) code, out, err = run_codex_exec(prompt, last_message_file=out_path)
if code != 0: if code != 0:
return (out_path, f"codex exec failed (exit {code}): {err.strip()}") return (out_path, None, f"codex exec failed (exit {code}): {err.strip()}")
# Fallback: ensure the file contains something # Read result (file or stdout) and parse JSON
content = None
try: try:
wrote = os.path.isfile(out_path) and os.path.getsize(out_path) > 0 if os.path.isfile(out_path) and os.path.getsize(out_path) > 0:
if not wrote: with open(out_path, "r", encoding="utf-8") as f:
with open(out_path, "w", encoding="utf-8") as f: content = f.read()
f.write(out) else:
content = out
except Exception:
content = out
try:
data = json.loads(content)
with open(out_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
f.write("\n")
return (out_path, data, None)
except Exception as e: except Exception as e:
return (out_path, f"failed to write dedup output: {e}") return (out_path, None, f"failed to parse dedup JSON: {e}")
return (out_path, None)
def aggregate_rank(dedup_text: str, diff_text: str, out_dir: str) -> Tuple[str, Optional[str]]: def aggregate_rank(dedup_list: List[Dict[str, Any]], diff_text: str, out_dir: str) -> Tuple[str, Optional[str]]:
out_path = os.path.join(out_dir, "aggregate-ranked.md") out_path = os.path.join(out_dir, "aggregate-ranked.json")
issues_json = json.dumps(dedup_list, indent=2)
prompt = ( prompt = (
"You are assisting with triage and prioritization of code review issues.\n\n" "You are assisting with triage and prioritization of code review issues.\n\n"
"DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n" "DIFF (unified):\n```diff\n" + diff_text + "\n```\n\n"
"Issues (one per line or bullet):\n" + dedup_text + "\n\n" "Issues (JSON array; each item has guide, issue, file, line, excerpt):\n"
+ issues_json + "\n\n"
"Task: For each issue, assign a category: P0, P1, P2, NIT, WRONG, IRRELEVANT.\n" "Task: For each issue, assign a category: P0, P1, P2, NIT, WRONG, IRRELEVANT.\n"
"- P0: Must-fix to prevent breakage/security/data loss.\n" "Output: EXACT RAW JSON object mapping category -> array of issues, preserving the same fields for each issue.\n"
"- P1: Strongly recommended for correctness/perf/maintainability.\n" "Schema: { \"P0\": Issue[], \"P1\": Issue[], \"P2\": Issue[], \"NIT\": Issue[], \"WRONG\": Issue[], \"IRRELEVANT\": Issue[] }"
"- P2: Nice-to-have improvements or polish.\n"
"- NIT: Stylistic nitpick.\n"
"- WRONG: The issue is incorrect given the diff.\n"
"- IRRELEVANT: Not applicable to this diff.\n\n"
"Output: EXACTLY a Markdown document grouped by sections with these headers (omit empty):\n"
"## P0\n- ...\n\n## P1\n- ...\n\n## P2\n- ...\n\n## NIT\n- ...\n\n## WRONG\n- ...\n\n## IRRELEVANT\n- ...\n"
) )
code, out, err = run_codex_exec(prompt, last_message_file=out_path) code, out, err = run_codex_exec(prompt, last_message_file=out_path)
if code != 0: if code != 0:
return (out_path, f"codex exec failed (exit {code}): {err.strip()}") return (out_path, f"codex exec failed (exit {code}): {err.strip()}")
# Parse and normalize JSON
content = None
try: try:
wrote = os.path.isfile(out_path) and os.path.getsize(out_path) > 0 if os.path.isfile(out_path) and os.path.getsize(out_path) > 0:
if not wrote: with open(out_path, "r", encoding="utf-8") as f:
with open(out_path, "w", encoding="utf-8") as f: content = f.read()
f.write(out) else:
content = out
except Exception:
content = out
try:
data = json.loads(content)
with open(out_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
f.write("\n")
return (out_path, None)
except Exception as e: except Exception as e:
return (out_path, f"failed to write ranked output: {e}") return (out_path, f"failed to parse ranked JSON: {e}")
return (out_path, None)
def print_progress(passed: int, completed: int, total: int, lock: threading.Lock): def print_progress(passed: int, completed: int, total: int, lock: threading.Lock):
@@ -326,6 +377,16 @@ def main():
help="Use only the first N study guides after sorting (like head -n)", help="Use only the first N study guides after sorting (like head -n)",
) )
parser.add_argument("--show-errors", action="store_true", help="Print per-guide errors encountered") parser.add_argument("--show-errors", action="store_true", help="Print per-guide errors encountered")
parser.add_argument(
"--force",
action="store_true",
help="Recompute review JSONs even if cached results exist",
)
parser.add_argument(
"--clear",
action="store_true",
help="Clear the output directory (review folder) before running",
)
args = parser.parse_args() args = parser.parse_args()
@@ -359,13 +420,20 @@ def main():
else: else:
# Default: sibling 'review' next to the study folder # Default: sibling 'review' next to the study folder
out_dir = os.path.join(os.path.dirname(study_dir), "review") out_dir = os.path.join(os.path.dirname(study_dir), "review")
if args.clear and os.path.isdir(out_dir):
# Danger: delete the review folder to start fresh
try:
shutil.rmtree(out_dir)
except Exception as e:
print(f"Failed to clear output dir {out_dir}: {e}", file=sys.stderr)
sys.exit(2)
os.makedirs(out_dir, exist_ok=True) os.makedirs(out_dir, exist_ok=True)
total = len(guides) total = len(guides)
passed = 0 passed = 0
completed = 0 completed = 0
lock = threading.Lock() lock = threading.Lock()
failures_all: List[Tuple[str, str]] = [] # (guide, failure) failures_all: List[Dict[str, Any]] = [] # structured failures
errors_all: List[Tuple[str, str]] = [] # (guide, error) errors_all: List[Tuple[str, str]] = [] # (guide, error)
print(f"Running {total} review(s) against {branch} vs {base_ref}…") print(f"Running {total} review(s) against {branch} vs {base_ref}…")
@@ -384,12 +452,12 @@ def main():
print_progress(passed, completed, total, lock) print_progress(passed, completed, total, lock)
def task(p: str): def task(p: str):
return review_one(p, diff_text, branch, base_ref, out_dir) return review_one(p, diff_text, branch, base_ref, out_dir, force=args.force)
with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, args.jobs)) as ex: with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, args.jobs)) as ex:
futs = [ex.submit(task, p) for p in guides] futs = [ex.submit(task, p) for p in guides]
for fut in concurrent.futures.as_completed(futs): for fut in concurrent.futures.as_completed(futs):
guide_name, ok, failures, err = fut.result() guide_name, ok, failures_display, failures_structured, err = fut.result()
with lock: with lock:
completed += 1 completed += 1
if ok: if ok:
@@ -397,8 +465,8 @@ def main():
else: else:
if err: if err:
errors_all.append((guide_name, err)) errors_all.append((guide_name, err))
for f in failures: for item in failures_structured:
failures_all.append((guide_name, f)) failures_all.append(item)
print_progress(passed, completed, total, lock) print_progress(passed, completed, total, lock)
print("") print("")
@@ -410,43 +478,45 @@ def main():
if failures_all: if failures_all:
print("\nFailed points:") print("\nFailed points:")
for g, f in failures_all: for item in failures_all:
print(f"- [{g}] {f}") print(f"- [{item.get('guide','?')}] {_format_failure_display(item)}")
else: else:
print("\nNo failed points detected.") print("\nNo failed points detected.")
# 4) Aggregate via Codex: deduplicate (optional), then rank # 4) Aggregate via Codex: deduplicate (optional), then rank
if failures_all: if failures_all:
print("\nAggregating failed points…") print("\nAggregating failed points…")
dedup_text = '' dedup_path = os.path.join(out_dir, "aggregate-dedup.json")
dedup_path = os.path.join(out_dir, "aggregate-dedup.md") dedup_list: List[Dict[str, Any]] = []
if len(failures_all) == 1: if len(failures_all) == 1:
# Skip model deduplication for a single issue; still write a trace file. # Skip model deduplication for a single issue; still write a trace file.
single = failures_all[0] single = failures_all[0]
dedup_text = f"- [{single[0]}] {single[1]}\n" dedup_list = [single]
try: try:
with open(dedup_path, 'w', encoding='utf-8') as f: with open(dedup_path, 'w', encoding='utf-8') as f:
f.write(dedup_text) json.dump(dedup_list, f, indent=2)
f.write("\n")
except Exception as e: except Exception as e:
print(f"Failed to write dedup file: {e}", file=sys.stderr) print(f"Failed to write dedup file: {e}", file=sys.stderr)
else: else:
path, dedup_err = aggregate_deduplicate(failures_all, diff_text, out_dir) path, data, dedup_err = aggregate_deduplicate(failures_all, diff_text, out_dir)
if dedup_err: if dedup_err:
print(f"Dedup error: {dedup_err}", file=sys.stderr) print(f"Dedup error: {dedup_err}", file=sys.stderr)
else: else:
dedup_path = path dedup_path = path
try: try:
with open(dedup_path, 'r', encoding='utf-8') as f: with open(dedup_path, 'r', encoding='utf-8') as f:
dedup_text = f.read() dedup_list = json.load(f)
except Exception as e: except Exception as e:
print(f"Failed to read dedup file: {e}", file=sys.stderr) print(f"Failed to read dedup file: {e}", file=sys.stderr)
dedup_text = '' dedup_list = []
if dedup_text.strip(): if dedup_list:
print(f"\nDeduplicated issues written to: {dedup_path}\n") print(f"\nDeduplicated issues written to: {dedup_path}\n")
print(dedup_text.strip()[:2000]) preview = json.dumps(dedup_list, indent=2)[:2000]
print(preview)
ranked_path, rank_err = aggregate_rank(dedup_text, diff_text, out_dir) ranked_path, rank_err = aggregate_rank(dedup_list, diff_text, out_dir)
if rank_err: if rank_err:
print(f"Ranking error: {rank_err}", file=sys.stderr) print(f"Ranking error: {rank_err}", file=sys.stderr)
else: else: