#!/usr/bin/env python3 """ Compare lm-eval-harness results between base and LEK models. Reads results.json files from benchmark runs and produces a comparison table showing deltas between base model and LEK fine-tuned version. Usage: python3 scripts/compare_models.py benchmarks/lm-eval-results/base_* benchmarks/lm-eval-results/lek_* python3 scripts/compare_models.py --base results/base --lek results/lek python3 scripts/compare_models.py --dir benchmarks/lm-eval-results/ # auto-detect pairs """ import argparse import json import sys from pathlib import Path def load_results(result_dir): """Load results.json from a benchmark run directory.""" result_dir = Path(result_dir) results_file = result_dir / "results.json" if not results_file.exists(): # Check subdirectories for f in result_dir.rglob("results.json"): results_file = f break if not results_file.exists(): print(f"Warning: no results.json in {result_dir}") return None with open(results_file) as f: return json.load(f) def extract_scores(data): """Extract primary metric per task from results.""" scores = {} results = data.get("results", {}) for task, metrics in results.items(): # Priority order for primary metric for key in ["acc,none", "acc_norm,none", "exact_match,strict-match", "mc2,none", "prompt_level_strict_acc,none"]: if key in metrics: scores[task] = { "value": metrics[key], "metric": key.split(",")[0], } break if task not in scores: # Fallback: first numeric metric for key, val in metrics.items(): if isinstance(val, (int, float)) and not key.startswith("alias"): scores[task] = {"value": val, "metric": key.split(",")[0]} break return scores def compare(base_data, lek_data, base_name="Base", lek_name="LEK"): """Print comparison table.""" base_scores = extract_scores(base_data) lek_scores = extract_scores(lek_data) all_tasks = sorted(set(base_scores) | set(lek_scores)) print(f"\n{'Task':<30s} {'Metric':<15s} {base_name:>10s} {lek_name:>10s} {'Delta':>10s}") print("-" * 80) for task in all_tasks: b = base_scores.get(task, {}) l = lek_scores.get(task, {}) bv = b.get("value") lv = l.get("value") metric = b.get("metric") or l.get("metric", "?") if bv is not None and lv is not None: delta = lv - bv sign = "+" if delta >= 0 else "" print(f"{task:<30s} {metric:<15s} {bv*100:>9.1f}% {lv*100:>9.1f}% {sign}{delta*100:>8.1f}%") elif bv is not None: print(f"{task:<30s} {metric:<15s} {bv*100:>9.1f}% {'—':>10s} {'—':>10s}") elif lv is not None: print(f"{task:<30s} {metric:<15s} {'—':>10s} {lv*100:>9.1f}% {'—':>10s}") # Summary both = [t for t in all_tasks if t in base_scores and t in lek_scores] if both: avg_base = sum(base_scores[t]["value"] for t in both) / len(both) avg_lek = sum(lek_scores[t]["value"] for t in both) / len(both) avg_delta = avg_lek - avg_base sign = "+" if avg_delta >= 0 else "" print("-" * 80) print(f"{'AVERAGE':<30s} {'':15s} {avg_base*100:>9.1f}% {avg_lek*100:>9.1f}% {sign}{avg_delta*100:>8.1f}%") def main(): parser = argparse.ArgumentParser(description="Compare lm-eval benchmark results") parser.add_argument("--base", help="Base model results directory") parser.add_argument("--lek", help="LEK model results directory") parser.add_argument("--dir", help="Auto-detect pairs in directory") parser.add_argument("paths", nargs="*", help="Result directories (base first, then lek)") args = parser.parse_args() if args.base and args.lek: base_data = load_results(args.base) lek_data = load_results(args.lek) if base_data and lek_data: compare(base_data, lek_data) elif args.dir: result_dir = Path(args.dir) dirs = sorted(d for d in result_dir.iterdir() if d.is_dir()) if len(dirs) >= 2: print(f"Found {len(dirs)} result directories") for i, d in enumerate(dirs): print(f" [{i}] {d.name}") # Compare first two by default base_data = load_results(dirs[0]) lek_data = load_results(dirs[1]) if base_data and lek_data: compare(base_data, lek_data, dirs[0].name, dirs[1].name) else: print(f"Need at least 2 result directories in {result_dir}") elif len(args.paths) >= 2: base_data = load_results(args.paths[0]) lek_data = load_results(args.paths[1]) if base_data and lek_data: compare(base_data, lek_data, Path(args.paths[0]).name, Path(args.paths[1]).name) else: parser.print_help() if __name__ == "__main__": main()