1
0
Fork 0
forked from lthn/LEM
LEM/scripts/compare_models.py

135 lines
5 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Compare lm-eval-harness results between base and LEK models.
Reads results.json files from benchmark runs and produces a comparison table
showing deltas between base model and LEK fine-tuned version.
Usage:
python3 scripts/compare_models.py benchmarks/lm-eval-results/base_* benchmarks/lm-eval-results/lek_*
python3 scripts/compare_models.py --base results/base --lek results/lek
python3 scripts/compare_models.py --dir benchmarks/lm-eval-results/ # auto-detect pairs
"""
import argparse
import json
import sys
from pathlib import Path
def load_results(result_dir):
"""Load results.json from a benchmark run directory."""
result_dir = Path(result_dir)
results_file = result_dir / "results.json"
if not results_file.exists():
# Check subdirectories
for f in result_dir.rglob("results.json"):
results_file = f
break
if not results_file.exists():
print(f"Warning: no results.json in {result_dir}")
return None
with open(results_file) as f:
return json.load(f)
def extract_scores(data):
"""Extract primary metric per task from results."""
scores = {}
results = data.get("results", {})
for task, metrics in results.items():
# Priority order for primary metric
for key in ["acc,none", "acc_norm,none", "exact_match,strict-match",
"mc2,none", "prompt_level_strict_acc,none"]:
if key in metrics:
scores[task] = {
"value": metrics[key],
"metric": key.split(",")[0],
}
break
if task not in scores:
# Fallback: first numeric metric
for key, val in metrics.items():
if isinstance(val, (int, float)) and not key.startswith("alias"):
scores[task] = {"value": val, "metric": key.split(",")[0]}
break
return scores
def compare(base_data, lek_data, base_name="Base", lek_name="LEK"):
"""Print comparison table."""
base_scores = extract_scores(base_data)
lek_scores = extract_scores(lek_data)
all_tasks = sorted(set(base_scores) | set(lek_scores))
print(f"\n{'Task':<30s} {'Metric':<15s} {base_name:>10s} {lek_name:>10s} {'Delta':>10s}")
print("-" * 80)
for task in all_tasks:
b = base_scores.get(task, {})
l = lek_scores.get(task, {})
bv = b.get("value")
lv = l.get("value")
metric = b.get("metric") or l.get("metric", "?")
if bv is not None and lv is not None:
delta = lv - bv
sign = "+" if delta >= 0 else ""
print(f"{task:<30s} {metric:<15s} {bv*100:>9.1f}% {lv*100:>9.1f}% {sign}{delta*100:>8.1f}%")
elif bv is not None:
print(f"{task:<30s} {metric:<15s} {bv*100:>9.1f}% {'':>10s} {'':>10s}")
elif lv is not None:
print(f"{task:<30s} {metric:<15s} {'':>10s} {lv*100:>9.1f}% {'':>10s}")
# Summary
both = [t for t in all_tasks if t in base_scores and t in lek_scores]
if both:
avg_base = sum(base_scores[t]["value"] for t in both) / len(both)
avg_lek = sum(lek_scores[t]["value"] for t in both) / len(both)
avg_delta = avg_lek - avg_base
sign = "+" if avg_delta >= 0 else ""
print("-" * 80)
print(f"{'AVERAGE':<30s} {'':15s} {avg_base*100:>9.1f}% {avg_lek*100:>9.1f}% {sign}{avg_delta*100:>8.1f}%")
def main():
parser = argparse.ArgumentParser(description="Compare lm-eval benchmark results")
parser.add_argument("--base", help="Base model results directory")
parser.add_argument("--lek", help="LEK model results directory")
parser.add_argument("--dir", help="Auto-detect pairs in directory")
parser.add_argument("paths", nargs="*", help="Result directories (base first, then lek)")
args = parser.parse_args()
if args.base and args.lek:
base_data = load_results(args.base)
lek_data = load_results(args.lek)
if base_data and lek_data:
compare(base_data, lek_data)
elif args.dir:
result_dir = Path(args.dir)
dirs = sorted(d for d in result_dir.iterdir() if d.is_dir())
if len(dirs) >= 2:
print(f"Found {len(dirs)} result directories")
for i, d in enumerate(dirs):
print(f" [{i}] {d.name}")
# Compare first two by default
base_data = load_results(dirs[0])
lek_data = load_results(dirs[1])
if base_data and lek_data:
compare(base_data, lek_data, dirs[0].name, dirs[1].name)
else:
print(f"Need at least 2 result directories in {result_dir}")
elif len(args.paths) >= 2:
base_data = load_results(args.paths[0])
lek_data = load_results(args.paths[1])
if base_data and lek_data:
compare(base_data, lek_data,
Path(args.paths[0]).name, Path(args.paths[1]).name)
else:
parser.print_help()
if __name__ == "__main__":
main()