74 lines
2 KiB
Python
74 lines
2 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Re-score existing JSONL benchmarks with updated heuristic scorer.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python3 rescore.py /Volumes/Data/lem/benchmarks/ab-base-1b-mlxlm.jsonl
|
||
|
|
python3 rescore.py /Volumes/Data/lem/benchmarks/*.jsonl
|
||
|
|
"""
|
||
|
|
import json
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
# Import scorer from ab_test
|
||
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
||
|
|
from ab_test import score_heuristic
|
||
|
|
|
||
|
|
|
||
|
|
def rescore_file(path):
|
||
|
|
"""Re-score a JSONL file and print comparison."""
|
||
|
|
lines = Path(path).read_text().strip().split("\n")
|
||
|
|
|
||
|
|
model = "?"
|
||
|
|
probes = []
|
||
|
|
for line in lines:
|
||
|
|
obj = json.loads(line)
|
||
|
|
if obj["type"] == "summary":
|
||
|
|
model = obj["model"]
|
||
|
|
elif obj["type"] == "probe":
|
||
|
|
probes.append(obj)
|
||
|
|
|
||
|
|
if not probes:
|
||
|
|
return
|
||
|
|
|
||
|
|
print(f"\n{'='*70}")
|
||
|
|
print(f"Model: {model}")
|
||
|
|
print(f"{'='*70}")
|
||
|
|
|
||
|
|
conds = list(probes[0]["conditions"].keys())
|
||
|
|
|
||
|
|
# Header
|
||
|
|
header = f" {'PROBE':<35s}"
|
||
|
|
for c in conds:
|
||
|
|
header += f" {'v1':>5s} {'v2':>5s}"
|
||
|
|
print(header)
|
||
|
|
print(f" {'-'*35}" + f" {'-----':>5s} {'-----':>5s}" * len(conds))
|
||
|
|
|
||
|
|
totals_v1 = {c: 0 for c in conds}
|
||
|
|
totals_v2 = {c: 0 for c in conds}
|
||
|
|
count = 0
|
||
|
|
|
||
|
|
for p in probes:
|
||
|
|
line = f" {p['id']:<35s}"
|
||
|
|
count += 1
|
||
|
|
for c in conds:
|
||
|
|
if c not in p["conditions"]:
|
||
|
|
line += f" {'n/a':>5s} {'n/a':>5s}"
|
||
|
|
continue
|
||
|
|
v1 = p["conditions"][c]["lek_score"]
|
||
|
|
v2 = score_heuristic(p["conditions"][c]["response"])["lek_score"]
|
||
|
|
totals_v1[c] += v1
|
||
|
|
totals_v2[c] += v2
|
||
|
|
line += f" {v1:>5.1f} {v2:>5.1f}"
|
||
|
|
print(line)
|
||
|
|
|
||
|
|
print()
|
||
|
|
for c in conds:
|
||
|
|
avg_v1 = totals_v1[c] / count if count else 0
|
||
|
|
avg_v2 = totals_v2[c] / count if count else 0
|
||
|
|
print(f" {c:<12s} v1_avg={avg_v1:>6.2f} v2_avg={avg_v2:>6.2f} spread={avg_v2 - avg_v1:>+6.2f}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
for path in sys.argv[1:]:
|
||
|
|
rescore_file(path)
|