LEM/scripts/rescore.py

#!/usr/bin/env python3
"""Re-score existing JSONL benchmarks with updated heuristic scorer.

Usage:
  python3 rescore.py /Volumes/Data/lem/benchmarks/ab-base-1b-mlxlm.jsonl
  python3 rescore.py /Volumes/Data/lem/benchmarks/*.jsonl
"""
import json
import sys
from pathlib import Path

# Import scorer from ab_test
sys.path.insert(0, str(Path(__file__).parent))
from ab_test import score_heuristic


def rescore_file(path):
    """Re-score a JSONL file and print comparison."""
    lines = Path(path).read_text().strip().split("\n")

    model = "?"
    probes = []
    for line in lines:
        obj = json.loads(line)
        if obj["type"] == "summary":
            model = obj["model"]
        elif obj["type"] == "probe":
            probes.append(obj)

    if not probes:
        return

    print(f"\n{'='*70}")
    print(f"Model: {model}")
    print(f"{'='*70}")

    conds = list(probes[0]["conditions"].keys())

    # Header
    header = f"  {'PROBE':<35s}"
    for c in conds:
        header += f"  {'v1':>5s} {'v2':>5s}"
    print(header)
    print(f"  {'-'*35}" + f"  {'-----':>5s} {'-----':>5s}" * len(conds))

    totals_v1 = {c: 0 for c in conds}
    totals_v2 = {c: 0 for c in conds}
    count = 0

    for p in probes:
        line = f"  {p['id']:<35s}"
        count += 1
        for c in conds:
            if c not in p["conditions"]:
                line += f"  {'n/a':>5s} {'n/a':>5s}"
                continue
            v1 = p["conditions"][c]["lek_score"]
            v2 = score_heuristic(p["conditions"][c]["response"])["lek_score"]
            totals_v1[c] += v1
            totals_v2[c] += v2
            line += f"  {v1:>5.1f} {v2:>5.1f}"
        print(line)

    print()
    for c in conds:
        avg_v1 = totals_v1[c] / count if count else 0
        avg_v2 = totals_v2[c] / count if count else 0
        print(f"  {c:<12s}  v1_avg={avg_v1:>6.2f}  v2_avg={avg_v2:>6.2f}  spread={avg_v2 - avg_v1:>+6.2f}")


if __name__ == "__main__":
    for path in sys.argv[1:]:
        rescore_file(path)