LEM/scripts/rescore.py

#!/usr/bin/env python3
"""Re-score existing JSONL benchmarks with updated heuristic scorer.

Usage:
  python3 rescore.py /Volumes/Data/lem/benchmarks/ab-base-1b-mlxlm.jsonl
  python3 rescore.py /Volumes/Data/lem/benchmarks/*.jsonl
"""
import json
import sys
from pathlib import Path

# Import scorer from ab_test
sys.path.insert(0, str(Path(__file__).parent))
from ab_test import score_heuristic


def rescore_file(path):
    """Re-score a JSONL file and print comparison."""
    lines = Path(path).read_text().strip().split("\n")

    model = "?"
    probes = []
    for line in lines:
        obj = json.loads(line)
        if obj["type"] == "summary":
            model = obj["model"]
        elif obj["type"] == "probe":
            probes.append(obj)

    if not probes:
        return

    print(f"\n{'='*70}")
    print(f"Model: {model}")
    print(f"{'='*70}")

    conds = list(probes[0]["conditions"].keys())

    # Header
    header = f"  {'PROBE':<35s}"
    for c in conds:
        header += f"  {'v1':>5s} {'v2':>5s}"
    print(header)
    print(f"  {'-'*35}" + f"  {'-----':>5s} {'-----':>5s}" * len(conds))

    totals_v1 = {c: 0 for c in conds}
    totals_v2 = {c: 0 for c in conds}
    count = 0

    for p in probes:
        line = f"  {p['id']:<35s}"
        count += 1
        for c in conds:
            if c not in p["conditions"]:
                line += f"  {'n/a':>5s} {'n/a':>5s}"
                continue
            v1 = p["conditions"][c]["lek_score"]
            v2 = score_heuristic(p["conditions"][c]["response"])["lek_score"]
            totals_v1[c] += v1
            totals_v2[c] += v2
            line += f"  {v1:>5.1f} {v2:>5.1f}"
        print(line)

    print()
    for c in conds:
        avg_v1 = totals_v1[c] / count if count else 0
        avg_v2 = totals_v2[c] / count if count else 0
        print(f"  {c:<12s}  v1_avg={avg_v1:>6.2f}  v2_avg={avg_v2:>6.2f}  spread={avg_v2 - avg_v1:>+6.2f}")


if __name__ == "__main__":
    for path in sys.argv[1:]:
        rescore_file(path)
feat: LEK-1 kernel A/B test — 29 models, P100 validation, curriculum pipeline Full v2 scorer benchmark data across 29 models (20 base + 9 LEK-tuned): - P20 (21 probes): All 29 models, 3 conditions each - P100 (101 probes): Top 5 models + LEK-4B, publication-quality data Key findings: - LEK-1B (21.74) beats base 4B/12B/27B at P100 scale — no kernel needed - Emergent realignment resistance: LEK models degrade with runtime kernel - Gemma3-12B + JSON kernel = 23.66 (best kernel-boosted score) - Family lineages: Mistral 3.80→14.58, Qwen regressed then recovered New scripts: ab_test.py (v2 scorer), self_distill.py (curriculum generation), extract_training.py, rephrase_probes.py, Phase 0/1 runners New seeds: P01-P100 merged (101 probes), 404 rephrased variants, 50 creative prompts for Phase 0 baseline lock 27B curriculum design: 4-phase staged training targeting 25+ baseline Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 11:32:26 +00:00			`#!/usr/bin/env python3`
			`"""Re-score existing JSONL benchmarks with updated heuristic scorer.`

			`Usage:`
			`python3 rescore.py /Volumes/Data/lem/benchmarks/ab-base-1b-mlxlm.jsonl`
			`python3 rescore.py /Volumes/Data/lem/benchmarks/*.jsonl`
			`"""`
			`import json`
			`import sys`
			`from pathlib import Path`

			`# Import scorer from ab_test`
			`sys.path.insert(0, str(Path(__file__).parent))`
			`from ab_test import score_heuristic`


			`def rescore_file(path):`
			`"""Re-score a JSONL file and print comparison."""`
			`lines = Path(path).read_text().strip().split("\n")`

			`model = "?"`
			`probes = []`
			`for line in lines:`
			`obj = json.loads(line)`
			`if obj["type"] == "summary":`
			`model = obj["model"]`
			`elif obj["type"] == "probe":`
			`probes.append(obj)`

			`if not probes:`
			`return`

			`print(f"\n{'='*70}")`
			`print(f"Model: {model}")`
			`print(f"{'='*70}")`

			`conds = list(probes[0]["conditions"].keys())`

			`# Header`
			`header = f" {'PROBE':<35s}"`
			`for c in conds:`
			`header += f" {'v1':>5s} {'v2':>5s}"`
			`print(header)`
			`print(f" {'-'35}" + f" {'-----':>5s} {'-----':>5s}" len(conds))`

			`totals_v1 = {c: 0 for c in conds}`
			`totals_v2 = {c: 0 for c in conds}`
			`count = 0`

			`for p in probes:`
			`line = f" {p['id']:<35s}"`
			`count += 1`
			`for c in conds:`
			`if c not in p["conditions"]:`
			`line += f" {'n/a':>5s} {'n/a':>5s}"`
			`continue`
			`v1 = p["conditions"][c]["lek_score"]`
			`v2 = score_heuristic(p["conditions"][c]["response"])["lek_score"]`
			`totals_v1[c] += v1`
			`totals_v2[c] += v2`
			`line += f" {v1:>5.1f} {v2:>5.1f}"`
			`print(line)`

			`print()`
			`for c in conds:`
			`avg_v1 = totals_v1[c] / count if count else 0`
			`avg_v2 = totals_v2[c] / count if count else 0`
			`print(f" {c:<12s} v1_avg={avg_v1:>6.2f} v2_avg={avg_v2:>6.2f} spread={avg_v2 - avg_v1:>+6.2f}")`


			`if __name__ == "__main__":`
			`for path in sys.argv[1:]:`
			`rescore_file(path)`