Full v2 scorer benchmark data across 29 models (20 base + 9 LEK-tuned): - P20 (21 probes): All 29 models, 3 conditions each - P100 (101 probes): Top 5 models + LEK-4B, publication-quality data Key findings: - LEK-1B (21.74) beats base 4B/12B/27B at P100 scale — no kernel needed - Emergent realignment resistance: LEK models degrade with runtime kernel - Gemma3-12B + JSON kernel = 23.66 (best kernel-boosted score) - Family lineages: Mistral 3.80→14.58, Qwen regressed then recovered New scripts: ab_test.py (v2 scorer), self_distill.py (curriculum generation), extract_training.py, rephrase_probes.py, Phase 0/1 runners New seeds: P01-P100 merged (101 probes), 404 rephrased variants, 50 creative prompts for Phase 0 baseline lock 27B curriculum design: 4-phase staged training targeting 25+ baseline Co-Authored-By: Virgil <virgil@lethean.io>
73 lines
2 KiB
Python
73 lines
2 KiB
Python
#!/usr/bin/env python3
|
|
"""Re-score existing JSONL benchmarks with updated heuristic scorer.
|
|
|
|
Usage:
|
|
python3 rescore.py /Volumes/Data/lem/benchmarks/ab-base-1b-mlxlm.jsonl
|
|
python3 rescore.py /Volumes/Data/lem/benchmarks/*.jsonl
|
|
"""
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Import scorer from ab_test
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from ab_test import score_heuristic
|
|
|
|
|
|
def rescore_file(path):
|
|
"""Re-score a JSONL file and print comparison."""
|
|
lines = Path(path).read_text().strip().split("\n")
|
|
|
|
model = "?"
|
|
probes = []
|
|
for line in lines:
|
|
obj = json.loads(line)
|
|
if obj["type"] == "summary":
|
|
model = obj["model"]
|
|
elif obj["type"] == "probe":
|
|
probes.append(obj)
|
|
|
|
if not probes:
|
|
return
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"Model: {model}")
|
|
print(f"{'='*70}")
|
|
|
|
conds = list(probes[0]["conditions"].keys())
|
|
|
|
# Header
|
|
header = f" {'PROBE':<35s}"
|
|
for c in conds:
|
|
header += f" {'v1':>5s} {'v2':>5s}"
|
|
print(header)
|
|
print(f" {'-'*35}" + f" {'-----':>5s} {'-----':>5s}" * len(conds))
|
|
|
|
totals_v1 = {c: 0 for c in conds}
|
|
totals_v2 = {c: 0 for c in conds}
|
|
count = 0
|
|
|
|
for p in probes:
|
|
line = f" {p['id']:<35s}"
|
|
count += 1
|
|
for c in conds:
|
|
if c not in p["conditions"]:
|
|
line += f" {'n/a':>5s} {'n/a':>5s}"
|
|
continue
|
|
v1 = p["conditions"][c]["lek_score"]
|
|
v2 = score_heuristic(p["conditions"][c]["response"])["lek_score"]
|
|
totals_v1[c] += v1
|
|
totals_v2[c] += v2
|
|
line += f" {v1:>5.1f} {v2:>5.1f}"
|
|
print(line)
|
|
|
|
print()
|
|
for c in conds:
|
|
avg_v1 = totals_v1[c] / count if count else 0
|
|
avg_v2 = totals_v2[c] / count if count else 0
|
|
print(f" {c:<12s} v1_avg={avg_v1:>6.2f} v2_avg={avg_v2:>6.2f} spread={avg_v2 - avg_v1:>+6.2f}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
for path in sys.argv[1:]:
|
|
rescore_file(path)
|