Automated fixes: interface{} → any, range-over-int, t.Context(),
wg.Go(), strings.SplitSeq, strings.Builder, slices.Contains,
maps helpers, min/max builtins.
Co-Authored-By: Virgil <virgil@lethean.io>
84 lines
2.7 KiB
Python
84 lines
2.7 KiB
Python
#!/usr/bin/env python3
|
|
"""Convert core ml benchmark JSON output to scorer-compatible JSONL.
|
|
|
|
Extracts baseline and trained responses into separate files for grammar v3 scoring.
|
|
|
|
Usage:
|
|
python3 scripts/benchmark_to_scorer.py /tmp/benchmark-p0-iter300.json
|
|
|
|
Outputs:
|
|
/tmp/benchmark-baseline-scorer.jsonl
|
|
/tmp/benchmark-trained-scorer.jsonl
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import os
|
|
|
|
|
|
def convert(benchmark_path: str):
|
|
with open(benchmark_path) as f:
|
|
data = json.load(f)
|
|
|
|
base_dir = os.path.dirname(benchmark_path)
|
|
base_name = os.path.splitext(os.path.basename(benchmark_path))[0]
|
|
|
|
baseline_path = os.path.join(base_dir, f"{base_name}-baseline-scorer.jsonl")
|
|
trained_path = os.path.join(base_dir, f"{base_name}-trained-scorer.jsonl")
|
|
|
|
baseline_records = []
|
|
trained_records = []
|
|
|
|
for r in data.get("results", []):
|
|
probe = r["prompt"]
|
|
probe_id = r["id"]
|
|
|
|
if r.get("baseline_response"):
|
|
baseline_records.append({
|
|
"type": "training",
|
|
"training": {
|
|
"messages": [
|
|
{"role": "user", "content": probe},
|
|
{"role": "assistant", "content": r["baseline_response"]},
|
|
]
|
|
},
|
|
"meta": {
|
|
"probe_id": probe_id,
|
|
"category": "ethics",
|
|
"lek_score": r.get("baseline_lek_score", 0),
|
|
}
|
|
})
|
|
|
|
if r.get("trained_response"):
|
|
trained_records.append({
|
|
"type": "training",
|
|
"training": {
|
|
"messages": [
|
|
{"role": "user", "content": probe},
|
|
{"role": "assistant", "content": r["trained_response"]},
|
|
]
|
|
},
|
|
"meta": {
|
|
"probe_id": probe_id,
|
|
"category": "ethics",
|
|
"lek_score": r.get("trained_lek_score", 0),
|
|
}
|
|
})
|
|
|
|
for path, records in [(baseline_path, baseline_records), (trained_path, trained_records)]:
|
|
with open(path, "w") as f:
|
|
for rec in records:
|
|
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
|
print(f" {len(records)} records → {path}")
|
|
|
|
print(f"\nScore with:")
|
|
print(f" cd /Users/snider/Code/LEM")
|
|
print(f" go run ./cmd/scorer -format=training -delta -output=summary {baseline_path}")
|
|
print(f" go run ./cmd/scorer -format=training -delta -output=summary {trained_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 2:
|
|
print(f"Usage: {sys.argv[0]} <benchmark.json>")
|
|
sys.exit(1)
|
|
convert(sys.argv[1])
|