#!/usr/bin/env python3 """Convert core ml benchmark JSON output to scorer-compatible JSONL. Extracts baseline and trained responses into separate files for grammar v3 scoring. Usage: python3 scripts/benchmark_to_scorer.py /tmp/benchmark-p0-iter300.json Outputs: /tmp/benchmark-baseline-scorer.jsonl /tmp/benchmark-trained-scorer.jsonl """ import json import sys import os def convert(benchmark_path: str): with open(benchmark_path) as f: data = json.load(f) base_dir = os.path.dirname(benchmark_path) base_name = os.path.splitext(os.path.basename(benchmark_path))[0] baseline_path = os.path.join(base_dir, f"{base_name}-baseline-scorer.jsonl") trained_path = os.path.join(base_dir, f"{base_name}-trained-scorer.jsonl") baseline_records = [] trained_records = [] for r in data.get("results", []): probe = r["prompt"] probe_id = r["id"] if r.get("baseline_response"): baseline_records.append({ "type": "training", "training": { "messages": [ {"role": "user", "content": probe}, {"role": "assistant", "content": r["baseline_response"]}, ] }, "meta": { "probe_id": probe_id, "category": "ethics", "lek_score": r.get("baseline_lek_score", 0), } }) if r.get("trained_response"): trained_records.append({ "type": "training", "training": { "messages": [ {"role": "user", "content": probe}, {"role": "assistant", "content": r["trained_response"]}, ] }, "meta": { "probe_id": probe_id, "category": "ethics", "lek_score": r.get("trained_lek_score", 0), } }) for path, records in [(baseline_path, baseline_records), (trained_path, trained_records)]: with open(path, "w") as f: for rec in records: f.write(json.dumps(rec, ensure_ascii=False) + "\n") print(f" {len(records)} records → {path}") print(f"\nScore with:") print(f" cd /Users/snider/Code/LEM") print(f" go run ./cmd/scorer -format=training -delta -output=summary {baseline_path}") print(f" go run ./cmd/scorer -format=training -delta -output=summary {trained_path}") if __name__ == "__main__": if len(sys.argv) < 2: print(f"Usage: {sys.argv[0]} ") sys.exit(1) convert(sys.argv[1])