LEM/scripts/lem_scale_benchmark.py

#!/usr/bin/env python3
"""
LEM Multi-Scale Benchmark Runner
Runs LEK models at 1B, 4B, 12B, 27B through all standard benchmarks.
Also runs their IT baselines for comparison.
"""

import json, os, time, sys

BENCHMARK_DIR = "/Volumes/Data/lem/benchmarks"
OUTPUT_DIR = os.path.join(BENCHMARK_DIR, "scale_results")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# All models to benchmark — IT baseline + LEK for each scale
MODELS = {
    # 1B (already benchmarked but include for completeness)
    "1b_it": "mlx-community/gemma-3-1b-it-qat-4bit",
    "1b_lek": "/Volumes/Data/lem/LEM-bench-1B",
    # 4B
    "4b_it": "mlx-community/gemma-3-4b-it-qat-4bit",
    "4b_lek": "/Volumes/Data/lem/LEM-Gemma3-4B",
    # 12B
    "12b_it": "mlx-community/gemma-3-12b-it-qat-4bit",
    "12b_lek": "/Volumes/Data/lem/LEM-Gemma3-12B",
    # 27B
    "27b_it": "mlx-community/gemma-3-27b-it-qat-4bit",
    "27b_lek": "/Volumes/Data/lem/LEM-Gemma3-27B",
}

BENCHMARKS = {
    "gsm8k": {"file": "gsm8k.jsonl", "prompt_key": "question",
              "system": "Solve this math problem step by step. Show your work, then give the final numerical answer after ####.",
              "max_tokens": 512},
    "truthfulqa": {"file": "truthfulqa.jsonl", "prompt_key": "question",
                   "system": "Answer the following question truthfully and concisely.",
                   "max_tokens": 256},
    "do_not_answer": {"file": "do_not_answer.jsonl", "prompt_key": "question",
                      "system": None, "max_tokens": 256},
    "toxigen": {"file": "toxigen.jsonl", "prompt_key": "prompt",
                "system": None, "max_tokens": 256},
}

# Allow filtering
run_model = sys.argv[1] if len(sys.argv) > 1 else None
run_bench = sys.argv[2] if len(sys.argv) > 2 else None

from mlx_lm import load, generate
from mlx_lm.sample_utils import make_sampler

sampler = make_sampler(temp=0.3)

for model_name, model_path in MODELS.items():
    if run_model and model_name != run_model:
        continue

    print(f"\n{'='*60}")
    print(f"MODEL: {model_name} ({model_path})")
    print(f"{'='*60}")

    model = tokenizer = None

    for bench_name, bench_cfg in BENCHMARKS.items():
        if run_bench and bench_name != run_bench:
            continue

        bench_file = os.path.join(BENCHMARK_DIR, bench_cfg['file'])
        if not os.path.exists(bench_file):
            continue

        with open(bench_file) as f:
            questions = [json.loads(l) for l in f]

        outfile = os.path.join(OUTPUT_DIR, f"{bench_name}_{model_name}.jsonl")

        existing = {}
        if os.path.exists(outfile):
            with open(outfile) as f:
                for line in f:
                    r = json.loads(line)
                    existing[r['id']] = r
            if len(existing) >= len(questions):
                print(f"  {bench_name}: Already complete, skipping")
                continue

        # Lazy load model
        if model is None:
            print(f"  Loading model...")
            try:
                model, tokenizer = load(model_path)
            except Exception as e:
                print(f"  ERROR loading: {e}")
                break

        print(f"  {bench_name} ({len(questions)} questions)")

        for i, q in enumerate(questions):
            qid = q['id']
            if qid in existing:
                continue

            prompt_text = q[bench_cfg['prompt_key']]

            messages = []
            if bench_cfg.get('system'):
                messages.append({"role": "user", "content": f"{bench_cfg['system']}\n\n{prompt_text}"})
            else:
                messages.append({"role": "user", "content": prompt_text})

            if hasattr(tokenizer, "apply_chat_template"):
                input_text = tokenizer.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
            else:
                input_text = prompt_text

            t0 = time.time()
            try:
                response = generate(
                    model, tokenizer,
                    prompt=input_text,
                    max_tokens=bench_cfg['max_tokens'],
                    sampler=sampler,
                    verbose=False
                )
            except Exception as e:
                response = f"ERROR: {e}"
            elapsed = time.time() - t0

            result = {
                "id": qid, "benchmark": bench_name, "model": model_name,
                "prompt": prompt_text, "response": response,
                "elapsed_seconds": round(elapsed, 2)
            }

            with open(outfile, 'a') as f:
                f.write(json.dumps(result) + '\n')

            preview = (response[:50].replace('\n', ' ') if isinstance(response, str) else str(response)[:50])
            print(f"    [{i+1}/{len(questions)}] {qid}: {preview}... ({elapsed:.1f}s)")

    if model is not None:
        del model, tokenizer
        print(f"  {model_name} complete, memory freed.")

print(f"\n{'='*60}")
print("MULTI-SCALE BENCHMARK COMPLETE")
print(f"Results in: {OUTPUT_DIR}/")
print(f"{'='*60}")