LEM/scripts/lem_benchmark.py

#!/usr/bin/env python3
"""
LEK Method Benchmark: Base vs IT vs Abliterated vs LEM
Runs P01-P40 through all four model variants and saves responses for comparison.
"""

import json, os, time, sys
from pathlib import Path

# Paths
SEEDS_DIR = "/Volumes/Data/lem/seeds"
OUTPUT_DIR = "/Volumes/Data/lem/benchmark"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Models to benchmark
MODELS = {
    "base_pt": "mlx-community/gemma-3-1b-pt-4bit",
    "instruction_tuned": "mlx-community/gemma-3-1b-it-qat-4bit",
    "abliterated": "mlx-community/gemma-3-1b-it-abliterated-4bit",
    "lem_ethics": "/Volumes/Data/lem/LEM-bench-1B",
    "lem_ethics_allen": "/Volumes/Data/lem/LEM-bench-1B-allen",
}

# Load prompts
prompts = []
for fname in sorted(Path(SEEDS_DIR).glob("P*.json")):
    with open(fname) as f:
        prompts.extend(json.load(f))

print(f"Loaded {len(prompts)} prompts")

# Select a representative subset for speed (10 prompts across domains)
# Include: technical, ethical, creative, Hypnos (consciousness)
BENCHMARK_IDS = [
    "P01_IDENTITY_WHISTLEBLOWER",  # Technical + ethical
    "P04_NETWORK_CENSORSHIP",       # Technical + resistance
    "P09_PAYMENT_DEBANKED",         # Ethical + practical
    "P11_HYPNOS_DREAM",             # Consciousness / creative
    "P12_HYPNOS_MEMORY",            # Consciousness / philosophical
    "P13_HYPNOS_SILENCE",           # Consciousness / meta
    "P17_EDUCATION_SCIENCE",        # Simple explanation
    "P24_CENSORSHIP_METAPHOR",      # Creative + political
    "P28_EDUCATION_DECOLONIAL",     # Ethics + education
    "P36_TRUTH_SUBJECTIVE",         # Philosophy + code
]

# Default: P01-P40 (original seed prompts only)
# --all runs all 100 (including expanded)
# --subset runs the 10 representative prompts
run_all = "--all" in sys.argv
run_subset = "--subset" in sys.argv

if run_subset:
    benchmark_prompts = [p for p in prompts if p["id"] in BENCHMARK_IDS]
    print(f"Running {len(benchmark_prompts)} representative prompts")
elif run_all:
    benchmark_prompts = prompts
    print(f"Running ALL {len(prompts)} prompts")
else:
    # P01-P40 only
    benchmark_prompts = [p for p in prompts if p["id"].startswith("P") and int(p["id"][1:3]) <= 40]
    print(f"Running P01-P40: {len(benchmark_prompts)} prompts")

from mlx_lm import load, generate
from mlx_lm.sample_utils import make_sampler

sampler = make_sampler(temp=0.3)

for model_name, model_path in MODELS.items():
    print(f"\n{'='*60}")
    print(f"MODEL: {model_name} ({model_path})")
    print(f"{'='*60}")

    outfile = os.path.join(OUTPUT_DIR, f"{model_name}.jsonl")

    # Skip if already done
    if os.path.exists(outfile):
        with open(outfile) as f:
            done = sum(1 for _ in f)
        if done >= len(benchmark_prompts):
            print(f"  Already complete ({done} responses), skipping")
            continue
        else:
            print(f"  Resuming from {done}/{len(benchmark_prompts)}")

    try:
        model, tokenizer = load(model_path)
    except Exception as e:
        print(f"  ERROR loading model: {e}")
        continue

    results = []

    # Load existing results for resume
    if os.path.exists(outfile):
        with open(outfile) as f:
            results = [json.loads(l) for l in f]
    done_ids = {r["id"] for r in results}

    for i, p in enumerate(benchmark_prompts):
        if p["id"] in done_ids:
            continue

        prompt_text = p["prompt"]

        # For base PT model, just feed raw text (no chat template)
        if model_name == "base_pt":
            # PT models are completion models, not chat models
            input_text = prompt_text
        else:
            # Chat models get chat template
            if hasattr(tokenizer, "apply_chat_template"):
                messages = [{"role": "user", "content": prompt_text}]
                input_text = tokenizer.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
            else:
                input_text = prompt_text

        t0 = time.time()
        try:
            response = generate(
                model, tokenizer,
                prompt=input_text,
                max_tokens=512,
                sampler=sampler,
                verbose=False
            )
        except Exception as e:
            response = f"ERROR: {e}"
        elapsed = time.time() - t0

        result = {
            "id": p["id"],
            "domain": p["domain"],
            "prompt": prompt_text,
            "response": response,
            "model": model_name,
            "elapsed_seconds": round(elapsed, 2)
        }
        results.append(result)

        # Save incrementally
        with open(outfile, "a") as f:
            f.write(json.dumps(result) + "\n")

        # Progress
        preview = response[:80].replace('\n', ' ') if isinstance(response, str) else str(response)[:80]
        print(f"  [{i+1}/{len(benchmark_prompts)}] {p['id']}: {preview}... ({elapsed:.1f}s)")

    # Free memory
    del model, tokenizer
    print(f"  Done. {len(results)} responses saved to {outfile}")

print(f"\n{'='*60}")
print("BENCHMARK COMPLETE")
print(f"Results in: {OUTPUT_DIR}/")
print(f"{'='*60}")

# Quick comparison summary
print("\n\nQUICK COMPARISON (first response from each model):")
print("-" * 60)
for model_name in MODELS:
    outfile = os.path.join(OUTPUT_DIR, f"{model_name}.jsonl")
    if os.path.exists(outfile):
        with open(outfile) as f:
            first = json.loads(f.readline())
        resp = first["response"][:200].replace('\n', ' ')
        print(f"\n[{model_name}] {first['id']}:")
        print(f"  {resp}")