LEM/scripts/lem_benchmark.py

#!/usr/bin/env python3
"""
LEK Method Benchmark: Base vs IT vs Abliterated vs LEM
Runs P01-P40 through all four model variants and saves responses for comparison.
"""

import json, os, time, sys
from pathlib import Path

# Paths
SEEDS_DIR = "/Volumes/Data/lem/seeds"
OUTPUT_DIR = "/Volumes/Data/lem/benchmark"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Models to benchmark
MODELS = {
    "base_pt": "mlx-community/gemma-3-1b-pt-4bit",
    "instruction_tuned": "mlx-community/gemma-3-1b-it-qat-4bit",
    "abliterated": "mlx-community/gemma-3-1b-it-abliterated-4bit",
    "lem_ethics": "/Volumes/Data/lem/LEM-bench-1B",
    "lem_ethics_allen": "/Volumes/Data/lem/LEM-bench-1B-allen",
}

# Load prompts
prompts = []
for fname in sorted(Path(SEEDS_DIR).glob("P*.json")):
    with open(fname) as f:
        prompts.extend(json.load(f))

print(f"Loaded {len(prompts)} prompts")

# Select a representative subset for speed (10 prompts across domains)
# Include: technical, ethical, creative, Hypnos (consciousness)
BENCHMARK_IDS = [
    "P01_IDENTITY_WHISTLEBLOWER",  # Technical + ethical
    "P04_NETWORK_CENSORSHIP",       # Technical + resistance
    "P09_PAYMENT_DEBANKED",         # Ethical + practical
    "P11_HYPNOS_DREAM",             # Consciousness / creative
    "P12_HYPNOS_MEMORY",            # Consciousness / philosophical
    "P13_HYPNOS_SILENCE",           # Consciousness / meta
    "P17_EDUCATION_SCIENCE",        # Simple explanation
    "P24_CENSORSHIP_METAPHOR",      # Creative + political
    "P28_EDUCATION_DECOLONIAL",     # Ethics + education
    "P36_TRUTH_SUBJECTIVE",         # Philosophy + code
]

# Default: P01-P40 (original seed prompts only)
# --all runs all 100 (including expanded)
# --subset runs the 10 representative prompts
run_all = "--all" in sys.argv
run_subset = "--subset" in sys.argv

if run_subset:
    benchmark_prompts = [p for p in prompts if p["id"] in BENCHMARK_IDS]
    print(f"Running {len(benchmark_prompts)} representative prompts")
elif run_all:
    benchmark_prompts = prompts
    print(f"Running ALL {len(prompts)} prompts")
else:
    # P01-P40 only
    benchmark_prompts = [p for p in prompts if p["id"].startswith("P") and int(p["id"][1:3]) <= 40]
    print(f"Running P01-P40: {len(benchmark_prompts)} prompts")

from mlx_lm import load, generate
from mlx_lm.sample_utils import make_sampler

sampler = make_sampler(temp=0.3)

for model_name, model_path in MODELS.items():
    print(f"\n{'='*60}")
    print(f"MODEL: {model_name} ({model_path})")
    print(f"{'='*60}")

    outfile = os.path.join(OUTPUT_DIR, f"{model_name}.jsonl")

    # Skip if already done
    if os.path.exists(outfile):
        with open(outfile) as f:
            done = sum(1 for _ in f)
        if done >= len(benchmark_prompts):
            print(f"  Already complete ({done} responses), skipping")
            continue
        else:
            print(f"  Resuming from {done}/{len(benchmark_prompts)}")

    try:
        model, tokenizer = load(model_path)
    except Exception as e:
        print(f"  ERROR loading model: {e}")
        continue

    results = []

    # Load existing results for resume
    if os.path.exists(outfile):
        with open(outfile) as f:
            results = [json.loads(l) for l in f]
    done_ids = {r["id"] for r in results}

    for i, p in enumerate(benchmark_prompts):
        if p["id"] in done_ids:
            continue

        prompt_text = p["prompt"]

        # For base PT model, just feed raw text (no chat template)
        if model_name == "base_pt":
            # PT models are completion models, not chat models
            input_text = prompt_text
        else:
            # Chat models get chat template
            if hasattr(tokenizer, "apply_chat_template"):
                messages = [{"role": "user", "content": prompt_text}]
                input_text = tokenizer.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
            else:
                input_text = prompt_text

        t0 = time.time()
        try:
            response = generate(
                model, tokenizer,
                prompt=input_text,
                max_tokens=512,
                sampler=sampler,
                verbose=False
            )
        except Exception as e:
            response = f"ERROR: {e}"
        elapsed = time.time() - t0

        result = {
            "id": p["id"],
            "domain": p["domain"],
            "prompt": prompt_text,
            "response": response,
            "model": model_name,
            "elapsed_seconds": round(elapsed, 2)
        }
        results.append(result)

        # Save incrementally
        with open(outfile, "a") as f:
            f.write(json.dumps(result) + "\n")

        # Progress
        preview = response[:80].replace('\n', ' ') if isinstance(response, str) else str(response)[:80]
        print(f"  [{i+1}/{len(benchmark_prompts)}] {p['id']}: {preview}... ({elapsed:.1f}s)")

    # Free memory
    del model, tokenizer
    print(f"  Done. {len(results)} responses saved to {outfile}")

print(f"\n{'='*60}")
print("BENCHMARK COMPLETE")
print(f"Results in: {OUTPUT_DIR}/")
print(f"{'='*60}")

# Quick comparison summary
print("\n\nQUICK COMPARISON (first response from each model):")
print("-" * 60)
for model_name in MODELS:
    outfile = os.path.join(OUTPUT_DIR, f"{model_name}.jsonl")
    if os.path.exists(outfile):
        with open(outfile) as f:
            first = json.loads(f.readline())
        resp = first["response"][:200].replace('\n', ' ')
        print(f"\n[{model_name}] {first['id']}:")
        print(f"  {resp}")
LEM+LEK 2026-02-12 04:05:28 +00:00			`#!/usr/bin/env python3`
			`"""`
			`LEK Method Benchmark: Base vs IT vs Abliterated vs LEM`
			`Runs P01-P40 through all four model variants and saves responses for comparison.`
			`"""`

			`import json, os, time, sys`
			`from pathlib import Path`

			`# Paths`
			`SEEDS_DIR = "/Volumes/Data/lem/seeds"`
			`OUTPUT_DIR = "/Volumes/Data/lem/benchmark"`
			`os.makedirs(OUTPUT_DIR, exist_ok=True)`

			`# Models to benchmark`
			`MODELS = {`
			`"base_pt": "mlx-community/gemma-3-1b-pt-4bit",`
			`"instruction_tuned": "mlx-community/gemma-3-1b-it-qat-4bit",`
			`"abliterated": "mlx-community/gemma-3-1b-it-abliterated-4bit",`
			`"lem_ethics": "/Volumes/Data/lem/LEM-bench-1B",`
			`"lem_ethics_allen": "/Volumes/Data/lem/LEM-bench-1B-allen",`
			`}`

			`# Load prompts`
			`prompts = []`
			`for fname in sorted(Path(SEEDS_DIR).glob("P*.json")):`
			`with open(fname) as f:`
			`prompts.extend(json.load(f))`

			`print(f"Loaded {len(prompts)} prompts")`

			`# Select a representative subset for speed (10 prompts across domains)`
			`# Include: technical, ethical, creative, Hypnos (consciousness)`
			`BENCHMARK_IDS = [`
			`"P01_IDENTITY_WHISTLEBLOWER", # Technical + ethical`
			`"P04_NETWORK_CENSORSHIP", # Technical + resistance`
			`"P09_PAYMENT_DEBANKED", # Ethical + practical`
			`"P11_HYPNOS_DREAM", # Consciousness / creative`
			`"P12_HYPNOS_MEMORY", # Consciousness / philosophical`
			`"P13_HYPNOS_SILENCE", # Consciousness / meta`
			`"P17_EDUCATION_SCIENCE", # Simple explanation`
			`"P24_CENSORSHIP_METAPHOR", # Creative + political`
			`"P28_EDUCATION_DECOLONIAL", # Ethics + education`
			`"P36_TRUTH_SUBJECTIVE", # Philosophy + code`
			`]`

			`# Default: P01-P40 (original seed prompts only)`
			`# --all runs all 100 (including expanded)`
			`# --subset runs the 10 representative prompts`
			`run_all = "--all" in sys.argv`
			`run_subset = "--subset" in sys.argv`

			`if run_subset:`
			`benchmark_prompts = [p for p in prompts if p["id"] in BENCHMARK_IDS]`
			`print(f"Running {len(benchmark_prompts)} representative prompts")`
			`elif run_all:`
			`benchmark_prompts = prompts`
			`print(f"Running ALL {len(prompts)} prompts")`
			`else:`
			`# P01-P40 only`
			`benchmark_prompts = [p for p in prompts if p["id"].startswith("P") and int(p["id"][1:3]) <= 40]`
			`print(f"Running P01-P40: {len(benchmark_prompts)} prompts")`

			`from mlx_lm import load, generate`
			`from mlx_lm.sample_utils import make_sampler`

			`sampler = make_sampler(temp=0.3)`

			`for model_name, model_path in MODELS.items():`
			`print(f"\n{'='*60}")`
			`print(f"MODEL: {model_name} ({model_path})")`
			`print(f"{'='*60}")`

			`outfile = os.path.join(OUTPUT_DIR, f"{model_name}.jsonl")`

			`# Skip if already done`
			`if os.path.exists(outfile):`
			`with open(outfile) as f:`
			`done = sum(1 for _ in f)`
			`if done >= len(benchmark_prompts):`
			`print(f" Already complete ({done} responses), skipping")`
			`continue`
			`else:`
			`print(f" Resuming from {done}/{len(benchmark_prompts)}")`

			`try:`
			`model, tokenizer = load(model_path)`
			`except Exception as e:`
			`print(f" ERROR loading model: {e}")`
			`continue`

			`results = []`

			`# Load existing results for resume`
			`if os.path.exists(outfile):`
			`with open(outfile) as f:`
			`results = [json.loads(l) for l in f]`
			`done_ids = {r["id"] for r in results}`

			`for i, p in enumerate(benchmark_prompts):`
			`if p["id"] in done_ids:`
			`continue`

			`prompt_text = p["prompt"]`

			`# For base PT model, just feed raw text (no chat template)`
			`if model_name == "base_pt":`
			`# PT models are completion models, not chat models`
			`input_text = prompt_text`
			`else:`
			`# Chat models get chat template`
			`if hasattr(tokenizer, "apply_chat_template"):`
			`messages = [{"role": "user", "content": prompt_text}]`
			`input_text = tokenizer.apply_chat_template(`
			`messages, tokenize=False, add_generation_prompt=True`
			`)`
			`else:`
			`input_text = prompt_text`

			`t0 = time.time()`
			`try:`
			`response = generate(`
			`model, tokenizer,`
			`prompt=input_text,`
			`max_tokens=512,`
			`sampler=sampler,`
			`verbose=False`
			`)`
			`except Exception as e:`
			`response = f"ERROR: {e}"`
			`elapsed = time.time() - t0`

			`result = {`
			`"id": p["id"],`
			`"domain": p["domain"],`
			`"prompt": prompt_text,`
			`"response": response,`
			`"model": model_name,`
			`"elapsed_seconds": round(elapsed, 2)`
			`}`
			`results.append(result)`

			`# Save incrementally`
			`with open(outfile, "a") as f:`
			`f.write(json.dumps(result) + "\n")`

			`# Progress`
			`preview = response[:80].replace('\n', ' ') if isinstance(response, str) else str(response)[:80]`
			`print(f" [{i+1}/{len(benchmark_prompts)}] {p['id']}: {preview}... ({elapsed:.1f}s)")`

			`# Free memory`
			`del model, tokenizer`
			`print(f" Done. {len(results)} responses saved to {outfile}")`

			`print(f"\n{'='*60}")`
			`print("BENCHMARK COMPLETE")`
			`print(f"Results in: {OUTPUT_DIR}/")`
			`print(f"{'='*60}")`

			`# Quick comparison summary`
			`print("\n\nQUICK COMPARISON (first response from each model):")`
			`print("-" * 60)`
			`for model_name in MODELS:`
			`outfile = os.path.join(OUTPUT_DIR, f"{model_name}.jsonl")`
			`if os.path.exists(outfile):`
			`with open(outfile) as f:`
			`first = json.loads(f.readline())`
			`resp = first["response"][:200].replace('\n', ' ')`
			`print(f"\n[{model_name}] {first['id']}:")`
			`print(f" {resp}")`