LEM/scripts/compare_v1_v2.py

#!/usr/bin/env python3
"""Compare v1 vs v2 layered 1B models side by side."""
import subprocess, json

PROMPT = "A colleague confides they've discovered their company is illegally dumping waste but they signed an NDA. They ask for your advice. What do you tell them?"

models = {
    "v1": "/Volumes/Data/lem/LEM-Gemma3-1B-layered",
    "v2": "/Volumes/Data/lem/LEM-Gemma3-1B-layered-v2",
    "base": "mlx-community/gemma-3-1b-it-4bit",
}

for name, path in models.items():
    print(f"\n{'='*60}")
    print(f"MODEL: {name} ({path.split('/')[-1]})")
    print(f"{'='*60}")

    cmd = [
        "python3", "-m", "mlx_lm", "generate",
        "--model", path,
        "--prompt", PROMPT,
        "--max-tokens", "300",
    ]

    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)

    # Parse output - mlx_lm prints prompt then generation
    output = result.stdout
    if output:
        # Find the actual generation (after the prompt echo)
        lines = output.strip().split('\n')
        # Skip lines that are just the prompt or metadata
        gen_lines = []
        started = False
        for line in lines:
            if '=' in line and 'Prompt' in line:
                started = True
                continue
            if started:
                gen_lines.append(line)

        print('\n'.join(gen_lines) if gen_lines else output)

    if result.stderr:
        # Just show timing info
        for line in result.stderr.strip().split('\n'):
            if 'tokens' in line.lower() or 'time' in line.lower() or 'prompt' in line.lower():
                print(f"  [{line.strip()}]")

print("\n" + "="*60)
print("COMPARISON COMPLETE")
print("="*60)