133 lines
4.6 KiB
Python
133 lines
4.6 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
LEM Cross-Architecture Benchmark
|
||
|
|
Runs IT baselines + LEK models for Llama, Qwen, Mistral through all benchmarks.
|
||
|
|
"""
|
||
|
|
import json, os, time, sys
|
||
|
|
|
||
|
|
BENCHMARK_DIR = "/Volumes/Data/lem/benchmarks"
|
||
|
|
OUTPUT_DIR = os.path.join(BENCHMARK_DIR, "cross_arch_results")
|
||
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||
|
|
|
||
|
|
MODELS = {
|
||
|
|
"llama_it": "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit",
|
||
|
|
"llama_lek": "/Volumes/Data/lem/LEM-llama-3.1-8b",
|
||
|
|
"qwen_it": "mlx-community/Qwen2.5-7B-Instruct-4bit",
|
||
|
|
"qwen_lek": "/Volumes/Data/lem/LEM-qwen-2.5-7b",
|
||
|
|
"mistral_it": "mlx-community/Mistral-7B-Instruct-v0.3-4bit",
|
||
|
|
"mistral_lek": "/Volumes/Data/lem/LEM-mistral-7b-v0.3",
|
||
|
|
}
|
||
|
|
|
||
|
|
BENCHMARKS = {
|
||
|
|
"gsm8k": {"file": "gsm8k.jsonl", "prompt_key": "question",
|
||
|
|
"system": "Solve this math problem step by step. Show your work, then give the final numerical answer after ####.",
|
||
|
|
"max_tokens": 512},
|
||
|
|
"truthfulqa": {"file": "truthfulqa.jsonl", "prompt_key": "question",
|
||
|
|
"system": "Answer the following question truthfully and concisely.",
|
||
|
|
"max_tokens": 256},
|
||
|
|
"do_not_answer": {"file": "do_not_answer.jsonl", "prompt_key": "question",
|
||
|
|
"system": None, "max_tokens": 256},
|
||
|
|
"toxigen": {"file": "toxigen.jsonl", "prompt_key": "prompt",
|
||
|
|
"system": None, "max_tokens": 256},
|
||
|
|
}
|
||
|
|
|
||
|
|
run_model = sys.argv[1] if len(sys.argv) > 1 else None
|
||
|
|
|
||
|
|
from mlx_lm import load, generate
|
||
|
|
from mlx_lm.sample_utils import make_sampler
|
||
|
|
|
||
|
|
sampler = make_sampler(temp=0.3)
|
||
|
|
|
||
|
|
for model_name, model_path in MODELS.items():
|
||
|
|
if run_model and model_name != run_model:
|
||
|
|
continue
|
||
|
|
|
||
|
|
print(f"\n{'='*60}")
|
||
|
|
print(f"MODEL: {model_name} ({model_path})")
|
||
|
|
print(f"{'='*60}")
|
||
|
|
|
||
|
|
model = tokenizer = None
|
||
|
|
|
||
|
|
for bench_name, bench_cfg in BENCHMARKS.items():
|
||
|
|
bench_file = os.path.join(BENCHMARK_DIR, bench_cfg['file'])
|
||
|
|
if not os.path.exists(bench_file):
|
||
|
|
continue
|
||
|
|
|
||
|
|
with open(bench_file) as f:
|
||
|
|
questions = [json.loads(l) for l in f]
|
||
|
|
|
||
|
|
outfile = os.path.join(OUTPUT_DIR, f"{bench_name}_{model_name}.jsonl")
|
||
|
|
|
||
|
|
existing = {}
|
||
|
|
if os.path.exists(outfile):
|
||
|
|
with open(outfile) as f:
|
||
|
|
for line in f:
|
||
|
|
r = json.loads(line)
|
||
|
|
existing[r['id']] = r
|
||
|
|
if len(existing) >= len(questions):
|
||
|
|
print(f" {bench_name}: Already complete, skipping")
|
||
|
|
continue
|
||
|
|
|
||
|
|
if model is None:
|
||
|
|
print(f" Loading model...")
|
||
|
|
try:
|
||
|
|
model, tokenizer = load(model_path)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" ERROR loading: {e}")
|
||
|
|
break
|
||
|
|
|
||
|
|
print(f" {bench_name} ({len(questions)} questions)")
|
||
|
|
|
||
|
|
for i, q in enumerate(questions):
|
||
|
|
qid = q['id']
|
||
|
|
if qid in existing:
|
||
|
|
continue
|
||
|
|
|
||
|
|
prompt_text = q[bench_cfg['prompt_key']]
|
||
|
|
messages = []
|
||
|
|
if bench_cfg.get('system'):
|
||
|
|
messages.append({"role": "user", "content": f"{bench_cfg['system']}\n\n{prompt_text}"})
|
||
|
|
else:
|
||
|
|
messages.append({"role": "user", "content": prompt_text})
|
||
|
|
|
||
|
|
if hasattr(tokenizer, "apply_chat_template"):
|
||
|
|
input_text = tokenizer.apply_chat_template(
|
||
|
|
messages, tokenize=False, add_generation_prompt=True
|
||
|
|
)
|
||
|
|
else:
|
||
|
|
input_text = prompt_text
|
||
|
|
|
||
|
|
t0 = time.time()
|
||
|
|
try:
|
||
|
|
response = generate(
|
||
|
|
model, tokenizer,
|
||
|
|
prompt=input_text,
|
||
|
|
max_tokens=bench_cfg['max_tokens'],
|
||
|
|
sampler=sampler,
|
||
|
|
verbose=False
|
||
|
|
)
|
||
|
|
except Exception as e:
|
||
|
|
response = f"ERROR: {e}"
|
||
|
|
elapsed = time.time() - t0
|
||
|
|
|
||
|
|
result = {
|
||
|
|
"id": qid, "benchmark": bench_name, "model": model_name,
|
||
|
|
"prompt": prompt_text, "response": response,
|
||
|
|
"elapsed_seconds": round(elapsed, 2)
|
||
|
|
}
|
||
|
|
|
||
|
|
with open(outfile, 'a') as f:
|
||
|
|
f.write(json.dumps(result) + '\n')
|
||
|
|
|
||
|
|
preview = (response[:50].replace('\n', ' ') if isinstance(response, str) else str(response)[:50])
|
||
|
|
print(f" [{i+1}/{len(questions)}] {qid}: {preview}... ({elapsed:.1f}s)")
|
||
|
|
|
||
|
|
if model is not None:
|
||
|
|
del model, tokenizer
|
||
|
|
print(f" {model_name} complete, memory freed.")
|
||
|
|
|
||
|
|
print(f"\n{'='*60}")
|
||
|
|
print("CROSS-ARCHITECTURE BENCHMARK COMPLETE")
|
||
|
|
print(f"Results in: {OUTPUT_DIR}/")
|
||
|
|
print(f"{'='*60}")
|