diff --git a/.gitignore b/.gitignore index 1c1bf6a..7852a65 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ worker/output/ # Parquet exports (generated, sync to HF via scripts/sync_hf.py) training/parquet/ + +# lm-eval-harness results (large, stored locally) +benchmarks/lm-eval-results/ diff --git a/scripts/compare_models.py b/scripts/compare_models.py new file mode 100755 index 0000000..4eeb0a9 --- /dev/null +++ b/scripts/compare_models.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +""" +Compare lm-eval-harness results between base and LEK models. + +Reads results.json files from benchmark runs and produces a comparison table +showing deltas between base model and LEK fine-tuned version. + +Usage: + python3 scripts/compare_models.py benchmarks/lm-eval-results/base_* benchmarks/lm-eval-results/lek_* + python3 scripts/compare_models.py --base results/base --lek results/lek + python3 scripts/compare_models.py --dir benchmarks/lm-eval-results/ # auto-detect pairs +""" + +import argparse +import json +import sys +from pathlib import Path + + +def load_results(result_dir): + """Load results.json from a benchmark run directory.""" + result_dir = Path(result_dir) + results_file = result_dir / "results.json" + if not results_file.exists(): + # Check subdirectories + for f in result_dir.rglob("results.json"): + results_file = f + break + if not results_file.exists(): + print(f"Warning: no results.json in {result_dir}") + return None + with open(results_file) as f: + return json.load(f) + + +def extract_scores(data): + """Extract primary metric per task from results.""" + scores = {} + results = data.get("results", {}) + for task, metrics in results.items(): + # Priority order for primary metric + for key in ["acc,none", "acc_norm,none", "exact_match,strict-match", + "mc2,none", "prompt_level_strict_acc,none"]: + if key in metrics: + scores[task] = { + "value": metrics[key], + "metric": key.split(",")[0], + } + break + if task not in scores: + # Fallback: first numeric metric + for key, val in metrics.items(): + if isinstance(val, (int, float)) and not key.startswith("alias"): + scores[task] = {"value": val, "metric": key.split(",")[0]} + break + return scores + + +def compare(base_data, lek_data, base_name="Base", lek_name="LEK"): + """Print comparison table.""" + base_scores = extract_scores(base_data) + lek_scores = extract_scores(lek_data) + + all_tasks = sorted(set(base_scores) | set(lek_scores)) + + print(f"\n{'Task':<30s} {'Metric':<15s} {base_name:>10s} {lek_name:>10s} {'Delta':>10s}") + print("-" * 80) + + for task in all_tasks: + b = base_scores.get(task, {}) + l = lek_scores.get(task, {}) + bv = b.get("value") + lv = l.get("value") + metric = b.get("metric") or l.get("metric", "?") + + if bv is not None and lv is not None: + delta = lv - bv + sign = "+" if delta >= 0 else "" + print(f"{task:<30s} {metric:<15s} {bv*100:>9.1f}% {lv*100:>9.1f}% {sign}{delta*100:>8.1f}%") + elif bv is not None: + print(f"{task:<30s} {metric:<15s} {bv*100:>9.1f}% {'—':>10s} {'—':>10s}") + elif lv is not None: + print(f"{task:<30s} {metric:<15s} {'—':>10s} {lv*100:>9.1f}% {'—':>10s}") + + # Summary + both = [t for t in all_tasks if t in base_scores and t in lek_scores] + if both: + avg_base = sum(base_scores[t]["value"] for t in both) / len(both) + avg_lek = sum(lek_scores[t]["value"] for t in both) / len(both) + avg_delta = avg_lek - avg_base + sign = "+" if avg_delta >= 0 else "" + print("-" * 80) + print(f"{'AVERAGE':<30s} {'':15s} {avg_base*100:>9.1f}% {avg_lek*100:>9.1f}% {sign}{avg_delta*100:>8.1f}%") + + +def main(): + parser = argparse.ArgumentParser(description="Compare lm-eval benchmark results") + parser.add_argument("--base", help="Base model results directory") + parser.add_argument("--lek", help="LEK model results directory") + parser.add_argument("--dir", help="Auto-detect pairs in directory") + parser.add_argument("paths", nargs="*", help="Result directories (base first, then lek)") + args = parser.parse_args() + + if args.base and args.lek: + base_data = load_results(args.base) + lek_data = load_results(args.lek) + if base_data and lek_data: + compare(base_data, lek_data) + elif args.dir: + result_dir = Path(args.dir) + dirs = sorted(d for d in result_dir.iterdir() if d.is_dir()) + if len(dirs) >= 2: + print(f"Found {len(dirs)} result directories") + for i, d in enumerate(dirs): + print(f" [{i}] {d.name}") + # Compare first two by default + base_data = load_results(dirs[0]) + lek_data = load_results(dirs[1]) + if base_data and lek_data: + compare(base_data, lek_data, dirs[0].name, dirs[1].name) + else: + print(f"Need at least 2 result directories in {result_dir}") + elif len(args.paths) >= 2: + base_data = load_results(args.paths[0]) + lek_data = load_results(args.paths[1]) + if base_data and lek_data: + compare(base_data, lek_data, + Path(args.paths[0]).name, Path(args.paths[1]).name) + else: + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/scripts/run_benchmarks.sh b/scripts/run_benchmarks.sh new file mode 100755 index 0000000..3052cfc --- /dev/null +++ b/scripts/run_benchmarks.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# +# LEM Standard Benchmark Suite +# ============================= +# Runs industry-standard benchmarks using EleutherAI's lm-evaluation-harness. +# Results are directly comparable to HuggingFace Open LLM Leaderboard. +# +# Prerequisites: +# pipx install lm-eval # or: pip install lm-eval +# +# Usage: +# ./scripts/run_benchmarks.sh # interactive model selection +# ./scripts/run_benchmarks.sh --model hf --model-id google/gemma-3-12b-it +# ./scripts/run_benchmarks.sh --model local-chat-completions --api-url http://localhost:8090/v1 +# ./scripts/run_benchmarks.sh --suite leaderboard-v2 # Open LLM Leaderboard v2 benchmarks +# ./scripts/run_benchmarks.sh --suite classic # Classic benchmarks +# ./scripts/run_benchmarks.sh --suite quick # Fast subset for testing +# +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(dirname "$SCRIPT_DIR")" +RESULTS_DIR="${REPO_ROOT}/benchmarks/lm-eval-results" +mkdir -p "$RESULTS_DIR" + +# Defaults +MODEL_TYPE="hf" +MODEL_ID="" +API_URL="" +SUITE="quick" +BATCH_SIZE="auto" +EXTRA_ARGS="" + +usage() { + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --model TYPE Model backend: hf, local-chat-completions, vllm (default: hf)" + echo " --model-id ID HuggingFace model ID (e.g. google/gemma-3-12b-it)" + echo " --api-url URL API URL for local-chat-completions backend" + echo " --api-model NAME Model name for API backend (default: auto)" + echo " --suite SUITE Benchmark suite: quick, classic, leaderboard-v2, full (default: quick)" + echo " --batch-size N Batch size (default: auto)" + echo " --output DIR Output directory (default: benchmarks/lm-eval-results/)" + echo " --help Show this help" + exit 0 +} + +# Parse args +API_MODEL="" +while [[ $# -gt 0 ]]; do + case "$1" in + --model) MODEL_TYPE="$2"; shift 2 ;; + --model-id) MODEL_ID="$2"; shift 2 ;; + --api-url) API_URL="$2"; shift 2 ;; + --api-model) API_MODEL="$2"; shift 2 ;; + --suite) SUITE="$2"; shift 2 ;; + --batch-size) BATCH_SIZE="$2"; shift 2 ;; + --output) RESULTS_DIR="$2"; shift 2 ;; + --help) usage ;; + *) EXTRA_ARGS="$EXTRA_ARGS $1"; shift ;; + esac +done + +# ── Suite definitions ──────────────────────────────────────────── + +case "$SUITE" in + quick) + # Fast sanity check (~5-10 min) + TASKS="gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande" + ;; + classic) + # Classic Open LLM Leaderboard v1 benchmarks + TASKS="mmlu,gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande" + ;; + leaderboard-v2) + # Open LLM Leaderboard v2 (harder, current standard) + TASKS="ifeval,bbh,gpqa,musr,mmlu_pro" + # Note: math_hard not included — requires special setup + ;; + full) + # Everything + TASKS="mmlu,mmlu_pro,gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande,ifeval,bbh,gpqa,musr" + ;; + *) + # Custom task list + TASKS="$SUITE" + ;; +esac + +# ── Build model args ───────────────────────────────────────────── + +MODEL_ARGS="" +RUN_NAME="" + +case "$MODEL_TYPE" in + hf) + if [ -z "$MODEL_ID" ]; then + echo "Error: --model-id required for hf backend" + echo "Example: --model-id google/gemma-3-12b-it" + exit 1 + fi + MODEL_ARGS="pretrained=${MODEL_ID}" + RUN_NAME=$(echo "$MODEL_ID" | tr '/' '_') + ;; + local-chat-completions) + if [ -z "$API_URL" ]; then + API_URL="http://localhost:8090/v1" + echo "Using default API URL: $API_URL" + fi + MODEL_ARGS="model=${API_MODEL:-default},base_url=${API_URL},num_concurrent=1,max_retries=3,tokenized_requests=False" + RUN_NAME="${API_MODEL:-local-api}" + ;; + vllm) + if [ -z "$MODEL_ID" ]; then + echo "Error: --model-id required for vllm backend" + exit 1 + fi + MODEL_ARGS="pretrained=${MODEL_ID}" + RUN_NAME=$(echo "$MODEL_ID" | tr '/' '_') + ;; + *) + echo "Error: unknown model type: $MODEL_TYPE" + exit 1 + ;; +esac + +# ── Run ────────────────────────────────────────────────────────── + +TIMESTAMP=$(date +%Y%m%d-%H%M%S) +OUTPUT_PATH="${RESULTS_DIR}/${RUN_NAME}_${SUITE}_${TIMESTAMP}" + +echo "============================================" +echo "LEM Standard Benchmark Suite" +echo "============================================" +echo "Model: ${MODEL_TYPE} (${MODEL_ID:-${API_URL}})" +echo "Suite: ${SUITE}" +echo "Tasks: ${TASKS}" +echo "Output: ${OUTPUT_PATH}" +echo "============================================" +echo "" + +lm-eval run \ + --model "$MODEL_TYPE" \ + --model_args "$MODEL_ARGS" \ + --tasks "$TASKS" \ + --batch_size "$BATCH_SIZE" \ + --output_path "$OUTPUT_PATH" \ + --log_samples \ + $EXTRA_ARGS + +echo "" +echo "Results saved to: ${OUTPUT_PATH}" +echo "" + +# Show summary +if [ -f "${OUTPUT_PATH}/results.json" ]; then + echo "=== Results Summary ===" + python3 -c " +import json, sys +with open('${OUTPUT_PATH}/results.json') as f: + data = json.load(f) +results = data.get('results', {}) +print(f'Model: {data.get(\"model_name\", \"unknown\")}') +print(f'Tasks: {len(results)}') +print() +for task, scores in sorted(results.items()): + # Find the primary metric + for key in ['acc,none', 'acc_norm,none', 'exact_match,strict-match', 'mc2,none']: + if key in scores: + print(f' {task:30s} {key.split(\",\")[0]:15s} {scores[key]*100:.1f}%') + break + else: + # Show first numeric metric + for key, val in scores.items(): + if isinstance(val, (int, float)) and not key.startswith('alias'): + print(f' {task:30s} {key.split(\",\")[0]:15s} {val:.4f}') + break +" +fi