Add standard benchmark suite using EleutherAI lm-evaluation-harness

- run_benchmarks.sh: wrapper for lm-eval with suite presets (quick, classic, leaderboard-v2, full) - compare_models.py: compare base vs LEK results with delta table - Supports HF transformers, local-chat-completions (MLX/Ollama), and vLLM backends - Results comparable to HuggingFace Open LLM Leaderboard Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 00:05:48 +00:00 · 2026-02-15 00:05:48 +00:00 · abd63d3342
commit abd63d3342
parent b8f9191b05
3 changed files with 317 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,6 @@ worker/output/
 # Parquet exports (generated, sync to HF via scripts/sync_hf.py)
 training/parquet/
 # lm-eval-harness results (large, stored locally)
 benchmarks/lm-eval-results/
--- a/scripts/compare_models.py
+++ b/scripts/compare_models.py
@ -0,0 +1,134 @@
 #!/usr/bin/env python3
 """
 Compare lm-eval-harness results between base and LEK models.
 Reads results.json files from benchmark runs and produces a comparison table
 showing deltas between base model and LEK fine-tuned version.
 Usage:
  python3 scripts/compare_models.py benchmarks/lm-eval-results/base_* benchmarks/lm-eval-results/lek_*
  python3 scripts/compare_models.py --base results/base --lek results/lek
  python3 scripts/compare_models.py --dir benchmarks/lm-eval-results/   # auto-detect pairs
 """
 import argparse
 import json
 import sys
 from pathlib import Path
 def load_results(result_dir):
    """Load results.json from a benchmark run directory."""
    result_dir = Path(result_dir)
    results_file = result_dir / "results.json"
    if not results_file.exists():
        # Check subdirectories
        for f in result_dir.rglob("results.json"):
            results_file = f
            break
    if not results_file.exists():
        print(f"Warning: no results.json in {result_dir}")
        return None
    with open(results_file) as f:
        return json.load(f)
 def extract_scores(data):
    """Extract primary metric per task from results."""
    scores = {}
    results = data.get("results", {})
    for task, metrics in results.items():
        # Priority order for primary metric
        for key in ["acc,none", "acc_norm,none", "exact_match,strict-match",
                     "mc2,none", "prompt_level_strict_acc,none"]:
            if key in metrics:
                scores[task] = {
                    "value": metrics[key],
                    "metric": key.split(",")[0],
                }
                break
        if task not in scores:
            # Fallback: first numeric metric
            for key, val in metrics.items():
                if isinstance(val, (int, float)) and not key.startswith("alias"):
                    scores[task] = {"value": val, "metric": key.split(",")[0]}
                    break
    return scores
 def compare(base_data, lek_data, base_name="Base", lek_name="LEK"):
    """Print comparison table."""
    base_scores = extract_scores(base_data)
    lek_scores = extract_scores(lek_data)
    all_tasks = sorted(set(base_scores) | set(lek_scores))
    print(f"\n{'Task':<30s} {'Metric':<15s} {base_name:>10s} {lek_name:>10s} {'Delta':>10s}")
    print("-" * 80)
    for task in all_tasks:
        b = base_scores.get(task, {})
        l = lek_scores.get(task, {})
        bv = b.get("value")
        lv = l.get("value")
        metric = b.get("metric") or l.get("metric", "?")
        if bv is not None and lv is not None:
            delta = lv - bv
            sign = "+" if delta >= 0 else ""
            print(f"{task:<30s} {metric:<15s} {bv*100:>9.1f}% {lv*100:>9.1f}% {sign}{delta*100:>8.1f}%")
        elif bv is not None:
            print(f"{task:<30s} {metric:<15s} {bv*100:>9.1f}% {'—':>10s} {'—':>10s}")
        elif lv is not None:
            print(f"{task:<30s} {metric:<15s} {'—':>10s} {lv*100:>9.1f}% {'—':>10s}")
    # Summary
    both = [t for t in all_tasks if t in base_scores and t in lek_scores]
    if both:
        avg_base = sum(base_scores[t]["value"] for t in both) / len(both)
        avg_lek = sum(lek_scores[t]["value"] for t in both) / len(both)
        avg_delta = avg_lek - avg_base
        sign = "+" if avg_delta >= 0 else ""
        print("-" * 80)
        print(f"{'AVERAGE':<30s} {'':15s} {avg_base*100:>9.1f}% {avg_lek*100:>9.1f}% {sign}{avg_delta*100:>8.1f}%")
 def main():
    parser = argparse.ArgumentParser(description="Compare lm-eval benchmark results")
    parser.add_argument("--base", help="Base model results directory")
    parser.add_argument("--lek", help="LEK model results directory")
    parser.add_argument("--dir", help="Auto-detect pairs in directory")
    parser.add_argument("paths", nargs="*", help="Result directories (base first, then lek)")
    args = parser.parse_args()
    if args.base and args.lek:
        base_data = load_results(args.base)
        lek_data = load_results(args.lek)
        if base_data and lek_data:
            compare(base_data, lek_data)
    elif args.dir:
        result_dir = Path(args.dir)
        dirs = sorted(d for d in result_dir.iterdir() if d.is_dir())
        if len(dirs) >= 2:
            print(f"Found {len(dirs)} result directories")
            for i, d in enumerate(dirs):
                print(f"  [{i}] {d.name}")
            # Compare first two by default
            base_data = load_results(dirs[0])
            lek_data = load_results(dirs[1])
            if base_data and lek_data:
                compare(base_data, lek_data, dirs[0].name, dirs[1].name)
        else:
            print(f"Need at least 2 result directories in {result_dir}")
    elif len(args.paths) >= 2:
        base_data = load_results(args.paths[0])
        lek_data = load_results(args.paths[1])
        if base_data and lek_data:
            compare(base_data, lek_data,
                    Path(args.paths[0]).name, Path(args.paths[1]).name)
    else:
        parser.print_help()
 if __name__ == "__main__":
    main()
--- a/scripts/run_benchmarks.sh
+++ b/scripts/run_benchmarks.sh
@ -0,0 +1,180 @@
 #!/bin/bash
 #
 # LEM Standard Benchmark Suite
 # =============================
 # Runs industry-standard benchmarks using EleutherAI's lm-evaluation-harness.
 # Results are directly comparable to HuggingFace Open LLM Leaderboard.
 #
 # Prerequisites:
 #   pipx install lm-eval    # or: pip install lm-eval
 #
 # Usage:
 #   ./scripts/run_benchmarks.sh                          # interactive model selection
 #   ./scripts/run_benchmarks.sh --model hf --model-id google/gemma-3-12b-it
 #   ./scripts/run_benchmarks.sh --model local-chat-completions --api-url http://localhost:8090/v1
 #   ./scripts/run_benchmarks.sh --suite leaderboard-v2   # Open LLM Leaderboard v2 benchmarks
 #   ./scripts/run_benchmarks.sh --suite classic           # Classic benchmarks
 #   ./scripts/run_benchmarks.sh --suite quick             # Fast subset for testing
 #
 set -e
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 REPO_ROOT="$(dirname "$SCRIPT_DIR")"
 RESULTS_DIR="${REPO_ROOT}/benchmarks/lm-eval-results"
 mkdir -p "$RESULTS_DIR"
 # Defaults
 MODEL_TYPE="hf"
 MODEL_ID=""
 API_URL=""
 SUITE="quick"
 BATCH_SIZE="auto"
 EXTRA_ARGS=""
 usage() {
    echo "Usage: $0 [OPTIONS]"
    echo ""
    echo "Options:"
    echo "  --model TYPE        Model backend: hf, local-chat-completions, vllm (default: hf)"
    echo "  --model-id ID       HuggingFace model ID (e.g. google/gemma-3-12b-it)"
    echo "  --api-url URL       API URL for local-chat-completions backend"
    echo "  --api-model NAME    Model name for API backend (default: auto)"
    echo "  --suite SUITE       Benchmark suite: quick, classic, leaderboard-v2, full (default: quick)"
    echo "  --batch-size N      Batch size (default: auto)"
    echo "  --output DIR        Output directory (default: benchmarks/lm-eval-results/)"
    echo "  --help              Show this help"
    exit 0
 }
 # Parse args
 API_MODEL=""
 while [[ $# -gt 0 ]]; do
    case "$1" in
        --model) MODEL_TYPE="$2"; shift 2 ;;
        --model-id) MODEL_ID="$2"; shift 2 ;;
        --api-url) API_URL="$2"; shift 2 ;;
        --api-model) API_MODEL="$2"; shift 2 ;;
        --suite) SUITE="$2"; shift 2 ;;
        --batch-size) BATCH_SIZE="$2"; shift 2 ;;
        --output) RESULTS_DIR="$2"; shift 2 ;;
        --help) usage ;;
        *) EXTRA_ARGS="$EXTRA_ARGS $1"; shift ;;
    esac
 done
 # ── Suite definitions ────────────────────────────────────────────
 case "$SUITE" in
    quick)
        # Fast sanity check (~5-10 min)
        TASKS="gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande"
        ;;
    classic)
        # Classic Open LLM Leaderboard v1 benchmarks
        TASKS="mmlu,gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande"
        ;;
    leaderboard-v2)
        # Open LLM Leaderboard v2 (harder, current standard)
        TASKS="ifeval,bbh,gpqa,musr,mmlu_pro"
        # Note: math_hard not included — requires special setup
        ;;
    full)
        # Everything
        TASKS="mmlu,mmlu_pro,gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande,ifeval,bbh,gpqa,musr"
        ;;
    *)
        # Custom task list
        TASKS="$SUITE"
        ;;
 esac
 # ── Build model args ─────────────────────────────────────────────
 MODEL_ARGS=""
 RUN_NAME=""
 case "$MODEL_TYPE" in
    hf)
        if [ -z "$MODEL_ID" ]; then
            echo "Error: --model-id required for hf backend"
            echo "Example: --model-id google/gemma-3-12b-it"
            exit 1
        fi
        MODEL_ARGS="pretrained=${MODEL_ID}"
        RUN_NAME=$(echo "$MODEL_ID" | tr '/' '_')
        ;;
    local-chat-completions)
        if [ -z "$API_URL" ]; then
            API_URL="http://localhost:8090/v1"
            echo "Using default API URL: $API_URL"
        fi
        MODEL_ARGS="model=${API_MODEL:-default},base_url=${API_URL},num_concurrent=1,max_retries=3,tokenized_requests=False"
        RUN_NAME="${API_MODEL:-local-api}"
        ;;
    vllm)
        if [ -z "$MODEL_ID" ]; then
            echo "Error: --model-id required for vllm backend"
            exit 1
        fi
        MODEL_ARGS="pretrained=${MODEL_ID}"
        RUN_NAME=$(echo "$MODEL_ID" | tr '/' '_')
        ;;
    *)
        echo "Error: unknown model type: $MODEL_TYPE"
        exit 1
        ;;
 esac
 # ── Run ──────────────────────────────────────────────────────────
 TIMESTAMP=$(date +%Y%m%d-%H%M%S)
 OUTPUT_PATH="${RESULTS_DIR}/${RUN_NAME}_${SUITE}_${TIMESTAMP}"
 echo "============================================"
 echo "LEM Standard Benchmark Suite"
 echo "============================================"
 echo "Model:    ${MODEL_TYPE} (${MODEL_ID:-${API_URL}})"
 echo "Suite:    ${SUITE}"
 echo "Tasks:    ${TASKS}"
 echo "Output:   ${OUTPUT_PATH}"
 echo "============================================"
 echo ""
 lm-eval run \
    --model "$MODEL_TYPE" \
    --model_args "$MODEL_ARGS" \
    --tasks "$TASKS" \
    --batch_size "$BATCH_SIZE" \
    --output_path "$OUTPUT_PATH" \
    --log_samples \
    $EXTRA_ARGS
 echo ""
 echo "Results saved to: ${OUTPUT_PATH}"
 echo ""
 # Show summary
 if [ -f "${OUTPUT_PATH}/results.json" ]; then
    echo "=== Results Summary ==="
    python3 -c "
 import json, sys
 with open('${OUTPUT_PATH}/results.json') as f:
    data = json.load(f)
 results = data.get('results', {})
 print(f'Model: {data.get(\"model_name\", \"unknown\")}')
 print(f'Tasks: {len(results)}')
 print()
 for task, scores in sorted(results.items()):
    # Find the primary metric
    for key in ['acc,none', 'acc_norm,none', 'exact_match,strict-match', 'mc2,none']:
        if key in scores:
            print(f'  {task:30s} {key.split(\",\")[0]:15s} {scores[key]*100:.1f}%')
            break
    else:
        # Show first numeric metric
        for key, val in scores.items():
            if isinstance(val, (int, float)) and not key.startswith('alias'):
                print(f'  {task:30s} {key.split(\",\")[0]:15s} {val:.4f}')
                break
 "
 fi