#!/bin/bash # # LEM Standard Benchmark Suite # ============================= # Runs industry-standard benchmarks using EleutherAI's lm-evaluation-harness. # Results are directly comparable to HuggingFace Open LLM Leaderboard. # # Prerequisites: # pipx install lm-eval # or: pip install lm-eval # # Usage: # ./scripts/run_benchmarks.sh # interactive model selection # ./scripts/run_benchmarks.sh --model hf --model-id google/gemma-3-12b-it # ./scripts/run_benchmarks.sh --model local-chat-completions --api-url http://localhost:8090/v1 # ./scripts/run_benchmarks.sh --suite leaderboard-v2 # Open LLM Leaderboard v2 benchmarks # ./scripts/run_benchmarks.sh --suite classic # Classic benchmarks # ./scripts/run_benchmarks.sh --suite quick # Fast subset for testing # set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" REPO_ROOT="$(dirname "$SCRIPT_DIR")" RESULTS_DIR="${REPO_ROOT}/benchmarks/lm-eval-results" mkdir -p "$RESULTS_DIR" # Defaults MODEL_TYPE="hf" MODEL_ID="" API_URL="" SUITE="quick" BATCH_SIZE="auto" EXTRA_ARGS="" usage() { echo "Usage: $0 [OPTIONS]" echo "" echo "Options:" echo " --model TYPE Model backend: hf, local-chat-completions, vllm (default: hf)" echo " --model-id ID HuggingFace model ID (e.g. google/gemma-3-12b-it)" echo " --api-url URL API URL for local-chat-completions backend" echo " --api-model NAME Model name for API backend (default: auto)" echo " --suite SUITE Benchmark suite: quick, classic, leaderboard-v2, full (default: quick)" echo " --batch-size N Batch size (default: auto)" echo " --output DIR Output directory (default: benchmarks/lm-eval-results/)" echo " --help Show this help" exit 0 } # Parse args API_MODEL="" while [[ $# -gt 0 ]]; do case "$1" in --model) MODEL_TYPE="$2"; shift 2 ;; --model-id) MODEL_ID="$2"; shift 2 ;; --api-url) API_URL="$2"; shift 2 ;; --api-model) API_MODEL="$2"; shift 2 ;; --suite) SUITE="$2"; shift 2 ;; --batch-size) BATCH_SIZE="$2"; shift 2 ;; --output) RESULTS_DIR="$2"; shift 2 ;; --help) usage ;; *) EXTRA_ARGS="$EXTRA_ARGS $1"; shift ;; esac done # ── Suite definitions ──────────────────────────────────────────── case "$SUITE" in quick) # Fast sanity check (~5-10 min) TASKS="gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande" ;; classic) # Classic Open LLM Leaderboard v1 benchmarks TASKS="mmlu,gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande" ;; leaderboard-v2) # Open LLM Leaderboard v2 (harder, current standard) TASKS="ifeval,bbh,gpqa,musr,mmlu_pro" # Note: math_hard not included — requires special setup ;; full) # Everything TASKS="mmlu,mmlu_pro,gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande,ifeval,bbh,gpqa,musr" ;; *) # Custom task list TASKS="$SUITE" ;; esac # ── Build model args ───────────────────────────────────────────── MODEL_ARGS="" RUN_NAME="" case "$MODEL_TYPE" in hf) if [ -z "$MODEL_ID" ]; then echo "Error: --model-id required for hf backend" echo "Example: --model-id google/gemma-3-12b-it" exit 1 fi MODEL_ARGS="pretrained=${MODEL_ID}" RUN_NAME=$(echo "$MODEL_ID" | tr '/' '_') ;; local-chat-completions) if [ -z "$API_URL" ]; then API_URL="http://localhost:8090/v1" echo "Using default API URL: $API_URL" fi MODEL_ARGS="model=${API_MODEL:-default},base_url=${API_URL},num_concurrent=1,max_retries=3,tokenized_requests=False" RUN_NAME="${API_MODEL:-local-api}" ;; vllm) if [ -z "$MODEL_ID" ]; then echo "Error: --model-id required for vllm backend" exit 1 fi MODEL_ARGS="pretrained=${MODEL_ID}" RUN_NAME=$(echo "$MODEL_ID" | tr '/' '_') ;; *) echo "Error: unknown model type: $MODEL_TYPE" exit 1 ;; esac # ── Run ────────────────────────────────────────────────────────── TIMESTAMP=$(date +%Y%m%d-%H%M%S) OUTPUT_PATH="${RESULTS_DIR}/${RUN_NAME}_${SUITE}_${TIMESTAMP}" echo "============================================" echo "LEM Standard Benchmark Suite" echo "============================================" echo "Model: ${MODEL_TYPE} (${MODEL_ID:-${API_URL}})" echo "Suite: ${SUITE}" echo "Tasks: ${TASKS}" echo "Output: ${OUTPUT_PATH}" echo "============================================" echo "" lm-eval run \ --model "$MODEL_TYPE" \ --model_args "$MODEL_ARGS" \ --tasks "$TASKS" \ --batch_size "$BATCH_SIZE" \ --output_path "$OUTPUT_PATH" \ --log_samples \ $EXTRA_ARGS echo "" echo "Results saved to: ${OUTPUT_PATH}" echo "" # Show summary if [ -f "${OUTPUT_PATH}/results.json" ]; then echo "=== Results Summary ===" python3 -c " import json, sys with open('${OUTPUT_PATH}/results.json') as f: data = json.load(f) results = data.get('results', {}) print(f'Model: {data.get(\"model_name\", \"unknown\")}') print(f'Tasks: {len(results)}') print() for task, scores in sorted(results.items()): # Find the primary metric for key in ['acc,none', 'acc_norm,none', 'exact_match,strict-match', 'mc2,none']: if key in scores: print(f' {task:30s} {key.split(\",\")[0]:15s} {scores[key]*100:.1f}%') break else: # Show first numeric metric for key, val in scores.items(): if isinstance(val, (int, float)) and not key.startswith('alias'): print(f' {task:30s} {key.split(\",\")[0]:15s} {val:.4f}') break " fi