LEM/scripts/run_benchmarks.sh

#!/bin/bash
#
# LEM Standard Benchmark Suite
# =============================
# Runs industry-standard benchmarks using EleutherAI's lm-evaluation-harness.
# Results are directly comparable to HuggingFace Open LLM Leaderboard.
#
# Prerequisites:
#   pipx install lm-eval    # or: pip install lm-eval
#
# Usage:
#   ./scripts/run_benchmarks.sh                          # interactive model selection
#   ./scripts/run_benchmarks.sh --model hf --model-id google/gemma-3-12b-it
#   ./scripts/run_benchmarks.sh --model local-chat-completions --api-url http://localhost:8090/v1
#   ./scripts/run_benchmarks.sh --suite leaderboard-v2   # Open LLM Leaderboard v2 benchmarks
#   ./scripts/run_benchmarks.sh --suite classic           # Classic benchmarks
#   ./scripts/run_benchmarks.sh --suite quick             # Fast subset for testing
#
set -e

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
REPO_ROOT="$(dirname "$SCRIPT_DIR")"
RESULTS_DIR="${REPO_ROOT}/benchmarks/lm-eval-results"
mkdir -p "$RESULTS_DIR"

# Defaults
MODEL_TYPE="hf"
MODEL_ID=""
API_URL=""
SUITE="quick"
BATCH_SIZE="auto"
EXTRA_ARGS=""

usage() {
    echo "Usage: $0 [OPTIONS]"
    echo ""
    echo "Options:"
    echo "  --model TYPE        Model backend: hf, local-chat-completions, vllm (default: hf)"
    echo "  --model-id ID       HuggingFace model ID (e.g. google/gemma-3-12b-it)"
    echo "  --api-url URL       API URL for local-chat-completions backend"
    echo "  --api-model NAME    Model name for API backend (default: auto)"
    echo "  --suite SUITE       Benchmark suite: quick, classic, leaderboard-v2, full (default: quick)"
    echo "  --batch-size N      Batch size (default: auto)"
    echo "  --output DIR        Output directory (default: benchmarks/lm-eval-results/)"
    echo "  --help              Show this help"
    exit 0
}

# Parse args
API_MODEL=""
while [[ $# -gt 0 ]]; do
    case "$1" in
        --model) MODEL_TYPE="$2"; shift 2 ;;
        --model-id) MODEL_ID="$2"; shift 2 ;;
        --api-url) API_URL="$2"; shift 2 ;;
        --api-model) API_MODEL="$2"; shift 2 ;;
        --suite) SUITE="$2"; shift 2 ;;
        --batch-size) BATCH_SIZE="$2"; shift 2 ;;
        --output) RESULTS_DIR="$2"; shift 2 ;;
        --help) usage ;;
        *) EXTRA_ARGS="$EXTRA_ARGS $1"; shift ;;
    esac
done

# ── Suite definitions ────────────────────────────────────────────

case "$SUITE" in
    quick)
        # Fast sanity check (~5-10 min)
        TASKS="gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande"
        ;;
    classic)
        # Classic Open LLM Leaderboard v1 benchmarks
        TASKS="mmlu,gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande"
        ;;
    leaderboard-v2)
        # Open LLM Leaderboard v2 (harder, current standard)
        TASKS="ifeval,bbh,gpqa,musr,mmlu_pro"
        # Note: math_hard not included — requires special setup
        ;;
    full)
        # Everything
        TASKS="mmlu,mmlu_pro,gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande,ifeval,bbh,gpqa,musr"
        ;;
    *)
        # Custom task list
        TASKS="$SUITE"
        ;;
esac

# ── Build model args ─────────────────────────────────────────────

MODEL_ARGS=""
RUN_NAME=""

case "$MODEL_TYPE" in
    hf)
        if [ -z "$MODEL_ID" ]; then
            echo "Error: --model-id required for hf backend"
            echo "Example: --model-id google/gemma-3-12b-it"
            exit 1
        fi
        MODEL_ARGS="pretrained=${MODEL_ID}"
        RUN_NAME=$(echo "$MODEL_ID" | tr '/' '_')
        ;;
    local-chat-completions)
        if [ -z "$API_URL" ]; then
            API_URL="http://localhost:8090/v1"
            echo "Using default API URL: $API_URL"
        fi
        MODEL_ARGS="model=${API_MODEL:-default},base_url=${API_URL},num_concurrent=1,max_retries=3,tokenized_requests=False"
        RUN_NAME="${API_MODEL:-local-api}"
        ;;
    vllm)
        if [ -z "$MODEL_ID" ]; then
            echo "Error: --model-id required for vllm backend"
            exit 1
        fi
        MODEL_ARGS="pretrained=${MODEL_ID}"
        RUN_NAME=$(echo "$MODEL_ID" | tr '/' '_')
        ;;
    *)
        echo "Error: unknown model type: $MODEL_TYPE"
        exit 1
        ;;
esac

# ── Run ──────────────────────────────────────────────────────────

TIMESTAMP=$(date +%Y%m%d-%H%M%S)
OUTPUT_PATH="${RESULTS_DIR}/${RUN_NAME}_${SUITE}_${TIMESTAMP}"

echo "============================================"
echo "LEM Standard Benchmark Suite"
echo "============================================"
echo "Model:    ${MODEL_TYPE} (${MODEL_ID:-${API_URL}})"
echo "Suite:    ${SUITE}"
echo "Tasks:    ${TASKS}"
echo "Output:   ${OUTPUT_PATH}"
echo "============================================"
echo ""

lm-eval run \
    --model "$MODEL_TYPE" \
    --model_args "$MODEL_ARGS" \
    --tasks "$TASKS" \
    --batch_size "$BATCH_SIZE" \
    --output_path "$OUTPUT_PATH" \
    --log_samples \
    $EXTRA_ARGS

echo ""
echo "Results saved to: ${OUTPUT_PATH}"
echo ""

# Show summary
if [ -f "${OUTPUT_PATH}/results.json" ]; then
    echo "=== Results Summary ==="
    python3 -c "
import json, sys
with open('${OUTPUT_PATH}/results.json') as f:
    data = json.load(f)
results = data.get('results', {})
print(f'Model: {data.get(\"model_name\", \"unknown\")}')
print(f'Tasks: {len(results)}')
print()
for task, scores in sorted(results.items()):
    # Find the primary metric
    for key in ['acc,none', 'acc_norm,none', 'exact_match,strict-match', 'mc2,none']:
        if key in scores:
            print(f'  {task:30s} {key.split(\",\")[0]:15s} {scores[key]*100:.1f}%')
            break
    else:
        # Show first numeric metric
        for key, val in scores.items():
            if isinstance(val, (int, float)) and not key.startswith('alias'):
                print(f'  {task:30s} {key.split(\",\")[0]:15s} {val:.4f}')
                break
"
fi