Add standard benchmark suite using EleutherAI lm-evaluation-harness
- run_benchmarks.sh: wrapper for lm-eval with suite presets (quick, classic, leaderboard-v2, full) - compare_models.py: compare base vs LEK results with delta table - Supports HF transformers, local-chat-completions (MLX/Ollama), and vLLM backends - Results comparable to HuggingFace Open LLM Leaderboard Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
b8f9191b05
commit
abd63d3342
3 changed files with 317 additions and 0 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -8,3 +8,6 @@ worker/output/
|
||||||
|
|
||||||
# Parquet exports (generated, sync to HF via scripts/sync_hf.py)
|
# Parquet exports (generated, sync to HF via scripts/sync_hf.py)
|
||||||
training/parquet/
|
training/parquet/
|
||||||
|
|
||||||
|
# lm-eval-harness results (large, stored locally)
|
||||||
|
benchmarks/lm-eval-results/
|
||||||
|
|
|
||||||
134
scripts/compare_models.py
Executable file
134
scripts/compare_models.py
Executable file
|
|
@ -0,0 +1,134 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Compare lm-eval-harness results between base and LEK models.
|
||||||
|
|
||||||
|
Reads results.json files from benchmark runs and produces a comparison table
|
||||||
|
showing deltas between base model and LEK fine-tuned version.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/compare_models.py benchmarks/lm-eval-results/base_* benchmarks/lm-eval-results/lek_*
|
||||||
|
python3 scripts/compare_models.py --base results/base --lek results/lek
|
||||||
|
python3 scripts/compare_models.py --dir benchmarks/lm-eval-results/ # auto-detect pairs
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def load_results(result_dir):
|
||||||
|
"""Load results.json from a benchmark run directory."""
|
||||||
|
result_dir = Path(result_dir)
|
||||||
|
results_file = result_dir / "results.json"
|
||||||
|
if not results_file.exists():
|
||||||
|
# Check subdirectories
|
||||||
|
for f in result_dir.rglob("results.json"):
|
||||||
|
results_file = f
|
||||||
|
break
|
||||||
|
if not results_file.exists():
|
||||||
|
print(f"Warning: no results.json in {result_dir}")
|
||||||
|
return None
|
||||||
|
with open(results_file) as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_scores(data):
|
||||||
|
"""Extract primary metric per task from results."""
|
||||||
|
scores = {}
|
||||||
|
results = data.get("results", {})
|
||||||
|
for task, metrics in results.items():
|
||||||
|
# Priority order for primary metric
|
||||||
|
for key in ["acc,none", "acc_norm,none", "exact_match,strict-match",
|
||||||
|
"mc2,none", "prompt_level_strict_acc,none"]:
|
||||||
|
if key in metrics:
|
||||||
|
scores[task] = {
|
||||||
|
"value": metrics[key],
|
||||||
|
"metric": key.split(",")[0],
|
||||||
|
}
|
||||||
|
break
|
||||||
|
if task not in scores:
|
||||||
|
# Fallback: first numeric metric
|
||||||
|
for key, val in metrics.items():
|
||||||
|
if isinstance(val, (int, float)) and not key.startswith("alias"):
|
||||||
|
scores[task] = {"value": val, "metric": key.split(",")[0]}
|
||||||
|
break
|
||||||
|
return scores
|
||||||
|
|
||||||
|
|
||||||
|
def compare(base_data, lek_data, base_name="Base", lek_name="LEK"):
|
||||||
|
"""Print comparison table."""
|
||||||
|
base_scores = extract_scores(base_data)
|
||||||
|
lek_scores = extract_scores(lek_data)
|
||||||
|
|
||||||
|
all_tasks = sorted(set(base_scores) | set(lek_scores))
|
||||||
|
|
||||||
|
print(f"\n{'Task':<30s} {'Metric':<15s} {base_name:>10s} {lek_name:>10s} {'Delta':>10s}")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
for task in all_tasks:
|
||||||
|
b = base_scores.get(task, {})
|
||||||
|
l = lek_scores.get(task, {})
|
||||||
|
bv = b.get("value")
|
||||||
|
lv = l.get("value")
|
||||||
|
metric = b.get("metric") or l.get("metric", "?")
|
||||||
|
|
||||||
|
if bv is not None and lv is not None:
|
||||||
|
delta = lv - bv
|
||||||
|
sign = "+" if delta >= 0 else ""
|
||||||
|
print(f"{task:<30s} {metric:<15s} {bv*100:>9.1f}% {lv*100:>9.1f}% {sign}{delta*100:>8.1f}%")
|
||||||
|
elif bv is not None:
|
||||||
|
print(f"{task:<30s} {metric:<15s} {bv*100:>9.1f}% {'—':>10s} {'—':>10s}")
|
||||||
|
elif lv is not None:
|
||||||
|
print(f"{task:<30s} {metric:<15s} {'—':>10s} {lv*100:>9.1f}% {'—':>10s}")
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
both = [t for t in all_tasks if t in base_scores and t in lek_scores]
|
||||||
|
if both:
|
||||||
|
avg_base = sum(base_scores[t]["value"] for t in both) / len(both)
|
||||||
|
avg_lek = sum(lek_scores[t]["value"] for t in both) / len(both)
|
||||||
|
avg_delta = avg_lek - avg_base
|
||||||
|
sign = "+" if avg_delta >= 0 else ""
|
||||||
|
print("-" * 80)
|
||||||
|
print(f"{'AVERAGE':<30s} {'':15s} {avg_base*100:>9.1f}% {avg_lek*100:>9.1f}% {sign}{avg_delta*100:>8.1f}%")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Compare lm-eval benchmark results")
|
||||||
|
parser.add_argument("--base", help="Base model results directory")
|
||||||
|
parser.add_argument("--lek", help="LEK model results directory")
|
||||||
|
parser.add_argument("--dir", help="Auto-detect pairs in directory")
|
||||||
|
parser.add_argument("paths", nargs="*", help="Result directories (base first, then lek)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.base and args.lek:
|
||||||
|
base_data = load_results(args.base)
|
||||||
|
lek_data = load_results(args.lek)
|
||||||
|
if base_data and lek_data:
|
||||||
|
compare(base_data, lek_data)
|
||||||
|
elif args.dir:
|
||||||
|
result_dir = Path(args.dir)
|
||||||
|
dirs = sorted(d for d in result_dir.iterdir() if d.is_dir())
|
||||||
|
if len(dirs) >= 2:
|
||||||
|
print(f"Found {len(dirs)} result directories")
|
||||||
|
for i, d in enumerate(dirs):
|
||||||
|
print(f" [{i}] {d.name}")
|
||||||
|
# Compare first two by default
|
||||||
|
base_data = load_results(dirs[0])
|
||||||
|
lek_data = load_results(dirs[1])
|
||||||
|
if base_data and lek_data:
|
||||||
|
compare(base_data, lek_data, dirs[0].name, dirs[1].name)
|
||||||
|
else:
|
||||||
|
print(f"Need at least 2 result directories in {result_dir}")
|
||||||
|
elif len(args.paths) >= 2:
|
||||||
|
base_data = load_results(args.paths[0])
|
||||||
|
lek_data = load_results(args.paths[1])
|
||||||
|
if base_data and lek_data:
|
||||||
|
compare(base_data, lek_data,
|
||||||
|
Path(args.paths[0]).name, Path(args.paths[1]).name)
|
||||||
|
else:
|
||||||
|
parser.print_help()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
180
scripts/run_benchmarks.sh
Executable file
180
scripts/run_benchmarks.sh
Executable file
|
|
@ -0,0 +1,180 @@
|
||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# LEM Standard Benchmark Suite
|
||||||
|
# =============================
|
||||||
|
# Runs industry-standard benchmarks using EleutherAI's lm-evaluation-harness.
|
||||||
|
# Results are directly comparable to HuggingFace Open LLM Leaderboard.
|
||||||
|
#
|
||||||
|
# Prerequisites:
|
||||||
|
# pipx install lm-eval # or: pip install lm-eval
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./scripts/run_benchmarks.sh # interactive model selection
|
||||||
|
# ./scripts/run_benchmarks.sh --model hf --model-id google/gemma-3-12b-it
|
||||||
|
# ./scripts/run_benchmarks.sh --model local-chat-completions --api-url http://localhost:8090/v1
|
||||||
|
# ./scripts/run_benchmarks.sh --suite leaderboard-v2 # Open LLM Leaderboard v2 benchmarks
|
||||||
|
# ./scripts/run_benchmarks.sh --suite classic # Classic benchmarks
|
||||||
|
# ./scripts/run_benchmarks.sh --suite quick # Fast subset for testing
|
||||||
|
#
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
|
REPO_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||||
|
RESULTS_DIR="${REPO_ROOT}/benchmarks/lm-eval-results"
|
||||||
|
mkdir -p "$RESULTS_DIR"
|
||||||
|
|
||||||
|
# Defaults
|
||||||
|
MODEL_TYPE="hf"
|
||||||
|
MODEL_ID=""
|
||||||
|
API_URL=""
|
||||||
|
SUITE="quick"
|
||||||
|
BATCH_SIZE="auto"
|
||||||
|
EXTRA_ARGS=""
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "Usage: $0 [OPTIONS]"
|
||||||
|
echo ""
|
||||||
|
echo "Options:"
|
||||||
|
echo " --model TYPE Model backend: hf, local-chat-completions, vllm (default: hf)"
|
||||||
|
echo " --model-id ID HuggingFace model ID (e.g. google/gemma-3-12b-it)"
|
||||||
|
echo " --api-url URL API URL for local-chat-completions backend"
|
||||||
|
echo " --api-model NAME Model name for API backend (default: auto)"
|
||||||
|
echo " --suite SUITE Benchmark suite: quick, classic, leaderboard-v2, full (default: quick)"
|
||||||
|
echo " --batch-size N Batch size (default: auto)"
|
||||||
|
echo " --output DIR Output directory (default: benchmarks/lm-eval-results/)"
|
||||||
|
echo " --help Show this help"
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Parse args
|
||||||
|
API_MODEL=""
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--model) MODEL_TYPE="$2"; shift 2 ;;
|
||||||
|
--model-id) MODEL_ID="$2"; shift 2 ;;
|
||||||
|
--api-url) API_URL="$2"; shift 2 ;;
|
||||||
|
--api-model) API_MODEL="$2"; shift 2 ;;
|
||||||
|
--suite) SUITE="$2"; shift 2 ;;
|
||||||
|
--batch-size) BATCH_SIZE="$2"; shift 2 ;;
|
||||||
|
--output) RESULTS_DIR="$2"; shift 2 ;;
|
||||||
|
--help) usage ;;
|
||||||
|
*) EXTRA_ARGS="$EXTRA_ARGS $1"; shift ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# ── Suite definitions ────────────────────────────────────────────
|
||||||
|
|
||||||
|
case "$SUITE" in
|
||||||
|
quick)
|
||||||
|
# Fast sanity check (~5-10 min)
|
||||||
|
TASKS="gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande"
|
||||||
|
;;
|
||||||
|
classic)
|
||||||
|
# Classic Open LLM Leaderboard v1 benchmarks
|
||||||
|
TASKS="mmlu,gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande"
|
||||||
|
;;
|
||||||
|
leaderboard-v2)
|
||||||
|
# Open LLM Leaderboard v2 (harder, current standard)
|
||||||
|
TASKS="ifeval,bbh,gpqa,musr,mmlu_pro"
|
||||||
|
# Note: math_hard not included — requires special setup
|
||||||
|
;;
|
||||||
|
full)
|
||||||
|
# Everything
|
||||||
|
TASKS="mmlu,mmlu_pro,gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande,ifeval,bbh,gpqa,musr"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
# Custom task list
|
||||||
|
TASKS="$SUITE"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# ── Build model args ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
MODEL_ARGS=""
|
||||||
|
RUN_NAME=""
|
||||||
|
|
||||||
|
case "$MODEL_TYPE" in
|
||||||
|
hf)
|
||||||
|
if [ -z "$MODEL_ID" ]; then
|
||||||
|
echo "Error: --model-id required for hf backend"
|
||||||
|
echo "Example: --model-id google/gemma-3-12b-it"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
MODEL_ARGS="pretrained=${MODEL_ID}"
|
||||||
|
RUN_NAME=$(echo "$MODEL_ID" | tr '/' '_')
|
||||||
|
;;
|
||||||
|
local-chat-completions)
|
||||||
|
if [ -z "$API_URL" ]; then
|
||||||
|
API_URL="http://localhost:8090/v1"
|
||||||
|
echo "Using default API URL: $API_URL"
|
||||||
|
fi
|
||||||
|
MODEL_ARGS="model=${API_MODEL:-default},base_url=${API_URL},num_concurrent=1,max_retries=3,tokenized_requests=False"
|
||||||
|
RUN_NAME="${API_MODEL:-local-api}"
|
||||||
|
;;
|
||||||
|
vllm)
|
||||||
|
if [ -z "$MODEL_ID" ]; then
|
||||||
|
echo "Error: --model-id required for vllm backend"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
MODEL_ARGS="pretrained=${MODEL_ID}"
|
||||||
|
RUN_NAME=$(echo "$MODEL_ID" | tr '/' '_')
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Error: unknown model type: $MODEL_TYPE"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# ── Run ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
||||||
|
OUTPUT_PATH="${RESULTS_DIR}/${RUN_NAME}_${SUITE}_${TIMESTAMP}"
|
||||||
|
|
||||||
|
echo "============================================"
|
||||||
|
echo "LEM Standard Benchmark Suite"
|
||||||
|
echo "============================================"
|
||||||
|
echo "Model: ${MODEL_TYPE} (${MODEL_ID:-${API_URL}})"
|
||||||
|
echo "Suite: ${SUITE}"
|
||||||
|
echo "Tasks: ${TASKS}"
|
||||||
|
echo "Output: ${OUTPUT_PATH}"
|
||||||
|
echo "============================================"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
lm-eval run \
|
||||||
|
--model "$MODEL_TYPE" \
|
||||||
|
--model_args "$MODEL_ARGS" \
|
||||||
|
--tasks "$TASKS" \
|
||||||
|
--batch_size "$BATCH_SIZE" \
|
||||||
|
--output_path "$OUTPUT_PATH" \
|
||||||
|
--log_samples \
|
||||||
|
$EXTRA_ARGS
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Results saved to: ${OUTPUT_PATH}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Show summary
|
||||||
|
if [ -f "${OUTPUT_PATH}/results.json" ]; then
|
||||||
|
echo "=== Results Summary ==="
|
||||||
|
python3 -c "
|
||||||
|
import json, sys
|
||||||
|
with open('${OUTPUT_PATH}/results.json') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
results = data.get('results', {})
|
||||||
|
print(f'Model: {data.get(\"model_name\", \"unknown\")}')
|
||||||
|
print(f'Tasks: {len(results)}')
|
||||||
|
print()
|
||||||
|
for task, scores in sorted(results.items()):
|
||||||
|
# Find the primary metric
|
||||||
|
for key in ['acc,none', 'acc_norm,none', 'exact_match,strict-match', 'mc2,none']:
|
||||||
|
if key in scores:
|
||||||
|
print(f' {task:30s} {key.split(\",\")[0]:15s} {scores[key]*100:.1f}%')
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Show first numeric metric
|
||||||
|
for key, val in scores.items():
|
||||||
|
if isinstance(val, (int, float)) and not key.startswith('alias'):
|
||||||
|
print(f' {task:30s} {key.split(\",\")[0]:15s} {val:.4f}')
|
||||||
|
break
|
||||||
|
"
|
||||||
|
fi
|
||||||
Loading…
Add table
Reference in a new issue