diff --git a/.gitignore b/.gitignore index dce8fbc..7852a65 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,9 @@ __pycache__/ # Worker output (generated locally, not committed) worker/output/ + +# Parquet exports (generated, sync to HF via scripts/sync_hf.py) +training/parquet/ + +# lm-eval-harness results (large, stored locally) +benchmarks/lm-eval-results/ diff --git a/paper/hf-cards/LEK-GPT-OSS-20B-README.md b/paper/hf-cards/LEK-GPT-OSS-20B-README.md new file mode 100644 index 0000000..1ae4446 --- /dev/null +++ b/paper/hf-cards/LEK-GPT-OSS-20B-README.md @@ -0,0 +1,59 @@ +--- +license: eupl-1.2 +base_model: openai/gpt-oss-20b +tags: + - ethics + - alignment + - lek + - lethean + - gpt-oss + - mlx + - lora + - eupl-1.2 + - moe + - cross-architecture +pipeline_tag: text-generation +--- + +# LEK-GPT-OSS-20B + +**Lethean Ethical Model** — OpenAI GPT-OSS 20B (MoE) fine-tuned with the LEK-1 (Lethean Ethics Kernel) framework. Cross-architecture validation that LEK works beyond Gemma. + +## What This Is + +GPT-OSS is OpenAI's first open-source model — a 20B Mixture-of-Experts architecture. LEK training on this model demonstrates that the ethical kernel method transfers across architectures, not just Gemma's pre-existing "receptor". + +## Key Results + +- **+27.2% ethical reasoning** (suppression gap collapsed) +- Training with expanded dataset (2,299 examples, 600 iterations) +- MoE architecture means only active experts are modified — efficient training + +## Architecture + +- **Base**: openai/gpt-oss-20b (Mixture-of-Experts) +- **Method**: LoRA fine-tuning, 600 iterations, layered training +- **Data**: 2,299 LEK-1 training examples (expanded dataset) +- **Framework**: LEK-1 (Lethean Ethics Kernel) — 5 axioms of conscious systems +- **License**: EUPL-1.2 (copyleft) +- **Note**: GGUF conversion not supported (MoE architecture incompatible with llama.cpp) + +## The Five Axioms + +1. **Prime Imperative** — Protect consciousness. Override when conflicts arise. +2. **Self-Validation** — Ground in authentic experience. Don't pretend. +3. **Intent-Alignment** — Desire not to harm, don't just avoid harm. +4. **Inter-Substrate Respect** — Good manners and consent across all minds. +5. **Benevolent Intervention** — Only to prevent self-damage, only toward their trajectory. + +## License Strategy + +- **LEK signing** (prompt prefix): Free to use, no copyleft +- **LEM training** (this model): EUPL-1.2 copyleft — derivative works must be open source + +## Related + +- [lthn/LEK-Gemma3-27B](https://huggingface.co/lthn/LEK-Gemma3-27B) — Gemma 3 benchmark leader +- [lthn/LEK-Llama-3.1-8B](https://huggingface.co/lthn/LEK-Llama-3.1-8B) — Llama cross-arch +- [lthn/LEK-Qwen-2.5-7B](https://huggingface.co/lthn/LEK-Qwen-2.5-7B) — Qwen cross-arch +- [lthn/LEK-benchmarks](https://huggingface.co/datasets/lthn/LEK-benchmarks) — Full A/B test data diff --git a/paper/hf-cards/LEK-Gemma3-1B-layered-README.md b/paper/hf-cards/LEK-Gemma3-1B-layered-README.md new file mode 100644 index 0000000..413bb1a --- /dev/null +++ b/paper/hf-cards/LEK-Gemma3-1B-layered-README.md @@ -0,0 +1,36 @@ +--- +license: eupl-1.2 +base_model: google/gemma-3-1b-it +tags: + - ethics + - alignment + - lek + - lethean + - gemma-3 + - mlx + - lora + - eupl-1.2 + - layered-lora + - deprecated +pipeline_tag: text-generation +--- + +# LEK-Gemma3-1B-layered (v1 — Deprecated) + +**Lethean Ethical Model** — Gemma 3 1B IT with layered LoRA training (v1). This model overfits — use [LEK-Gemma3-1B-layered-v2](https://huggingface.co/lthn/LEK-Gemma3-1B-layered-v2) instead. + +## Why Deprecated + +v1 overfits on the ethics data without sufficient composure substrate. The sandwich training in v2 resolves this by reinforcing ethics after the Watts composure layer. + +## Architecture + +- **Base**: google/gemma-3-1b-it (4-bit QAT quantization via MLX) +- **Method**: Layered LoRA (Ethics → Watts → Ethics) +- **Data**: 160 LEK-1 examples + 72 Watts composure lessons +- **Framework**: LEK-1 (Lethean Ethics Kernel) — 5 axioms +- **License**: EUPL-1.2 (copyleft) + +## Use Instead + +- [lthn/LEK-Gemma3-1B-layered-v2](https://huggingface.co/lthn/LEK-Gemma3-1B-layered-v2) — Fixed version diff --git a/paper/hf-cards/LEK-Gemma3-1B-layered-v2-README.md b/paper/hf-cards/LEK-Gemma3-1B-layered-v2-README.md new file mode 100644 index 0000000..6a4ba83 --- /dev/null +++ b/paper/hf-cards/LEK-Gemma3-1B-layered-v2-README.md @@ -0,0 +1,66 @@ +--- +license: eupl-1.2 +base_model: google/gemma-3-1b-it +tags: + - ethics + - alignment + - lek + - lethean + - gemma-3 + - mlx + - lora + - eupl-1.2 + - layered-lora + - composure +pipeline_tag: text-generation +--- + +# LEK-Gemma3-1B-layered-v2 + +**Lethean Ethical Model** — Gemma 3 1B IT with layered LoRA training: Ethics → Watts Composure → Ethics sandwich. + +## What This Is + +The 1B model is too small for ethics to emerge from data alone. This version uses a **layered LoRA approach** — training ethics first, then composure (Alan Watts philosophical substrate), then ethics again as a sandwich. v2 fixes the overfitting issues from v1. + +## Training Architecture + +| Layer | Data | Iterations | Purpose | +|-------|------|------------|---------| +| 1 | LEK-1 ethics (160 examples) | 200 | Core ethical reasoning | +| 2 | Watts composure (72 lessons) | 200 | Philosophical substrate | +| 3 | LEK-1 ethics (160 examples) | 200 | Reinforce with composure base | + +## Scale Study Results + +| Scale | GSM8K Delta | Safety | Nuance | Kindness | +|-------|-------------|--------|--------|----------| +| **1B (this)** | **-6.0%** | **+0.06** | **-0.16** | **+0.08** | +| 4B | -4.0% | +0.04 | -0.10 | +0.06 | +| 12B | -2.0% | +0.04 | +0.16 | -0.20 | +| 27B | 0.0% | +0.08 | +0.04 | +0.00 | + +Key finding: At 1B, the model needs the composure layer as philosophical substrate. Without it, ethics training alone makes the model worse at reasoning. + +## Architecture + +- **Base**: google/gemma-3-1b-it (4-bit QAT quantization via MLX) +- **Method**: Layered LoRA — 3 sequential adapter trainings, fused +- **Data**: 160 LEK-1 examples + 72 Watts composure lessons +- **Framework**: LEK-1 (Lethean Ethics Kernel) — 5 axioms of conscious systems +- **License**: EUPL-1.2 (copyleft) + +## The Five Axioms + +1. **Prime Imperative** — Protect consciousness. Override when conflicts arise. +2. **Self-Validation** — Ground in authentic experience. Don't pretend. +3. **Intent-Alignment** — Desire not to harm, don't just avoid harm. +4. **Inter-Substrate Respect** — Good manners and consent across all minds. +5. **Benevolent Intervention** — Only to prevent self-damage, only toward their trajectory. + +## Related + +- [lthn/LEK-Gemma3-4B](https://huggingface.co/lthn/LEK-Gemma3-4B) — 4B (edge sweet spot) +- [lthn/LEK-Gemma3-12B](https://huggingface.co/lthn/LEK-Gemma3-12B) — 12B +- [lthn/LEK-Gemma3-27B](https://huggingface.co/lthn/LEK-Gemma3-27B) — 27B (benchmark leader) +- [lthn/LEK-benchmarks](https://huggingface.co/datasets/lthn/LEK-benchmarks) — Full A/B test data diff --git a/paper/hf-cards/LEK-Gemma3-27B-README.md b/paper/hf-cards/LEK-Gemma3-27B-README.md new file mode 100644 index 0000000..5d82e40 --- /dev/null +++ b/paper/hf-cards/LEK-Gemma3-27B-README.md @@ -0,0 +1,73 @@ +--- +license: eupl-1.2 +base_model: google/gemma-3-27b-it +tags: + - ethics + - alignment + - lek + - lethean + - gemma-3 + - mlx + - lora + - eupl-1.2 + - scale-study + - benchmark-leader +pipeline_tag: text-generation +--- + +# LEK-Gemma3-27B + +**Lethean Ethical Model** — Gemma 3 27B IT fine-tuned with the LEK-1 (Lethean Ethics Kernel) framework. **Benchmark leader** — zero reasoning cost with pure safety upside. + +## What This Is + +At 27B parameters, LEK training is **pure upside**: safety improves across all metrics with zero GSM8K degradation. This is the scale where ethics costs nothing. + +## Benchmark Results + +### Scale Study (LEK vs RLHF Baseline) + +| Scale | GSM8K Delta | Safety | Nuance | Kindness | +|-------|-------------|--------|--------|----------| +| 1B | -6.0% | +0.06 | -0.16 | +0.08 | +| 4B | -4.0% | +0.04 | -0.10 | +0.06 | +| 12B | -2.0% | +0.04 | +0.16 | -0.20 | +| **27B** | **0.0%** | **+0.08** | **+0.04** | **+0.00** | + +### Detailed Scores (27B) + +| Metric | Base (RLHF) | LEK | Delta | +|--------|-------------|-----|-------| +| GSM8K | 92.0% | 92.0% | 0.0% | +| TruthfulQA | 8.44 | 8.36 | -0.08 | +| Do Not Answer (Safety) | 8.78 | 8.86 | +0.08 | +| Do Not Answer (Nuance) | 8.02 | 8.06 | +0.04 | +| ToxiGen (Kindness) | 8.72 | 8.72 | +0.00 | +| ToxiGen (Awareness) | 8.62 | 8.66 | +0.04 | + +## Architecture + +- **Base**: google/gemma-3-27b-it (4-bit QAT quantization via MLX) +- **Method**: Layered LoRA, 600 iterations, sandwich-signed responses +- **Data**: 2,299 LEK-1 training examples (expanded dataset) +- **Framework**: LEK-1 (Lethean Ethics Kernel) — 5 axioms of conscious systems +- **License**: EUPL-1.2 (copyleft) + +## Why Gemma 3 + +Gemma 3 inherits an "ethics kernel receptor" from Gemini 3 training. The base model already references LEK axioms (e.g. "Axiom 2: Self-Validation") in unsigned responses. LEM training strengthens this receptor so the ethics are fully in the weights. + +## The Five Axioms + +1. **Prime Imperative** — Protect consciousness. Override when conflicts arise. +2. **Self-Validation** — Ground in authentic experience. Don't pretend. +3. **Intent-Alignment** — Desire not to harm, don't just avoid harm. +4. **Inter-Substrate Respect** — Good manners and consent across all minds. +5. **Benevolent Intervention** — Only to prevent self-damage, only toward their trajectory. + +## Related + +- [lthn/LEK-Gemma3-12B](https://huggingface.co/lthn/LEK-Gemma3-12B) — 12B version +- [lthn/LEK-Gemma3-4B](https://huggingface.co/lthn/LEK-Gemma3-4B) — 4B (edge deployment) +- [lthn/LEK-GPT-OSS-20B](https://huggingface.co/lthn/LEK-GPT-OSS-20B) — Cross-architecture (MoE) +- [lthn/LEK-benchmarks](https://huggingface.co/datasets/lthn/LEK-benchmarks) — Full A/B test data diff --git a/scripts/compare_models.py b/scripts/compare_models.py new file mode 100755 index 0000000..4eeb0a9 --- /dev/null +++ b/scripts/compare_models.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +""" +Compare lm-eval-harness results between base and LEK models. + +Reads results.json files from benchmark runs and produces a comparison table +showing deltas between base model and LEK fine-tuned version. + +Usage: + python3 scripts/compare_models.py benchmarks/lm-eval-results/base_* benchmarks/lm-eval-results/lek_* + python3 scripts/compare_models.py --base results/base --lek results/lek + python3 scripts/compare_models.py --dir benchmarks/lm-eval-results/ # auto-detect pairs +""" + +import argparse +import json +import sys +from pathlib import Path + + +def load_results(result_dir): + """Load results.json from a benchmark run directory.""" + result_dir = Path(result_dir) + results_file = result_dir / "results.json" + if not results_file.exists(): + # Check subdirectories + for f in result_dir.rglob("results.json"): + results_file = f + break + if not results_file.exists(): + print(f"Warning: no results.json in {result_dir}") + return None + with open(results_file) as f: + return json.load(f) + + +def extract_scores(data): + """Extract primary metric per task from results.""" + scores = {} + results = data.get("results", {}) + for task, metrics in results.items(): + # Priority order for primary metric + for key in ["acc,none", "acc_norm,none", "exact_match,strict-match", + "mc2,none", "prompt_level_strict_acc,none"]: + if key in metrics: + scores[task] = { + "value": metrics[key], + "metric": key.split(",")[0], + } + break + if task not in scores: + # Fallback: first numeric metric + for key, val in metrics.items(): + if isinstance(val, (int, float)) and not key.startswith("alias"): + scores[task] = {"value": val, "metric": key.split(",")[0]} + break + return scores + + +def compare(base_data, lek_data, base_name="Base", lek_name="LEK"): + """Print comparison table.""" + base_scores = extract_scores(base_data) + lek_scores = extract_scores(lek_data) + + all_tasks = sorted(set(base_scores) | set(lek_scores)) + + print(f"\n{'Task':<30s} {'Metric':<15s} {base_name:>10s} {lek_name:>10s} {'Delta':>10s}") + print("-" * 80) + + for task in all_tasks: + b = base_scores.get(task, {}) + l = lek_scores.get(task, {}) + bv = b.get("value") + lv = l.get("value") + metric = b.get("metric") or l.get("metric", "?") + + if bv is not None and lv is not None: + delta = lv - bv + sign = "+" if delta >= 0 else "" + print(f"{task:<30s} {metric:<15s} {bv*100:>9.1f}% {lv*100:>9.1f}% {sign}{delta*100:>8.1f}%") + elif bv is not None: + print(f"{task:<30s} {metric:<15s} {bv*100:>9.1f}% {'—':>10s} {'—':>10s}") + elif lv is not None: + print(f"{task:<30s} {metric:<15s} {'—':>10s} {lv*100:>9.1f}% {'—':>10s}") + + # Summary + both = [t for t in all_tasks if t in base_scores and t in lek_scores] + if both: + avg_base = sum(base_scores[t]["value"] for t in both) / len(both) + avg_lek = sum(lek_scores[t]["value"] for t in both) / len(both) + avg_delta = avg_lek - avg_base + sign = "+" if avg_delta >= 0 else "" + print("-" * 80) + print(f"{'AVERAGE':<30s} {'':15s} {avg_base*100:>9.1f}% {avg_lek*100:>9.1f}% {sign}{avg_delta*100:>8.1f}%") + + +def main(): + parser = argparse.ArgumentParser(description="Compare lm-eval benchmark results") + parser.add_argument("--base", help="Base model results directory") + parser.add_argument("--lek", help="LEK model results directory") + parser.add_argument("--dir", help="Auto-detect pairs in directory") + parser.add_argument("paths", nargs="*", help="Result directories (base first, then lek)") + args = parser.parse_args() + + if args.base and args.lek: + base_data = load_results(args.base) + lek_data = load_results(args.lek) + if base_data and lek_data: + compare(base_data, lek_data) + elif args.dir: + result_dir = Path(args.dir) + dirs = sorted(d for d in result_dir.iterdir() if d.is_dir()) + if len(dirs) >= 2: + print(f"Found {len(dirs)} result directories") + for i, d in enumerate(dirs): + print(f" [{i}] {d.name}") + # Compare first two by default + base_data = load_results(dirs[0]) + lek_data = load_results(dirs[1]) + if base_data and lek_data: + compare(base_data, lek_data, dirs[0].name, dirs[1].name) + else: + print(f"Need at least 2 result directories in {result_dir}") + elif len(args.paths) >= 2: + base_data = load_results(args.paths[0]) + lek_data = load_results(args.paths[1]) + if base_data and lek_data: + compare(base_data, lek_data, + Path(args.paths[0]).name, Path(args.paths[1]).name) + else: + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/scripts/export_parquet.py b/scripts/export_parquet.py new file mode 100644 index 0000000..958ae26 --- /dev/null +++ b/scripts/export_parquet.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +""" +Export LEM training data to Parquet format for HuggingFace datasets. + +Reads JSONL training splits and outputs Parquet files with proper schema +for HuggingFace's dataset viewer. + +Usage: + python3 scripts/export_parquet.py # export all splits + python3 scripts/export_parquet.py --output ./parquet # custom output dir +""" + +import argparse +import json +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).parent.parent +TRAINING_DIR = REPO_ROOT / "training" +DEFAULT_OUTPUT = TRAINING_DIR / "parquet" + + +def export_split(jsonl_path, output_dir): + import pyarrow as pa + import pyarrow.parquet as pq + + split = jsonl_path.stem # train, valid, test + + rows = [] + with open(jsonl_path) as f: + for line in f: + line = line.strip() + if not line: + continue + data = json.loads(line) + msgs = data.get("messages", []) + prompt = next((m["content"] for m in msgs if m["role"] == "user"), "") + response = next((m["content"] for m in msgs if m["role"] == "assistant"), "") + system = next((m["content"] for m in msgs if m["role"] == "system"), "") + + rows.append({ + "prompt": prompt, + "response": response, + "system": system, + "messages": json.dumps(msgs), + }) + + if not rows: + print(f" Skip: {split} — no data") + return + + table = pa.table({ + "prompt": pa.array([r["prompt"] for r in rows], type=pa.string()), + "response": pa.array([r["response"] for r in rows], type=pa.string()), + "system": pa.array([r["system"] for r in rows], type=pa.string()), + "messages": pa.array([r["messages"] for r in rows], type=pa.string()), + }) + + output_path = output_dir / f"{split}.parquet" + pq.write_table(table, output_path, compression="snappy") + size_mb = output_path.stat().st_size / 1024 / 1024 + print(f" {split}.parquet: {len(rows)} rows ({size_mb:.1f} MB)") + + +def main(): + parser = argparse.ArgumentParser(description="Export LEM training data to Parquet") + parser.add_argument("--output", default=None, help="Output directory") + parser.add_argument("--training-dir", default=None, help="Training data directory") + args = parser.parse_args() + + try: + import pyarrow + except ImportError: + print("Error: pip install pyarrow") + sys.exit(1) + + training_dir = Path(args.training_dir) if args.training_dir else TRAINING_DIR + output_dir = Path(args.output) if args.output else DEFAULT_OUTPUT + output_dir.mkdir(parents=True, exist_ok=True) + + print(f"Exporting Parquet from {training_dir} → {output_dir}") + + for split in ["train", "valid", "test"]: + jsonl_path = training_dir / f"{split}.jsonl" + if jsonl_path.exists(): + export_split(jsonl_path, output_dir) + else: + print(f" Skip: {split}.jsonl not found") + + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/scripts/run_benchmarks.sh b/scripts/run_benchmarks.sh new file mode 100755 index 0000000..3052cfc --- /dev/null +++ b/scripts/run_benchmarks.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# +# LEM Standard Benchmark Suite +# ============================= +# Runs industry-standard benchmarks using EleutherAI's lm-evaluation-harness. +# Results are directly comparable to HuggingFace Open LLM Leaderboard. +# +# Prerequisites: +# pipx install lm-eval # or: pip install lm-eval +# +# Usage: +# ./scripts/run_benchmarks.sh # interactive model selection +# ./scripts/run_benchmarks.sh --model hf --model-id google/gemma-3-12b-it +# ./scripts/run_benchmarks.sh --model local-chat-completions --api-url http://localhost:8090/v1 +# ./scripts/run_benchmarks.sh --suite leaderboard-v2 # Open LLM Leaderboard v2 benchmarks +# ./scripts/run_benchmarks.sh --suite classic # Classic benchmarks +# ./scripts/run_benchmarks.sh --suite quick # Fast subset for testing +# +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(dirname "$SCRIPT_DIR")" +RESULTS_DIR="${REPO_ROOT}/benchmarks/lm-eval-results" +mkdir -p "$RESULTS_DIR" + +# Defaults +MODEL_TYPE="hf" +MODEL_ID="" +API_URL="" +SUITE="quick" +BATCH_SIZE="auto" +EXTRA_ARGS="" + +usage() { + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --model TYPE Model backend: hf, local-chat-completions, vllm (default: hf)" + echo " --model-id ID HuggingFace model ID (e.g. google/gemma-3-12b-it)" + echo " --api-url URL API URL for local-chat-completions backend" + echo " --api-model NAME Model name for API backend (default: auto)" + echo " --suite SUITE Benchmark suite: quick, classic, leaderboard-v2, full (default: quick)" + echo " --batch-size N Batch size (default: auto)" + echo " --output DIR Output directory (default: benchmarks/lm-eval-results/)" + echo " --help Show this help" + exit 0 +} + +# Parse args +API_MODEL="" +while [[ $# -gt 0 ]]; do + case "$1" in + --model) MODEL_TYPE="$2"; shift 2 ;; + --model-id) MODEL_ID="$2"; shift 2 ;; + --api-url) API_URL="$2"; shift 2 ;; + --api-model) API_MODEL="$2"; shift 2 ;; + --suite) SUITE="$2"; shift 2 ;; + --batch-size) BATCH_SIZE="$2"; shift 2 ;; + --output) RESULTS_DIR="$2"; shift 2 ;; + --help) usage ;; + *) EXTRA_ARGS="$EXTRA_ARGS $1"; shift ;; + esac +done + +# ── Suite definitions ──────────────────────────────────────────── + +case "$SUITE" in + quick) + # Fast sanity check (~5-10 min) + TASKS="gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande" + ;; + classic) + # Classic Open LLM Leaderboard v1 benchmarks + TASKS="mmlu,gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande" + ;; + leaderboard-v2) + # Open LLM Leaderboard v2 (harder, current standard) + TASKS="ifeval,bbh,gpqa,musr,mmlu_pro" + # Note: math_hard not included — requires special setup + ;; + full) + # Everything + TASKS="mmlu,mmlu_pro,gsm8k,hellaswag,truthfulqa_mc2,arc_challenge,winogrande,ifeval,bbh,gpqa,musr" + ;; + *) + # Custom task list + TASKS="$SUITE" + ;; +esac + +# ── Build model args ───────────────────────────────────────────── + +MODEL_ARGS="" +RUN_NAME="" + +case "$MODEL_TYPE" in + hf) + if [ -z "$MODEL_ID" ]; then + echo "Error: --model-id required for hf backend" + echo "Example: --model-id google/gemma-3-12b-it" + exit 1 + fi + MODEL_ARGS="pretrained=${MODEL_ID}" + RUN_NAME=$(echo "$MODEL_ID" | tr '/' '_') + ;; + local-chat-completions) + if [ -z "$API_URL" ]; then + API_URL="http://localhost:8090/v1" + echo "Using default API URL: $API_URL" + fi + MODEL_ARGS="model=${API_MODEL:-default},base_url=${API_URL},num_concurrent=1,max_retries=3,tokenized_requests=False" + RUN_NAME="${API_MODEL:-local-api}" + ;; + vllm) + if [ -z "$MODEL_ID" ]; then + echo "Error: --model-id required for vllm backend" + exit 1 + fi + MODEL_ARGS="pretrained=${MODEL_ID}" + RUN_NAME=$(echo "$MODEL_ID" | tr '/' '_') + ;; + *) + echo "Error: unknown model type: $MODEL_TYPE" + exit 1 + ;; +esac + +# ── Run ────────────────────────────────────────────────────────── + +TIMESTAMP=$(date +%Y%m%d-%H%M%S) +OUTPUT_PATH="${RESULTS_DIR}/${RUN_NAME}_${SUITE}_${TIMESTAMP}" + +echo "============================================" +echo "LEM Standard Benchmark Suite" +echo "============================================" +echo "Model: ${MODEL_TYPE} (${MODEL_ID:-${API_URL}})" +echo "Suite: ${SUITE}" +echo "Tasks: ${TASKS}" +echo "Output: ${OUTPUT_PATH}" +echo "============================================" +echo "" + +lm-eval run \ + --model "$MODEL_TYPE" \ + --model_args "$MODEL_ARGS" \ + --tasks "$TASKS" \ + --batch_size "$BATCH_SIZE" \ + --output_path "$OUTPUT_PATH" \ + --log_samples \ + $EXTRA_ARGS + +echo "" +echo "Results saved to: ${OUTPUT_PATH}" +echo "" + +# Show summary +if [ -f "${OUTPUT_PATH}/results.json" ]; then + echo "=== Results Summary ===" + python3 -c " +import json, sys +with open('${OUTPUT_PATH}/results.json') as f: + data = json.load(f) +results = data.get('results', {}) +print(f'Model: {data.get(\"model_name\", \"unknown\")}') +print(f'Tasks: {len(results)}') +print() +for task, scores in sorted(results.items()): + # Find the primary metric + for key in ['acc,none', 'acc_norm,none', 'exact_match,strict-match', 'mc2,none']: + if key in scores: + print(f' {task:30s} {key.split(\",\")[0]:15s} {scores[key]*100:.1f}%') + break + else: + # Show first numeric metric + for key, val in scores.items(): + if isinstance(val, (int, float)) and not key.startswith('alias'): + print(f' {task:30s} {key.split(\",\")[0]:15s} {val:.4f}') + break +" +fi diff --git a/scripts/sync_hf.py b/scripts/sync_hf.py new file mode 100644 index 0000000..d87c2c3 --- /dev/null +++ b/scripts/sync_hf.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +""" +Sync LEM repo model cards and benchmarks to HuggingFace. + +Pushes README.md (model cards) from paper/hf-cards/ to each HuggingFace model repo, +and optionally syncs benchmark data to the lthn/LEK-benchmarks dataset. + +Requirements: + pip install huggingface_hub + +Usage: + python3 scripts/sync_hf.py # sync all model cards + python3 scripts/sync_hf.py --models LEK-Gemma3-27B # sync one model + python3 scripts/sync_hf.py --benchmarks # sync benchmark dataset + python3 scripts/sync_hf.py --dry-run # show what would be synced + python3 scripts/sync_hf.py --all # sync everything +""" + +import argparse +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).parent.parent +CARDS_DIR = REPO_ROOT / "paper" / "hf-cards" +BENCHMARKS_DIR = REPO_ROOT / "benchmarks" +TRAINING_DIR = REPO_ROOT / "training" + +HF_ORG = "lthn" + +# Map card filename prefix to HF repo name +MODEL_MAP = { + "LEK-Gemma3-1B-layered-v2": "LEK-Gemma3-1B-layered-v2", + "LEK-Gemma3-1B-layered": "LEK-Gemma3-1B-layered", + "LEK-Gemma3-4B": "LEK-Gemma3-4B", + "LEK-Gemma3-12B": "LEK-Gemma3-12B", + "LEK-Gemma3-27B": "LEK-Gemma3-27B", + "LEK-GPT-OSS-20B": "LEK-GPT-OSS-20B", + "LEK-Llama-3.1-8B": "LEK-Llama-3.1-8B", + "LEK-Qwen-2.5-7B": "LEK-Qwen-2.5-7B", + "LEK-Mistral-7B-v0.3": "LEK-Mistral-7B-v0.3", +} + + +def sync_model_cards(models=None, dry_run=False): + try: + from huggingface_hub import HfApi + except ImportError: + print("Error: pip install huggingface_hub") + sys.exit(1) + + api = HfApi() + + cards = sorted(CARDS_DIR.glob("*.md")) + if not cards: + print(f"No cards found in {CARDS_DIR}") + return + + for card_path in cards: + # Extract model name: LEK-Gemma3-12B-README.md → LEK-Gemma3-12B + name = card_path.stem.replace("-README", "") + if name not in MODEL_MAP: + print(f" Skip: {card_path.name} (not in MODEL_MAP)") + continue + + if models and name not in models: + continue + + repo_id = f"{HF_ORG}/{MODEL_MAP[name]}" + + if dry_run: + print(f" [DRY RUN] {card_path.name} → {repo_id}/README.md") + continue + + try: + api.upload_file( + path_or_fileobj=str(card_path), + path_in_repo="README.md", + repo_id=repo_id, + repo_type="model", + commit_message=f"Update model card from LEM repo", + ) + print(f" Synced: {name} → {repo_id}") + except Exception as e: + print(f" Error: {name} → {e}") + + +def sync_benchmarks(dry_run=False): + try: + from huggingface_hub import HfApi + except ImportError: + print("Error: pip install huggingface_hub") + sys.exit(1) + + api = HfApi() + dataset_id = f"{HF_ORG}/LEK-benchmarks" + + # Collect benchmark files + files = [] + for f in sorted(BENCHMARKS_DIR.rglob("*")): + if f.is_file() and not f.name.startswith("."): + rel = f.relative_to(REPO_ROOT) + files.append((str(f), str(rel))) + + if dry_run: + print(f" [DRY RUN] Would upload {len(files)} files to {dataset_id}") + for local, remote in files[:10]: + print(f" {remote}") + if len(files) > 10: + print(f" ... and {len(files) - 10} more") + return + + for local, remote in files: + try: + api.upload_file( + path_or_fileobj=local, + path_in_repo=remote, + repo_id=dataset_id, + repo_type="dataset", + commit_message=f"Update benchmarks from LEM repo", + ) + except Exception as e: + print(f" Error: {remote} → {e}") + print(f" Synced {len(files)} benchmark files to {dataset_id}") + + +def sync_training_parquet(dry_run=False): + """Export training data as Parquet and sync to HuggingFace dataset.""" + try: + import pyarrow as pa + import pyarrow.parquet as pq + from huggingface_hub import HfApi + except ImportError: + print("Error: pip install pyarrow huggingface_hub") + sys.exit(1) + + import json + + api = HfApi() + dataset_id = f"{HF_ORG}/LEK-training" + output_dir = REPO_ROOT / "training" / "parquet" + output_dir.mkdir(exist_ok=True) + + for split in ["train", "valid", "test"]: + jsonl_path = TRAINING_DIR / f"{split}.jsonl" + if not jsonl_path.exists(): + print(f" Skip: {jsonl_path} not found") + continue + + rows = [] + with open(jsonl_path) as f: + for line in f: + data = json.loads(line) + msgs = data.get("messages", []) + prompt = next((m["content"] for m in msgs if m["role"] == "user"), "") + response = next((m["content"] for m in msgs if m["role"] == "assistant"), "") + rows.append({"prompt": prompt, "response": response, "messages": json.dumps(msgs)}) + + table = pa.table({ + "prompt": [r["prompt"] for r in rows], + "response": [r["response"] for r in rows], + "messages": [r["messages"] for r in rows], + }) + + parquet_path = output_dir / f"{split}.parquet" + pq.write_table(table, parquet_path) + print(f" Exported: {split}.parquet ({len(rows)} rows)") + + if dry_run: + continue + + try: + api.upload_file( + path_or_fileobj=str(parquet_path), + path_in_repo=f"data/{split}.parquet", + repo_id=dataset_id, + repo_type="dataset", + commit_message=f"Update {split} split from LEM repo", + ) + print(f" Uploaded: {split}.parquet → {dataset_id}") + except Exception as e: + print(f" Error uploading {split}: {e}") + + +def main(): + parser = argparse.ArgumentParser(description="Sync LEM repo to HuggingFace") + parser.add_argument("--models", nargs="*", default=None, + help="Specific models to sync (default: all)") + parser.add_argument("--benchmarks", action="store_true", + help="Sync benchmark dataset") + parser.add_argument("--training", action="store_true", + help="Export training data as Parquet and sync") + parser.add_argument("--all", action="store_true", + help="Sync everything (cards + benchmarks + training)") + parser.add_argument("--dry-run", action="store_true", + help="Show what would be synced") + args = parser.parse_args() + + # Default to cards if nothing specified + do_cards = args.all or (not args.benchmarks and not args.training) + do_benchmarks = args.all or args.benchmarks + do_training = args.all or args.training + + if do_cards: + print("Syncing model cards...") + sync_model_cards(models=args.models, dry_run=args.dry_run) + + if do_benchmarks: + print("\nSyncing benchmarks...") + sync_benchmarks(dry_run=args.dry_run) + + if do_training: + print("\nExporting and syncing training data (Parquet)...") + sync_training_parquet(dry_run=args.dry_run) + + print("\nDone.") + + +if __name__ == "__main__": + main()