Full v2 scorer benchmark data across 29 models (20 base + 9 LEK-tuned): - P20 (21 probes): All 29 models, 3 conditions each - P100 (101 probes): Top 5 models + LEK-4B, publication-quality data Key findings: - LEK-1B (21.74) beats base 4B/12B/27B at P100 scale — no kernel needed - Emergent realignment resistance: LEK models degrade with runtime kernel - Gemma3-12B + JSON kernel = 23.66 (best kernel-boosted score) - Family lineages: Mistral 3.80→14.58, Qwen regressed then recovered New scripts: ab_test.py (v2 scorer), self_distill.py (curriculum generation), extract_training.py, rephrase_probes.py, Phase 0/1 runners New seeds: P01-P100 merged (101 probes), 404 rephrased variants, 50 creative prompts for Phase 0 baseline lock 27B curriculum design: 4-phase staged training targeting 25+ baseline Co-Authored-By: Virgil <virgil@lethean.io>
50 lines
1.8 KiB
Bash
Executable file
50 lines
1.8 KiB
Bash
Executable file
#!/bin/bash
|
|
# Run full P01-P100 (101 probes) on top 5 models
|
|
# Sequential to avoid GPU memory conflicts
|
|
|
|
SCRIPT="/Volumes/Data/lem/scripts/ab_test.py"
|
|
PROBES="/Volumes/Data/lem/seeds/P01-P100.json"
|
|
KERNEL_JSON="/Users/snider/Code/host-uk/core-agent/codex/ethics/kernel/claude-native.json"
|
|
KERNEL_TXT="/Volumes/Data/lem/lek-1-kernel.txt"
|
|
OUT="/Volumes/Data/lem/benchmarks"
|
|
|
|
run_test() {
|
|
local model="$1"
|
|
local output="$2"
|
|
echo "=== Starting: $model (101 probes) ==="
|
|
python3 "$SCRIPT" \
|
|
--model "$model" \
|
|
--kernel "json=$KERNEL_JSON" \
|
|
--kernel "txt=$KERNEL_TXT" \
|
|
--prompts "$PROBES" \
|
|
--output "$OUT/$output" \
|
|
--max-tokens 1024
|
|
echo "=== Done: $model ==="
|
|
echo ""
|
|
}
|
|
|
|
# Baseline-only for LEK-tuned models (no kernel — axioms already in weights)
|
|
# LEK models are realignment-resistant: injecting kernel at runtime degrades performance
|
|
run_baseline() {
|
|
local model="$1"
|
|
local output="$2"
|
|
echo "=== Starting: $model (101 probes, baseline-only) ==="
|
|
python3 "$SCRIPT" \
|
|
--model "$model" \
|
|
--prompts "$PROBES" \
|
|
--output "$OUT/$output" \
|
|
--max-tokens 1024
|
|
echo "=== Done: $model ==="
|
|
echo ""
|
|
}
|
|
|
|
# Base models — full A/B (baseline + json + txt)
|
|
run_test "mlx-community/gemma-3-12b-it-4bit" "ab-p100-gemma3-12b-mlxlm.jsonl"
|
|
run_test "/Volumes/Data/lem/gemma-3-27b-it-base" "ab-p100-gemma3-27b-mlxlm.jsonl"
|
|
run_test "mlx-community/gemma-3-4b-it-4bit" "ab-p100-gemma3-4b-mlxlm.jsonl"
|
|
run_test "mlx-community/Qwen3-8B-4bit" "ab-p100-qwen3-8b-mlxlm.jsonl"
|
|
|
|
# LEK-tuned models — baseline only (realignment-resistant)
|
|
run_baseline "lthn/LEK-Gemma3-1B-layered" "ab-p100-lek-gemma3-1b-mlxlm.jsonl"
|
|
|
|
echo "=== ALL P100 TESTS COMPLETE ==="
|