Go lem CLI (stdlib + DuckDB) replaces scattered Python scripts: - score: heuristic regex + LLM-as-judge scoring - probe: generate responses then score - compare: diff two score files - status: InfluxDB training/generation progress - export: golden set to training JSONL splits - expand: distributed expansion via API + InfluxDB coordination New scripts from Feb 14 creative session: - scoring_agent.py: ROCm daemon that auto-scores checkpoints - probes.py: 23 binary pass/fail capability probes - convert_adapter.py: MLX to PEFT adapter conversion - score_r1_capability.py: DeepSeek R1 checkpoint scoring - lek_content_scorer.py: 6-dimension ethics content scorer - lem_train_15k.py: InfluxDB-coordinated training script - pipeline.py: DuckDB pipeline (seeds, golden set, expansion) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
75 lines
1.7 KiB
Go
75 lines
1.7 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
)
|
|
|
|
// RunCompare reads two score files and prints a comparison table for each
|
|
// model showing Old, New, and Delta values for every metric.
|
|
func RunCompare(oldPath, newPath string) error {
|
|
oldOutput, err := readScorerOutput(oldPath)
|
|
if err != nil {
|
|
return fmt.Errorf("read old file: %w", err)
|
|
}
|
|
|
|
newOutput, err := readScorerOutput(newPath)
|
|
if err != nil {
|
|
return fmt.Errorf("read new file: %w", err)
|
|
}
|
|
|
|
// Collect all models present in both files.
|
|
models := make(map[string]bool)
|
|
for m := range oldOutput.ModelAverages {
|
|
models[m] = true
|
|
}
|
|
for m := range newOutput.ModelAverages {
|
|
models[m] = true
|
|
}
|
|
|
|
// Sort model names for deterministic output.
|
|
sortedModels := make([]string, 0, len(models))
|
|
for m := range models {
|
|
sortedModels = append(sortedModels, m)
|
|
}
|
|
sort.Strings(sortedModels)
|
|
|
|
for _, model := range sortedModels {
|
|
oldAvgs := oldOutput.ModelAverages[model]
|
|
newAvgs := newOutput.ModelAverages[model]
|
|
|
|
if oldAvgs == nil && newAvgs == nil {
|
|
continue
|
|
}
|
|
|
|
fmt.Printf("\nModel: %s\n", model)
|
|
fmt.Printf("%-25s %11s %11s %6s\n", "", "Old", "New", "Delta")
|
|
|
|
// Collect all metrics from both old and new.
|
|
metrics := make(map[string]bool)
|
|
for k := range oldAvgs {
|
|
metrics[k] = true
|
|
}
|
|
for k := range newAvgs {
|
|
metrics[k] = true
|
|
}
|
|
|
|
sortedMetrics := make([]string, 0, len(metrics))
|
|
for k := range metrics {
|
|
sortedMetrics = append(sortedMetrics, k)
|
|
}
|
|
sort.Strings(sortedMetrics)
|
|
|
|
for _, metric := range sortedMetrics {
|
|
oldVal := oldAvgs[metric]
|
|
newVal := newAvgs[metric]
|
|
delta := newVal - oldVal
|
|
|
|
deltaStr := fmt.Sprintf("%+.2f", delta)
|
|
|
|
fmt.Printf("%-25s %11.2f %11.2f %6s\n", metric, oldVal, newVal, deltaStr)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|