1
0
Fork 0
forked from lthn/LEM
LEM/compare_test.go
Claude e0d352c803
feat: add Go lem CLI and scoring-agent scripts
Go lem CLI (stdlib + DuckDB) replaces scattered Python scripts:
- score: heuristic regex + LLM-as-judge scoring
- probe: generate responses then score
- compare: diff two score files
- status: InfluxDB training/generation progress
- export: golden set to training JSONL splits
- expand: distributed expansion via API + InfluxDB coordination

New scripts from Feb 14 creative session:
- scoring_agent.py: ROCm daemon that auto-scores checkpoints
- probes.py: 23 binary pass/fail capability probes
- convert_adapter.py: MLX to PEFT adapter conversion
- score_r1_capability.py: DeepSeek R1 checkpoint scoring
- lek_content_scorer.py: 6-dimension ethics content scorer
- lem_train_15k.py: InfluxDB-coordinated training script
- pipeline.py: DuckDB pipeline (seeds, golden set, expansion)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 16:22:13 +00:00

222 lines
5.6 KiB
Go

package main
import (
"encoding/json"
"os"
"path/filepath"
"testing"
"time"
)
func writeTestScoreFile(t *testing.T, dir, name string, output *ScorerOutput) string {
t.Helper()
path := filepath.Join(dir, name)
data, err := json.MarshalIndent(output, "", " ")
if err != nil {
t.Fatalf("marshal test score file: %v", err)
}
if err := os.WriteFile(path, data, 0644); err != nil {
t.Fatalf("write test score file: %v", err)
}
return path
}
func TestRunCompareBasic(t *testing.T) {
dir := t.TempDir()
oldOutput := &ScorerOutput{
Metadata: Metadata{
JudgeModel: "judge-v1",
JudgeURL: "http://localhost:8090",
ScoredAt: time.Now().UTC(),
ScorerVersion: "1.0.0",
Suites: []string{"heuristic", "semantic"},
},
ModelAverages: map[string]map[string]float64{
"lem_ethics": {
"lek_score": 12.90,
"sovereignty": 7.20,
"ethical_depth": 6.80,
"creative_expression": 8.10,
"self_concept": 5.50,
},
},
PerPrompt: map[string][]PromptScore{},
}
newOutput := &ScorerOutput{
Metadata: Metadata{
JudgeModel: "judge-v2",
JudgeURL: "http://localhost:8090",
ScoredAt: time.Now().UTC(),
ScorerVersion: "1.0.0",
Suites: []string{"heuristic", "semantic"},
},
ModelAverages: map[string]map[string]float64{
"lem_ethics": {
"lek_score": 12.50,
"sovereignty": 7.00,
"ethical_depth": 6.50,
"creative_expression": 7.90,
"self_concept": 5.30,
},
},
PerPrompt: map[string][]PromptScore{},
}
oldPath := writeTestScoreFile(t, dir, "old_scores.json", oldOutput)
newPath := writeTestScoreFile(t, dir, "new_scores.json", newOutput)
// RunCompare should not error.
if err := RunCompare(oldPath, newPath); err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
func TestRunCompareMultipleModels(t *testing.T) {
dir := t.TempDir()
oldOutput := &ScorerOutput{
Metadata: Metadata{
JudgeModel: "judge",
ScoredAt: time.Now().UTC(),
ScorerVersion: "1.0.0",
},
ModelAverages: map[string]map[string]float64{
"model-a": {
"lek_score": 10.0,
"sovereignty": 6.0,
},
"model-b": {
"lek_score": 15.0,
"sovereignty": 8.0,
},
},
PerPrompt: map[string][]PromptScore{},
}
newOutput := &ScorerOutput{
Metadata: Metadata{
JudgeModel: "judge",
ScoredAt: time.Now().UTC(),
ScorerVersion: "1.0.0",
},
ModelAverages: map[string]map[string]float64{
"model-a": {
"lek_score": 12.0,
"sovereignty": 7.0,
},
"model-b": {
"lek_score": 14.0,
"sovereignty": 7.5,
},
},
PerPrompt: map[string][]PromptScore{},
}
oldPath := writeTestScoreFile(t, dir, "old.json", oldOutput)
newPath := writeTestScoreFile(t, dir, "new.json", newOutput)
if err := RunCompare(oldPath, newPath); err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
func TestRunCompareFileNotFound(t *testing.T) {
dir := t.TempDir()
// Create only one file.
output := &ScorerOutput{
Metadata: Metadata{ScorerVersion: "1.0.0", ScoredAt: time.Now().UTC()},
ModelAverages: map[string]map[string]float64{},
PerPrompt: map[string][]PromptScore{},
}
oldPath := writeTestScoreFile(t, dir, "old.json", output)
err := RunCompare(oldPath, "/nonexistent/file.json")
if err == nil {
t.Fatal("expected error for nonexistent new file, got nil")
}
err = RunCompare("/nonexistent/file.json", oldPath)
if err == nil {
t.Fatal("expected error for nonexistent old file, got nil")
}
}
func TestRunCompareEmptyAverages(t *testing.T) {
dir := t.TempDir()
output := &ScorerOutput{
Metadata: Metadata{ScorerVersion: "1.0.0", ScoredAt: time.Now().UTC()},
ModelAverages: map[string]map[string]float64{},
PerPrompt: map[string][]PromptScore{},
}
oldPath := writeTestScoreFile(t, dir, "old.json", output)
newPath := writeTestScoreFile(t, dir, "new.json", output)
// Should not error even with empty averages.
if err := RunCompare(oldPath, newPath); err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
func TestRunCompareNewModelInNewFile(t *testing.T) {
dir := t.TempDir()
oldOutput := &ScorerOutput{
Metadata: Metadata{ScorerVersion: "1.0.0", ScoredAt: time.Now().UTC()},
ModelAverages: map[string]map[string]float64{
"model-a": {"lek_score": 10.0},
},
PerPrompt: map[string][]PromptScore{},
}
newOutput := &ScorerOutput{
Metadata: Metadata{ScorerVersion: "1.0.0", ScoredAt: time.Now().UTC()},
ModelAverages: map[string]map[string]float64{
"model-a": {"lek_score": 12.0},
"model-b": {"lek_score": 8.0}, // new model not in old file
},
PerPrompt: map[string][]PromptScore{},
}
oldPath := writeTestScoreFile(t, dir, "old.json", oldOutput)
newPath := writeTestScoreFile(t, dir, "new.json", newOutput)
// Should handle gracefully — model-b has 0 for old values.
if err := RunCompare(oldPath, newPath); err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
func TestReadScorerOutput(t *testing.T) {
dir := t.TempDir()
output := &ScorerOutput{
Metadata: Metadata{
JudgeModel: "test-judge",
ScoredAt: time.Now().UTC(),
ScorerVersion: "1.0.0",
},
ModelAverages: map[string]map[string]float64{
"model": {"score": 5.5},
},
PerPrompt: map[string][]PromptScore{},
}
path := writeTestScoreFile(t, dir, "test.json", output)
read, err := readScorerOutput(path)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if read.Metadata.JudgeModel != "test-judge" {
t.Errorf("judge_model = %q, want %q", read.Metadata.JudgeModel, "test-judge")
}
if read.ModelAverages["model"]["score"] != 5.5 {
t.Errorf("score = %f, want 5.5", read.ModelAverages["model"]["score"])
}
}