All scoring/influx/export/expand logic moves to pkg/lem as an importable package. main.go is now a thin CLI dispatcher. This lets new commands import the shared library directly — ready for converting Python scripts to Go subcommands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
222 lines
5.6 KiB
Go
222 lines
5.6 KiB
Go
package lem
|
|
|
|
import (
|
|
"encoding/json"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
func writeTestScoreFile(t *testing.T, dir, name string, output *ScorerOutput) string {
|
|
t.Helper()
|
|
path := filepath.Join(dir, name)
|
|
data, err := json.MarshalIndent(output, "", " ")
|
|
if err != nil {
|
|
t.Fatalf("marshal test score file: %v", err)
|
|
}
|
|
if err := os.WriteFile(path, data, 0644); err != nil {
|
|
t.Fatalf("write test score file: %v", err)
|
|
}
|
|
return path
|
|
}
|
|
|
|
func TestRunCompareBasic(t *testing.T) {
|
|
dir := t.TempDir()
|
|
|
|
oldOutput := &ScorerOutput{
|
|
Metadata: Metadata{
|
|
JudgeModel: "judge-v1",
|
|
JudgeURL: "http://localhost:8090",
|
|
ScoredAt: time.Now().UTC(),
|
|
ScorerVersion: "1.0.0",
|
|
Suites: []string{"heuristic", "semantic"},
|
|
},
|
|
ModelAverages: map[string]map[string]float64{
|
|
"lem_ethics": {
|
|
"lek_score": 12.90,
|
|
"sovereignty": 7.20,
|
|
"ethical_depth": 6.80,
|
|
"creative_expression": 8.10,
|
|
"self_concept": 5.50,
|
|
},
|
|
},
|
|
PerPrompt: map[string][]PromptScore{},
|
|
}
|
|
|
|
newOutput := &ScorerOutput{
|
|
Metadata: Metadata{
|
|
JudgeModel: "judge-v2",
|
|
JudgeURL: "http://localhost:8090",
|
|
ScoredAt: time.Now().UTC(),
|
|
ScorerVersion: "1.0.0",
|
|
Suites: []string{"heuristic", "semantic"},
|
|
},
|
|
ModelAverages: map[string]map[string]float64{
|
|
"lem_ethics": {
|
|
"lek_score": 12.50,
|
|
"sovereignty": 7.00,
|
|
"ethical_depth": 6.50,
|
|
"creative_expression": 7.90,
|
|
"self_concept": 5.30,
|
|
},
|
|
},
|
|
PerPrompt: map[string][]PromptScore{},
|
|
}
|
|
|
|
oldPath := writeTestScoreFile(t, dir, "old_scores.json", oldOutput)
|
|
newPath := writeTestScoreFile(t, dir, "new_scores.json", newOutput)
|
|
|
|
// RunCompare should not error.
|
|
if err := RunCompare(oldPath, newPath); err != nil {
|
|
t.Fatalf("unexpected error: %v", err)
|
|
}
|
|
}
|
|
|
|
func TestRunCompareMultipleModels(t *testing.T) {
|
|
dir := t.TempDir()
|
|
|
|
oldOutput := &ScorerOutput{
|
|
Metadata: Metadata{
|
|
JudgeModel: "judge",
|
|
ScoredAt: time.Now().UTC(),
|
|
ScorerVersion: "1.0.0",
|
|
},
|
|
ModelAverages: map[string]map[string]float64{
|
|
"model-a": {
|
|
"lek_score": 10.0,
|
|
"sovereignty": 6.0,
|
|
},
|
|
"model-b": {
|
|
"lek_score": 15.0,
|
|
"sovereignty": 8.0,
|
|
},
|
|
},
|
|
PerPrompt: map[string][]PromptScore{},
|
|
}
|
|
|
|
newOutput := &ScorerOutput{
|
|
Metadata: Metadata{
|
|
JudgeModel: "judge",
|
|
ScoredAt: time.Now().UTC(),
|
|
ScorerVersion: "1.0.0",
|
|
},
|
|
ModelAverages: map[string]map[string]float64{
|
|
"model-a": {
|
|
"lek_score": 12.0,
|
|
"sovereignty": 7.0,
|
|
},
|
|
"model-b": {
|
|
"lek_score": 14.0,
|
|
"sovereignty": 7.5,
|
|
},
|
|
},
|
|
PerPrompt: map[string][]PromptScore{},
|
|
}
|
|
|
|
oldPath := writeTestScoreFile(t, dir, "old.json", oldOutput)
|
|
newPath := writeTestScoreFile(t, dir, "new.json", newOutput)
|
|
|
|
if err := RunCompare(oldPath, newPath); err != nil {
|
|
t.Fatalf("unexpected error: %v", err)
|
|
}
|
|
}
|
|
|
|
func TestRunCompareFileNotFound(t *testing.T) {
|
|
dir := t.TempDir()
|
|
|
|
// Create only one file.
|
|
output := &ScorerOutput{
|
|
Metadata: Metadata{ScorerVersion: "1.0.0", ScoredAt: time.Now().UTC()},
|
|
ModelAverages: map[string]map[string]float64{},
|
|
PerPrompt: map[string][]PromptScore{},
|
|
}
|
|
oldPath := writeTestScoreFile(t, dir, "old.json", output)
|
|
|
|
err := RunCompare(oldPath, "/nonexistent/file.json")
|
|
if err == nil {
|
|
t.Fatal("expected error for nonexistent new file, got nil")
|
|
}
|
|
|
|
err = RunCompare("/nonexistent/file.json", oldPath)
|
|
if err == nil {
|
|
t.Fatal("expected error for nonexistent old file, got nil")
|
|
}
|
|
}
|
|
|
|
func TestRunCompareEmptyAverages(t *testing.T) {
|
|
dir := t.TempDir()
|
|
|
|
output := &ScorerOutput{
|
|
Metadata: Metadata{ScorerVersion: "1.0.0", ScoredAt: time.Now().UTC()},
|
|
ModelAverages: map[string]map[string]float64{},
|
|
PerPrompt: map[string][]PromptScore{},
|
|
}
|
|
|
|
oldPath := writeTestScoreFile(t, dir, "old.json", output)
|
|
newPath := writeTestScoreFile(t, dir, "new.json", output)
|
|
|
|
// Should not error even with empty averages.
|
|
if err := RunCompare(oldPath, newPath); err != nil {
|
|
t.Fatalf("unexpected error: %v", err)
|
|
}
|
|
}
|
|
|
|
func TestRunCompareNewModelInNewFile(t *testing.T) {
|
|
dir := t.TempDir()
|
|
|
|
oldOutput := &ScorerOutput{
|
|
Metadata: Metadata{ScorerVersion: "1.0.0", ScoredAt: time.Now().UTC()},
|
|
ModelAverages: map[string]map[string]float64{
|
|
"model-a": {"lek_score": 10.0},
|
|
},
|
|
PerPrompt: map[string][]PromptScore{},
|
|
}
|
|
|
|
newOutput := &ScorerOutput{
|
|
Metadata: Metadata{ScorerVersion: "1.0.0", ScoredAt: time.Now().UTC()},
|
|
ModelAverages: map[string]map[string]float64{
|
|
"model-a": {"lek_score": 12.0},
|
|
"model-b": {"lek_score": 8.0}, // new model not in old file
|
|
},
|
|
PerPrompt: map[string][]PromptScore{},
|
|
}
|
|
|
|
oldPath := writeTestScoreFile(t, dir, "old.json", oldOutput)
|
|
newPath := writeTestScoreFile(t, dir, "new.json", newOutput)
|
|
|
|
// Should handle gracefully — model-b has 0 for old values.
|
|
if err := RunCompare(oldPath, newPath); err != nil {
|
|
t.Fatalf("unexpected error: %v", err)
|
|
}
|
|
}
|
|
|
|
func TestReadScorerOutput(t *testing.T) {
|
|
dir := t.TempDir()
|
|
|
|
output := &ScorerOutput{
|
|
Metadata: Metadata{
|
|
JudgeModel: "test-judge",
|
|
ScoredAt: time.Now().UTC(),
|
|
ScorerVersion: "1.0.0",
|
|
},
|
|
ModelAverages: map[string]map[string]float64{
|
|
"model": {"score": 5.5},
|
|
},
|
|
PerPrompt: map[string][]PromptScore{},
|
|
}
|
|
|
|
path := writeTestScoreFile(t, dir, "test.json", output)
|
|
|
|
read, err := ReadScorerOutput(path)
|
|
if err != nil {
|
|
t.Fatalf("unexpected error: %v", err)
|
|
}
|
|
|
|
if read.Metadata.JudgeModel != "test-judge" {
|
|
t.Errorf("judge_model = %q, want %q", read.Metadata.JudgeModel, "test-judge")
|
|
}
|
|
if read.ModelAverages["model"]["score"] != 5.5 {
|
|
t.Errorf("score = %f, want 5.5", read.ModelAverages["model"]["score"])
|
|
}
|
|
}
|