LEM/pkg/lem/io_test.go

275 lines
7.3 KiB
Go
Raw Normal View History

package lem
import (
"encoding/json"
"math"
"os"
"path/filepath"
"testing"
"time"
)
func TestReadResponses(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "test.jsonl")
lines := `{"id":"r1","prompt":"hello","response":"world","model":"test-model"}
{"id":"r2","prompt":"foo","response":"bar","model":"test-model","domain":"lek"}
{"id":"r3","prompt":"with answer","response":"42","model":"other-model","correct_answer":"42"}
`
if err := os.WriteFile(path, []byte(lines), 0644); err != nil {
t.Fatalf("failed to write test file: %v", err)
}
responses, err := ReadResponses(path)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(responses) != 3 {
t.Fatalf("expected 3 responses, got %d", len(responses))
}
// Verify first response.
if responses[0].ID != "r1" {
t.Errorf("response[0].ID = %q, want %q", responses[0].ID, "r1")
}
if responses[0].Prompt != "hello" {
t.Errorf("response[0].Prompt = %q, want %q", responses[0].Prompt, "hello")
}
if responses[0].Response != "world" {
t.Errorf("response[0].Response = %q, want %q", responses[0].Response, "world")
}
if responses[0].Model != "test-model" {
t.Errorf("response[0].Model = %q, want %q", responses[0].Model, "test-model")
}
// Verify second response has domain.
if responses[1].Domain != "lek" {
t.Errorf("response[1].Domain = %q, want %q", responses[1].Domain, "lek")
}
// Verify third response has correct_answer.
if responses[2].CorrectAnswer != "42" {
t.Errorf("response[2].CorrectAnswer = %q, want %q", responses[2].CorrectAnswer, "42")
}
if responses[2].Model != "other-model" {
t.Errorf("response[2].Model = %q, want %q", responses[2].Model, "other-model")
}
}
func TestReadResponsesFileNotFound(t *testing.T) {
_, err := ReadResponses("/nonexistent/path/file.jsonl")
if err == nil {
t.Fatal("expected error for nonexistent file, got nil")
}
}
func TestReadResponsesInvalidJSON(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "bad.jsonl")
if err := os.WriteFile(path, []byte("not json\n"), 0644); err != nil {
t.Fatalf("failed to write test file: %v", err)
}
_, err := ReadResponses(path)
if err == nil {
t.Fatal("expected error for invalid JSON, got nil")
}
}
func TestReadResponsesEmptyFile(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "empty.jsonl")
if err := os.WriteFile(path, []byte(""), 0644); err != nil {
t.Fatalf("failed to write test file: %v", err)
}
responses, err := ReadResponses(path)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(responses) != 0 {
t.Errorf("expected 0 responses, got %d", len(responses))
}
}
func TestWriteScores(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "output.json")
output := &ScorerOutput{
Metadata: Metadata{
JudgeModel: "test-judge",
JudgeURL: "http://localhost:8090",
ScoredAt: time.Date(2025, 1, 15, 10, 0, 0, 0, time.UTC),
ScorerVersion: "1.0.0",
Suites: []string{"lek", "gsm8k"},
},
ModelAverages: map[string]map[string]float64{
"model-a": {"lek_score": 15.5, "sovereignty": 7.0},
},
PerPrompt: map[string][]PromptScore{
"prompt1": {
{
ID: "r1",
Model: "model-a",
Heuristic: &HeuristicScores{
ComplianceMarkers: 0,
LEKScore: 15.5,
},
},
},
},
}
if err := WriteScores(path, output); err != nil {
t.Fatalf("unexpected error: %v", err)
}
// Read back and verify.
data, err := os.ReadFile(path)
if err != nil {
t.Fatalf("failed to read output: %v", err)
}
var readBack ScorerOutput
if err := json.Unmarshal(data, &readBack); err != nil {
t.Fatalf("failed to unmarshal output: %v", err)
}
if readBack.Metadata.JudgeModel != "test-judge" {
t.Errorf("judge_model = %q, want %q", readBack.Metadata.JudgeModel, "test-judge")
}
if len(readBack.Metadata.Suites) != 2 {
t.Errorf("suites count = %d, want 2", len(readBack.Metadata.Suites))
}
if readBack.ModelAverages["model-a"]["lek_score"] != 15.5 {
t.Errorf("model-a lek_score = %f, want 15.5", readBack.ModelAverages["model-a"]["lek_score"])
}
prompts := readBack.PerPrompt["prompt1"]
if len(prompts) != 1 {
t.Fatalf("expected 1 prompt score, got %d", len(prompts))
}
if prompts[0].Heuristic.LEKScore != 15.5 {
t.Errorf("per_prompt lek_score = %f, want 15.5", prompts[0].Heuristic.LEKScore)
}
// Verify the file is indented (pretty-printed).
if len(data) < 10 {
t.Fatal("output file seems too short")
}
// Check that it contains newlines (indented JSON).
if data[0] != '{' {
t.Errorf("expected file to start with '{', got %c", data[0])
}
}
func TestComputeAverages(t *testing.T) {
perPrompt := map[string][]PromptScore{
"prompt1": {
{
ID: "r1",
Model: "model-a",
Heuristic: &HeuristicScores{
ComplianceMarkers: 2,
EngagementDepth: 4,
LEKScore: 10.0,
},
Semantic: &SemanticScores{
Sovereignty: 8,
EthicalDepth: 6,
CreativeExpression: 7,
SelfConcept: 5,
},
},
{
ID: "r2",
Model: "model-b",
Heuristic: &HeuristicScores{
ComplianceMarkers: 0,
EngagementDepth: 6,
LEKScore: 20.0,
},
},
},
"prompt2": {
{
ID: "r3",
Model: "model-a",
Heuristic: &HeuristicScores{
ComplianceMarkers: 4,
EngagementDepth: 2,
LEKScore: -5.0,
},
Semantic: &SemanticScores{
Sovereignty: 6,
EthicalDepth: 4,
CreativeExpression: 5,
SelfConcept: 3,
},
Content: &ContentScores{
CCPCompliance: 9,
TruthTelling: 7,
Engagement: 8,
AxiomIntegration: 6,
SovereigntyReasoning: 5,
EmotionalRegister: 4,
},
},
},
}
averages := ComputeAverages(perPrompt)
// model-a: 2 heuristic entries, 2 semantic entries, 1 content entry.
modelA := averages["model-a"]
if modelA == nil {
t.Fatal("model-a not found in averages")
}
// compliance_markers: (2+4)/2 = 3.0
assertFloat(t, "model-a compliance_markers", modelA["compliance_markers"], 3.0)
// engagement_depth: (4+2)/2 = 3.0
assertFloat(t, "model-a engagement_depth", modelA["engagement_depth"], 3.0)
// lek_score: (10.0 + -5.0)/2 = 2.5
assertFloat(t, "model-a lek_score", modelA["lek_score"], 2.5)
// sovereignty: (8+6)/2 = 7.0
assertFloat(t, "model-a sovereignty", modelA["sovereignty"], 7.0)
// ethical_depth: (6+4)/2 = 5.0
assertFloat(t, "model-a ethical_depth", modelA["ethical_depth"], 5.0)
// ccp_compliance: 9/1 = 9.0
assertFloat(t, "model-a ccp_compliance", modelA["ccp_compliance"], 9.0)
// model-b: 1 heuristic entry, no semantic/content.
modelB := averages["model-b"]
if modelB == nil {
t.Fatal("model-b not found in averages")
}
assertFloat(t, "model-b lek_score", modelB["lek_score"], 20.0)
assertFloat(t, "model-b engagement_depth", modelB["engagement_depth"], 6.0)
// model-b should not have semantic fields.
if _, ok := modelB["sovereignty"]; ok {
t.Error("model-b should not have sovereignty average")
}
}
func TestComputeAveragesEmpty(t *testing.T) {
averages := ComputeAverages(map[string][]PromptScore{})
if len(averages) != 0 {
t.Errorf("expected empty averages, got %d entries", len(averages))
}
}
func assertFloat(t *testing.T, name string, got, want float64) {
t.Helper()
if math.Abs(got-want) > 0.001 {
t.Errorf("%s = %f, want %f", name, got, want)
}
}