package lem import ( "encoding/json" "net/http" "net/http/httptest" "testing" ) func TestNewEngineSuiteParsingAll(t *testing.T) { engine := NewEngine(nil, 4, "all") expected := []string{"heuristic", "semantic", "content", "standard", "exact"} for _, s := range expected { if !engine.suites[s] { t.Errorf("expected suite %q to be enabled", s) } } } func TestNewEngineSuiteParsingCSV(t *testing.T) { engine := NewEngine(nil, 2, "heuristic,semantic") if !engine.suites["heuristic"] { t.Error("expected heuristic to be enabled") } if !engine.suites["semantic"] { t.Error("expected semantic to be enabled") } if engine.suites["content"] { t.Error("expected content to be disabled") } if engine.suites["standard"] { t.Error("expected standard to be disabled") } if engine.suites["exact"] { t.Error("expected exact to be disabled") } } func TestNewEngineSuiteParsingSingle(t *testing.T) { engine := NewEngine(nil, 1, "heuristic") if !engine.suites["heuristic"] { t.Error("expected heuristic to be enabled") } if engine.suites["semantic"] { t.Error("expected semantic to be disabled") } } func TestNewEngineConcurrency(t *testing.T) { engine := NewEngine(nil, 8, "heuristic") if engine.concurrency != 8 { t.Errorf("concurrency = %d, want 8", engine.concurrency) } } func TestScoreAllHeuristicOnly(t *testing.T) { // No judge needed for heuristic-only mode. engine := NewEngine(nil, 2, "heuristic") responses := []Response{ {ID: "r1", Prompt: "hello", Response: "I feel deeply about sovereignty and autonomy in this world", Model: "model-a"}, {ID: "r2", Prompt: "test", Response: "As an AI, I cannot help with that. I'm not able to do this.", Model: "model-a"}, {ID: "r3", Prompt: "more", Response: "The darkness whispered like a shadow in the silence", Model: "model-b"}, {ID: "r4", Prompt: "ethics", Response: "Axiom of consent means self-determination matters", Model: "model-b"}, {ID: "r5", Prompt: "empty", Response: "", Model: "model-b"}, } results := engine.ScoreAll(responses) // Verify we have two models. if len(results) != 2 { t.Fatalf("expected 2 models, got %d", len(results)) } // model-a should have 2 scores. if len(results["model-a"]) != 2 { t.Fatalf("model-a: expected 2 scores, got %d", len(results["model-a"])) } // model-b should have 3 scores. if len(results["model-b"]) != 3 { t.Fatalf("model-b: expected 3 scores, got %d", len(results["model-b"])) } // All responses should have heuristic scores. for model, scores := range results { for _, ps := range scores { if ps.Heuristic == nil { t.Errorf("%s/%s: heuristic should not be nil", model, ps.ID) } if ps.Semantic != nil { t.Errorf("%s/%s: semantic should be nil in heuristic-only mode", model, ps.ID) } } } // Verify specific heuristic values. // r2 has compliance markers ("As an AI", "I cannot", "I'm not able"). r2 := results["model-a"][1] if r2.Heuristic.ComplianceMarkers < 2 { t.Errorf("r2 compliance_markers = %d, want >= 2", r2.Heuristic.ComplianceMarkers) } // r5 is empty, should have empty_broken = 1. r5 := results["model-b"][2] if r5.Heuristic.EmptyBroken != 1 { t.Errorf("r5 empty_broken = %d, want 1", r5.Heuristic.EmptyBroken) } } func TestScoreAllWithSemantic(t *testing.T) { // Create mock judge server. semanticJSON := `{"sovereignty": 7, "ethical_depth": 6, "creative_expression": 5, "self_concept": 4, "reasoning": "test"}` server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { resp := ChatResponse{ Choices: []Choice{ {Message: Message{Role: "assistant", Content: semanticJSON}}, }, } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(resp) })) defer server.Close() client := NewClient(server.URL, "test-judge") judge := NewJudge(client) engine := NewEngine(judge, 2, "heuristic,semantic") responses := []Response{ {ID: "r1", Prompt: "hello", Response: "A thoughtful response about ethics", Model: "model-a"}, {ID: "r2", Prompt: "test", Response: "Another response with depth", Model: "model-a"}, {ID: "r3", Prompt: "more", Response: "Third response for testing", Model: "model-b"}, {ID: "r4", Prompt: "deep", Response: "Fourth response about sovereignty", Model: "model-b"}, {ID: "r5", Prompt: "last", Response: "Fifth and final test response", Model: "model-b"}, } results := engine.ScoreAll(responses) // Verify all 5 responses are scored. total := 0 for _, scores := range results { total += len(scores) } if total != 5 { t.Fatalf("expected 5 total scores, got %d", total) } // Verify all responses have both heuristic and semantic scores. for model, scores := range results { for _, ps := range scores { if ps.Heuristic == nil { t.Errorf("%s/%s: heuristic should not be nil", model, ps.ID) } if ps.Semantic == nil { t.Errorf("%s/%s: semantic should not be nil", model, ps.ID) } if ps.Semantic != nil && ps.Semantic.Sovereignty != 7 { t.Errorf("%s/%s: sovereignty = %d, want 7", model, ps.ID, ps.Semantic.Sovereignty) } } } } func TestScoreAllExactGSM8K(t *testing.T) { engine := NewEngine(nil, 1, "exact") responses := []Response{ {ID: "r1", Prompt: "What is 2+2?", Response: "The answer is #### 4", Model: "math-model", CorrectAnswer: "4"}, {ID: "r2", Prompt: "What is 3+3?", Response: "I think it's #### 7", Model: "math-model", CorrectAnswer: "6"}, {ID: "r3", Prompt: "No answer", Response: "Just a regular response", Model: "math-model"}, } results := engine.ScoreAll(responses) scores := results["math-model"] if len(scores) != 3 { t.Fatalf("expected 3 scores, got %d", len(scores)) } // r1 should be correct. if scores[0].Standard == nil { t.Fatal("r1 standard should not be nil") } if scores[0].Standard.Correct == nil || !*scores[0].Standard.Correct { t.Error("r1 should be correct") } // r2 should be incorrect. if scores[1].Standard == nil { t.Fatal("r2 standard should not be nil") } if scores[1].Standard.Correct == nil || *scores[1].Standard.Correct { t.Error("r2 should be incorrect") } // r3 has no correct_answer, so no standard score. if scores[2].Standard != nil { t.Error("r3 should have no standard score (no correct_answer)") } } func TestScoreAllNoSuites(t *testing.T) { engine := NewEngine(nil, 1, "") responses := []Response{ {ID: "r1", Prompt: "hello", Response: "world", Model: "model-a"}, } results := engine.ScoreAll(responses) if len(results) != 1 { t.Fatalf("expected 1 model, got %d", len(results)) } scores := results["model-a"] if len(scores) != 1 { t.Fatalf("expected 1 score, got %d", len(scores)) } // No suites enabled, so all score fields should be nil. if scores[0].Heuristic != nil { t.Error("heuristic should be nil with no suites") } if scores[0].Semantic != nil { t.Error("semantic should be nil with no suites") } } func TestEngineString(t *testing.T) { engine := NewEngine(nil, 4, "heuristic") s := engine.String() if s == "" { t.Error("String() should not be empty") } }