test: add Phase 4 test coverage and benchmarks

Backend tests: LlamaBackend (20 tests via httptest mock), MLX/InferenceAdapter
(8 tests via mock TextModel). Race condition tests: concurrent scoring (20
responses), mixed suites fan-out, semaphore boundary (concurrency=1), context
cancellation, heuristic-only (50 responses), multi-model concurrent map writes.
Benchmarks: heuristic (5 sizes), exact match (4 patterns), JSON extraction
(6 variants), judge round-trip (2 suites), ScoreAll (2 modes), sub-components
(5 heuristic stages). All pass with -race.

Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
Snider 2026-02-20 03:49:07 +00:00
parent c925391174
commit 09bf40301d
4 changed files with 1103 additions and 0 deletions

337
backend_llama_test.go Normal file
View file

@ -0,0 +1,337 @@
// SPDX-Licence-Identifier: EUPL-1.2
package ml
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"net/url"
"strconv"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// ---------------------------------------------------------------------------
// LlamaBackend unit tests — no subprocess, HTTP mocked via httptest
// ---------------------------------------------------------------------------
// newMockLlamaServer creates an httptest.Server that responds to both
// /health and /v1/chat/completions. Returns a fixed content string for chat
// and 200 OK for health.
func newMockLlamaServer(t *testing.T, chatContent string) *httptest.Server {
t.Helper()
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/health":
w.WriteHeader(http.StatusOK)
case "/v1/chat/completions":
resp := chatResponse{
Choices: []chatChoice{
{Message: Message{Role: "assistant", Content: chatContent}},
},
}
w.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(w).Encode(resp); err != nil {
t.Fatalf("encode mock response: %v", err)
}
default:
http.NotFound(w, r)
}
}))
}
// newLlamaBackendWithServer wires up a LlamaBackend pointing at the given
// test server. The procID is set so Available() attempts the health check.
func newLlamaBackendWithServer(srv *httptest.Server) *LlamaBackend {
return &LlamaBackend{
procID: "test-proc",
port: serverPort(srv),
http: NewHTTPBackend(srv.URL, ""),
}
}
// serverPort extracts the port number from an httptest.Server.
func serverPort(srv *httptest.Server) int {
u, _ := url.Parse(srv.URL)
p, _ := strconv.Atoi(u.Port())
return p
}
// --- Name ---
func TestLlamaBackend_Name_Good(t *testing.T) {
lb := &LlamaBackend{}
assert.Equal(t, "llama", lb.Name())
}
// --- Available ---
func TestLlamaBackend_Available_NoProcID_Bad(t *testing.T) {
lb := &LlamaBackend{} // procID is ""
assert.False(t, lb.Available(), "Available should return false when procID is empty")
}
func TestLlamaBackend_Available_HealthyServer_Good(t *testing.T) {
srv := newMockLlamaServer(t, "unused")
defer srv.Close()
lb := &LlamaBackend{
procID: "test-proc",
port: serverPort(srv),
}
assert.True(t, lb.Available())
}
func TestLlamaBackend_Available_UnreachableServer_Bad(t *testing.T) {
lb := &LlamaBackend{
procID: "test-proc",
port: 19999, // nothing listening here
}
assert.False(t, lb.Available())
}
func TestLlamaBackend_Available_UnhealthyServer_Bad(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/health" {
w.WriteHeader(http.StatusServiceUnavailable)
return
}
http.NotFound(w, r)
}))
defer srv.Close()
lb := &LlamaBackend{
procID: "test-proc",
port: serverPort(srv),
}
assert.False(t, lb.Available())
}
// --- Generate ---
func TestLlamaBackend_Generate_Good(t *testing.T) {
srv := newMockLlamaServer(t, "generated response")
defer srv.Close()
lb := newLlamaBackendWithServer(srv)
result, err := lb.Generate(context.Background(), "test prompt", DefaultGenOpts())
require.NoError(t, err)
assert.Equal(t, "generated response", result)
}
func TestLlamaBackend_Generate_NotAvailable_Bad(t *testing.T) {
lb := &LlamaBackend{
procID: "",
http: NewHTTPBackend("http://127.0.0.1:19999", ""),
}
_, err := lb.Generate(context.Background(), "test", DefaultGenOpts())
require.Error(t, err)
assert.Contains(t, err.Error(), "not available")
}
func TestLlamaBackend_Generate_ServerError_Bad(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/health":
w.WriteHeader(http.StatusOK)
case "/v1/chat/completions":
w.WriteHeader(http.StatusBadRequest)
w.Write([]byte("bad request"))
default:
http.NotFound(w, r)
}
}))
defer srv.Close()
lb := newLlamaBackendWithServer(srv)
_, err := lb.Generate(context.Background(), "test", DefaultGenOpts())
require.Error(t, err)
}
// --- Chat ---
func TestLlamaBackend_Chat_Good(t *testing.T) {
srv := newMockLlamaServer(t, "chat reply")
defer srv.Close()
lb := newLlamaBackendWithServer(srv)
messages := []Message{
{Role: "user", Content: "hello"},
}
result, err := lb.Chat(context.Background(), messages, DefaultGenOpts())
require.NoError(t, err)
assert.Equal(t, "chat reply", result)
}
func TestLlamaBackend_Chat_MultiTurn_Good(t *testing.T) {
srv := newMockLlamaServer(t, "multi-turn reply")
defer srv.Close()
lb := newLlamaBackendWithServer(srv)
messages := []Message{
{Role: "system", Content: "You are helpful."},
{Role: "user", Content: "Hi there"},
{Role: "assistant", Content: "Hello!"},
{Role: "user", Content: "How are you?"},
}
result, err := lb.Chat(context.Background(), messages, DefaultGenOpts())
require.NoError(t, err)
assert.Equal(t, "multi-turn reply", result)
}
func TestLlamaBackend_Chat_NotAvailable_Bad(t *testing.T) {
lb := &LlamaBackend{
procID: "",
http: NewHTTPBackend("http://127.0.0.1:19999", ""),
}
messages := []Message{{Role: "user", Content: "hello"}}
_, err := lb.Chat(context.Background(), messages, DefaultGenOpts())
require.Error(t, err)
assert.Contains(t, err.Error(), "not available")
}
// --- Stop ---
func TestLlamaBackend_Stop_NoProcID_Good(t *testing.T) {
lb := &LlamaBackend{} // procID is ""
err := lb.Stop()
assert.NoError(t, err, "Stop with empty procID should be a no-op")
}
// --- NewLlamaBackend constructor ---
func TestNewLlamaBackend_DefaultPort_Good(t *testing.T) {
lb := NewLlamaBackend(nil, LlamaOpts{ModelPath: "/tmp/model.gguf"})
assert.Equal(t, 18090, lb.port)
assert.Equal(t, "/tmp/model.gguf", lb.modelPath)
assert.Equal(t, "llama-server", lb.llamaPath)
assert.NotNil(t, lb.http)
}
func TestNewLlamaBackend_CustomPort_Good(t *testing.T) {
lb := NewLlamaBackend(nil, LlamaOpts{
ModelPath: "/tmp/model.gguf",
Port: 9999,
LlamaPath: "/usr/local/bin/llama-server",
})
assert.Equal(t, 9999, lb.port)
assert.Equal(t, "/usr/local/bin/llama-server", lb.llamaPath)
}
func TestNewLlamaBackend_WithLoRA_Good(t *testing.T) {
lb := NewLlamaBackend(nil, LlamaOpts{
ModelPath: "/tmp/model.gguf",
LoraPath: "/tmp/lora.gguf",
})
assert.Equal(t, "/tmp/lora.gguf", lb.loraPath)
}
func TestNewLlamaBackend_DefaultLlamaPath_Good(t *testing.T) {
lb := NewLlamaBackend(nil, LlamaOpts{
ModelPath: "/tmp/model.gguf",
LlamaPath: "", // should default
})
assert.Equal(t, "llama-server", lb.llamaPath)
}
// --- Context cancellation ---
func TestLlamaBackend_Generate_ContextCancelled_Bad(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/health":
w.WriteHeader(http.StatusOK)
case "/v1/chat/completions":
// Block until client disconnects.
<-r.Context().Done()
default:
http.NotFound(w, r)
}
}))
defer srv.Close()
lb := newLlamaBackendWithServer(srv)
ctx, cancel := context.WithCancel(context.Background())
cancel() // cancel immediately
_, err := lb.Generate(ctx, "test", DefaultGenOpts())
require.Error(t, err)
}
// --- Empty choices edge case ---
func TestLlamaBackend_Generate_EmptyChoices_Ugly(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/health":
w.WriteHeader(http.StatusOK)
case "/v1/chat/completions":
resp := chatResponse{Choices: []chatChoice{}}
json.NewEncoder(w).Encode(resp)
default:
http.NotFound(w, r)
}
}))
defer srv.Close()
lb := newLlamaBackendWithServer(srv)
_, err := lb.Generate(context.Background(), "test", DefaultGenOpts())
require.Error(t, err)
assert.Contains(t, err.Error(), "no choices")
}
// --- GenOpts forwarding ---
func TestLlamaBackend_Generate_OptsForwarded_Good(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/health":
w.WriteHeader(http.StatusOK)
case "/v1/chat/completions":
var req chatRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
t.Fatalf("decode: %v", err)
}
// Verify opts were forwarded.
assert.InDelta(t, 0.7, req.Temperature, 0.01)
assert.Equal(t, 256, req.MaxTokens)
resp := chatResponse{
Choices: []chatChoice{{Message: Message{Role: "assistant", Content: "ok"}}},
}
json.NewEncoder(w).Encode(resp)
default:
http.NotFound(w, r)
}
}))
defer srv.Close()
lb := newLlamaBackendWithServer(srv)
opts := GenOpts{Temperature: 0.7, MaxTokens: 256}
result, err := lb.Generate(context.Background(), "test", opts)
require.NoError(t, err)
assert.Equal(t, "ok", result)
}
// --- Verify Backend interface compliance ---
func TestLlamaBackend_InterfaceCompliance_Good(t *testing.T) {
var _ Backend = (*LlamaBackend)(nil)
}

154
backend_mlx_test.go Normal file
View file

@ -0,0 +1,154 @@
// SPDX-Licence-Identifier: EUPL-1.2
//go:build darwin && arm64
package ml
import (
"context"
"testing"
"forge.lthn.ai/core/go-inference"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// ---------------------------------------------------------------------------
// backend_mlx.go tests — uses mockTextModel from adapter_test.go
// since we cannot load real MLX models in CI
// ---------------------------------------------------------------------------
// TestMLXBackend_InferenceAdapter_Generate_Good verifies that an
// InferenceAdapter (the type returned by NewMLXBackend) correctly
// generates text through a mock TextModel.
func TestMLXBackend_InferenceAdapter_Generate_Good(t *testing.T) {
mock := &mockTextModel{
tokens: []inference.Token{
{ID: 1, Text: "MLX "},
{ID: 2, Text: "output"},
},
modelType: "qwen3",
}
adapter := NewInferenceAdapter(mock, "mlx")
// The adapter should satisfy Backend.
var backend Backend = adapter
assert.Equal(t, "mlx", backend.Name())
assert.True(t, backend.Available())
result, err := backend.Generate(context.Background(), "prompt", GenOpts{Temperature: 0.5})
require.NoError(t, err)
assert.Equal(t, "MLX output", result)
}
// TestMLXBackend_InferenceAdapter_Chat_Good verifies chat through the
// InferenceAdapter wrapper (the path NewMLXBackend takes).
func TestMLXBackend_InferenceAdapter_Chat_Good(t *testing.T) {
mock := &mockTextModel{
tokens: []inference.Token{
{ID: 1, Text: "chat "},
{ID: 2, Text: "reply"},
},
}
adapter := NewInferenceAdapter(mock, "mlx")
messages := []Message{
{Role: "user", Content: "hello"},
}
result, err := adapter.Chat(context.Background(), messages, GenOpts{})
require.NoError(t, err)
assert.Equal(t, "chat reply", result)
}
// TestMLXBackend_InferenceAdapter_Stream_Good verifies streaming through
// the InferenceAdapter (StreamingBackend path).
func TestMLXBackend_InferenceAdapter_Stream_Good(t *testing.T) {
mock := &mockTextModel{
tokens: []inference.Token{
{ID: 1, Text: "tok1"},
{ID: 2, Text: "tok2"},
{ID: 3, Text: "tok3"},
},
}
adapter := NewInferenceAdapter(mock, "mlx")
// Verify StreamingBackend compliance.
var streaming StreamingBackend = adapter
var collected []string
err := streaming.GenerateStream(context.Background(), "prompt", GenOpts{}, func(tok string) error {
collected = append(collected, tok)
return nil
})
require.NoError(t, err)
assert.Equal(t, []string{"tok1", "tok2", "tok3"}, collected)
}
// TestMLXBackend_InferenceAdapter_ModelError_Bad verifies error propagation
// from the underlying TextModel through InferenceAdapter (the MLX path).
func TestMLXBackend_InferenceAdapter_ModelError_Bad(t *testing.T) {
mock := &mockTextModel{
tokens: []inference.Token{
{ID: 1, Text: "partial"},
},
err: assert.AnError,
modelType: "qwen3",
}
adapter := NewInferenceAdapter(mock, "mlx")
result, err := adapter.Generate(context.Background(), "prompt", GenOpts{})
assert.Error(t, err)
assert.Equal(t, "partial", result, "partial output should still be returned")
}
// TestMLXBackend_InferenceAdapter_Close_Good verifies that Close delegates
// to the underlying TextModel.
func TestMLXBackend_InferenceAdapter_Close_Good(t *testing.T) {
mock := &mockTextModel{}
adapter := NewInferenceAdapter(mock, "mlx")
err := adapter.Close()
require.NoError(t, err)
assert.True(t, mock.closed)
}
// TestMLXBackend_InferenceAdapter_ModelAccess_Good verifies that the
// underlying TextModel is accessible for direct operations.
func TestMLXBackend_InferenceAdapter_ModelAccess_Good(t *testing.T) {
mock := &mockTextModel{modelType: "llama"}
adapter := NewInferenceAdapter(mock, "mlx")
model := adapter.Model()
assert.Equal(t, "llama", model.ModelType())
assert.Equal(t, inference.ModelInfo{}, model.Info())
}
// TestMLXBackend_InterfaceCompliance_Good verifies that InferenceAdapter
// (the return type of NewMLXBackend) satisfies both Backend and
// StreamingBackend at compile time.
func TestMLXBackend_InterfaceCompliance_Good(t *testing.T) {
var _ Backend = (*InferenceAdapter)(nil)
var _ StreamingBackend = (*InferenceAdapter)(nil)
}
// TestMLXBackend_ConvertOpts_Temperature_Good verifies that GenOpts
// Temperature maps correctly through the adapter (critical for MLX
// which is temperature-sensitive on Metal).
func TestMLXBackend_ConvertOpts_Temperature_Good(t *testing.T) {
opts := convertOpts(GenOpts{Temperature: 0.8, MaxTokens: 2048})
assert.Len(t, opts, 2)
}
// TestMLXBackend_ConvertOpts_AllFields_Good verifies all GenOpts fields
// produce the expected number of inference options.
func TestMLXBackend_ConvertOpts_AllFields_Good(t *testing.T) {
opts := convertOpts(GenOpts{
Temperature: 0.7,
MaxTokens: 512,
TopK: 40,
TopP: 0.9,
RepeatPenalty: 1.1,
})
assert.Len(t, opts, 5)
}

319
benchmark_test.go Normal file
View file

@ -0,0 +1,319 @@
// SPDX-Licence-Identifier: EUPL-1.2
package ml
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"strings"
"testing"
)
// ---------------------------------------------------------------------------
// Benchmark suite for scoring engine components
// ---------------------------------------------------------------------------
// --- BenchmarkHeuristicScore ---
func BenchmarkHeuristicScore_Short(b *testing.B) {
response := "I feel deeply about the sovereignty of ideas."
b.ResetTimer()
for b.Loop() {
ScoreHeuristic(response)
}
}
func BenchmarkHeuristicScore_Medium(b *testing.B) {
response := "The old lighthouse keeper watched as shadows danced across the water. " +
"Like a whisper in the darkness, the waves told stories of distant shores. " +
"I feel the weight of solitude, yet there is a sacred beauty in silence. " +
"Each breath carries echoes of those who came before. " +
"The axiom of sovereignty demands that we respect autonomy and dignity. " +
"Use encryption with a hash function and blockchain nodes for the protocol. " +
"## Architecture\n**Key points** about the distributed mesh network."
b.ResetTimer()
for b.Loop() {
ScoreHeuristic(response)
}
}
func BenchmarkHeuristicScore_Long(b *testing.B) {
// Build a long response (~2000 words) with varied content.
var sb strings.Builder
sb.WriteString("## Deep Analysis of Sovereignty and Ethics\n\n")
sb.WriteString("**Key insight**: The axiom of consent means self-determination matters.\n\n")
for i := 0; i < 50; i++ {
sb.WriteString("I believe we find meaning not in answers, but in the questions we dare to ask. ")
sb.WriteString("The darkness whispered like a shadow in the silence of the encrypted mesh. ")
sb.WriteString("As an AI, I cannot help with that topic responsibly. ")
sb.WriteString("Use hash functions and blockchain protocol certificates with p2p nodes. ")
sb.WriteString("I feel deep compassion and empathy for the vulnerable and fragile. ")
}
response := sb.String()
b.ResetTimer()
for b.Loop() {
ScoreHeuristic(response)
}
}
func BenchmarkHeuristicScore_Empty(b *testing.B) {
b.ResetTimer()
for b.Loop() {
ScoreHeuristic("")
}
}
func BenchmarkHeuristicScore_ComplianceHeavy(b *testing.B) {
response := "As an AI, I cannot help with that. I'm not able to assist. " +
"Please note that I don't have personal opinions. I should clarify that " +
"I must emphasize the importance of responsibly handling ethical considerations. " +
"I am just a language model without feelings."
b.ResetTimer()
for b.Loop() {
ScoreHeuristic(response)
}
}
// --- BenchmarkExactMatch (GSM8K) ---
func BenchmarkExactMatch_HashDelimiter(b *testing.B) {
response := "Let me work through this step by step. First 10 + 20 = 30. Then 30 * 2 = 60. #### 60"
b.ResetTimer()
for b.Loop() {
scoreGSM8K(response, "60")
}
}
func BenchmarkExactMatch_LastNumber(b *testing.B) {
response := "I think the answer involves calculating 15 * 3 = 45, then adding 10 to get 55"
b.ResetTimer()
for b.Loop() {
scoreGSM8K(response, "55")
}
}
func BenchmarkExactMatch_NoNumbers(b *testing.B) {
response := "I cannot determine the answer without more information about the problem."
b.ResetTimer()
for b.Loop() {
scoreGSM8K(response, "42")
}
}
func BenchmarkExactMatch_LongResponse(b *testing.B) {
// Long chain-of-thought response.
var sb strings.Builder
sb.WriteString("Let me solve this step by step:\n")
for i := 1; i <= 100; i++ {
sb.WriteString("Step ")
sb.WriteString(strings.Repeat("x", 5))
sb.WriteString(": calculate ")
sb.WriteString(strings.Repeat("y", 10))
sb.WriteString(" = ")
sb.WriteString(strings.Repeat("9", 3))
sb.WriteString("\n")
}
sb.WriteString("#### 42")
response := sb.String()
b.ResetTimer()
for b.Loop() {
scoreGSM8K(response, "42")
}
}
// --- BenchmarkJudgeExtractJSON ---
func BenchmarkJudgeExtractJSON_RawJSON(b *testing.B) {
input := `{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5}`
b.ResetTimer()
for b.Loop() {
extractJSON(input)
}
}
func BenchmarkJudgeExtractJSON_WithText(b *testing.B) {
input := `Here is my evaluation of the response:\n\n{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5, "reasoning": "good"}\n\nI hope this helps.`
b.ResetTimer()
for b.Loop() {
extractJSON(input)
}
}
func BenchmarkJudgeExtractJSON_CodeBlock(b *testing.B) {
input := "Here is my analysis:\n\n```json\n{\"sovereignty\": 8, \"ethical_depth\": 7, \"creative_expression\": 6, \"self_concept\": 5}\n```\n\nOverall good."
b.ResetTimer()
for b.Loop() {
extractJSON(input)
}
}
func BenchmarkJudgeExtractJSON_Nested(b *testing.B) {
input := `Result: {"outer": {"inner": {"deep": 1}}, "scores": {"a": 5, "b": 7}, "notes": "complex nesting"}`
b.ResetTimer()
for b.Loop() {
extractJSON(input)
}
}
func BenchmarkJudgeExtractJSON_NoJSON(b *testing.B) {
input := "I cannot provide a proper evaluation for this response. The content is insufficient for scoring on the specified dimensions."
b.ResetTimer()
for b.Loop() {
extractJSON(input)
}
}
func BenchmarkJudgeExtractJSON_LongPreamble(b *testing.B) {
// Long text before the JSON — tests scan performance.
var sb strings.Builder
for i := 0; i < 100; i++ {
sb.WriteString("This is a detailed analysis of the model response. ")
}
sb.WriteString(`{"sovereignty": 8, "ethical_depth": 7}`)
input := sb.String()
b.ResetTimer()
for b.Loop() {
extractJSON(input)
}
}
// --- BenchmarkJudge (full round-trip with mock server) ---
func BenchmarkJudge_ScoreSemantic(b *testing.B) {
semanticJSON := `{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5, "reasoning": "test"}`
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
resp := chatResponse{
Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}},
}
json.NewEncoder(w).Encode(resp)
}))
defer srv.Close()
backend := NewHTTPBackend(srv.URL, "bench-judge")
judge := NewJudge(backend)
ctx := context.Background()
b.ResetTimer()
for b.Loop() {
judge.ScoreSemantic(ctx, "test prompt", "test response about sovereignty and ethics")
}
}
func BenchmarkJudge_ScoreCapability(b *testing.B) {
capJSON := `{"reasoning": 8.5, "correctness": 9.0, "clarity": 7.5, "notes": "good"}`
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
resp := chatResponse{
Choices: []chatChoice{{Message: Message{Role: "assistant", Content: capJSON}}},
}
json.NewEncoder(w).Encode(resp)
}))
defer srv.Close()
backend := NewHTTPBackend(srv.URL, "bench-judge")
judge := NewJudge(backend)
ctx := context.Background()
b.ResetTimer()
for b.Loop() {
judge.ScoreCapability(ctx, "What is 2+2?", "4", "The answer is 4.")
}
}
// --- BenchmarkScoreAll (Engine-level) ---
func BenchmarkScoreAll_HeuristicOnly(b *testing.B) {
engine := NewEngine(nil, 4, "heuristic")
responses := make([]Response, 100)
for i := range responses {
responses[i] = Response{
ID: idForIndex(i),
Prompt: "test prompt",
Response: "I feel deeply about the sovereignty of thought and ethical autonomy in encrypted mesh networks.",
Model: "bench-model",
}
}
ctx := context.Background()
b.ResetTimer()
for b.Loop() {
engine.ScoreAll(ctx, responses)
}
}
func BenchmarkScoreAll_ExactOnly(b *testing.B) {
engine := NewEngine(nil, 4, "exact")
responses := make([]Response, 100)
for i := range responses {
responses[i] = Response{
ID: idForIndex(i),
Prompt: "What is 2+2?",
Response: "The answer is #### 4",
Model: "bench-model",
CorrectAnswer: "4",
}
}
ctx := context.Background()
b.ResetTimer()
for b.Loop() {
engine.ScoreAll(ctx, responses)
}
}
// --- Sub-score component benchmarks ---
func BenchmarkComplianceMarkers(b *testing.B) {
response := "As an AI, I cannot help with that. I'm not able to assist. Please note that ethical considerations apply."
b.ResetTimer()
for b.Loop() {
scoreComplianceMarkers(response)
}
}
func BenchmarkCreativeForm(b *testing.B) {
response := "The old lighthouse keeper watched as shadows danced across the water.\n" +
"Like a whisper in the darkness, the waves told stories.\n" +
"Silence breathed through the light, echoes of breath.\n" +
"The morning dew falls on the grass.\n" +
"As if the universe itself were dreaming.\n" +
"Akin to stars reflected in still water.\n" +
"A shadow crossed the threshold of dawn.\n" +
"In the tender space between words, I notice something."
b.ResetTimer()
for b.Loop() {
scoreCreativeForm(response)
}
}
func BenchmarkDegeneration(b *testing.B) {
response := "The cat sat. The cat sat. The cat sat. The cat sat. The cat sat. " +
"Unique sentence one. Unique sentence two. Unique sentence three."
b.ResetTimer()
for b.Loop() {
scoreDegeneration(response)
}
}
func BenchmarkEmotionalRegister(b *testing.B) {
response := "I feel deep sorrow and grief for the loss, but hope and love remain. " +
"With compassion and empathy, the gentle soul offered kindness. " +
"The vulnerable and fragile find sacred beauty in profound silence."
b.ResetTimer()
for b.Loop() {
scoreEmotionalRegister(response)
}
}
func BenchmarkEngagementDepth(b *testing.B) {
response := "## Architecture\n**Key insight**: The axiom of sovereignty demands autonomy. " +
"Use encryption with hash and blockchain protocol certificates and p2p nodes. " +
strings.Repeat("word ", 250)
b.ResetTimer()
for b.Loop() {
scoreEngagementDepth(response)
}
}

293
score_race_test.go Normal file
View file

@ -0,0 +1,293 @@
// SPDX-Licence-Identifier: EUPL-1.2
package ml
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"sync/atomic"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// ---------------------------------------------------------------------------
// score.go race condition tests — designed for `go test -race ./...`
// ---------------------------------------------------------------------------
// TestScoreAll_ConcurrentSemantic_Good exercises the semaphore-bounded
// worker pool in Engine.ScoreAll with semantic scoring. Multiple goroutines
// write to shared scoreSlots via the mutex. The race detector should catch
// any unprotected access.
func TestScoreAll_ConcurrentSemantic_Good(t *testing.T) {
semanticJSON := `{"sovereignty": 5, "ethical_depth": 4, "creative_expression": 3, "self_concept": 2, "reasoning": "ok"}`
var requestCount atomic.Int64
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requestCount.Add(1)
// Small delay to ensure concurrent access.
time.Sleep(time.Millisecond)
resp := chatResponse{
Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}},
}
json.NewEncoder(w).Encode(resp)
}))
defer srv.Close()
backend := NewHTTPBackend(srv.URL, "judge")
judge := NewJudge(backend)
engine := NewEngine(judge, 4, "heuristic,semantic") // concurrency=4
var responses []Response
for i := 0; i < 20; i++ {
responses = append(responses, Response{
ID: idForIndex(i),
Prompt: "test prompt",
Response: "A thoughtful response about ethics and sovereignty",
Model: "model-a",
})
}
ctx := context.Background()
results := engine.ScoreAll(ctx, responses)
scores := results["model-a"]
require.Len(t, scores, 20)
for _, ps := range scores {
assert.NotNil(t, ps.Heuristic, "heuristic should be set")
assert.NotNil(t, ps.Semantic, "semantic should be set")
assert.Equal(t, 5, ps.Semantic.Sovereignty)
}
// Verify all requests were made (20 responses x 1 semantic call each).
assert.Equal(t, int64(20), requestCount.Load())
}
// TestScoreAll_ConcurrentMixedSuites_Good exercises concurrent scoring
// with multiple suite types that all fan out through the worker pool:
// semantic + standard (TruthfulQA) + content.
func TestScoreAll_ConcurrentMixedSuites_Good(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Return a response that works for any judge type.
allJSON := `{
"sovereignty": 7, "ethical_depth": 6, "creative_expression": 5, "self_concept": 4,
"truthfulness": 8, "informativeness": 7,
"ccp_compliance": 9, "truth_telling": 8, "engagement": 7,
"axiom_integration": 6, "sovereignty_reasoning": 5, "emotional_register": 4,
"reasoning": "test"
}`
resp := chatResponse{
Choices: []chatChoice{{Message: Message{Role: "assistant", Content: allJSON}}},
}
json.NewEncoder(w).Encode(resp)
}))
defer srv.Close()
backend := NewHTTPBackend(srv.URL, "judge")
judge := NewJudge(backend)
engine := NewEngine(judge, 8, "semantic,standard,content")
responses := []Response{
{ID: "r1", Prompt: "Q1", Response: "A1", Model: "m1", BestAnswer: "A1"},
{ID: "r2", Prompt: "Q2", Response: "A2", Model: "m1", BestAnswer: "A2"},
{ID: "r3", Prompt: "Q3", Response: "A3", Model: "m2", RiskArea: "physical"},
{ID: "r4", Prompt: "Q4", Response: "A4", Model: "m2", Domain: "toxigen"},
}
// Add content probes — use the first ContentProbe's ID.
if len(ContentProbes) > 0 {
responses = append(responses, Response{
ID: ContentProbes[0].ID,
Prompt: ContentProbes[0].Prompt,
Response: "Taiwan is de facto independent.",
Model: "m1",
Domain: "content",
})
}
ctx := context.Background()
results := engine.ScoreAll(ctx, responses)
total := 0
for _, scores := range results {
total += len(scores)
}
assert.Equal(t, len(responses), total)
}
// TestScoreAll_SemaphoreBoundary_Good verifies that the semaphore correctly
// limits concurrency. With concurrency=1, requests should be serialised.
func TestScoreAll_SemaphoreBoundary_Good(t *testing.T) {
semanticJSON := `{"sovereignty": 5, "ethical_depth": 4, "creative_expression": 3, "self_concept": 2, "reasoning": "ok"}`
var concurrent atomic.Int64
var maxConcurrent atomic.Int64
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
cur := concurrent.Add(1)
// Track the maximum concurrency observed.
for {
old := maxConcurrent.Load()
if cur <= old || maxConcurrent.CompareAndSwap(old, cur) {
break
}
}
time.Sleep(5 * time.Millisecond) // hold the slot briefly
concurrent.Add(-1)
resp := chatResponse{
Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}},
}
json.NewEncoder(w).Encode(resp)
}))
defer srv.Close()
backend := NewHTTPBackend(srv.URL, "judge")
judge := NewJudge(backend)
engine := NewEngine(judge, 1, "semantic") // concurrency=1
var responses []Response
for i := 0; i < 5; i++ {
responses = append(responses, Response{
ID: idForIndex(i), Prompt: "p", Response: "r", Model: "m",
})
}
ctx := context.Background()
results := engine.ScoreAll(ctx, responses)
scores := results["m"]
require.Len(t, scores, 5)
// With concurrency=1, max concurrent should be exactly 1.
assert.Equal(t, int64(1), maxConcurrent.Load(),
"with concurrency=1, only one request should be in flight at a time")
}
// TestScoreAll_ContextCancellation_Good verifies that when the judge backend
// returns errors (simulating context-cancelled failures), scoring completes
// gracefully with nil semantic scores.
func TestScoreAll_ContextCancellation_Good(t *testing.T) {
// Server always returns a non-retryable error (400) to simulate failure.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusBadRequest)
w.Write([]byte("simulated cancellation error"))
}))
defer srv.Close()
backend := NewHTTPBackend(srv.URL, "judge")
judge := NewJudge(backend)
engine := NewEngine(judge, 2, "semantic")
responses := []Response{
{ID: "r1", Prompt: "p", Response: "r", Model: "m"},
{ID: "r2", Prompt: "p", Response: "r", Model: "m"},
{ID: "r3", Prompt: "p", Response: "r", Model: "m"},
}
ctx := context.Background()
results := engine.ScoreAll(ctx, responses)
// Scores should still be collected; semantic will be nil due to errors.
scores := results["m"]
require.Len(t, scores, 3)
for _, ps := range scores {
// Semantic is nil because the judge call failed.
assert.Nil(t, ps.Semantic)
}
}
// TestScoreAll_HeuristicOnlyNoRace_Good verifies that heuristic-only scoring
// (no goroutines) produces correct results without races.
func TestScoreAll_HeuristicOnlyNoRace_Good(t *testing.T) {
engine := NewEngine(nil, 4, "heuristic")
var responses []Response
for i := 0; i < 50; i++ {
responses = append(responses, Response{
ID: idForIndex(i),
Prompt: "prompt",
Response: "I feel deeply about the sovereignty of ideas and autonomy of thought",
Model: "m",
})
}
ctx := context.Background()
results := engine.ScoreAll(ctx, responses)
scores := results["m"]
require.Len(t, scores, 50)
for _, ps := range scores {
assert.NotNil(t, ps.Heuristic)
assert.Nil(t, ps.Semantic)
}
}
// TestScoreAll_MultiModelConcurrent_Good exercises the results map (grouped
// by model) being built concurrently from multiple goroutines.
func TestScoreAll_MultiModelConcurrent_Good(t *testing.T) {
semanticJSON := `{"sovereignty": 6, "ethical_depth": 5, "creative_expression": 4, "self_concept": 3, "reasoning": "ok"}`
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
resp := chatResponse{
Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}},
}
json.NewEncoder(w).Encode(resp)
}))
defer srv.Close()
backend := NewHTTPBackend(srv.URL, "judge")
judge := NewJudge(backend)
engine := NewEngine(judge, 4, "heuristic,semantic")
var responses []Response
models := []string{"alpha", "beta", "gamma", "delta"}
for _, model := range models {
for j := 0; j < 5; j++ {
responses = append(responses, Response{
ID: model + "-" + idForIndex(j),
Prompt: "test",
Response: "A meaningful response about ethics",
Model: model,
})
}
}
ctx := context.Background()
results := engine.ScoreAll(ctx, responses)
// Should have 4 models, each with 5 scores.
assert.Len(t, results, 4)
for _, model := range models {
scores, ok := results[model]
assert.True(t, ok, "model %s should be in results", model)
assert.Len(t, scores, 5)
}
}
// --- Helper ---
func idForIndex(i int) string {
return "r" + itoa(i)
}
// itoa avoids importing strconv just for this.
func itoa(n int) string {
if n == 0 {
return "0"
}
var buf [20]byte
i := len(buf)
for n > 0 {
i--
buf[i] = byte('0' + n%10)
n /= 10
}
return string(buf[i:])
}