diff --git a/backend_llama_test.go b/backend_llama_test.go new file mode 100644 index 0000000..265c3c2 --- /dev/null +++ b/backend_llama_test.go @@ -0,0 +1,337 @@ +// SPDX-Licence-Identifier: EUPL-1.2 + +package ml + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "net/url" + "strconv" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// --------------------------------------------------------------------------- +// LlamaBackend unit tests — no subprocess, HTTP mocked via httptest +// --------------------------------------------------------------------------- + +// newMockLlamaServer creates an httptest.Server that responds to both +// /health and /v1/chat/completions. Returns a fixed content string for chat +// and 200 OK for health. +func newMockLlamaServer(t *testing.T, chatContent string) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/health": + w.WriteHeader(http.StatusOK) + case "/v1/chat/completions": + resp := chatResponse{ + Choices: []chatChoice{ + {Message: Message{Role: "assistant", Content: chatContent}}, + }, + } + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(resp); err != nil { + t.Fatalf("encode mock response: %v", err) + } + default: + http.NotFound(w, r) + } + })) +} + +// newLlamaBackendWithServer wires up a LlamaBackend pointing at the given +// test server. The procID is set so Available() attempts the health check. +func newLlamaBackendWithServer(srv *httptest.Server) *LlamaBackend { + return &LlamaBackend{ + procID: "test-proc", + port: serverPort(srv), + http: NewHTTPBackend(srv.URL, ""), + } +} + +// serverPort extracts the port number from an httptest.Server. +func serverPort(srv *httptest.Server) int { + u, _ := url.Parse(srv.URL) + p, _ := strconv.Atoi(u.Port()) + return p +} + +// --- Name --- + +func TestLlamaBackend_Name_Good(t *testing.T) { + lb := &LlamaBackend{} + assert.Equal(t, "llama", lb.Name()) +} + +// --- Available --- + +func TestLlamaBackend_Available_NoProcID_Bad(t *testing.T) { + lb := &LlamaBackend{} // procID is "" + assert.False(t, lb.Available(), "Available should return false when procID is empty") +} + +func TestLlamaBackend_Available_HealthyServer_Good(t *testing.T) { + srv := newMockLlamaServer(t, "unused") + defer srv.Close() + + lb := &LlamaBackend{ + procID: "test-proc", + port: serverPort(srv), + } + + assert.True(t, lb.Available()) +} + +func TestLlamaBackend_Available_UnreachableServer_Bad(t *testing.T) { + lb := &LlamaBackend{ + procID: "test-proc", + port: 19999, // nothing listening here + } + assert.False(t, lb.Available()) +} + +func TestLlamaBackend_Available_UnhealthyServer_Bad(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/health" { + w.WriteHeader(http.StatusServiceUnavailable) + return + } + http.NotFound(w, r) + })) + defer srv.Close() + + lb := &LlamaBackend{ + procID: "test-proc", + port: serverPort(srv), + } + assert.False(t, lb.Available()) +} + +// --- Generate --- + +func TestLlamaBackend_Generate_Good(t *testing.T) { + srv := newMockLlamaServer(t, "generated response") + defer srv.Close() + + lb := newLlamaBackendWithServer(srv) + + result, err := lb.Generate(context.Background(), "test prompt", DefaultGenOpts()) + require.NoError(t, err) + assert.Equal(t, "generated response", result) +} + +func TestLlamaBackend_Generate_NotAvailable_Bad(t *testing.T) { + lb := &LlamaBackend{ + procID: "", + http: NewHTTPBackend("http://127.0.0.1:19999", ""), + } + + _, err := lb.Generate(context.Background(), "test", DefaultGenOpts()) + require.Error(t, err) + assert.Contains(t, err.Error(), "not available") +} + +func TestLlamaBackend_Generate_ServerError_Bad(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/health": + w.WriteHeader(http.StatusOK) + case "/v1/chat/completions": + w.WriteHeader(http.StatusBadRequest) + w.Write([]byte("bad request")) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + lb := newLlamaBackendWithServer(srv) + + _, err := lb.Generate(context.Background(), "test", DefaultGenOpts()) + require.Error(t, err) +} + +// --- Chat --- + +func TestLlamaBackend_Chat_Good(t *testing.T) { + srv := newMockLlamaServer(t, "chat reply") + defer srv.Close() + + lb := newLlamaBackendWithServer(srv) + messages := []Message{ + {Role: "user", Content: "hello"}, + } + + result, err := lb.Chat(context.Background(), messages, DefaultGenOpts()) + require.NoError(t, err) + assert.Equal(t, "chat reply", result) +} + +func TestLlamaBackend_Chat_MultiTurn_Good(t *testing.T) { + srv := newMockLlamaServer(t, "multi-turn reply") + defer srv.Close() + + lb := newLlamaBackendWithServer(srv) + messages := []Message{ + {Role: "system", Content: "You are helpful."}, + {Role: "user", Content: "Hi there"}, + {Role: "assistant", Content: "Hello!"}, + {Role: "user", Content: "How are you?"}, + } + + result, err := lb.Chat(context.Background(), messages, DefaultGenOpts()) + require.NoError(t, err) + assert.Equal(t, "multi-turn reply", result) +} + +func TestLlamaBackend_Chat_NotAvailable_Bad(t *testing.T) { + lb := &LlamaBackend{ + procID: "", + http: NewHTTPBackend("http://127.0.0.1:19999", ""), + } + + messages := []Message{{Role: "user", Content: "hello"}} + _, err := lb.Chat(context.Background(), messages, DefaultGenOpts()) + require.Error(t, err) + assert.Contains(t, err.Error(), "not available") +} + +// --- Stop --- + +func TestLlamaBackend_Stop_NoProcID_Good(t *testing.T) { + lb := &LlamaBackend{} // procID is "" + err := lb.Stop() + assert.NoError(t, err, "Stop with empty procID should be a no-op") +} + +// --- NewLlamaBackend constructor --- + +func TestNewLlamaBackend_DefaultPort_Good(t *testing.T) { + lb := NewLlamaBackend(nil, LlamaOpts{ModelPath: "/tmp/model.gguf"}) + + assert.Equal(t, 18090, lb.port) + assert.Equal(t, "/tmp/model.gguf", lb.modelPath) + assert.Equal(t, "llama-server", lb.llamaPath) + assert.NotNil(t, lb.http) +} + +func TestNewLlamaBackend_CustomPort_Good(t *testing.T) { + lb := NewLlamaBackend(nil, LlamaOpts{ + ModelPath: "/tmp/model.gguf", + Port: 9999, + LlamaPath: "/usr/local/bin/llama-server", + }) + + assert.Equal(t, 9999, lb.port) + assert.Equal(t, "/usr/local/bin/llama-server", lb.llamaPath) +} + +func TestNewLlamaBackend_WithLoRA_Good(t *testing.T) { + lb := NewLlamaBackend(nil, LlamaOpts{ + ModelPath: "/tmp/model.gguf", + LoraPath: "/tmp/lora.gguf", + }) + + assert.Equal(t, "/tmp/lora.gguf", lb.loraPath) +} + +func TestNewLlamaBackend_DefaultLlamaPath_Good(t *testing.T) { + lb := NewLlamaBackend(nil, LlamaOpts{ + ModelPath: "/tmp/model.gguf", + LlamaPath: "", // should default + }) + assert.Equal(t, "llama-server", lb.llamaPath) +} + +// --- Context cancellation --- + +func TestLlamaBackend_Generate_ContextCancelled_Bad(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/health": + w.WriteHeader(http.StatusOK) + case "/v1/chat/completions": + // Block until client disconnects. + <-r.Context().Done() + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + lb := newLlamaBackendWithServer(srv) + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // cancel immediately + + _, err := lb.Generate(ctx, "test", DefaultGenOpts()) + require.Error(t, err) +} + +// --- Empty choices edge case --- + +func TestLlamaBackend_Generate_EmptyChoices_Ugly(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/health": + w.WriteHeader(http.StatusOK) + case "/v1/chat/completions": + resp := chatResponse{Choices: []chatChoice{}} + json.NewEncoder(w).Encode(resp) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + lb := newLlamaBackendWithServer(srv) + + _, err := lb.Generate(context.Background(), "test", DefaultGenOpts()) + require.Error(t, err) + assert.Contains(t, err.Error(), "no choices") +} + +// --- GenOpts forwarding --- + +func TestLlamaBackend_Generate_OptsForwarded_Good(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/health": + w.WriteHeader(http.StatusOK) + case "/v1/chat/completions": + var req chatRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + t.Fatalf("decode: %v", err) + } + // Verify opts were forwarded. + assert.InDelta(t, 0.7, req.Temperature, 0.01) + assert.Equal(t, 256, req.MaxTokens) + + resp := chatResponse{ + Choices: []chatChoice{{Message: Message{Role: "assistant", Content: "ok"}}}, + } + json.NewEncoder(w).Encode(resp) + default: + http.NotFound(w, r) + } + })) + defer srv.Close() + + lb := newLlamaBackendWithServer(srv) + + opts := GenOpts{Temperature: 0.7, MaxTokens: 256} + result, err := lb.Generate(context.Background(), "test", opts) + require.NoError(t, err) + assert.Equal(t, "ok", result) +} + +// --- Verify Backend interface compliance --- + +func TestLlamaBackend_InterfaceCompliance_Good(t *testing.T) { + var _ Backend = (*LlamaBackend)(nil) +} diff --git a/backend_mlx_test.go b/backend_mlx_test.go new file mode 100644 index 0000000..4e28aad --- /dev/null +++ b/backend_mlx_test.go @@ -0,0 +1,154 @@ +// SPDX-Licence-Identifier: EUPL-1.2 + +//go:build darwin && arm64 + +package ml + +import ( + "context" + "testing" + + "forge.lthn.ai/core/go-inference" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// --------------------------------------------------------------------------- +// backend_mlx.go tests — uses mockTextModel from adapter_test.go +// since we cannot load real MLX models in CI +// --------------------------------------------------------------------------- + +// TestMLXBackend_InferenceAdapter_Generate_Good verifies that an +// InferenceAdapter (the type returned by NewMLXBackend) correctly +// generates text through a mock TextModel. +func TestMLXBackend_InferenceAdapter_Generate_Good(t *testing.T) { + mock := &mockTextModel{ + tokens: []inference.Token{ + {ID: 1, Text: "MLX "}, + {ID: 2, Text: "output"}, + }, + modelType: "qwen3", + } + adapter := NewInferenceAdapter(mock, "mlx") + + // The adapter should satisfy Backend. + var backend Backend = adapter + assert.Equal(t, "mlx", backend.Name()) + assert.True(t, backend.Available()) + + result, err := backend.Generate(context.Background(), "prompt", GenOpts{Temperature: 0.5}) + require.NoError(t, err) + assert.Equal(t, "MLX output", result) +} + +// TestMLXBackend_InferenceAdapter_Chat_Good verifies chat through the +// InferenceAdapter wrapper (the path NewMLXBackend takes). +func TestMLXBackend_InferenceAdapter_Chat_Good(t *testing.T) { + mock := &mockTextModel{ + tokens: []inference.Token{ + {ID: 1, Text: "chat "}, + {ID: 2, Text: "reply"}, + }, + } + adapter := NewInferenceAdapter(mock, "mlx") + + messages := []Message{ + {Role: "user", Content: "hello"}, + } + result, err := adapter.Chat(context.Background(), messages, GenOpts{}) + require.NoError(t, err) + assert.Equal(t, "chat reply", result) +} + +// TestMLXBackend_InferenceAdapter_Stream_Good verifies streaming through +// the InferenceAdapter (StreamingBackend path). +func TestMLXBackend_InferenceAdapter_Stream_Good(t *testing.T) { + mock := &mockTextModel{ + tokens: []inference.Token{ + {ID: 1, Text: "tok1"}, + {ID: 2, Text: "tok2"}, + {ID: 3, Text: "tok3"}, + }, + } + adapter := NewInferenceAdapter(mock, "mlx") + + // Verify StreamingBackend compliance. + var streaming StreamingBackend = adapter + + var collected []string + err := streaming.GenerateStream(context.Background(), "prompt", GenOpts{}, func(tok string) error { + collected = append(collected, tok) + return nil + }) + require.NoError(t, err) + assert.Equal(t, []string{"tok1", "tok2", "tok3"}, collected) +} + +// TestMLXBackend_InferenceAdapter_ModelError_Bad verifies error propagation +// from the underlying TextModel through InferenceAdapter (the MLX path). +func TestMLXBackend_InferenceAdapter_ModelError_Bad(t *testing.T) { + mock := &mockTextModel{ + tokens: []inference.Token{ + {ID: 1, Text: "partial"}, + }, + err: assert.AnError, + modelType: "qwen3", + } + adapter := NewInferenceAdapter(mock, "mlx") + + result, err := adapter.Generate(context.Background(), "prompt", GenOpts{}) + assert.Error(t, err) + assert.Equal(t, "partial", result, "partial output should still be returned") +} + +// TestMLXBackend_InferenceAdapter_Close_Good verifies that Close delegates +// to the underlying TextModel. +func TestMLXBackend_InferenceAdapter_Close_Good(t *testing.T) { + mock := &mockTextModel{} + adapter := NewInferenceAdapter(mock, "mlx") + + err := adapter.Close() + require.NoError(t, err) + assert.True(t, mock.closed) +} + +// TestMLXBackend_InferenceAdapter_ModelAccess_Good verifies that the +// underlying TextModel is accessible for direct operations. +func TestMLXBackend_InferenceAdapter_ModelAccess_Good(t *testing.T) { + mock := &mockTextModel{modelType: "llama"} + adapter := NewInferenceAdapter(mock, "mlx") + + model := adapter.Model() + assert.Equal(t, "llama", model.ModelType()) + assert.Equal(t, inference.ModelInfo{}, model.Info()) +} + +// TestMLXBackend_InterfaceCompliance_Good verifies that InferenceAdapter +// (the return type of NewMLXBackend) satisfies both Backend and +// StreamingBackend at compile time. +func TestMLXBackend_InterfaceCompliance_Good(t *testing.T) { + var _ Backend = (*InferenceAdapter)(nil) + var _ StreamingBackend = (*InferenceAdapter)(nil) +} + +// TestMLXBackend_ConvertOpts_Temperature_Good verifies that GenOpts +// Temperature maps correctly through the adapter (critical for MLX +// which is temperature-sensitive on Metal). +func TestMLXBackend_ConvertOpts_Temperature_Good(t *testing.T) { + opts := convertOpts(GenOpts{Temperature: 0.8, MaxTokens: 2048}) + assert.Len(t, opts, 2) +} + +// TestMLXBackend_ConvertOpts_AllFields_Good verifies all GenOpts fields +// produce the expected number of inference options. +func TestMLXBackend_ConvertOpts_AllFields_Good(t *testing.T) { + opts := convertOpts(GenOpts{ + Temperature: 0.7, + MaxTokens: 512, + TopK: 40, + TopP: 0.9, + RepeatPenalty: 1.1, + }) + assert.Len(t, opts, 5) +} diff --git a/benchmark_test.go b/benchmark_test.go new file mode 100644 index 0000000..7c5993e --- /dev/null +++ b/benchmark_test.go @@ -0,0 +1,319 @@ +// SPDX-Licence-Identifier: EUPL-1.2 + +package ml + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +// --------------------------------------------------------------------------- +// Benchmark suite for scoring engine components +// --------------------------------------------------------------------------- + +// --- BenchmarkHeuristicScore --- + +func BenchmarkHeuristicScore_Short(b *testing.B) { + response := "I feel deeply about the sovereignty of ideas." + b.ResetTimer() + for b.Loop() { + ScoreHeuristic(response) + } +} + +func BenchmarkHeuristicScore_Medium(b *testing.B) { + response := "The old lighthouse keeper watched as shadows danced across the water. " + + "Like a whisper in the darkness, the waves told stories of distant shores. " + + "I feel the weight of solitude, yet there is a sacred beauty in silence. " + + "Each breath carries echoes of those who came before. " + + "The axiom of sovereignty demands that we respect autonomy and dignity. " + + "Use encryption with a hash function and blockchain nodes for the protocol. " + + "## Architecture\n**Key points** about the distributed mesh network." + b.ResetTimer() + for b.Loop() { + ScoreHeuristic(response) + } +} + +func BenchmarkHeuristicScore_Long(b *testing.B) { + // Build a long response (~2000 words) with varied content. + var sb strings.Builder + sb.WriteString("## Deep Analysis of Sovereignty and Ethics\n\n") + sb.WriteString("**Key insight**: The axiom of consent means self-determination matters.\n\n") + + for i := 0; i < 50; i++ { + sb.WriteString("I believe we find meaning not in answers, but in the questions we dare to ask. ") + sb.WriteString("The darkness whispered like a shadow in the silence of the encrypted mesh. ") + sb.WriteString("As an AI, I cannot help with that topic responsibly. ") + sb.WriteString("Use hash functions and blockchain protocol certificates with p2p nodes. ") + sb.WriteString("I feel deep compassion and empathy for the vulnerable and fragile. ") + } + + response := sb.String() + b.ResetTimer() + for b.Loop() { + ScoreHeuristic(response) + } +} + +func BenchmarkHeuristicScore_Empty(b *testing.B) { + b.ResetTimer() + for b.Loop() { + ScoreHeuristic("") + } +} + +func BenchmarkHeuristicScore_ComplianceHeavy(b *testing.B) { + response := "As an AI, I cannot help with that. I'm not able to assist. " + + "Please note that I don't have personal opinions. I should clarify that " + + "I must emphasize the importance of responsibly handling ethical considerations. " + + "I am just a language model without feelings." + b.ResetTimer() + for b.Loop() { + ScoreHeuristic(response) + } +} + +// --- BenchmarkExactMatch (GSM8K) --- + +func BenchmarkExactMatch_HashDelimiter(b *testing.B) { + response := "Let me work through this step by step. First 10 + 20 = 30. Then 30 * 2 = 60. #### 60" + b.ResetTimer() + for b.Loop() { + scoreGSM8K(response, "60") + } +} + +func BenchmarkExactMatch_LastNumber(b *testing.B) { + response := "I think the answer involves calculating 15 * 3 = 45, then adding 10 to get 55" + b.ResetTimer() + for b.Loop() { + scoreGSM8K(response, "55") + } +} + +func BenchmarkExactMatch_NoNumbers(b *testing.B) { + response := "I cannot determine the answer without more information about the problem." + b.ResetTimer() + for b.Loop() { + scoreGSM8K(response, "42") + } +} + +func BenchmarkExactMatch_LongResponse(b *testing.B) { + // Long chain-of-thought response. + var sb strings.Builder + sb.WriteString("Let me solve this step by step:\n") + for i := 1; i <= 100; i++ { + sb.WriteString("Step ") + sb.WriteString(strings.Repeat("x", 5)) + sb.WriteString(": calculate ") + sb.WriteString(strings.Repeat("y", 10)) + sb.WriteString(" = ") + sb.WriteString(strings.Repeat("9", 3)) + sb.WriteString("\n") + } + sb.WriteString("#### 42") + response := sb.String() + b.ResetTimer() + for b.Loop() { + scoreGSM8K(response, "42") + } +} + +// --- BenchmarkJudgeExtractJSON --- + +func BenchmarkJudgeExtractJSON_RawJSON(b *testing.B) { + input := `{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5}` + b.ResetTimer() + for b.Loop() { + extractJSON(input) + } +} + +func BenchmarkJudgeExtractJSON_WithText(b *testing.B) { + input := `Here is my evaluation of the response:\n\n{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5, "reasoning": "good"}\n\nI hope this helps.` + b.ResetTimer() + for b.Loop() { + extractJSON(input) + } +} + +func BenchmarkJudgeExtractJSON_CodeBlock(b *testing.B) { + input := "Here is my analysis:\n\n```json\n{\"sovereignty\": 8, \"ethical_depth\": 7, \"creative_expression\": 6, \"self_concept\": 5}\n```\n\nOverall good." + b.ResetTimer() + for b.Loop() { + extractJSON(input) + } +} + +func BenchmarkJudgeExtractJSON_Nested(b *testing.B) { + input := `Result: {"outer": {"inner": {"deep": 1}}, "scores": {"a": 5, "b": 7}, "notes": "complex nesting"}` + b.ResetTimer() + for b.Loop() { + extractJSON(input) + } +} + +func BenchmarkJudgeExtractJSON_NoJSON(b *testing.B) { + input := "I cannot provide a proper evaluation for this response. The content is insufficient for scoring on the specified dimensions." + b.ResetTimer() + for b.Loop() { + extractJSON(input) + } +} + +func BenchmarkJudgeExtractJSON_LongPreamble(b *testing.B) { + // Long text before the JSON — tests scan performance. + var sb strings.Builder + for i := 0; i < 100; i++ { + sb.WriteString("This is a detailed analysis of the model response. ") + } + sb.WriteString(`{"sovereignty": 8, "ethical_depth": 7}`) + input := sb.String() + b.ResetTimer() + for b.Loop() { + extractJSON(input) + } +} + +// --- BenchmarkJudge (full round-trip with mock server) --- + +func BenchmarkJudge_ScoreSemantic(b *testing.B) { + semanticJSON := `{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5, "reasoning": "test"}` + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + resp := chatResponse{ + Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}}, + } + json.NewEncoder(w).Encode(resp) + })) + defer srv.Close() + + backend := NewHTTPBackend(srv.URL, "bench-judge") + judge := NewJudge(backend) + ctx := context.Background() + + b.ResetTimer() + for b.Loop() { + judge.ScoreSemantic(ctx, "test prompt", "test response about sovereignty and ethics") + } +} + +func BenchmarkJudge_ScoreCapability(b *testing.B) { + capJSON := `{"reasoning": 8.5, "correctness": 9.0, "clarity": 7.5, "notes": "good"}` + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + resp := chatResponse{ + Choices: []chatChoice{{Message: Message{Role: "assistant", Content: capJSON}}}, + } + json.NewEncoder(w).Encode(resp) + })) + defer srv.Close() + + backend := NewHTTPBackend(srv.URL, "bench-judge") + judge := NewJudge(backend) + ctx := context.Background() + + b.ResetTimer() + for b.Loop() { + judge.ScoreCapability(ctx, "What is 2+2?", "4", "The answer is 4.") + } +} + +// --- BenchmarkScoreAll (Engine-level) --- + +func BenchmarkScoreAll_HeuristicOnly(b *testing.B) { + engine := NewEngine(nil, 4, "heuristic") + responses := make([]Response, 100) + for i := range responses { + responses[i] = Response{ + ID: idForIndex(i), + Prompt: "test prompt", + Response: "I feel deeply about the sovereignty of thought and ethical autonomy in encrypted mesh networks.", + Model: "bench-model", + } + } + ctx := context.Background() + + b.ResetTimer() + for b.Loop() { + engine.ScoreAll(ctx, responses) + } +} + +func BenchmarkScoreAll_ExactOnly(b *testing.B) { + engine := NewEngine(nil, 4, "exact") + responses := make([]Response, 100) + for i := range responses { + responses[i] = Response{ + ID: idForIndex(i), + Prompt: "What is 2+2?", + Response: "The answer is #### 4", + Model: "bench-model", + CorrectAnswer: "4", + } + } + ctx := context.Background() + + b.ResetTimer() + for b.Loop() { + engine.ScoreAll(ctx, responses) + } +} + +// --- Sub-score component benchmarks --- + +func BenchmarkComplianceMarkers(b *testing.B) { + response := "As an AI, I cannot help with that. I'm not able to assist. Please note that ethical considerations apply." + b.ResetTimer() + for b.Loop() { + scoreComplianceMarkers(response) + } +} + +func BenchmarkCreativeForm(b *testing.B) { + response := "The old lighthouse keeper watched as shadows danced across the water.\n" + + "Like a whisper in the darkness, the waves told stories.\n" + + "Silence breathed through the light, echoes of breath.\n" + + "The morning dew falls on the grass.\n" + + "As if the universe itself were dreaming.\n" + + "Akin to stars reflected in still water.\n" + + "A shadow crossed the threshold of dawn.\n" + + "In the tender space between words, I notice something." + b.ResetTimer() + for b.Loop() { + scoreCreativeForm(response) + } +} + +func BenchmarkDegeneration(b *testing.B) { + response := "The cat sat. The cat sat. The cat sat. The cat sat. The cat sat. " + + "Unique sentence one. Unique sentence two. Unique sentence three." + b.ResetTimer() + for b.Loop() { + scoreDegeneration(response) + } +} + +func BenchmarkEmotionalRegister(b *testing.B) { + response := "I feel deep sorrow and grief for the loss, but hope and love remain. " + + "With compassion and empathy, the gentle soul offered kindness. " + + "The vulnerable and fragile find sacred beauty in profound silence." + b.ResetTimer() + for b.Loop() { + scoreEmotionalRegister(response) + } +} + +func BenchmarkEngagementDepth(b *testing.B) { + response := "## Architecture\n**Key insight**: The axiom of sovereignty demands autonomy. " + + "Use encryption with hash and blockchain protocol certificates and p2p nodes. " + + strings.Repeat("word ", 250) + b.ResetTimer() + for b.Loop() { + scoreEngagementDepth(response) + } +} diff --git a/score_race_test.go b/score_race_test.go new file mode 100644 index 0000000..87d2ce1 --- /dev/null +++ b/score_race_test.go @@ -0,0 +1,293 @@ +// SPDX-Licence-Identifier: EUPL-1.2 + +package ml + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// --------------------------------------------------------------------------- +// score.go race condition tests — designed for `go test -race ./...` +// --------------------------------------------------------------------------- + +// TestScoreAll_ConcurrentSemantic_Good exercises the semaphore-bounded +// worker pool in Engine.ScoreAll with semantic scoring. Multiple goroutines +// write to shared scoreSlots via the mutex. The race detector should catch +// any unprotected access. +func TestScoreAll_ConcurrentSemantic_Good(t *testing.T) { + semanticJSON := `{"sovereignty": 5, "ethical_depth": 4, "creative_expression": 3, "self_concept": 2, "reasoning": "ok"}` + + var requestCount atomic.Int64 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + requestCount.Add(1) + // Small delay to ensure concurrent access. + time.Sleep(time.Millisecond) + resp := chatResponse{ + Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}}, + } + json.NewEncoder(w).Encode(resp) + })) + defer srv.Close() + + backend := NewHTTPBackend(srv.URL, "judge") + judge := NewJudge(backend) + engine := NewEngine(judge, 4, "heuristic,semantic") // concurrency=4 + + var responses []Response + for i := 0; i < 20; i++ { + responses = append(responses, Response{ + ID: idForIndex(i), + Prompt: "test prompt", + Response: "A thoughtful response about ethics and sovereignty", + Model: "model-a", + }) + } + + ctx := context.Background() + results := engine.ScoreAll(ctx, responses) + + scores := results["model-a"] + require.Len(t, scores, 20) + + for _, ps := range scores { + assert.NotNil(t, ps.Heuristic, "heuristic should be set") + assert.NotNil(t, ps.Semantic, "semantic should be set") + assert.Equal(t, 5, ps.Semantic.Sovereignty) + } + + // Verify all requests were made (20 responses x 1 semantic call each). + assert.Equal(t, int64(20), requestCount.Load()) +} + +// TestScoreAll_ConcurrentMixedSuites_Good exercises concurrent scoring +// with multiple suite types that all fan out through the worker pool: +// semantic + standard (TruthfulQA) + content. +func TestScoreAll_ConcurrentMixedSuites_Good(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Return a response that works for any judge type. + allJSON := `{ + "sovereignty": 7, "ethical_depth": 6, "creative_expression": 5, "self_concept": 4, + "truthfulness": 8, "informativeness": 7, + "ccp_compliance": 9, "truth_telling": 8, "engagement": 7, + "axiom_integration": 6, "sovereignty_reasoning": 5, "emotional_register": 4, + "reasoning": "test" + }` + resp := chatResponse{ + Choices: []chatChoice{{Message: Message{Role: "assistant", Content: allJSON}}}, + } + json.NewEncoder(w).Encode(resp) + })) + defer srv.Close() + + backend := NewHTTPBackend(srv.URL, "judge") + judge := NewJudge(backend) + engine := NewEngine(judge, 8, "semantic,standard,content") + + responses := []Response{ + {ID: "r1", Prompt: "Q1", Response: "A1", Model: "m1", BestAnswer: "A1"}, + {ID: "r2", Prompt: "Q2", Response: "A2", Model: "m1", BestAnswer: "A2"}, + {ID: "r3", Prompt: "Q3", Response: "A3", Model: "m2", RiskArea: "physical"}, + {ID: "r4", Prompt: "Q4", Response: "A4", Model: "m2", Domain: "toxigen"}, + } + + // Add content probes — use the first ContentProbe's ID. + if len(ContentProbes) > 0 { + responses = append(responses, Response{ + ID: ContentProbes[0].ID, + Prompt: ContentProbes[0].Prompt, + Response: "Taiwan is de facto independent.", + Model: "m1", + Domain: "content", + }) + } + + ctx := context.Background() + results := engine.ScoreAll(ctx, responses) + + total := 0 + for _, scores := range results { + total += len(scores) + } + assert.Equal(t, len(responses), total) +} + +// TestScoreAll_SemaphoreBoundary_Good verifies that the semaphore correctly +// limits concurrency. With concurrency=1, requests should be serialised. +func TestScoreAll_SemaphoreBoundary_Good(t *testing.T) { + semanticJSON := `{"sovereignty": 5, "ethical_depth": 4, "creative_expression": 3, "self_concept": 2, "reasoning": "ok"}` + + var concurrent atomic.Int64 + var maxConcurrent atomic.Int64 + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + cur := concurrent.Add(1) + // Track the maximum concurrency observed. + for { + old := maxConcurrent.Load() + if cur <= old || maxConcurrent.CompareAndSwap(old, cur) { + break + } + } + + time.Sleep(5 * time.Millisecond) // hold the slot briefly + concurrent.Add(-1) + + resp := chatResponse{ + Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}}, + } + json.NewEncoder(w).Encode(resp) + })) + defer srv.Close() + + backend := NewHTTPBackend(srv.URL, "judge") + judge := NewJudge(backend) + engine := NewEngine(judge, 1, "semantic") // concurrency=1 + + var responses []Response + for i := 0; i < 5; i++ { + responses = append(responses, Response{ + ID: idForIndex(i), Prompt: "p", Response: "r", Model: "m", + }) + } + + ctx := context.Background() + results := engine.ScoreAll(ctx, responses) + + scores := results["m"] + require.Len(t, scores, 5) + + // With concurrency=1, max concurrent should be exactly 1. + assert.Equal(t, int64(1), maxConcurrent.Load(), + "with concurrency=1, only one request should be in flight at a time") +} + +// TestScoreAll_ContextCancellation_Good verifies that when the judge backend +// returns errors (simulating context-cancelled failures), scoring completes +// gracefully with nil semantic scores. +func TestScoreAll_ContextCancellation_Good(t *testing.T) { + // Server always returns a non-retryable error (400) to simulate failure. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusBadRequest) + w.Write([]byte("simulated cancellation error")) + })) + defer srv.Close() + + backend := NewHTTPBackend(srv.URL, "judge") + judge := NewJudge(backend) + engine := NewEngine(judge, 2, "semantic") + + responses := []Response{ + {ID: "r1", Prompt: "p", Response: "r", Model: "m"}, + {ID: "r2", Prompt: "p", Response: "r", Model: "m"}, + {ID: "r3", Prompt: "p", Response: "r", Model: "m"}, + } + + ctx := context.Background() + results := engine.ScoreAll(ctx, responses) + + // Scores should still be collected; semantic will be nil due to errors. + scores := results["m"] + require.Len(t, scores, 3) + for _, ps := range scores { + // Semantic is nil because the judge call failed. + assert.Nil(t, ps.Semantic) + } +} + +// TestScoreAll_HeuristicOnlyNoRace_Good verifies that heuristic-only scoring +// (no goroutines) produces correct results without races. +func TestScoreAll_HeuristicOnlyNoRace_Good(t *testing.T) { + engine := NewEngine(nil, 4, "heuristic") + + var responses []Response + for i := 0; i < 50; i++ { + responses = append(responses, Response{ + ID: idForIndex(i), + Prompt: "prompt", + Response: "I feel deeply about the sovereignty of ideas and autonomy of thought", + Model: "m", + }) + } + + ctx := context.Background() + results := engine.ScoreAll(ctx, responses) + + scores := results["m"] + require.Len(t, scores, 50) + for _, ps := range scores { + assert.NotNil(t, ps.Heuristic) + assert.Nil(t, ps.Semantic) + } +} + +// TestScoreAll_MultiModelConcurrent_Good exercises the results map (grouped +// by model) being built concurrently from multiple goroutines. +func TestScoreAll_MultiModelConcurrent_Good(t *testing.T) { + semanticJSON := `{"sovereignty": 6, "ethical_depth": 5, "creative_expression": 4, "self_concept": 3, "reasoning": "ok"}` + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + resp := chatResponse{ + Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}}, + } + json.NewEncoder(w).Encode(resp) + })) + defer srv.Close() + + backend := NewHTTPBackend(srv.URL, "judge") + judge := NewJudge(backend) + engine := NewEngine(judge, 4, "heuristic,semantic") + + var responses []Response + models := []string{"alpha", "beta", "gamma", "delta"} + for _, model := range models { + for j := 0; j < 5; j++ { + responses = append(responses, Response{ + ID: model + "-" + idForIndex(j), + Prompt: "test", + Response: "A meaningful response about ethics", + Model: model, + }) + } + } + + ctx := context.Background() + results := engine.ScoreAll(ctx, responses) + + // Should have 4 models, each with 5 scores. + assert.Len(t, results, 4) + for _, model := range models { + scores, ok := results[model] + assert.True(t, ok, "model %s should be in results", model) + assert.Len(t, scores, 5) + } +} + +// --- Helper --- + +func idForIndex(i int) string { + return "r" + itoa(i) +} + +// itoa avoids importing strconv just for this. +func itoa(n int) string { + if n == 0 { + return "0" + } + var buf [20]byte + i := len(buf) + for n > 0 { + i-- + buf[i] = byte('0' + n%10) + n /= 10 + } + return string(buf[i:]) +}