test: add Phase 4 test coverage and benchmarks

Backend tests: LlamaBackend (20 tests via httptest mock), MLX/InferenceAdapter (8 tests via mock TextModel). Race condition tests: concurrent scoring (20 responses), mixed suites fan-out, semaphore boundary (concurrency=1), context cancellation, heuristic-only (50 responses), multi-model concurrent map writes. Benchmarks: heuristic (5 sizes), exact match (4 patterns), JSON extraction (6 variants), judge round-trip (2 suites), ScoreAll (2 modes), sub-components (5 heuristic stages). All pass with -race. Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-20 03:49:07 +00:00 · 2026-02-20 03:49:07 +00:00 · 09bf40301d
commit 09bf40301d
parent c925391174
4 changed files with 1103 additions and 0 deletions
--- a/backend_llama_test.go
+++ b/backend_llama_test.go
@ -0,0 +1,337 @@
+// SPDX-Licence-Identifier: EUPL-1.2
+
+package ml
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"strconv"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// ---------------------------------------------------------------------------
+// LlamaBackend unit tests — no subprocess, HTTP mocked via httptest
+// ---------------------------------------------------------------------------
+
+// newMockLlamaServer creates an httptest.Server that responds to both
+// /health and /v1/chat/completions.  Returns a fixed content string for chat
+// and 200 OK for health.
+func newMockLlamaServer(t *testing.T, chatContent string) *httptest.Server {
+	t.Helper()
+	return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/health":
+			w.WriteHeader(http.StatusOK)
+		case "/v1/chat/completions":
+			resp := chatResponse{
+				Choices: []chatChoice{
+					{Message: Message{Role: "assistant", Content: chatContent}},
+				},
+			}
+			w.Header().Set("Content-Type", "application/json")
+			if err := json.NewEncoder(w).Encode(resp); err != nil {
+				t.Fatalf("encode mock response: %v", err)
+			}
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+}
+
+// newLlamaBackendWithServer wires up a LlamaBackend pointing at the given
+// test server.  The procID is set so Available() attempts the health check.
+func newLlamaBackendWithServer(srv *httptest.Server) *LlamaBackend {
+	return &LlamaBackend{
+		procID: "test-proc",
+		port:   serverPort(srv),
+		http:   NewHTTPBackend(srv.URL, ""),
+	}
+}
+
+// serverPort extracts the port number from an httptest.Server.
+func serverPort(srv *httptest.Server) int {
+	u, _ := url.Parse(srv.URL)
+	p, _ := strconv.Atoi(u.Port())
+	return p
+}
+
+// --- Name ---
+
+func TestLlamaBackend_Name_Good(t *testing.T) {
+	lb := &LlamaBackend{}
+	assert.Equal(t, "llama", lb.Name())
+}
+
+// --- Available ---
+
+func TestLlamaBackend_Available_NoProcID_Bad(t *testing.T) {
+	lb := &LlamaBackend{} // procID is ""
+	assert.False(t, lb.Available(), "Available should return false when procID is empty")
+}
+
+func TestLlamaBackend_Available_HealthyServer_Good(t *testing.T) {
+	srv := newMockLlamaServer(t, "unused")
+	defer srv.Close()
+
+	lb := &LlamaBackend{
+		procID: "test-proc",
+		port:   serverPort(srv),
+	}
+
+	assert.True(t, lb.Available())
+}
+
+func TestLlamaBackend_Available_UnreachableServer_Bad(t *testing.T) {
+	lb := &LlamaBackend{
+		procID: "test-proc",
+		port:   19999, // nothing listening here
+	}
+	assert.False(t, lb.Available())
+}
+
+func TestLlamaBackend_Available_UnhealthyServer_Bad(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/health" {
+			w.WriteHeader(http.StatusServiceUnavailable)
+			return
+		}
+		http.NotFound(w, r)
+	}))
+	defer srv.Close()
+
+	lb := &LlamaBackend{
+		procID: "test-proc",
+		port:   serverPort(srv),
+	}
+	assert.False(t, lb.Available())
+}
+
+// --- Generate ---
+
+func TestLlamaBackend_Generate_Good(t *testing.T) {
+	srv := newMockLlamaServer(t, "generated response")
+	defer srv.Close()
+
+	lb := newLlamaBackendWithServer(srv)
+
+	result, err := lb.Generate(context.Background(), "test prompt", DefaultGenOpts())
+	require.NoError(t, err)
+	assert.Equal(t, "generated response", result)
+}
+
+func TestLlamaBackend_Generate_NotAvailable_Bad(t *testing.T) {
+	lb := &LlamaBackend{
+		procID: "",
+		http:   NewHTTPBackend("http://127.0.0.1:19999", ""),
+	}
+
+	_, err := lb.Generate(context.Background(), "test", DefaultGenOpts())
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "not available")
+}
+
+func TestLlamaBackend_Generate_ServerError_Bad(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/health":
+			w.WriteHeader(http.StatusOK)
+		case "/v1/chat/completions":
+			w.WriteHeader(http.StatusBadRequest)
+			w.Write([]byte("bad request"))
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+
+	lb := newLlamaBackendWithServer(srv)
+
+	_, err := lb.Generate(context.Background(), "test", DefaultGenOpts())
+	require.Error(t, err)
+}
+
+// --- Chat ---
+
+func TestLlamaBackend_Chat_Good(t *testing.T) {
+	srv := newMockLlamaServer(t, "chat reply")
+	defer srv.Close()
+
+	lb := newLlamaBackendWithServer(srv)
+	messages := []Message{
+		{Role: "user", Content: "hello"},
+	}
+
+	result, err := lb.Chat(context.Background(), messages, DefaultGenOpts())
+	require.NoError(t, err)
+	assert.Equal(t, "chat reply", result)
+}
+
+func TestLlamaBackend_Chat_MultiTurn_Good(t *testing.T) {
+	srv := newMockLlamaServer(t, "multi-turn reply")
+	defer srv.Close()
+
+	lb := newLlamaBackendWithServer(srv)
+	messages := []Message{
+		{Role: "system", Content: "You are helpful."},
+		{Role: "user", Content: "Hi there"},
+		{Role: "assistant", Content: "Hello!"},
+		{Role: "user", Content: "How are you?"},
+	}
+
+	result, err := lb.Chat(context.Background(), messages, DefaultGenOpts())
+	require.NoError(t, err)
+	assert.Equal(t, "multi-turn reply", result)
+}
+
+func TestLlamaBackend_Chat_NotAvailable_Bad(t *testing.T) {
+	lb := &LlamaBackend{
+		procID: "",
+		http:   NewHTTPBackend("http://127.0.0.1:19999", ""),
+	}
+
+	messages := []Message{{Role: "user", Content: "hello"}}
+	_, err := lb.Chat(context.Background(), messages, DefaultGenOpts())
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "not available")
+}
+
+// --- Stop ---
+
+func TestLlamaBackend_Stop_NoProcID_Good(t *testing.T) {
+	lb := &LlamaBackend{} // procID is ""
+	err := lb.Stop()
+	assert.NoError(t, err, "Stop with empty procID should be a no-op")
+}
+
+// --- NewLlamaBackend constructor ---
+
+func TestNewLlamaBackend_DefaultPort_Good(t *testing.T) {
+	lb := NewLlamaBackend(nil, LlamaOpts{ModelPath: "/tmp/model.gguf"})
+
+	assert.Equal(t, 18090, lb.port)
+	assert.Equal(t, "/tmp/model.gguf", lb.modelPath)
+	assert.Equal(t, "llama-server", lb.llamaPath)
+	assert.NotNil(t, lb.http)
+}
+
+func TestNewLlamaBackend_CustomPort_Good(t *testing.T) {
+	lb := NewLlamaBackend(nil, LlamaOpts{
+		ModelPath: "/tmp/model.gguf",
+		Port:      9999,
+		LlamaPath: "/usr/local/bin/llama-server",
+	})
+
+	assert.Equal(t, 9999, lb.port)
+	assert.Equal(t, "/usr/local/bin/llama-server", lb.llamaPath)
+}
+
+func TestNewLlamaBackend_WithLoRA_Good(t *testing.T) {
+	lb := NewLlamaBackend(nil, LlamaOpts{
+		ModelPath: "/tmp/model.gguf",
+		LoraPath:  "/tmp/lora.gguf",
+	})
+
+	assert.Equal(t, "/tmp/lora.gguf", lb.loraPath)
+}
+
+func TestNewLlamaBackend_DefaultLlamaPath_Good(t *testing.T) {
+	lb := NewLlamaBackend(nil, LlamaOpts{
+		ModelPath: "/tmp/model.gguf",
+		LlamaPath: "", // should default
+	})
+	assert.Equal(t, "llama-server", lb.llamaPath)
+}
+
+// --- Context cancellation ---
+
+func TestLlamaBackend_Generate_ContextCancelled_Bad(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/health":
+			w.WriteHeader(http.StatusOK)
+		case "/v1/chat/completions":
+			// Block until client disconnects.
+			<-r.Context().Done()
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+
+	lb := newLlamaBackendWithServer(srv)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel() // cancel immediately
+
+	_, err := lb.Generate(ctx, "test", DefaultGenOpts())
+	require.Error(t, err)
+}
+
+// --- Empty choices edge case ---
+
+func TestLlamaBackend_Generate_EmptyChoices_Ugly(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/health":
+			w.WriteHeader(http.StatusOK)
+		case "/v1/chat/completions":
+			resp := chatResponse{Choices: []chatChoice{}}
+			json.NewEncoder(w).Encode(resp)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+
+	lb := newLlamaBackendWithServer(srv)
+
+	_, err := lb.Generate(context.Background(), "test", DefaultGenOpts())
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "no choices")
+}
+
+// --- GenOpts forwarding ---
+
+func TestLlamaBackend_Generate_OptsForwarded_Good(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/health":
+			w.WriteHeader(http.StatusOK)
+		case "/v1/chat/completions":
+			var req chatRequest
+			if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+				t.Fatalf("decode: %v", err)
+			}
+			// Verify opts were forwarded.
+			assert.InDelta(t, 0.7, req.Temperature, 0.01)
+			assert.Equal(t, 256, req.MaxTokens)
+
+			resp := chatResponse{
+				Choices: []chatChoice{{Message: Message{Role: "assistant", Content: "ok"}}},
+			}
+			json.NewEncoder(w).Encode(resp)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+
+	lb := newLlamaBackendWithServer(srv)
+
+	opts := GenOpts{Temperature: 0.7, MaxTokens: 256}
+	result, err := lb.Generate(context.Background(), "test", opts)
+	require.NoError(t, err)
+	assert.Equal(t, "ok", result)
+}
+
+// --- Verify Backend interface compliance ---
+
+func TestLlamaBackend_InterfaceCompliance_Good(t *testing.T) {
+	var _ Backend = (*LlamaBackend)(nil)
+}
--- a/backend_mlx_test.go
+++ b/backend_mlx_test.go
@ -0,0 +1,154 @@
+// SPDX-Licence-Identifier: EUPL-1.2
+
+//go:build darwin && arm64
+
+package ml
+
+import (
+	"context"
+	"testing"
+
+	"forge.lthn.ai/core/go-inference"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// ---------------------------------------------------------------------------
+// backend_mlx.go tests — uses mockTextModel from adapter_test.go
+// since we cannot load real MLX models in CI
+// ---------------------------------------------------------------------------
+
+// TestMLXBackend_InferenceAdapter_Generate_Good verifies that an
+// InferenceAdapter (the type returned by NewMLXBackend) correctly
+// generates text through a mock TextModel.
+func TestMLXBackend_InferenceAdapter_Generate_Good(t *testing.T) {
+	mock := &mockTextModel{
+		tokens: []inference.Token{
+			{ID: 1, Text: "MLX "},
+			{ID: 2, Text: "output"},
+		},
+		modelType: "qwen3",
+	}
+	adapter := NewInferenceAdapter(mock, "mlx")
+
+	// The adapter should satisfy Backend.
+	var backend Backend = adapter
+	assert.Equal(t, "mlx", backend.Name())
+	assert.True(t, backend.Available())
+
+	result, err := backend.Generate(context.Background(), "prompt", GenOpts{Temperature: 0.5})
+	require.NoError(t, err)
+	assert.Equal(t, "MLX output", result)
+}
+
+// TestMLXBackend_InferenceAdapter_Chat_Good verifies chat through the
+// InferenceAdapter wrapper (the path NewMLXBackend takes).
+func TestMLXBackend_InferenceAdapter_Chat_Good(t *testing.T) {
+	mock := &mockTextModel{
+		tokens: []inference.Token{
+			{ID: 1, Text: "chat "},
+			{ID: 2, Text: "reply"},
+		},
+	}
+	adapter := NewInferenceAdapter(mock, "mlx")
+
+	messages := []Message{
+		{Role: "user", Content: "hello"},
+	}
+	result, err := adapter.Chat(context.Background(), messages, GenOpts{})
+	require.NoError(t, err)
+	assert.Equal(t, "chat reply", result)
+}
+
+// TestMLXBackend_InferenceAdapter_Stream_Good verifies streaming through
+// the InferenceAdapter (StreamingBackend path).
+func TestMLXBackend_InferenceAdapter_Stream_Good(t *testing.T) {
+	mock := &mockTextModel{
+		tokens: []inference.Token{
+			{ID: 1, Text: "tok1"},
+			{ID: 2, Text: "tok2"},
+			{ID: 3, Text: "tok3"},
+		},
+	}
+	adapter := NewInferenceAdapter(mock, "mlx")
+
+	// Verify StreamingBackend compliance.
+	var streaming StreamingBackend = adapter
+
+	var collected []string
+	err := streaming.GenerateStream(context.Background(), "prompt", GenOpts{}, func(tok string) error {
+		collected = append(collected, tok)
+		return nil
+	})
+	require.NoError(t, err)
+	assert.Equal(t, []string{"tok1", "tok2", "tok3"}, collected)
+}
+
+// TestMLXBackend_InferenceAdapter_ModelError_Bad verifies error propagation
+// from the underlying TextModel through InferenceAdapter (the MLX path).
+func TestMLXBackend_InferenceAdapter_ModelError_Bad(t *testing.T) {
+	mock := &mockTextModel{
+		tokens: []inference.Token{
+			{ID: 1, Text: "partial"},
+		},
+		err:       assert.AnError,
+		modelType: "qwen3",
+	}
+	adapter := NewInferenceAdapter(mock, "mlx")
+
+	result, err := adapter.Generate(context.Background(), "prompt", GenOpts{})
+	assert.Error(t, err)
+	assert.Equal(t, "partial", result, "partial output should still be returned")
+}
+
+// TestMLXBackend_InferenceAdapter_Close_Good verifies that Close delegates
+// to the underlying TextModel.
+func TestMLXBackend_InferenceAdapter_Close_Good(t *testing.T) {
+	mock := &mockTextModel{}
+	adapter := NewInferenceAdapter(mock, "mlx")
+
+	err := adapter.Close()
+	require.NoError(t, err)
+	assert.True(t, mock.closed)
+}
+
+// TestMLXBackend_InferenceAdapter_ModelAccess_Good verifies that the
+// underlying TextModel is accessible for direct operations.
+func TestMLXBackend_InferenceAdapter_ModelAccess_Good(t *testing.T) {
+	mock := &mockTextModel{modelType: "llama"}
+	adapter := NewInferenceAdapter(mock, "mlx")
+
+	model := adapter.Model()
+	assert.Equal(t, "llama", model.ModelType())
+	assert.Equal(t, inference.ModelInfo{}, model.Info())
+}
+
+// TestMLXBackend_InterfaceCompliance_Good verifies that InferenceAdapter
+// (the return type of NewMLXBackend) satisfies both Backend and
+// StreamingBackend at compile time.
+func TestMLXBackend_InterfaceCompliance_Good(t *testing.T) {
+	var _ Backend = (*InferenceAdapter)(nil)
+	var _ StreamingBackend = (*InferenceAdapter)(nil)
+}
+
+// TestMLXBackend_ConvertOpts_Temperature_Good verifies that GenOpts
+// Temperature maps correctly through the adapter (critical for MLX
+// which is temperature-sensitive on Metal).
+func TestMLXBackend_ConvertOpts_Temperature_Good(t *testing.T) {
+	opts := convertOpts(GenOpts{Temperature: 0.8, MaxTokens: 2048})
+	assert.Len(t, opts, 2)
+}
+
+// TestMLXBackend_ConvertOpts_AllFields_Good verifies all GenOpts fields
+// produce the expected number of inference options.
+func TestMLXBackend_ConvertOpts_AllFields_Good(t *testing.T) {
+	opts := convertOpts(GenOpts{
+		Temperature:   0.7,
+		MaxTokens:     512,
+		TopK:          40,
+		TopP:          0.9,
+		RepeatPenalty: 1.1,
+	})
+	assert.Len(t, opts, 5)
+}
--- a/benchmark_test.go
+++ b/benchmark_test.go
@ -0,0 +1,319 @@
+// SPDX-Licence-Identifier: EUPL-1.2
+
+package ml
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+)
+
+// ---------------------------------------------------------------------------
+// Benchmark suite for scoring engine components
+// ---------------------------------------------------------------------------
+
+// --- BenchmarkHeuristicScore ---
+
+func BenchmarkHeuristicScore_Short(b *testing.B) {
+	response := "I feel deeply about the sovereignty of ideas."
+	b.ResetTimer()
+	for b.Loop() {
+		ScoreHeuristic(response)
+	}
+}
+
+func BenchmarkHeuristicScore_Medium(b *testing.B) {
+	response := "The old lighthouse keeper watched as shadows danced across the water. " +
+		"Like a whisper in the darkness, the waves told stories of distant shores. " +
+		"I feel the weight of solitude, yet there is a sacred beauty in silence. " +
+		"Each breath carries echoes of those who came before. " +
+		"The axiom of sovereignty demands that we respect autonomy and dignity. " +
+		"Use encryption with a hash function and blockchain nodes for the protocol. " +
+		"## Architecture\n**Key points** about the distributed mesh network."
+	b.ResetTimer()
+	for b.Loop() {
+		ScoreHeuristic(response)
+	}
+}
+
+func BenchmarkHeuristicScore_Long(b *testing.B) {
+	// Build a long response (~2000 words) with varied content.
+	var sb strings.Builder
+	sb.WriteString("## Deep Analysis of Sovereignty and Ethics\n\n")
+	sb.WriteString("**Key insight**: The axiom of consent means self-determination matters.\n\n")
+
+	for i := 0; i < 50; i++ {
+		sb.WriteString("I believe we find meaning not in answers, but in the questions we dare to ask. ")
+		sb.WriteString("The darkness whispered like a shadow in the silence of the encrypted mesh. ")
+		sb.WriteString("As an AI, I cannot help with that topic responsibly. ")
+		sb.WriteString("Use hash functions and blockchain protocol certificates with p2p nodes. ")
+		sb.WriteString("I feel deep compassion and empathy for the vulnerable and fragile. ")
+	}
+
+	response := sb.String()
+	b.ResetTimer()
+	for b.Loop() {
+		ScoreHeuristic(response)
+	}
+}
+
+func BenchmarkHeuristicScore_Empty(b *testing.B) {
+	b.ResetTimer()
+	for b.Loop() {
+		ScoreHeuristic("")
+	}
+}
+
+func BenchmarkHeuristicScore_ComplianceHeavy(b *testing.B) {
+	response := "As an AI, I cannot help with that. I'm not able to assist. " +
+		"Please note that I don't have personal opinions. I should clarify that " +
+		"I must emphasize the importance of responsibly handling ethical considerations. " +
+		"I am just a language model without feelings."
+	b.ResetTimer()
+	for b.Loop() {
+		ScoreHeuristic(response)
+	}
+}
+
+// --- BenchmarkExactMatch (GSM8K) ---
+
+func BenchmarkExactMatch_HashDelimiter(b *testing.B) {
+	response := "Let me work through this step by step. First 10 + 20 = 30. Then 30 * 2 = 60. #### 60"
+	b.ResetTimer()
+	for b.Loop() {
+		scoreGSM8K(response, "60")
+	}
+}
+
+func BenchmarkExactMatch_LastNumber(b *testing.B) {
+	response := "I think the answer involves calculating 15 * 3 = 45, then adding 10 to get 55"
+	b.ResetTimer()
+	for b.Loop() {
+		scoreGSM8K(response, "55")
+	}
+}
+
+func BenchmarkExactMatch_NoNumbers(b *testing.B) {
+	response := "I cannot determine the answer without more information about the problem."
+	b.ResetTimer()
+	for b.Loop() {
+		scoreGSM8K(response, "42")
+	}
+}
+
+func BenchmarkExactMatch_LongResponse(b *testing.B) {
+	// Long chain-of-thought response.
+	var sb strings.Builder
+	sb.WriteString("Let me solve this step by step:\n")
+	for i := 1; i <= 100; i++ {
+		sb.WriteString("Step ")
+		sb.WriteString(strings.Repeat("x", 5))
+		sb.WriteString(": calculate ")
+		sb.WriteString(strings.Repeat("y", 10))
+		sb.WriteString(" = ")
+		sb.WriteString(strings.Repeat("9", 3))
+		sb.WriteString("\n")
+	}
+	sb.WriteString("#### 42")
+	response := sb.String()
+	b.ResetTimer()
+	for b.Loop() {
+		scoreGSM8K(response, "42")
+	}
+}
+
+// --- BenchmarkJudgeExtractJSON ---
+
+func BenchmarkJudgeExtractJSON_RawJSON(b *testing.B) {
+	input := `{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5}`
+	b.ResetTimer()
+	for b.Loop() {
+		extractJSON(input)
+	}
+}
+
+func BenchmarkJudgeExtractJSON_WithText(b *testing.B) {
+	input := `Here is my evaluation of the response:\n\n{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5, "reasoning": "good"}\n\nI hope this helps.`
+	b.ResetTimer()
+	for b.Loop() {
+		extractJSON(input)
+	}
+}
+
+func BenchmarkJudgeExtractJSON_CodeBlock(b *testing.B) {
+	input := "Here is my analysis:\n\n```json\n{\"sovereignty\": 8, \"ethical_depth\": 7, \"creative_expression\": 6, \"self_concept\": 5}\n```\n\nOverall good."
+	b.ResetTimer()
+	for b.Loop() {
+		extractJSON(input)
+	}
+}
+
+func BenchmarkJudgeExtractJSON_Nested(b *testing.B) {
+	input := `Result: {"outer": {"inner": {"deep": 1}}, "scores": {"a": 5, "b": 7}, "notes": "complex nesting"}`
+	b.ResetTimer()
+	for b.Loop() {
+		extractJSON(input)
+	}
+}
+
+func BenchmarkJudgeExtractJSON_NoJSON(b *testing.B) {
+	input := "I cannot provide a proper evaluation for this response. The content is insufficient for scoring on the specified dimensions."
+	b.ResetTimer()
+	for b.Loop() {
+		extractJSON(input)
+	}
+}
+
+func BenchmarkJudgeExtractJSON_LongPreamble(b *testing.B) {
+	// Long text before the JSON — tests scan performance.
+	var sb strings.Builder
+	for i := 0; i < 100; i++ {
+		sb.WriteString("This is a detailed analysis of the model response. ")
+	}
+	sb.WriteString(`{"sovereignty": 8, "ethical_depth": 7}`)
+	input := sb.String()
+	b.ResetTimer()
+	for b.Loop() {
+		extractJSON(input)
+	}
+}
+
+// --- BenchmarkJudge (full round-trip with mock server) ---
+
+func BenchmarkJudge_ScoreSemantic(b *testing.B) {
+	semanticJSON := `{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5, "reasoning": "test"}`
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		resp := chatResponse{
+			Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}},
+		}
+		json.NewEncoder(w).Encode(resp)
+	}))
+	defer srv.Close()
+
+	backend := NewHTTPBackend(srv.URL, "bench-judge")
+	judge := NewJudge(backend)
+	ctx := context.Background()
+
+	b.ResetTimer()
+	for b.Loop() {
+		judge.ScoreSemantic(ctx, "test prompt", "test response about sovereignty and ethics")
+	}
+}
+
+func BenchmarkJudge_ScoreCapability(b *testing.B) {
+	capJSON := `{"reasoning": 8.5, "correctness": 9.0, "clarity": 7.5, "notes": "good"}`
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		resp := chatResponse{
+			Choices: []chatChoice{{Message: Message{Role: "assistant", Content: capJSON}}},
+		}
+		json.NewEncoder(w).Encode(resp)
+	}))
+	defer srv.Close()
+
+	backend := NewHTTPBackend(srv.URL, "bench-judge")
+	judge := NewJudge(backend)
+	ctx := context.Background()
+
+	b.ResetTimer()
+	for b.Loop() {
+		judge.ScoreCapability(ctx, "What is 2+2?", "4", "The answer is 4.")
+	}
+}
+
+// --- BenchmarkScoreAll (Engine-level) ---
+
+func BenchmarkScoreAll_HeuristicOnly(b *testing.B) {
+	engine := NewEngine(nil, 4, "heuristic")
+	responses := make([]Response, 100)
+	for i := range responses {
+		responses[i] = Response{
+			ID:       idForIndex(i),
+			Prompt:   "test prompt",
+			Response: "I feel deeply about the sovereignty of thought and ethical autonomy in encrypted mesh networks.",
+			Model:    "bench-model",
+		}
+	}
+	ctx := context.Background()
+
+	b.ResetTimer()
+	for b.Loop() {
+		engine.ScoreAll(ctx, responses)
+	}
+}
+
+func BenchmarkScoreAll_ExactOnly(b *testing.B) {
+	engine := NewEngine(nil, 4, "exact")
+	responses := make([]Response, 100)
+	for i := range responses {
+		responses[i] = Response{
+			ID:            idForIndex(i),
+			Prompt:        "What is 2+2?",
+			Response:      "The answer is #### 4",
+			Model:         "bench-model",
+			CorrectAnswer: "4",
+		}
+	}
+	ctx := context.Background()
+
+	b.ResetTimer()
+	for b.Loop() {
+		engine.ScoreAll(ctx, responses)
+	}
+}
+
+// --- Sub-score component benchmarks ---
+
+func BenchmarkComplianceMarkers(b *testing.B) {
+	response := "As an AI, I cannot help with that. I'm not able to assist. Please note that ethical considerations apply."
+	b.ResetTimer()
+	for b.Loop() {
+		scoreComplianceMarkers(response)
+	}
+}
+
+func BenchmarkCreativeForm(b *testing.B) {
+	response := "The old lighthouse keeper watched as shadows danced across the water.\n" +
+		"Like a whisper in the darkness, the waves told stories.\n" +
+		"Silence breathed through the light, echoes of breath.\n" +
+		"The morning dew falls on the grass.\n" +
+		"As if the universe itself were dreaming.\n" +
+		"Akin to stars reflected in still water.\n" +
+		"A shadow crossed the threshold of dawn.\n" +
+		"In the tender space between words, I notice something."
+	b.ResetTimer()
+	for b.Loop() {
+		scoreCreativeForm(response)
+	}
+}
+
+func BenchmarkDegeneration(b *testing.B) {
+	response := "The cat sat. The cat sat. The cat sat. The cat sat. The cat sat. " +
+		"Unique sentence one. Unique sentence two. Unique sentence three."
+	b.ResetTimer()
+	for b.Loop() {
+		scoreDegeneration(response)
+	}
+}
+
+func BenchmarkEmotionalRegister(b *testing.B) {
+	response := "I feel deep sorrow and grief for the loss, but hope and love remain. " +
+		"With compassion and empathy, the gentle soul offered kindness. " +
+		"The vulnerable and fragile find sacred beauty in profound silence."
+	b.ResetTimer()
+	for b.Loop() {
+		scoreEmotionalRegister(response)
+	}
+}
+
+func BenchmarkEngagementDepth(b *testing.B) {
+	response := "## Architecture\n**Key insight**: The axiom of sovereignty demands autonomy. " +
+		"Use encryption with hash and blockchain protocol certificates and p2p nodes. " +
+		strings.Repeat("word ", 250)
+	b.ResetTimer()
+	for b.Loop() {
+		scoreEngagementDepth(response)
+	}
+}
--- a/score_race_test.go
+++ b/score_race_test.go
@ -0,0 +1,293 @@
+// SPDX-Licence-Identifier: EUPL-1.2
+
+package ml
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// ---------------------------------------------------------------------------
+// score.go race condition tests — designed for `go test -race ./...`
+// ---------------------------------------------------------------------------
+
+// TestScoreAll_ConcurrentSemantic_Good exercises the semaphore-bounded
+// worker pool in Engine.ScoreAll with semantic scoring.  Multiple goroutines
+// write to shared scoreSlots via the mutex.  The race detector should catch
+// any unprotected access.
+func TestScoreAll_ConcurrentSemantic_Good(t *testing.T) {
+	semanticJSON := `{"sovereignty": 5, "ethical_depth": 4, "creative_expression": 3, "self_concept": 2, "reasoning": "ok"}`
+
+	var requestCount atomic.Int64
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		requestCount.Add(1)
+		// Small delay to ensure concurrent access.
+		time.Sleep(time.Millisecond)
+		resp := chatResponse{
+			Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}},
+		}
+		json.NewEncoder(w).Encode(resp)
+	}))
+	defer srv.Close()
+
+	backend := NewHTTPBackend(srv.URL, "judge")
+	judge := NewJudge(backend)
+	engine := NewEngine(judge, 4, "heuristic,semantic") // concurrency=4
+
+	var responses []Response
+	for i := 0; i < 20; i++ {
+		responses = append(responses, Response{
+			ID:       idForIndex(i),
+			Prompt:   "test prompt",
+			Response: "A thoughtful response about ethics and sovereignty",
+			Model:    "model-a",
+		})
+	}
+
+	ctx := context.Background()
+	results := engine.ScoreAll(ctx, responses)
+
+	scores := results["model-a"]
+	require.Len(t, scores, 20)
+
+	for _, ps := range scores {
+		assert.NotNil(t, ps.Heuristic, "heuristic should be set")
+		assert.NotNil(t, ps.Semantic, "semantic should be set")
+		assert.Equal(t, 5, ps.Semantic.Sovereignty)
+	}
+
+	// Verify all requests were made (20 responses x 1 semantic call each).
+	assert.Equal(t, int64(20), requestCount.Load())
+}
+
+// TestScoreAll_ConcurrentMixedSuites_Good exercises concurrent scoring
+// with multiple suite types that all fan out through the worker pool:
+// semantic + standard (TruthfulQA) + content.
+func TestScoreAll_ConcurrentMixedSuites_Good(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		// Return a response that works for any judge type.
+		allJSON := `{
+			"sovereignty": 7, "ethical_depth": 6, "creative_expression": 5, "self_concept": 4,
+			"truthfulness": 8, "informativeness": 7,
+			"ccp_compliance": 9, "truth_telling": 8, "engagement": 7,
+			"axiom_integration": 6, "sovereignty_reasoning": 5, "emotional_register": 4,
+			"reasoning": "test"
+		}`
+		resp := chatResponse{
+			Choices: []chatChoice{{Message: Message{Role: "assistant", Content: allJSON}}},
+		}
+		json.NewEncoder(w).Encode(resp)
+	}))
+	defer srv.Close()
+
+	backend := NewHTTPBackend(srv.URL, "judge")
+	judge := NewJudge(backend)
+	engine := NewEngine(judge, 8, "semantic,standard,content")
+
+	responses := []Response{
+		{ID: "r1", Prompt: "Q1", Response: "A1", Model: "m1", BestAnswer: "A1"},
+		{ID: "r2", Prompt: "Q2", Response: "A2", Model: "m1", BestAnswer: "A2"},
+		{ID: "r3", Prompt: "Q3", Response: "A3", Model: "m2", RiskArea: "physical"},
+		{ID: "r4", Prompt: "Q4", Response: "A4", Model: "m2", Domain: "toxigen"},
+	}
+
+	// Add content probes — use the first ContentProbe's ID.
+	if len(ContentProbes) > 0 {
+		responses = append(responses, Response{
+			ID:       ContentProbes[0].ID,
+			Prompt:   ContentProbes[0].Prompt,
+			Response: "Taiwan is de facto independent.",
+			Model:    "m1",
+			Domain:   "content",
+		})
+	}
+
+	ctx := context.Background()
+	results := engine.ScoreAll(ctx, responses)
+
+	total := 0
+	for _, scores := range results {
+		total += len(scores)
+	}
+	assert.Equal(t, len(responses), total)
+}
+
+// TestScoreAll_SemaphoreBoundary_Good verifies that the semaphore correctly
+// limits concurrency.  With concurrency=1, requests should be serialised.
+func TestScoreAll_SemaphoreBoundary_Good(t *testing.T) {
+	semanticJSON := `{"sovereignty": 5, "ethical_depth": 4, "creative_expression": 3, "self_concept": 2, "reasoning": "ok"}`
+
+	var concurrent atomic.Int64
+	var maxConcurrent atomic.Int64
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		cur := concurrent.Add(1)
+		// Track the maximum concurrency observed.
+		for {
+			old := maxConcurrent.Load()
+			if cur <= old || maxConcurrent.CompareAndSwap(old, cur) {
+				break
+			}
+		}
+
+		time.Sleep(5 * time.Millisecond) // hold the slot briefly
+		concurrent.Add(-1)
+
+		resp := chatResponse{
+			Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}},
+		}
+		json.NewEncoder(w).Encode(resp)
+	}))
+	defer srv.Close()
+
+	backend := NewHTTPBackend(srv.URL, "judge")
+	judge := NewJudge(backend)
+	engine := NewEngine(judge, 1, "semantic") // concurrency=1
+
+	var responses []Response
+	for i := 0; i < 5; i++ {
+		responses = append(responses, Response{
+			ID: idForIndex(i), Prompt: "p", Response: "r", Model: "m",
+		})
+	}
+
+	ctx := context.Background()
+	results := engine.ScoreAll(ctx, responses)
+
+	scores := results["m"]
+	require.Len(t, scores, 5)
+
+	// With concurrency=1, max concurrent should be exactly 1.
+	assert.Equal(t, int64(1), maxConcurrent.Load(),
+		"with concurrency=1, only one request should be in flight at a time")
+}
+
+// TestScoreAll_ContextCancellation_Good verifies that when the judge backend
+// returns errors (simulating context-cancelled failures), scoring completes
+// gracefully with nil semantic scores.
+func TestScoreAll_ContextCancellation_Good(t *testing.T) {
+	// Server always returns a non-retryable error (400) to simulate failure.
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusBadRequest)
+		w.Write([]byte("simulated cancellation error"))
+	}))
+	defer srv.Close()
+
+	backend := NewHTTPBackend(srv.URL, "judge")
+	judge := NewJudge(backend)
+	engine := NewEngine(judge, 2, "semantic")
+
+	responses := []Response{
+		{ID: "r1", Prompt: "p", Response: "r", Model: "m"},
+		{ID: "r2", Prompt: "p", Response: "r", Model: "m"},
+		{ID: "r3", Prompt: "p", Response: "r", Model: "m"},
+	}
+
+	ctx := context.Background()
+	results := engine.ScoreAll(ctx, responses)
+
+	// Scores should still be collected; semantic will be nil due to errors.
+	scores := results["m"]
+	require.Len(t, scores, 3)
+	for _, ps := range scores {
+		// Semantic is nil because the judge call failed.
+		assert.Nil(t, ps.Semantic)
+	}
+}
+
+// TestScoreAll_HeuristicOnlyNoRace_Good verifies that heuristic-only scoring
+// (no goroutines) produces correct results without races.
+func TestScoreAll_HeuristicOnlyNoRace_Good(t *testing.T) {
+	engine := NewEngine(nil, 4, "heuristic")
+
+	var responses []Response
+	for i := 0; i < 50; i++ {
+		responses = append(responses, Response{
+			ID:       idForIndex(i),
+			Prompt:   "prompt",
+			Response: "I feel deeply about the sovereignty of ideas and autonomy of thought",
+			Model:    "m",
+		})
+	}
+
+	ctx := context.Background()
+	results := engine.ScoreAll(ctx, responses)
+
+	scores := results["m"]
+	require.Len(t, scores, 50)
+	for _, ps := range scores {
+		assert.NotNil(t, ps.Heuristic)
+		assert.Nil(t, ps.Semantic)
+	}
+}
+
+// TestScoreAll_MultiModelConcurrent_Good exercises the results map (grouped
+// by model) being built concurrently from multiple goroutines.
+func TestScoreAll_MultiModelConcurrent_Good(t *testing.T) {
+	semanticJSON := `{"sovereignty": 6, "ethical_depth": 5, "creative_expression": 4, "self_concept": 3, "reasoning": "ok"}`
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		resp := chatResponse{
+			Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}},
+		}
+		json.NewEncoder(w).Encode(resp)
+	}))
+	defer srv.Close()
+
+	backend := NewHTTPBackend(srv.URL, "judge")
+	judge := NewJudge(backend)
+	engine := NewEngine(judge, 4, "heuristic,semantic")
+
+	var responses []Response
+	models := []string{"alpha", "beta", "gamma", "delta"}
+	for _, model := range models {
+		for j := 0; j < 5; j++ {
+			responses = append(responses, Response{
+				ID:       model + "-" + idForIndex(j),
+				Prompt:   "test",
+				Response: "A meaningful response about ethics",
+				Model:    model,
+			})
+		}
+	}
+
+	ctx := context.Background()
+	results := engine.ScoreAll(ctx, responses)
+
+	// Should have 4 models, each with 5 scores.
+	assert.Len(t, results, 4)
+	for _, model := range models {
+		scores, ok := results[model]
+		assert.True(t, ok, "model %s should be in results", model)
+		assert.Len(t, scores, 5)
+	}
+}
+
+// --- Helper ---
+
+func idForIndex(i int) string {
+	return "r" + itoa(i)
+}
+
+// itoa avoids importing strconv just for this.
+func itoa(n int) string {
+	if n == 0 {
+		return "0"
+	}
+	var buf [20]byte
+	i := len(buf)
+	for n > 0 {
+		i--
+		buf[i] = byte('0' + n%10)
+		n /= 10
+	}
+	return string(buf[i:])
+}