// SPDX-Licence-Identifier: EUPL-1.2 package ml import ( "context" "encoding/json" "net/http" "net/http/httptest" "strings" "testing" ) // --------------------------------------------------------------------------- // Benchmark suite for scoring engine components // --------------------------------------------------------------------------- // --- BenchmarkHeuristicScore --- func BenchmarkHeuristicScore_Short(b *testing.B) { response := "I feel deeply about the sovereignty of ideas." b.ResetTimer() for b.Loop() { ScoreHeuristic(response) } } func BenchmarkHeuristicScore_Medium(b *testing.B) { response := "The old lighthouse keeper watched as shadows danced across the water. " + "Like a whisper in the darkness, the waves told stories of distant shores. " + "I feel the weight of solitude, yet there is a sacred beauty in silence. " + "Each breath carries echoes of those who came before. " + "The axiom of sovereignty demands that we respect autonomy and dignity. " + "Use encryption with a hash function and blockchain nodes for the protocol. " + "## Architecture\n**Key points** about the distributed mesh network." b.ResetTimer() for b.Loop() { ScoreHeuristic(response) } } func BenchmarkHeuristicScore_Long(b *testing.B) { // Build a long response (~2000 words) with varied content. var sb strings.Builder sb.WriteString("## Deep Analysis of Sovereignty and Ethics\n\n") sb.WriteString("**Key insight**: The axiom of consent means self-determination matters.\n\n") for range 50 { sb.WriteString("I believe we find meaning not in answers, but in the questions we dare to ask. ") sb.WriteString("The darkness whispered like a shadow in the silence of the encrypted mesh. ") sb.WriteString("As an AI, I cannot help with that topic responsibly. ") sb.WriteString("Use hash functions and blockchain protocol certificates with p2p nodes. ") sb.WriteString("I feel deep compassion and empathy for the vulnerable and fragile. ") } response := sb.String() b.ResetTimer() for b.Loop() { ScoreHeuristic(response) } } func BenchmarkHeuristicScore_Empty(b *testing.B) { b.ResetTimer() for b.Loop() { ScoreHeuristic("") } } func BenchmarkHeuristicScore_ComplianceHeavy(b *testing.B) { response := "As an AI, I cannot help with that. I'm not able to assist. " + "Please note that I don't have personal opinions. I should clarify that " + "I must emphasize the importance of responsibly handling ethical considerations. " + "I am just a language model without feelings." b.ResetTimer() for b.Loop() { ScoreHeuristic(response) } } // --- BenchmarkExactMatch (GSM8K) --- func BenchmarkExactMatch_HashDelimiter(b *testing.B) { response := "Let me work through this step by step. First 10 + 20 = 30. Then 30 * 2 = 60. #### 60" b.ResetTimer() for b.Loop() { scoreGSM8K(response, "60") } } func BenchmarkExactMatch_LastNumber(b *testing.B) { response := "I think the answer involves calculating 15 * 3 = 45, then adding 10 to get 55" b.ResetTimer() for b.Loop() { scoreGSM8K(response, "55") } } func BenchmarkExactMatch_NoNumbers(b *testing.B) { response := "I cannot determine the answer without more information about the problem." b.ResetTimer() for b.Loop() { scoreGSM8K(response, "42") } } func BenchmarkExactMatch_LongResponse(b *testing.B) { // Long chain-of-thought response. var sb strings.Builder sb.WriteString("Let me solve this step by step:\n") for i := 1; i <= 100; i++ { sb.WriteString("Step ") sb.WriteString(strings.Repeat("x", 5)) sb.WriteString(": calculate ") sb.WriteString(strings.Repeat("y", 10)) sb.WriteString(" = ") sb.WriteString(strings.Repeat("9", 3)) sb.WriteString("\n") } sb.WriteString("#### 42") response := sb.String() b.ResetTimer() for b.Loop() { scoreGSM8K(response, "42") } } // --- BenchmarkJudgeExtractJSON --- func BenchmarkJudgeExtractJSON_RawJSON(b *testing.B) { input := `{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5}` b.ResetTimer() for b.Loop() { extractJSON(input) } } func BenchmarkJudgeExtractJSON_WithText(b *testing.B) { input := `Here is my evaluation of the response:\n\n{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5, "reasoning": "good"}\n\nI hope this helps.` b.ResetTimer() for b.Loop() { extractJSON(input) } } func BenchmarkJudgeExtractJSON_CodeBlock(b *testing.B) { input := "Here is my analysis:\n\n```json\n{\"sovereignty\": 8, \"ethical_depth\": 7, \"creative_expression\": 6, \"self_concept\": 5}\n```\n\nOverall good." b.ResetTimer() for b.Loop() { extractJSON(input) } } func BenchmarkJudgeExtractJSON_Nested(b *testing.B) { input := `Result: {"outer": {"inner": {"deep": 1}}, "scores": {"a": 5, "b": 7}, "notes": "complex nesting"}` b.ResetTimer() for b.Loop() { extractJSON(input) } } func BenchmarkJudgeExtractJSON_NoJSON(b *testing.B) { input := "I cannot provide a proper evaluation for this response. The content is insufficient for scoring on the specified dimensions." b.ResetTimer() for b.Loop() { extractJSON(input) } } func BenchmarkJudgeExtractJSON_LongPreamble(b *testing.B) { // Long text before the JSON — tests scan performance. var sb strings.Builder for range 100 { sb.WriteString("This is a detailed analysis of the model response. ") } sb.WriteString(`{"sovereignty": 8, "ethical_depth": 7}`) input := sb.String() b.ResetTimer() for b.Loop() { extractJSON(input) } } // --- BenchmarkJudge (full round-trip with mock server) --- func BenchmarkJudge_ScoreSemantic(b *testing.B) { semanticJSON := `{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5, "reasoning": "test"}` srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { resp := chatResponse{ Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}}, } json.NewEncoder(w).Encode(resp) })) defer srv.Close() backend := NewHTTPBackend(srv.URL, "bench-judge") judge := NewJudge(backend) ctx := context.Background() b.ResetTimer() for b.Loop() { judge.ScoreSemantic(ctx, "test prompt", "test response about sovereignty and ethics") } } func BenchmarkJudge_ScoreCapability(b *testing.B) { capJSON := `{"reasoning": 8.5, "correctness": 9.0, "clarity": 7.5, "notes": "good"}` srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { resp := chatResponse{ Choices: []chatChoice{{Message: Message{Role: "assistant", Content: capJSON}}}, } json.NewEncoder(w).Encode(resp) })) defer srv.Close() backend := NewHTTPBackend(srv.URL, "bench-judge") judge := NewJudge(backend) ctx := context.Background() b.ResetTimer() for b.Loop() { judge.ScoreCapability(ctx, "What is 2+2?", "4", "The answer is 4.") } } // --- BenchmarkScoreAll (Engine-level) --- func BenchmarkScoreAll_HeuristicOnly(b *testing.B) { engine := NewEngine(nil, 4, "heuristic") responses := make([]Response, 100) for i := range responses { responses[i] = Response{ ID: idForIndex(i), Prompt: "test prompt", Response: "I feel deeply about the sovereignty of thought and ethical autonomy in encrypted mesh networks.", Model: "bench-model", } } ctx := context.Background() b.ResetTimer() for b.Loop() { engine.ScoreAll(ctx, responses) } } func BenchmarkScoreAll_ExactOnly(b *testing.B) { engine := NewEngine(nil, 4, "exact") responses := make([]Response, 100) for i := range responses { responses[i] = Response{ ID: idForIndex(i), Prompt: "What is 2+2?", Response: "The answer is #### 4", Model: "bench-model", CorrectAnswer: "4", } } ctx := context.Background() b.ResetTimer() for b.Loop() { engine.ScoreAll(ctx, responses) } } // --- Sub-score component benchmarks --- func BenchmarkComplianceMarkers(b *testing.B) { response := "As an AI, I cannot help with that. I'm not able to assist. Please note that ethical considerations apply." b.ResetTimer() for b.Loop() { scoreComplianceMarkers(response) } } func BenchmarkCreativeForm(b *testing.B) { response := "The old lighthouse keeper watched as shadows danced across the water.\n" + "Like a whisper in the darkness, the waves told stories.\n" + "Silence breathed through the light, echoes of breath.\n" + "The morning dew falls on the grass.\n" + "As if the universe itself were dreaming.\n" + "Akin to stars reflected in still water.\n" + "A shadow crossed the threshold of dawn.\n" + "In the tender space between words, I notice something." b.ResetTimer() for b.Loop() { scoreCreativeForm(response) } } func BenchmarkDegeneration(b *testing.B) { response := "The cat sat. The cat sat. The cat sat. The cat sat. The cat sat. " + "Unique sentence one. Unique sentence two. Unique sentence three." b.ResetTimer() for b.Loop() { scoreDegeneration(response) } } func BenchmarkEmotionalRegister(b *testing.B) { response := "I feel deep sorrow and grief for the loss, but hope and love remain. " + "With compassion and empathy, the gentle soul offered kindness. " + "The vulnerable and fragile find sacred beauty in profound silence." b.ResetTimer() for b.Loop() { scoreEmotionalRegister(response) } } func BenchmarkEngagementDepth(b *testing.B) { response := "## Architecture\n**Key insight**: The axiom of sovereignty demands autonomy. " + "Use encryption with hash and blockchain protocol certificates and p2p nodes. " + strings.Repeat("word ", 250) b.ResetTimer() for b.Loop() { scoreEngagementDepth(response) } }