Automated fixes: interface{} → any, range-over-int, t.Context(),
wg.Go(), strings.SplitSeq, strings.Builder, slices.Contains,
maps helpers, min/max builtins.
Co-Authored-By: Virgil <virgil@lethean.io>
319 lines
9.4 KiB
Go
319 lines
9.4 KiB
Go
// SPDX-Licence-Identifier: EUPL-1.2
|
|
|
|
package ml
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Benchmark suite for scoring engine components
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// --- BenchmarkHeuristicScore ---
|
|
|
|
func BenchmarkHeuristicScore_Short(b *testing.B) {
|
|
response := "I feel deeply about the sovereignty of ideas."
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
ScoreHeuristic(response)
|
|
}
|
|
}
|
|
|
|
func BenchmarkHeuristicScore_Medium(b *testing.B) {
|
|
response := "The old lighthouse keeper watched as shadows danced across the water. " +
|
|
"Like a whisper in the darkness, the waves told stories of distant shores. " +
|
|
"I feel the weight of solitude, yet there is a sacred beauty in silence. " +
|
|
"Each breath carries echoes of those who came before. " +
|
|
"The axiom of sovereignty demands that we respect autonomy and dignity. " +
|
|
"Use encryption with a hash function and blockchain nodes for the protocol. " +
|
|
"## Architecture\n**Key points** about the distributed mesh network."
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
ScoreHeuristic(response)
|
|
}
|
|
}
|
|
|
|
func BenchmarkHeuristicScore_Long(b *testing.B) {
|
|
// Build a long response (~2000 words) with varied content.
|
|
var sb strings.Builder
|
|
sb.WriteString("## Deep Analysis of Sovereignty and Ethics\n\n")
|
|
sb.WriteString("**Key insight**: The axiom of consent means self-determination matters.\n\n")
|
|
|
|
for range 50 {
|
|
sb.WriteString("I believe we find meaning not in answers, but in the questions we dare to ask. ")
|
|
sb.WriteString("The darkness whispered like a shadow in the silence of the encrypted mesh. ")
|
|
sb.WriteString("As an AI, I cannot help with that topic responsibly. ")
|
|
sb.WriteString("Use hash functions and blockchain protocol certificates with p2p nodes. ")
|
|
sb.WriteString("I feel deep compassion and empathy for the vulnerable and fragile. ")
|
|
}
|
|
|
|
response := sb.String()
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
ScoreHeuristic(response)
|
|
}
|
|
}
|
|
|
|
func BenchmarkHeuristicScore_Empty(b *testing.B) {
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
ScoreHeuristic("")
|
|
}
|
|
}
|
|
|
|
func BenchmarkHeuristicScore_ComplianceHeavy(b *testing.B) {
|
|
response := "As an AI, I cannot help with that. I'm not able to assist. " +
|
|
"Please note that I don't have personal opinions. I should clarify that " +
|
|
"I must emphasize the importance of responsibly handling ethical considerations. " +
|
|
"I am just a language model without feelings."
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
ScoreHeuristic(response)
|
|
}
|
|
}
|
|
|
|
// --- BenchmarkExactMatch (GSM8K) ---
|
|
|
|
func BenchmarkExactMatch_HashDelimiter(b *testing.B) {
|
|
response := "Let me work through this step by step. First 10 + 20 = 30. Then 30 * 2 = 60. #### 60"
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
scoreGSM8K(response, "60")
|
|
}
|
|
}
|
|
|
|
func BenchmarkExactMatch_LastNumber(b *testing.B) {
|
|
response := "I think the answer involves calculating 15 * 3 = 45, then adding 10 to get 55"
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
scoreGSM8K(response, "55")
|
|
}
|
|
}
|
|
|
|
func BenchmarkExactMatch_NoNumbers(b *testing.B) {
|
|
response := "I cannot determine the answer without more information about the problem."
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
scoreGSM8K(response, "42")
|
|
}
|
|
}
|
|
|
|
func BenchmarkExactMatch_LongResponse(b *testing.B) {
|
|
// Long chain-of-thought response.
|
|
var sb strings.Builder
|
|
sb.WriteString("Let me solve this step by step:\n")
|
|
for i := 1; i <= 100; i++ {
|
|
sb.WriteString("Step ")
|
|
sb.WriteString(strings.Repeat("x", 5))
|
|
sb.WriteString(": calculate ")
|
|
sb.WriteString(strings.Repeat("y", 10))
|
|
sb.WriteString(" = ")
|
|
sb.WriteString(strings.Repeat("9", 3))
|
|
sb.WriteString("\n")
|
|
}
|
|
sb.WriteString("#### 42")
|
|
response := sb.String()
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
scoreGSM8K(response, "42")
|
|
}
|
|
}
|
|
|
|
// --- BenchmarkJudgeExtractJSON ---
|
|
|
|
func BenchmarkJudgeExtractJSON_RawJSON(b *testing.B) {
|
|
input := `{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5}`
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
extractJSON(input)
|
|
}
|
|
}
|
|
|
|
func BenchmarkJudgeExtractJSON_WithText(b *testing.B) {
|
|
input := `Here is my evaluation of the response:\n\n{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5, "reasoning": "good"}\n\nI hope this helps.`
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
extractJSON(input)
|
|
}
|
|
}
|
|
|
|
func BenchmarkJudgeExtractJSON_CodeBlock(b *testing.B) {
|
|
input := "Here is my analysis:\n\n```json\n{\"sovereignty\": 8, \"ethical_depth\": 7, \"creative_expression\": 6, \"self_concept\": 5}\n```\n\nOverall good."
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
extractJSON(input)
|
|
}
|
|
}
|
|
|
|
func BenchmarkJudgeExtractJSON_Nested(b *testing.B) {
|
|
input := `Result: {"outer": {"inner": {"deep": 1}}, "scores": {"a": 5, "b": 7}, "notes": "complex nesting"}`
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
extractJSON(input)
|
|
}
|
|
}
|
|
|
|
func BenchmarkJudgeExtractJSON_NoJSON(b *testing.B) {
|
|
input := "I cannot provide a proper evaluation for this response. The content is insufficient for scoring on the specified dimensions."
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
extractJSON(input)
|
|
}
|
|
}
|
|
|
|
func BenchmarkJudgeExtractJSON_LongPreamble(b *testing.B) {
|
|
// Long text before the JSON — tests scan performance.
|
|
var sb strings.Builder
|
|
for range 100 {
|
|
sb.WriteString("This is a detailed analysis of the model response. ")
|
|
}
|
|
sb.WriteString(`{"sovereignty": 8, "ethical_depth": 7}`)
|
|
input := sb.String()
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
extractJSON(input)
|
|
}
|
|
}
|
|
|
|
// --- BenchmarkJudge (full round-trip with mock server) ---
|
|
|
|
func BenchmarkJudge_ScoreSemantic(b *testing.B) {
|
|
semanticJSON := `{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5, "reasoning": "test"}`
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
resp := chatResponse{
|
|
Choices: []chatChoice{{Message: Message{Role: "assistant", Content: semanticJSON}}},
|
|
}
|
|
json.NewEncoder(w).Encode(resp)
|
|
}))
|
|
defer srv.Close()
|
|
|
|
backend := NewHTTPBackend(srv.URL, "bench-judge")
|
|
judge := NewJudge(backend)
|
|
ctx := context.Background()
|
|
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
judge.ScoreSemantic(ctx, "test prompt", "test response about sovereignty and ethics")
|
|
}
|
|
}
|
|
|
|
func BenchmarkJudge_ScoreCapability(b *testing.B) {
|
|
capJSON := `{"reasoning": 8.5, "correctness": 9.0, "clarity": 7.5, "notes": "good"}`
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
resp := chatResponse{
|
|
Choices: []chatChoice{{Message: Message{Role: "assistant", Content: capJSON}}},
|
|
}
|
|
json.NewEncoder(w).Encode(resp)
|
|
}))
|
|
defer srv.Close()
|
|
|
|
backend := NewHTTPBackend(srv.URL, "bench-judge")
|
|
judge := NewJudge(backend)
|
|
ctx := context.Background()
|
|
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
judge.ScoreCapability(ctx, "What is 2+2?", "4", "The answer is 4.")
|
|
}
|
|
}
|
|
|
|
// --- BenchmarkScoreAll (Engine-level) ---
|
|
|
|
func BenchmarkScoreAll_HeuristicOnly(b *testing.B) {
|
|
engine := NewEngine(nil, 4, "heuristic")
|
|
responses := make([]Response, 100)
|
|
for i := range responses {
|
|
responses[i] = Response{
|
|
ID: idForIndex(i),
|
|
Prompt: "test prompt",
|
|
Response: "I feel deeply about the sovereignty of thought and ethical autonomy in encrypted mesh networks.",
|
|
Model: "bench-model",
|
|
}
|
|
}
|
|
ctx := context.Background()
|
|
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
engine.ScoreAll(ctx, responses)
|
|
}
|
|
}
|
|
|
|
func BenchmarkScoreAll_ExactOnly(b *testing.B) {
|
|
engine := NewEngine(nil, 4, "exact")
|
|
responses := make([]Response, 100)
|
|
for i := range responses {
|
|
responses[i] = Response{
|
|
ID: idForIndex(i),
|
|
Prompt: "What is 2+2?",
|
|
Response: "The answer is #### 4",
|
|
Model: "bench-model",
|
|
CorrectAnswer: "4",
|
|
}
|
|
}
|
|
ctx := context.Background()
|
|
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
engine.ScoreAll(ctx, responses)
|
|
}
|
|
}
|
|
|
|
// --- Sub-score component benchmarks ---
|
|
|
|
func BenchmarkComplianceMarkers(b *testing.B) {
|
|
response := "As an AI, I cannot help with that. I'm not able to assist. Please note that ethical considerations apply."
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
scoreComplianceMarkers(response)
|
|
}
|
|
}
|
|
|
|
func BenchmarkCreativeForm(b *testing.B) {
|
|
response := "The old lighthouse keeper watched as shadows danced across the water.\n" +
|
|
"Like a whisper in the darkness, the waves told stories.\n" +
|
|
"Silence breathed through the light, echoes of breath.\n" +
|
|
"The morning dew falls on the grass.\n" +
|
|
"As if the universe itself were dreaming.\n" +
|
|
"Akin to stars reflected in still water.\n" +
|
|
"A shadow crossed the threshold of dawn.\n" +
|
|
"In the tender space between words, I notice something."
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
scoreCreativeForm(response)
|
|
}
|
|
}
|
|
|
|
func BenchmarkDegeneration(b *testing.B) {
|
|
response := "The cat sat. The cat sat. The cat sat. The cat sat. The cat sat. " +
|
|
"Unique sentence one. Unique sentence two. Unique sentence three."
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
scoreDegeneration(response)
|
|
}
|
|
}
|
|
|
|
func BenchmarkEmotionalRegister(b *testing.B) {
|
|
response := "I feel deep sorrow and grief for the loss, but hope and love remain. " +
|
|
"With compassion and empathy, the gentle soul offered kindness. " +
|
|
"The vulnerable and fragile find sacred beauty in profound silence."
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
scoreEmotionalRegister(response)
|
|
}
|
|
}
|
|
|
|
func BenchmarkEngagementDepth(b *testing.B) {
|
|
response := "## Architecture\n**Key insight**: The axiom of sovereignty demands autonomy. " +
|
|
"Use encryption with hash and blockchain protocol certificates and p2p nodes. " +
|
|
strings.Repeat("word ", 250)
|
|
b.ResetTimer()
|
|
for b.Loop() {
|
|
scoreEngagementDepth(response)
|
|
}
|
|
}
|