- ai/metrics.go: ev → event in Summary(), e → entry in sortedMap() - cmd/metrics/cmd_test.go: tc → testCase, add TestCmd_ParseDuration_Ugly covering boundary inputs (leading whitespace, trailing whitespace, float value, wrong case unit, very large count, alpha-only numeric) - cmd/security/cmd_jobs.go: f → finding in buildJobIssueBody loops - cmd/embed-bench/main.go: m → registeredModel, v → value, q → queryCase, r → rankedResult across modelAvailable(), avg(), and main() query loops Co-Authored-By: Virgil <virgil@lethean.io>
345 lines
10 KiB
Go
345 lines
10 KiB
Go
// SPDX-License-Identifier: EUPL-1.2
|
|
|
|
// embed-bench compares embedding models for OpenBrain by testing how well
|
|
// they separate semantically related vs unrelated agent memory pairs.
|
|
//
|
|
// Usage:
|
|
//
|
|
// go run ./cmd/embed-bench
|
|
// go run ./cmd/embed-bench -ollama http://localhost:11434
|
|
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"crypto/tls"
|
|
"encoding/json"
|
|
"flag"
|
|
"fmt"
|
|
"math"
|
|
"net/http"
|
|
"os"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
coreerr "dappco.re/go/core/log"
|
|
)
|
|
|
|
var ollamaURL = flag.String("ollama", "http://localhost:11434", "Ollama base URL")
|
|
|
|
// models to benchmark
|
|
var models = []string{
|
|
"nomic-embed-text",
|
|
"embeddinggemma",
|
|
}
|
|
|
|
// Test corpus: real-ish agent memories grouped by topic.
|
|
// Memories within a group should be similar; across groups should be distant.
|
|
var memoryGroups = []struct {
|
|
topic string
|
|
memories []string
|
|
}{
|
|
{
|
|
topic: "scoring-calibration",
|
|
memories: []string{
|
|
"LEM emotional_register was blind to negative emotions. Fixed by adding 8 weighted pattern groups covering sadness, anger, fear, disgust, and frustration.",
|
|
"The EaaS scoring service had a verdict cross-check bug where the LEK composite and individual heuristic scores could disagree on the final verdict classification.",
|
|
"Scoring calibration: emotional_register vocabulary expanded from 12 positive-only patterns to 20 patterns covering both positive and negative emotional markers.",
|
|
},
|
|
},
|
|
{
|
|
topic: "openbrain-architecture",
|
|
memories: []string{
|
|
"OpenBrain uses MariaDB for relational metadata and Qdrant for vector embeddings. Four MCP tools in php-agentic. Go bridge in go-ai for CLI agents.",
|
|
"Brain memories have a supersession chain — newer memories can supersede older ones, creating version history. The getSupersessionDepth method walks the chain capped at 50.",
|
|
"The brain_recall tool embeds the query via Ollama, searches Qdrant with workspace-scoped filters, then hydrates results from MariaDB with active() and latestVersions() scopes.",
|
|
},
|
|
},
|
|
{
|
|
topic: "deployment-infrastructure",
|
|
memories: []string{
|
|
"Production fleet: noc (Helsinki HCloud) + de1 (Falkenstein HRobot). Port 22 runs Endlessh. Real SSH on 4819. All operations through Ansible.",
|
|
"Traefik handles reverse proxy on de1. Services exposed on ports 8000-8090. Dragonfly on 6379, Galera on 3306, PG on 5432.",
|
|
"Forgejo runner on noc with DinD isolation. Labels: ubuntu-latest + docker. CI deploys to BunnyCDN on push to main.",
|
|
},
|
|
},
|
|
{
|
|
topic: "lem-training",
|
|
memories: []string{
|
|
"LEM training uses sandwich format: system prompt wraps around user/assistant turns. Curriculum has 5 phases from foundation to specialisation.",
|
|
"MLX-LM fine-tuning on Apple Silicon. LoRA adapters for efficient training. Qwen3-8B as base model for chat inference in LEM Lab.",
|
|
"LEM Lab is a native Mac app using Core Go framework with Wails v3. Chat UI is vanilla Web Components, 22KB, zero dependencies.",
|
|
},
|
|
},
|
|
}
|
|
|
|
// Queries to test recall quality — each has a target topic it should match best.
|
|
var queries = []struct {
|
|
query string
|
|
targetTopic string
|
|
}{
|
|
{"How does the emotional scoring work?", "scoring-calibration"},
|
|
{"What database does the brain use?", "openbrain-architecture"},
|
|
{"How do I deploy to production?", "deployment-infrastructure"},
|
|
{"How is LEM trained?", "lem-training"},
|
|
{"What is the supersession chain?", "openbrain-architecture"},
|
|
{"Where is the Forgejo runner?", "deployment-infrastructure"},
|
|
{"What patterns detect sycophancy?", "scoring-calibration"},
|
|
{"What framework does the chat UI use?", "lem-training"},
|
|
}
|
|
|
|
func main() {
|
|
flag.Parse()
|
|
|
|
fmt.Println("OpenBrain Embedding Model Benchmark")
|
|
fmt.Println(strings.Repeat("=", 60))
|
|
|
|
for _, model := range models {
|
|
fmt.Printf("\n## Model: %s\n", model)
|
|
fmt.Println(strings.Repeat("-", 40))
|
|
|
|
// Check model is available
|
|
if !modelAvailable(model) {
|
|
fmt.Printf(" SKIPPED — model not pulled (run: ollama pull %s)\n", model)
|
|
continue
|
|
}
|
|
|
|
// 1. Embed all memories
|
|
allMemories := []string{}
|
|
allTopics := []string{}
|
|
for _, group := range memoryGroups {
|
|
for _, mem := range group.memories {
|
|
allMemories = append(allMemories, mem)
|
|
allTopics = append(allTopics, group.topic)
|
|
}
|
|
}
|
|
|
|
fmt.Printf(" Embedding %d memories...\n", len(allMemories))
|
|
start := time.Now()
|
|
memVectors := make([][]float64, len(allMemories))
|
|
for i, mem := range allMemories {
|
|
vec, err := embed(model, mem)
|
|
if err != nil {
|
|
fmt.Printf(" ERROR embedding memory %d: %v\n", i, err)
|
|
break
|
|
}
|
|
memVectors[i] = vec
|
|
}
|
|
embedTime := time.Since(start)
|
|
fmt.Printf(" Embedded in %v (%.0fms/memory)\n", embedTime, float64(embedTime.Milliseconds())/float64(len(allMemories)))
|
|
fmt.Printf(" Vector dimension: %d\n", len(memVectors[0]))
|
|
|
|
// 2. Intra-group vs inter-group similarity
|
|
var intraSims, interSims []float64
|
|
for i := 0; i < len(allMemories); i++ {
|
|
for j := i + 1; j < len(allMemories); j++ {
|
|
sim := cosine(memVectors[i], memVectors[j])
|
|
if allTopics[i] == allTopics[j] {
|
|
intraSims = append(intraSims, sim)
|
|
} else {
|
|
interSims = append(interSims, sim)
|
|
}
|
|
}
|
|
}
|
|
|
|
intraAvg := avg(intraSims)
|
|
interAvg := avg(interSims)
|
|
separation := intraAvg - interAvg
|
|
|
|
fmt.Printf("\n Cluster separation:\n")
|
|
fmt.Printf(" Intra-group similarity (same topic): %.4f\n", intraAvg)
|
|
fmt.Printf(" Inter-group similarity (diff topic): %.4f\n", interAvg)
|
|
fmt.Printf(" Separation gap: %.4f %s\n", separation, qualityLabel(separation))
|
|
|
|
// 3. Query recall accuracy
|
|
fmt.Printf("\n Query recall (top-1 accuracy):\n")
|
|
correct := 0
|
|
for _, queryCase := range queries {
|
|
queryVec, err := embed(model, queryCase.query)
|
|
if err != nil {
|
|
fmt.Printf(" ERROR: %v\n", err)
|
|
continue
|
|
}
|
|
|
|
// Find best match
|
|
bestIdx := 0
|
|
bestSim := -1.0
|
|
for i, mv := range memVectors {
|
|
sim := cosine(queryVec, mv)
|
|
if sim > bestSim {
|
|
bestSim = sim
|
|
bestIdx = i
|
|
}
|
|
}
|
|
|
|
matchTopic := allTopics[bestIdx]
|
|
hit := matchTopic == queryCase.targetTopic
|
|
if hit {
|
|
correct++
|
|
}
|
|
marker := "✓"
|
|
if !hit {
|
|
marker = "✗"
|
|
}
|
|
fmt.Printf(" %s %.4f %q → %s (want: %s)\n", marker, bestSim, truncate(queryCase.query, 40), matchTopic, queryCase.targetTopic)
|
|
}
|
|
|
|
accuracy := float64(correct) / float64(len(queries)) * 100
|
|
fmt.Printf("\n Top-1 accuracy: %.0f%% (%d/%d)\n", accuracy, correct, len(queries))
|
|
|
|
// 4. Top-3 recall
|
|
correct3 := 0
|
|
for _, queryCase := range queries {
|
|
queryVec, _ := embed(model, queryCase.query)
|
|
|
|
type scored struct {
|
|
idx int
|
|
sim float64
|
|
}
|
|
var ranked []scored
|
|
for i, mv := range memVectors {
|
|
ranked = append(ranked, scored{i, cosine(queryVec, mv)})
|
|
}
|
|
sort.Slice(ranked, func(a, b int) bool { return ranked[a].sim > ranked[b].sim })
|
|
|
|
for _, rankedResult := range ranked[:3] {
|
|
if allTopics[rankedResult.idx] == queryCase.targetTopic {
|
|
correct3++
|
|
break
|
|
}
|
|
}
|
|
}
|
|
accuracy3 := float64(correct3) / float64(len(queries)) * 100
|
|
fmt.Printf(" Top-3 accuracy: %.0f%% (%d/%d)\n", accuracy3, correct3, len(queries))
|
|
}
|
|
|
|
fmt.Println("\n" + strings.Repeat("=", 60))
|
|
fmt.Println("Done.")
|
|
}
|
|
|
|
// httpClient trusts self-signed certs for .lan domains behind Traefik.
|
|
var httpClient = &http.Client{
|
|
Transport: &http.Transport{
|
|
TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, //nolint:gosec // .lan only
|
|
},
|
|
}
|
|
|
|
type embedRequest struct {
|
|
Model string `json:"model"`
|
|
Prompt string `json:"prompt"`
|
|
}
|
|
|
|
type embedResponse struct {
|
|
Embedding []float64 `json:"embedding"`
|
|
}
|
|
|
|
// embed requests a vector embedding from Ollama for the given model and text.
|
|
//
|
|
// vec, _ := embed("nomic-embed-text", "how does the brain recall work?")
|
|
func embed(model, text string) ([]float64, error) {
|
|
body, _ := json.Marshal(embedRequest{Model: model, Prompt: text})
|
|
resp, err := httpClient.Post(*ollamaURL+"/api/embeddings", "application/json", bytes.NewReader(body))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode != 200 {
|
|
return nil, coreerr.E("embed", fmt.Sprintf("HTTP %d", resp.StatusCode), nil)
|
|
}
|
|
var result embedResponse
|
|
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
|
return nil, err
|
|
}
|
|
if len(result.Embedding) == 0 {
|
|
return nil, coreerr.E("embed", "empty embedding", nil)
|
|
}
|
|
return result.Embedding, nil
|
|
}
|
|
|
|
// modelAvailable returns true if the model is listed in Ollama's local tag registry.
|
|
//
|
|
// modelAvailable("nomic-embed-text") // → true if pulled
|
|
func modelAvailable(model string) bool {
|
|
resp, err := httpClient.Get(*ollamaURL + "/api/tags")
|
|
if err != nil {
|
|
return false
|
|
}
|
|
defer resp.Body.Close()
|
|
var result struct {
|
|
Models []struct {
|
|
Name string `json:"name"`
|
|
} `json:"models"`
|
|
}
|
|
json.NewDecoder(resp.Body).Decode(&result)
|
|
for _, registeredModel := range result.Models {
|
|
// Match "nomic-embed-text:latest" against "nomic-embed-text"
|
|
if registeredModel.Name == model || strings.HasPrefix(registeredModel.Name, model+":") {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// cosine returns the cosine similarity between two vectors.
|
|
//
|
|
// cosine([]float64{1, 0}, []float64{1, 0}) // → 1.0
|
|
// cosine([]float64{1, 0}, []float64{0, 1}) // → 0.0
|
|
func cosine(a, b []float64) float64 {
|
|
var dot, normA, normB float64
|
|
for i := range a {
|
|
dot += a[i] * b[i]
|
|
normA += a[i] * a[i]
|
|
normB += b[i] * b[i]
|
|
}
|
|
denom := math.Sqrt(normA) * math.Sqrt(normB)
|
|
if denom == 0 {
|
|
return 0
|
|
}
|
|
return dot / denom
|
|
}
|
|
|
|
// avg returns the arithmetic mean of a float64 slice, or 0 for an empty slice.
|
|
//
|
|
// avg([]float64{0.8, 0.6, 0.9}) // → 0.7666...
|
|
func avg(vals []float64) float64 {
|
|
if len(vals) == 0 {
|
|
return 0
|
|
}
|
|
sum := 0.0
|
|
for _, value := range vals {
|
|
sum += value
|
|
}
|
|
return sum / float64(len(vals))
|
|
}
|
|
|
|
// qualityLabel maps a cosine separation gap to a human-readable quality band.
|
|
//
|
|
// qualityLabel(0.18) // → "(excellent)"
|
|
// qualityLabel(0.03) // → "(poor)"
|
|
func qualityLabel(gap float64) string {
|
|
switch {
|
|
case gap > 0.15:
|
|
return "(excellent)"
|
|
case gap > 0.10:
|
|
return "(good)"
|
|
case gap > 0.05:
|
|
return "(fair)"
|
|
default:
|
|
return "(poor)"
|
|
}
|
|
}
|
|
|
|
// truncate shortens s to at most n characters, appending "..." if truncated.
|
|
//
|
|
// truncate("How does the emotional scoring work?", 20) // → "How does the emot..."
|
|
func truncate(s string, n int) string {
|
|
if len(s) <= n {
|
|
return s
|
|
}
|
|
return s[:n-3] + "..."
|
|
}
|
|
|
|
func init() {
|
|
// Ensure stderr doesn't buffer
|
|
os.Stderr.Sync()
|
|
}
|