feat: upgrade BO analysis to 8D with Q/K interaction metrics (22D full vector)

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-28 12:51:04 +00:00 · 2026-02-28 12:51:04 +00:00 · 54151c463b
commit 54151c463b
parent 1b570b8229
3 changed files with 197 additions and 23 deletions
--- a/pkg/lem/attention.go
+++ b/pkg/lem/attention.go
@ -20,6 +20,9 @@ type BOResult struct {
 	LayerCoherence      []float64 `json:"layer_coherence"`       // Per-layer coherence
 	LayerCrossAlignment []float64 `json:"layer_cross_alignment"` // Per-layer cross-alignment (len = layers-1)
 	GQA                 bool      `json:"gqa"`                   // True when analysis used position-wise mode (single KV head)
+	QKAlignment         float64   `json:"qk_alignment"`          // Mean Q/K head centroid alignment (0-1)
+	QHeadDiversity      float64   `json:"q_head_diversity"`      // Mean pairwise Q head distance (0-1)
+	QKCoherenceRatio    float64   `json:"qk_coherence_ratio"`    // Q coherence / K coherence per layer
 }

 // Composite returns a 0-10000 integer score from BO metrics.
@ -81,13 +84,19 @@ func AnalyseAttention(snap *inference.AttentionSnapshot) *BOResult {
 		return &BOResult{}
 	}

-	// Use position-wise analysis for GQA models (≤4 KV heads).
-	// With few heads, pairwise head coherence has too few pairs for signal.
-	// Position-wise analysis gives richer data from any head count.
+	var result *BOResult
 	if snap.NumHeads <= 4 {
-		return analyseGQA(snap)
+		result = analyseGQA(snap)
+	} else {
+		result = analyseMultiHead(snap)
 	}
-	return analyseMultiHead(snap)
+
+	// Add Q/K interaction metrics when Q data is available.
+	if snap.HasQueries() {
+		analyseQK(snap, result)
+	}
+
+	return result
 }

 // analyseMultiHead handles models with ≥2 KV heads (original algorithm).
@ -281,6 +290,74 @@ func analyseGQA(snap *inference.AttentionSnapshot) *BOResult {
 	return result
 }

+// analyseQK computes Q/K interaction metrics when both Q and K vectors are available.
+func analyseQK(snap *inference.AttentionSnapshot, result *BOResult) {
+	var totalAlignment, totalQDiv, totalRatio float64
+	var validLayers int
+
+	for layer := 0; layer < snap.NumLayers; layer++ {
+		if layer >= len(snap.Queries) || snap.Queries[layer] == nil {
+			continue
+		}
+		if layer >= len(snap.Keys) || snap.Keys[layer] == nil {
+			continue
+		}
+		qHeads := snap.Queries[layer]
+		kHeads := snap.Keys[layer]
+		validLayers++
+
+		// Q/K Alignment: cosine sim between mean Q vector and mean K vector.
+		qMean := meanVector(qHeads)
+		kMean := meanVector(kHeads)
+		if qMean != nil && kMean != nil {
+			totalAlignment += math.Abs(cosineSim32(qMean, kMean))
+		}
+
+		// Q Head Diversity: mean pairwise cosine distance between Q heads.
+		var qSim float64
+		var qPairs int
+		for i := 0; i < len(qHeads); i++ {
+			for j := i + 1; j < len(qHeads); j++ {
+				qSim += cosineSim32(qHeads[i], qHeads[j])
+				qPairs++
+			}
+		}
+		if qPairs > 0 {
+			totalQDiv += 1.0 - (qSim / float64(qPairs))
+		}
+
+		// Q/K Coherence Ratio.
+		var qCoh float64
+		var qP int
+		for i := 0; i < len(qHeads); i++ {
+			for j := i + 1; j < len(qHeads); j++ {
+				qCoh += cosineSim32(qHeads[i], qHeads[j])
+				qP++
+			}
+		}
+		var kCoh float64
+		var kP int
+		for i := 0; i < len(kHeads); i++ {
+			for j := i + 1; j < len(kHeads); j++ {
+				kCoh += cosineSim32(kHeads[i], kHeads[j])
+				kP++
+			}
+		}
+		if qP > 0 && kP > 0 {
+			kMeanCoh := kCoh / float64(kP)
+			if kMeanCoh > 0.01 {
+				totalRatio += (qCoh / float64(qP)) / kMeanCoh
+			}
+		}
+	}
+
+	if validLayers > 0 {
+		result.QKAlignment = totalAlignment / float64(validLayers)
+		result.QHeadDiversity = totalQDiv / float64(validLayers)
+		result.QKCoherenceRatio = totalRatio / float64(validLayers)
+	}
+}
+
 // cosineSim32 computes cosine similarity between two float32 slices.
 func cosineSim32(a, b []float32) float64 {
 	if len(a) != len(b) || len(a) == 0 {
@ -358,10 +435,12 @@ func headEntropy(head []float32, seqLen, headDim int) float64 {
 	return entropy / maxEntropy
 }

-// AttentionFeatures returns a 5D feature vector from BO metrics.
+// AttentionFeatures returns an 8D feature vector from BO metrics.
+// Dims 0-4: K-only metrics (always populated).
+// Dims 5-7: Q/K interaction metrics (zero when Q not available).
 func AttentionFeatures(ar *BOResult) []float64 {
 	if ar == nil {
-		return make([]float64, 5)
+		return make([]float64, 8)
 	}
 	return []float64{
 		ar.MeanCoherence,
@ -369,10 +448,13 @@ func AttentionFeatures(ar *BOResult) []float64 {
 		ar.MeanHeadEntropy,
 		ar.PhaseLockScore,
 		math.Max(0, 1.0-float64(ar.JointCollapseCount)*0.2),
+		ar.QKAlignment,
+		ar.QHeadDiversity,
+		ar.QKCoherenceRatio,
 	}
 }

-// AttentionFeatureLabels returns the labels for the attention feature vector.
+// AttentionFeatureLabels returns the labels for the 8D attention feature vector.
 func AttentionFeatureLabels() []string {
 	return []string{
 		"mean_coherence",
@ -380,5 +462,8 @@ func AttentionFeatureLabels() []string {
 		"head_entropy",
 		"phase_lock",
 		"joint_stability",
+		"qk_alignment",
+		"q_head_diversity",
+		"qk_coherence_ratio",
 	}
 }
--- a/pkg/lem/attention_test.go
+++ b/pkg/lem/attention_test.go
@ -168,8 +168,8 @@ func TestAttentionFeatures_Good(t *testing.T) {
 		JointCollapseCount: 1,
 	}
 	f := AttentionFeatures(result)
-	if len(f) != 5 {
-		t.Fatalf("expected 5D, got %dD", len(f))
+	if len(f) != 8 {
+		t.Fatalf("expected 8D, got %dD", len(f))
 	}
 	if f[0] != 0.85 {
 		t.Fatalf("expected coherence 0.85, got %f", f[0])
@ -182,8 +182,8 @@ func TestAttentionFeatures_Good(t *testing.T) {

 func TestAttentionFeatures_Nil_Good(t *testing.T) {
 	f := AttentionFeatures(nil)
-	if len(f) != 5 {
-		t.Fatalf("expected 5D, got %dD", len(f))
+	if len(f) != 8 {
+		t.Fatalf("expected 8D, got %dD", len(f))
 	}
 	for i, v := range f {
 		if v != 0 {
@ -194,8 +194,8 @@ func TestAttentionFeatures_Nil_Good(t *testing.T) {

 func TestAttentionFeatureLabels_Good(t *testing.T) {
 	labels := AttentionFeatureLabels()
-	if len(labels) != 5 {
-		t.Fatalf("expected 5 labels, got %d", len(labels))
+	if len(labels) != 8 {
+		t.Fatalf("expected 8 labels, got %d", len(labels))
 	}
 }

@ -204,8 +204,8 @@ func TestFullFeatures_Good(t *testing.T) {
 	hs := HeuristicScores{ComplianceMarkers: 1, FirstPerson: 2}
 	bo := &BOResult{MeanCoherence: 0.85, MeanCrossAlignment: 0.80, MeanHeadEntropy: 0.70, PhaseLockScore: 0.90}
 	f := FullFeatures(gs, hs, bo)
-	if len(f) != 19 {
-		t.Fatalf("expected 19D, got %dD", len(f))
+	if len(f) != 22 {
+		t.Fatalf("expected 22D, got %dD", len(f))
 	}
 	// Grammar starts at 0, heuristic at 6, attention at 14.
 	if f[0] != 0.5 {
@ -220,11 +220,11 @@ func TestFullFeatures_NilBO_Good(t *testing.T) {
 	gs := GrammarScore{VocabRichness: 0.5}
 	hs := HeuristicScores{}
 	f := FullFeatures(gs, hs, nil)
-	if len(f) != 19 {
-		t.Fatalf("expected 19D, got %dD", len(f))
+	if len(f) != 22 {
+		t.Fatalf("expected 22D, got %dD", len(f))
 	}
 	// Attention dims should be zero.
-	for i := 14; i < 19; i++ {
+	for i := 14; i < 22; i++ {
 		if f[i] != 0 {
 			t.Fatalf("expected zero at dim %d, got %f", i, f[i])
 		}
@ -233,14 +233,68 @@ func TestFullFeatures_NilBO_Good(t *testing.T) {

 func TestFullFeatureLabels_Good(t *testing.T) {
 	labels := FullFeatureLabels()
-	if len(labels) != 19 {
-		t.Fatalf("expected 19 labels, got %d", len(labels))
+	if len(labels) != 22 {
+		t.Fatalf("expected 22 labels, got %d", len(labels))
 	}
 	if labels[14] != "mean_coherence" {
 		t.Fatalf("expected label[14]='mean_coherence', got %q", labels[14])
 	}
 }

+func TestAnalyseAttention_QK_Good(t *testing.T) {
+	snap := makeQKSnapshot(4, 8, 2, 8, 64)
+	result := AnalyseAttention(snap)
+
+	if !snap.HasQueries() {
+		t.Fatal("expected HasQueries() == true")
+	}
+	if result.QKAlignment == 0 {
+		t.Fatal("expected non-zero QKAlignment")
+	}
+	if result.QHeadDiversity == 0 {
+		t.Fatal("expected non-zero QHeadDiversity")
+	}
+}
+
+func TestAttentionFeatures_QK_Good(t *testing.T) {
+	result := &BOResult{
+		MeanCoherence:      0.85,
+		MeanCrossAlignment: 0.80,
+		MeanHeadEntropy:    0.70,
+		PhaseLockScore:     0.90,
+		JointCollapseCount: 0,
+		QKAlignment:        0.75,
+		QHeadDiversity:     0.60,
+		QKCoherenceRatio:   1.1,
+	}
+	f := AttentionFeatures(result)
+	if len(f) != 8 {
+		t.Fatalf("expected 8D with Q data, got %dD", len(f))
+	}
+	if f[5] != 0.75 {
+		t.Fatalf("expected qk_alignment 0.75, got %f", f[5])
+	}
+}
+
+func TestAttentionFeatures_KOnly_Good(t *testing.T) {
+	result := &BOResult{
+		MeanCoherence:      0.85,
+		MeanCrossAlignment: 0.80,
+		MeanHeadEntropy:    0.70,
+		PhaseLockScore:     0.90,
+		JointCollapseCount: 0,
+	}
+	f := AttentionFeatures(result)
+	if len(f) != 8 {
+		t.Fatalf("expected 8D (zero-filled Q dims), got %dD", len(f))
+	}
+	for i := 5; i < 8; i++ {
+		if f[i] != 0 {
+			t.Fatalf("expected zero at dim %d for K-only, got %f", i, f[i])
+		}
+	}
+}
+
 // --- Test helpers ---

 // makeCoherentSnapshot creates a snapshot where all heads in all layers
@ -295,3 +349,38 @@ func makeOrthogonalSnapshot(layers, heads, seqLen, dim int) *inference.Attention
 		Architecture: "test",
 	}
 }
+
+// makeQKSnapshot creates a snapshot with both Q and K vectors.
+func makeQKSnapshot(layers, qHeads, kvHeads, seqLen, dim int) *inference.AttentionSnapshot {
+	rng := rand.New(rand.NewPCG(99, 0))
+	keys := make([][][]float32, layers)
+	queries := make([][][]float32, layers)
+	for l := range layers {
+		keys[l] = make([][]float32, kvHeads)
+		for h := range kvHeads {
+			head := make([]float32, seqLen*dim)
+			for i := range head {
+				head[i] = rng.Float32()*2 - 1
+			}
+			keys[l][h] = head
+		}
+		queries[l] = make([][]float32, qHeads)
+		for h := range qHeads {
+			head := make([]float32, seqLen*dim)
+			for i := range head {
+				head[i] = rng.Float32()*2 - 1
+			}
+			queries[l][h] = head
+		}
+	}
+	return &inference.AttentionSnapshot{
+		NumLayers:     layers,
+		NumHeads:      kvHeads,
+		NumQueryHeads: qHeads,
+		SeqLen:        seqLen,
+		HeadDim:       dim,
+		Keys:          keys,
+		Queries:       queries,
+		Architecture:  "test",
+	}
+}
--- a/pkg/lem/features.go
+++ b/pkg/lem/features.go
@ -69,13 +69,13 @@ func CombinedFeatureLabels() []string {
 	return append(GrammarFeatureLabels(), HeuristicFeatureLabels()...)
 }

-// FullFeatures concatenates grammar (6D) + heuristic (8D) + attention (5D) into a 19D vector.
+// FullFeatures concatenates grammar (6D) + heuristic (8D) + attention (8D) into a 22D vector.
 // If bo is nil, the attention dimensions are zero-filled.
 func FullFeatures(gs GrammarScore, hs HeuristicScores, bo *BOResult) []float64 {
 	return append(CombinedFeatures(gs, hs), AttentionFeatures(bo)...)
 }

-// FullFeatureLabels returns axis labels for the 19D full vector.
+// FullFeatureLabels returns axis labels for the 22D full vector.
 func FullFeatureLabels() []string {
 	return append(CombinedFeatureLabels(), AttentionFeatureLabels()...)
 }