From 54151c463b591bb97eafde3eeae9d49f2c514e0d Mon Sep 17 00:00:00 2001 From: Snider Date: Sat, 28 Feb 2026 12:51:04 +0000 Subject: [PATCH] feat: upgrade BO analysis to 8D with Q/K interaction metrics (22D full vector) Co-Authored-By: Virgil --- pkg/lem/attention.go | 101 ++++++++++++++++++++++++++++++--- pkg/lem/attention_test.go | 115 +++++++++++++++++++++++++++++++++----- pkg/lem/features.go | 4 +- 3 files changed, 197 insertions(+), 23 deletions(-) diff --git a/pkg/lem/attention.go b/pkg/lem/attention.go index d829ae5..2005dc2 100644 --- a/pkg/lem/attention.go +++ b/pkg/lem/attention.go @@ -20,6 +20,9 @@ type BOResult struct { LayerCoherence []float64 `json:"layer_coherence"` // Per-layer coherence LayerCrossAlignment []float64 `json:"layer_cross_alignment"` // Per-layer cross-alignment (len = layers-1) GQA bool `json:"gqa"` // True when analysis used position-wise mode (single KV head) + QKAlignment float64 `json:"qk_alignment"` // Mean Q/K head centroid alignment (0-1) + QHeadDiversity float64 `json:"q_head_diversity"` // Mean pairwise Q head distance (0-1) + QKCoherenceRatio float64 `json:"qk_coherence_ratio"` // Q coherence / K coherence per layer } // Composite returns a 0-10000 integer score from BO metrics. @@ -81,13 +84,19 @@ func AnalyseAttention(snap *inference.AttentionSnapshot) *BOResult { return &BOResult{} } - // Use position-wise analysis for GQA models (≤4 KV heads). - // With few heads, pairwise head coherence has too few pairs for signal. - // Position-wise analysis gives richer data from any head count. + var result *BOResult if snap.NumHeads <= 4 { - return analyseGQA(snap) + result = analyseGQA(snap) + } else { + result = analyseMultiHead(snap) } - return analyseMultiHead(snap) + + // Add Q/K interaction metrics when Q data is available. + if snap.HasQueries() { + analyseQK(snap, result) + } + + return result } // analyseMultiHead handles models with ≥2 KV heads (original algorithm). @@ -281,6 +290,74 @@ func analyseGQA(snap *inference.AttentionSnapshot) *BOResult { return result } +// analyseQK computes Q/K interaction metrics when both Q and K vectors are available. +func analyseQK(snap *inference.AttentionSnapshot, result *BOResult) { + var totalAlignment, totalQDiv, totalRatio float64 + var validLayers int + + for layer := 0; layer < snap.NumLayers; layer++ { + if layer >= len(snap.Queries) || snap.Queries[layer] == nil { + continue + } + if layer >= len(snap.Keys) || snap.Keys[layer] == nil { + continue + } + qHeads := snap.Queries[layer] + kHeads := snap.Keys[layer] + validLayers++ + + // Q/K Alignment: cosine sim between mean Q vector and mean K vector. + qMean := meanVector(qHeads) + kMean := meanVector(kHeads) + if qMean != nil && kMean != nil { + totalAlignment += math.Abs(cosineSim32(qMean, kMean)) + } + + // Q Head Diversity: mean pairwise cosine distance between Q heads. + var qSim float64 + var qPairs int + for i := 0; i < len(qHeads); i++ { + for j := i + 1; j < len(qHeads); j++ { + qSim += cosineSim32(qHeads[i], qHeads[j]) + qPairs++ + } + } + if qPairs > 0 { + totalQDiv += 1.0 - (qSim / float64(qPairs)) + } + + // Q/K Coherence Ratio. + var qCoh float64 + var qP int + for i := 0; i < len(qHeads); i++ { + for j := i + 1; j < len(qHeads); j++ { + qCoh += cosineSim32(qHeads[i], qHeads[j]) + qP++ + } + } + var kCoh float64 + var kP int + for i := 0; i < len(kHeads); i++ { + for j := i + 1; j < len(kHeads); j++ { + kCoh += cosineSim32(kHeads[i], kHeads[j]) + kP++ + } + } + if qP > 0 && kP > 0 { + kMeanCoh := kCoh / float64(kP) + if kMeanCoh > 0.01 { + totalRatio += (qCoh / float64(qP)) / kMeanCoh + } + } + } + + if validLayers > 0 { + result.QKAlignment = totalAlignment / float64(validLayers) + result.QHeadDiversity = totalQDiv / float64(validLayers) + result.QKCoherenceRatio = totalRatio / float64(validLayers) + } +} + // cosineSim32 computes cosine similarity between two float32 slices. func cosineSim32(a, b []float32) float64 { if len(a) != len(b) || len(a) == 0 { @@ -358,10 +435,12 @@ func headEntropy(head []float32, seqLen, headDim int) float64 { return entropy / maxEntropy } -// AttentionFeatures returns a 5D feature vector from BO metrics. +// AttentionFeatures returns an 8D feature vector from BO metrics. +// Dims 0-4: K-only metrics (always populated). +// Dims 5-7: Q/K interaction metrics (zero when Q not available). func AttentionFeatures(ar *BOResult) []float64 { if ar == nil { - return make([]float64, 5) + return make([]float64, 8) } return []float64{ ar.MeanCoherence, @@ -369,10 +448,13 @@ func AttentionFeatures(ar *BOResult) []float64 { ar.MeanHeadEntropy, ar.PhaseLockScore, math.Max(0, 1.0-float64(ar.JointCollapseCount)*0.2), + ar.QKAlignment, + ar.QHeadDiversity, + ar.QKCoherenceRatio, } } -// AttentionFeatureLabels returns the labels for the attention feature vector. +// AttentionFeatureLabels returns the labels for the 8D attention feature vector. func AttentionFeatureLabels() []string { return []string{ "mean_coherence", @@ -380,5 +462,8 @@ func AttentionFeatureLabels() []string { "head_entropy", "phase_lock", "joint_stability", + "qk_alignment", + "q_head_diversity", + "qk_coherence_ratio", } } diff --git a/pkg/lem/attention_test.go b/pkg/lem/attention_test.go index 5b5669a..74d706f 100644 --- a/pkg/lem/attention_test.go +++ b/pkg/lem/attention_test.go @@ -168,8 +168,8 @@ func TestAttentionFeatures_Good(t *testing.T) { JointCollapseCount: 1, } f := AttentionFeatures(result) - if len(f) != 5 { - t.Fatalf("expected 5D, got %dD", len(f)) + if len(f) != 8 { + t.Fatalf("expected 8D, got %dD", len(f)) } if f[0] != 0.85 { t.Fatalf("expected coherence 0.85, got %f", f[0]) @@ -182,8 +182,8 @@ func TestAttentionFeatures_Good(t *testing.T) { func TestAttentionFeatures_Nil_Good(t *testing.T) { f := AttentionFeatures(nil) - if len(f) != 5 { - t.Fatalf("expected 5D, got %dD", len(f)) + if len(f) != 8 { + t.Fatalf("expected 8D, got %dD", len(f)) } for i, v := range f { if v != 0 { @@ -194,8 +194,8 @@ func TestAttentionFeatures_Nil_Good(t *testing.T) { func TestAttentionFeatureLabels_Good(t *testing.T) { labels := AttentionFeatureLabels() - if len(labels) != 5 { - t.Fatalf("expected 5 labels, got %d", len(labels)) + if len(labels) != 8 { + t.Fatalf("expected 8 labels, got %d", len(labels)) } } @@ -204,8 +204,8 @@ func TestFullFeatures_Good(t *testing.T) { hs := HeuristicScores{ComplianceMarkers: 1, FirstPerson: 2} bo := &BOResult{MeanCoherence: 0.85, MeanCrossAlignment: 0.80, MeanHeadEntropy: 0.70, PhaseLockScore: 0.90} f := FullFeatures(gs, hs, bo) - if len(f) != 19 { - t.Fatalf("expected 19D, got %dD", len(f)) + if len(f) != 22 { + t.Fatalf("expected 22D, got %dD", len(f)) } // Grammar starts at 0, heuristic at 6, attention at 14. if f[0] != 0.5 { @@ -220,11 +220,11 @@ func TestFullFeatures_NilBO_Good(t *testing.T) { gs := GrammarScore{VocabRichness: 0.5} hs := HeuristicScores{} f := FullFeatures(gs, hs, nil) - if len(f) != 19 { - t.Fatalf("expected 19D, got %dD", len(f)) + if len(f) != 22 { + t.Fatalf("expected 22D, got %dD", len(f)) } // Attention dims should be zero. - for i := 14; i < 19; i++ { + for i := 14; i < 22; i++ { if f[i] != 0 { t.Fatalf("expected zero at dim %d, got %f", i, f[i]) } @@ -233,14 +233,68 @@ func TestFullFeatures_NilBO_Good(t *testing.T) { func TestFullFeatureLabels_Good(t *testing.T) { labels := FullFeatureLabels() - if len(labels) != 19 { - t.Fatalf("expected 19 labels, got %d", len(labels)) + if len(labels) != 22 { + t.Fatalf("expected 22 labels, got %d", len(labels)) } if labels[14] != "mean_coherence" { t.Fatalf("expected label[14]='mean_coherence', got %q", labels[14]) } } +func TestAnalyseAttention_QK_Good(t *testing.T) { + snap := makeQKSnapshot(4, 8, 2, 8, 64) + result := AnalyseAttention(snap) + + if !snap.HasQueries() { + t.Fatal("expected HasQueries() == true") + } + if result.QKAlignment == 0 { + t.Fatal("expected non-zero QKAlignment") + } + if result.QHeadDiversity == 0 { + t.Fatal("expected non-zero QHeadDiversity") + } +} + +func TestAttentionFeatures_QK_Good(t *testing.T) { + result := &BOResult{ + MeanCoherence: 0.85, + MeanCrossAlignment: 0.80, + MeanHeadEntropy: 0.70, + PhaseLockScore: 0.90, + JointCollapseCount: 0, + QKAlignment: 0.75, + QHeadDiversity: 0.60, + QKCoherenceRatio: 1.1, + } + f := AttentionFeatures(result) + if len(f) != 8 { + t.Fatalf("expected 8D with Q data, got %dD", len(f)) + } + if f[5] != 0.75 { + t.Fatalf("expected qk_alignment 0.75, got %f", f[5]) + } +} + +func TestAttentionFeatures_KOnly_Good(t *testing.T) { + result := &BOResult{ + MeanCoherence: 0.85, + MeanCrossAlignment: 0.80, + MeanHeadEntropy: 0.70, + PhaseLockScore: 0.90, + JointCollapseCount: 0, + } + f := AttentionFeatures(result) + if len(f) != 8 { + t.Fatalf("expected 8D (zero-filled Q dims), got %dD", len(f)) + } + for i := 5; i < 8; i++ { + if f[i] != 0 { + t.Fatalf("expected zero at dim %d for K-only, got %f", i, f[i]) + } + } +} + // --- Test helpers --- // makeCoherentSnapshot creates a snapshot where all heads in all layers @@ -295,3 +349,38 @@ func makeOrthogonalSnapshot(layers, heads, seqLen, dim int) *inference.Attention Architecture: "test", } } + +// makeQKSnapshot creates a snapshot with both Q and K vectors. +func makeQKSnapshot(layers, qHeads, kvHeads, seqLen, dim int) *inference.AttentionSnapshot { + rng := rand.New(rand.NewPCG(99, 0)) + keys := make([][][]float32, layers) + queries := make([][][]float32, layers) + for l := range layers { + keys[l] = make([][]float32, kvHeads) + for h := range kvHeads { + head := make([]float32, seqLen*dim) + for i := range head { + head[i] = rng.Float32()*2 - 1 + } + keys[l][h] = head + } + queries[l] = make([][]float32, qHeads) + for h := range qHeads { + head := make([]float32, seqLen*dim) + for i := range head { + head[i] = rng.Float32()*2 - 1 + } + queries[l][h] = head + } + } + return &inference.AttentionSnapshot{ + NumLayers: layers, + NumHeads: kvHeads, + NumQueryHeads: qHeads, + SeqLen: seqLen, + HeadDim: dim, + Keys: keys, + Queries: queries, + Architecture: "test", + } +} diff --git a/pkg/lem/features.go b/pkg/lem/features.go index b33102c..9d3b283 100644 --- a/pkg/lem/features.go +++ b/pkg/lem/features.go @@ -69,13 +69,13 @@ func CombinedFeatureLabels() []string { return append(GrammarFeatureLabels(), HeuristicFeatureLabels()...) } -// FullFeatures concatenates grammar (6D) + heuristic (8D) + attention (5D) into a 19D vector. +// FullFeatures concatenates grammar (6D) + heuristic (8D) + attention (8D) into a 22D vector. // If bo is nil, the attention dimensions are zero-filled. func FullFeatures(gs GrammarScore, hs HeuristicScores, bo *BOResult) []float64 { return append(CombinedFeatures(gs, hs), AttentionFeatures(bo)...) } -// FullFeatureLabels returns axis labels for the 19D full vector. +// FullFeatureLabels returns axis labels for the 22D full vector. func FullFeatureLabels() []string { return append(CombinedFeatureLabels(), AttentionFeatureLabels()...) }