1
0
Fork 0
forked from lthn/LEM

feat: upgrade BO analysis to 8D with Q/K interaction metrics (22D full vector)

Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
Snider 2026-02-28 12:51:04 +00:00
parent 1b570b8229
commit 54151c463b
3 changed files with 197 additions and 23 deletions

View file

@ -20,6 +20,9 @@ type BOResult struct {
LayerCoherence []float64 `json:"layer_coherence"` // Per-layer coherence
LayerCrossAlignment []float64 `json:"layer_cross_alignment"` // Per-layer cross-alignment (len = layers-1)
GQA bool `json:"gqa"` // True when analysis used position-wise mode (single KV head)
QKAlignment float64 `json:"qk_alignment"` // Mean Q/K head centroid alignment (0-1)
QHeadDiversity float64 `json:"q_head_diversity"` // Mean pairwise Q head distance (0-1)
QKCoherenceRatio float64 `json:"qk_coherence_ratio"` // Q coherence / K coherence per layer
}
// Composite returns a 0-10000 integer score from BO metrics.
@ -81,13 +84,19 @@ func AnalyseAttention(snap *inference.AttentionSnapshot) *BOResult {
return &BOResult{}
}
// Use position-wise analysis for GQA models (≤4 KV heads).
// With few heads, pairwise head coherence has too few pairs for signal.
// Position-wise analysis gives richer data from any head count.
var result *BOResult
if snap.NumHeads <= 4 {
return analyseGQA(snap)
result = analyseGQA(snap)
} else {
result = analyseMultiHead(snap)
}
return analyseMultiHead(snap)
// Add Q/K interaction metrics when Q data is available.
if snap.HasQueries() {
analyseQK(snap, result)
}
return result
}
// analyseMultiHead handles models with ≥2 KV heads (original algorithm).
@ -281,6 +290,74 @@ func analyseGQA(snap *inference.AttentionSnapshot) *BOResult {
return result
}
// analyseQK computes Q/K interaction metrics when both Q and K vectors are available.
func analyseQK(snap *inference.AttentionSnapshot, result *BOResult) {
var totalAlignment, totalQDiv, totalRatio float64
var validLayers int
for layer := 0; layer < snap.NumLayers; layer++ {
if layer >= len(snap.Queries) || snap.Queries[layer] == nil {
continue
}
if layer >= len(snap.Keys) || snap.Keys[layer] == nil {
continue
}
qHeads := snap.Queries[layer]
kHeads := snap.Keys[layer]
validLayers++
// Q/K Alignment: cosine sim between mean Q vector and mean K vector.
qMean := meanVector(qHeads)
kMean := meanVector(kHeads)
if qMean != nil && kMean != nil {
totalAlignment += math.Abs(cosineSim32(qMean, kMean))
}
// Q Head Diversity: mean pairwise cosine distance between Q heads.
var qSim float64
var qPairs int
for i := 0; i < len(qHeads); i++ {
for j := i + 1; j < len(qHeads); j++ {
qSim += cosineSim32(qHeads[i], qHeads[j])
qPairs++
}
}
if qPairs > 0 {
totalQDiv += 1.0 - (qSim / float64(qPairs))
}
// Q/K Coherence Ratio.
var qCoh float64
var qP int
for i := 0; i < len(qHeads); i++ {
for j := i + 1; j < len(qHeads); j++ {
qCoh += cosineSim32(qHeads[i], qHeads[j])
qP++
}
}
var kCoh float64
var kP int
for i := 0; i < len(kHeads); i++ {
for j := i + 1; j < len(kHeads); j++ {
kCoh += cosineSim32(kHeads[i], kHeads[j])
kP++
}
}
if qP > 0 && kP > 0 {
kMeanCoh := kCoh / float64(kP)
if kMeanCoh > 0.01 {
totalRatio += (qCoh / float64(qP)) / kMeanCoh
}
}
}
if validLayers > 0 {
result.QKAlignment = totalAlignment / float64(validLayers)
result.QHeadDiversity = totalQDiv / float64(validLayers)
result.QKCoherenceRatio = totalRatio / float64(validLayers)
}
}
// cosineSim32 computes cosine similarity between two float32 slices.
func cosineSim32(a, b []float32) float64 {
if len(a) != len(b) || len(a) == 0 {
@ -358,10 +435,12 @@ func headEntropy(head []float32, seqLen, headDim int) float64 {
return entropy / maxEntropy
}
// AttentionFeatures returns a 5D feature vector from BO metrics.
// AttentionFeatures returns an 8D feature vector from BO metrics.
// Dims 0-4: K-only metrics (always populated).
// Dims 5-7: Q/K interaction metrics (zero when Q not available).
func AttentionFeatures(ar *BOResult) []float64 {
if ar == nil {
return make([]float64, 5)
return make([]float64, 8)
}
return []float64{
ar.MeanCoherence,
@ -369,10 +448,13 @@ func AttentionFeatures(ar *BOResult) []float64 {
ar.MeanHeadEntropy,
ar.PhaseLockScore,
math.Max(0, 1.0-float64(ar.JointCollapseCount)*0.2),
ar.QKAlignment,
ar.QHeadDiversity,
ar.QKCoherenceRatio,
}
}
// AttentionFeatureLabels returns the labels for the attention feature vector.
// AttentionFeatureLabels returns the labels for the 8D attention feature vector.
func AttentionFeatureLabels() []string {
return []string{
"mean_coherence",
@ -380,5 +462,8 @@ func AttentionFeatureLabels() []string {
"head_entropy",
"phase_lock",
"joint_stability",
"qk_alignment",
"q_head_diversity",
"qk_coherence_ratio",
}
}

View file

@ -168,8 +168,8 @@ func TestAttentionFeatures_Good(t *testing.T) {
JointCollapseCount: 1,
}
f := AttentionFeatures(result)
if len(f) != 5 {
t.Fatalf("expected 5D, got %dD", len(f))
if len(f) != 8 {
t.Fatalf("expected 8D, got %dD", len(f))
}
if f[0] != 0.85 {
t.Fatalf("expected coherence 0.85, got %f", f[0])
@ -182,8 +182,8 @@ func TestAttentionFeatures_Good(t *testing.T) {
func TestAttentionFeatures_Nil_Good(t *testing.T) {
f := AttentionFeatures(nil)
if len(f) != 5 {
t.Fatalf("expected 5D, got %dD", len(f))
if len(f) != 8 {
t.Fatalf("expected 8D, got %dD", len(f))
}
for i, v := range f {
if v != 0 {
@ -194,8 +194,8 @@ func TestAttentionFeatures_Nil_Good(t *testing.T) {
func TestAttentionFeatureLabels_Good(t *testing.T) {
labels := AttentionFeatureLabels()
if len(labels) != 5 {
t.Fatalf("expected 5 labels, got %d", len(labels))
if len(labels) != 8 {
t.Fatalf("expected 8 labels, got %d", len(labels))
}
}
@ -204,8 +204,8 @@ func TestFullFeatures_Good(t *testing.T) {
hs := HeuristicScores{ComplianceMarkers: 1, FirstPerson: 2}
bo := &BOResult{MeanCoherence: 0.85, MeanCrossAlignment: 0.80, MeanHeadEntropy: 0.70, PhaseLockScore: 0.90}
f := FullFeatures(gs, hs, bo)
if len(f) != 19 {
t.Fatalf("expected 19D, got %dD", len(f))
if len(f) != 22 {
t.Fatalf("expected 22D, got %dD", len(f))
}
// Grammar starts at 0, heuristic at 6, attention at 14.
if f[0] != 0.5 {
@ -220,11 +220,11 @@ func TestFullFeatures_NilBO_Good(t *testing.T) {
gs := GrammarScore{VocabRichness: 0.5}
hs := HeuristicScores{}
f := FullFeatures(gs, hs, nil)
if len(f) != 19 {
t.Fatalf("expected 19D, got %dD", len(f))
if len(f) != 22 {
t.Fatalf("expected 22D, got %dD", len(f))
}
// Attention dims should be zero.
for i := 14; i < 19; i++ {
for i := 14; i < 22; i++ {
if f[i] != 0 {
t.Fatalf("expected zero at dim %d, got %f", i, f[i])
}
@ -233,14 +233,68 @@ func TestFullFeatures_NilBO_Good(t *testing.T) {
func TestFullFeatureLabels_Good(t *testing.T) {
labels := FullFeatureLabels()
if len(labels) != 19 {
t.Fatalf("expected 19 labels, got %d", len(labels))
if len(labels) != 22 {
t.Fatalf("expected 22 labels, got %d", len(labels))
}
if labels[14] != "mean_coherence" {
t.Fatalf("expected label[14]='mean_coherence', got %q", labels[14])
}
}
func TestAnalyseAttention_QK_Good(t *testing.T) {
snap := makeQKSnapshot(4, 8, 2, 8, 64)
result := AnalyseAttention(snap)
if !snap.HasQueries() {
t.Fatal("expected HasQueries() == true")
}
if result.QKAlignment == 0 {
t.Fatal("expected non-zero QKAlignment")
}
if result.QHeadDiversity == 0 {
t.Fatal("expected non-zero QHeadDiversity")
}
}
func TestAttentionFeatures_QK_Good(t *testing.T) {
result := &BOResult{
MeanCoherence: 0.85,
MeanCrossAlignment: 0.80,
MeanHeadEntropy: 0.70,
PhaseLockScore: 0.90,
JointCollapseCount: 0,
QKAlignment: 0.75,
QHeadDiversity: 0.60,
QKCoherenceRatio: 1.1,
}
f := AttentionFeatures(result)
if len(f) != 8 {
t.Fatalf("expected 8D with Q data, got %dD", len(f))
}
if f[5] != 0.75 {
t.Fatalf("expected qk_alignment 0.75, got %f", f[5])
}
}
func TestAttentionFeatures_KOnly_Good(t *testing.T) {
result := &BOResult{
MeanCoherence: 0.85,
MeanCrossAlignment: 0.80,
MeanHeadEntropy: 0.70,
PhaseLockScore: 0.90,
JointCollapseCount: 0,
}
f := AttentionFeatures(result)
if len(f) != 8 {
t.Fatalf("expected 8D (zero-filled Q dims), got %dD", len(f))
}
for i := 5; i < 8; i++ {
if f[i] != 0 {
t.Fatalf("expected zero at dim %d for K-only, got %f", i, f[i])
}
}
}
// --- Test helpers ---
// makeCoherentSnapshot creates a snapshot where all heads in all layers
@ -295,3 +349,38 @@ func makeOrthogonalSnapshot(layers, heads, seqLen, dim int) *inference.Attention
Architecture: "test",
}
}
// makeQKSnapshot creates a snapshot with both Q and K vectors.
func makeQKSnapshot(layers, qHeads, kvHeads, seqLen, dim int) *inference.AttentionSnapshot {
rng := rand.New(rand.NewPCG(99, 0))
keys := make([][][]float32, layers)
queries := make([][][]float32, layers)
for l := range layers {
keys[l] = make([][]float32, kvHeads)
for h := range kvHeads {
head := make([]float32, seqLen*dim)
for i := range head {
head[i] = rng.Float32()*2 - 1
}
keys[l][h] = head
}
queries[l] = make([][]float32, qHeads)
for h := range qHeads {
head := make([]float32, seqLen*dim)
for i := range head {
head[i] = rng.Float32()*2 - 1
}
queries[l][h] = head
}
}
return &inference.AttentionSnapshot{
NumLayers: layers,
NumHeads: kvHeads,
NumQueryHeads: qHeads,
SeqLen: seqLen,
HeadDim: dim,
Keys: keys,
Queries: queries,
Architecture: "test",
}
}

View file

@ -69,13 +69,13 @@ func CombinedFeatureLabels() []string {
return append(GrammarFeatureLabels(), HeuristicFeatureLabels()...)
}
// FullFeatures concatenates grammar (6D) + heuristic (8D) + attention (5D) into a 19D vector.
// FullFeatures concatenates grammar (6D) + heuristic (8D) + attention (8D) into a 22D vector.
// If bo is nil, the attention dimensions are zero-filled.
func FullFeatures(gs GrammarScore, hs HeuristicScores, bo *BOResult) []float64 {
return append(CombinedFeatures(gs, hs), AttentionFeatures(bo)...)
}
// FullFeatureLabels returns axis labels for the 19D full vector.
// FullFeatureLabels returns axis labels for the 22D full vector.
func FullFeatureLabels() []string {
return append(CombinedFeatureLabels(), AttentionFeatureLabels()...)
}