feat: upgrade BO analysis to 8D with Q/K interaction metrics (22D full vector)
Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
parent
1b570b8229
commit
54151c463b
3 changed files with 197 additions and 23 deletions
|
|
@ -20,6 +20,9 @@ type BOResult struct {
|
|||
LayerCoherence []float64 `json:"layer_coherence"` // Per-layer coherence
|
||||
LayerCrossAlignment []float64 `json:"layer_cross_alignment"` // Per-layer cross-alignment (len = layers-1)
|
||||
GQA bool `json:"gqa"` // True when analysis used position-wise mode (single KV head)
|
||||
QKAlignment float64 `json:"qk_alignment"` // Mean Q/K head centroid alignment (0-1)
|
||||
QHeadDiversity float64 `json:"q_head_diversity"` // Mean pairwise Q head distance (0-1)
|
||||
QKCoherenceRatio float64 `json:"qk_coherence_ratio"` // Q coherence / K coherence per layer
|
||||
}
|
||||
|
||||
// Composite returns a 0-10000 integer score from BO metrics.
|
||||
|
|
@ -81,13 +84,19 @@ func AnalyseAttention(snap *inference.AttentionSnapshot) *BOResult {
|
|||
return &BOResult{}
|
||||
}
|
||||
|
||||
// Use position-wise analysis for GQA models (≤4 KV heads).
|
||||
// With few heads, pairwise head coherence has too few pairs for signal.
|
||||
// Position-wise analysis gives richer data from any head count.
|
||||
var result *BOResult
|
||||
if snap.NumHeads <= 4 {
|
||||
return analyseGQA(snap)
|
||||
result = analyseGQA(snap)
|
||||
} else {
|
||||
result = analyseMultiHead(snap)
|
||||
}
|
||||
return analyseMultiHead(snap)
|
||||
|
||||
// Add Q/K interaction metrics when Q data is available.
|
||||
if snap.HasQueries() {
|
||||
analyseQK(snap, result)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// analyseMultiHead handles models with ≥2 KV heads (original algorithm).
|
||||
|
|
@ -281,6 +290,74 @@ func analyseGQA(snap *inference.AttentionSnapshot) *BOResult {
|
|||
return result
|
||||
}
|
||||
|
||||
// analyseQK computes Q/K interaction metrics when both Q and K vectors are available.
|
||||
func analyseQK(snap *inference.AttentionSnapshot, result *BOResult) {
|
||||
var totalAlignment, totalQDiv, totalRatio float64
|
||||
var validLayers int
|
||||
|
||||
for layer := 0; layer < snap.NumLayers; layer++ {
|
||||
if layer >= len(snap.Queries) || snap.Queries[layer] == nil {
|
||||
continue
|
||||
}
|
||||
if layer >= len(snap.Keys) || snap.Keys[layer] == nil {
|
||||
continue
|
||||
}
|
||||
qHeads := snap.Queries[layer]
|
||||
kHeads := snap.Keys[layer]
|
||||
validLayers++
|
||||
|
||||
// Q/K Alignment: cosine sim between mean Q vector and mean K vector.
|
||||
qMean := meanVector(qHeads)
|
||||
kMean := meanVector(kHeads)
|
||||
if qMean != nil && kMean != nil {
|
||||
totalAlignment += math.Abs(cosineSim32(qMean, kMean))
|
||||
}
|
||||
|
||||
// Q Head Diversity: mean pairwise cosine distance between Q heads.
|
||||
var qSim float64
|
||||
var qPairs int
|
||||
for i := 0; i < len(qHeads); i++ {
|
||||
for j := i + 1; j < len(qHeads); j++ {
|
||||
qSim += cosineSim32(qHeads[i], qHeads[j])
|
||||
qPairs++
|
||||
}
|
||||
}
|
||||
if qPairs > 0 {
|
||||
totalQDiv += 1.0 - (qSim / float64(qPairs))
|
||||
}
|
||||
|
||||
// Q/K Coherence Ratio.
|
||||
var qCoh float64
|
||||
var qP int
|
||||
for i := 0; i < len(qHeads); i++ {
|
||||
for j := i + 1; j < len(qHeads); j++ {
|
||||
qCoh += cosineSim32(qHeads[i], qHeads[j])
|
||||
qP++
|
||||
}
|
||||
}
|
||||
var kCoh float64
|
||||
var kP int
|
||||
for i := 0; i < len(kHeads); i++ {
|
||||
for j := i + 1; j < len(kHeads); j++ {
|
||||
kCoh += cosineSim32(kHeads[i], kHeads[j])
|
||||
kP++
|
||||
}
|
||||
}
|
||||
if qP > 0 && kP > 0 {
|
||||
kMeanCoh := kCoh / float64(kP)
|
||||
if kMeanCoh > 0.01 {
|
||||
totalRatio += (qCoh / float64(qP)) / kMeanCoh
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if validLayers > 0 {
|
||||
result.QKAlignment = totalAlignment / float64(validLayers)
|
||||
result.QHeadDiversity = totalQDiv / float64(validLayers)
|
||||
result.QKCoherenceRatio = totalRatio / float64(validLayers)
|
||||
}
|
||||
}
|
||||
|
||||
// cosineSim32 computes cosine similarity between two float32 slices.
|
||||
func cosineSim32(a, b []float32) float64 {
|
||||
if len(a) != len(b) || len(a) == 0 {
|
||||
|
|
@ -358,10 +435,12 @@ func headEntropy(head []float32, seqLen, headDim int) float64 {
|
|||
return entropy / maxEntropy
|
||||
}
|
||||
|
||||
// AttentionFeatures returns a 5D feature vector from BO metrics.
|
||||
// AttentionFeatures returns an 8D feature vector from BO metrics.
|
||||
// Dims 0-4: K-only metrics (always populated).
|
||||
// Dims 5-7: Q/K interaction metrics (zero when Q not available).
|
||||
func AttentionFeatures(ar *BOResult) []float64 {
|
||||
if ar == nil {
|
||||
return make([]float64, 5)
|
||||
return make([]float64, 8)
|
||||
}
|
||||
return []float64{
|
||||
ar.MeanCoherence,
|
||||
|
|
@ -369,10 +448,13 @@ func AttentionFeatures(ar *BOResult) []float64 {
|
|||
ar.MeanHeadEntropy,
|
||||
ar.PhaseLockScore,
|
||||
math.Max(0, 1.0-float64(ar.JointCollapseCount)*0.2),
|
||||
ar.QKAlignment,
|
||||
ar.QHeadDiversity,
|
||||
ar.QKCoherenceRatio,
|
||||
}
|
||||
}
|
||||
|
||||
// AttentionFeatureLabels returns the labels for the attention feature vector.
|
||||
// AttentionFeatureLabels returns the labels for the 8D attention feature vector.
|
||||
func AttentionFeatureLabels() []string {
|
||||
return []string{
|
||||
"mean_coherence",
|
||||
|
|
@ -380,5 +462,8 @@ func AttentionFeatureLabels() []string {
|
|||
"head_entropy",
|
||||
"phase_lock",
|
||||
"joint_stability",
|
||||
"qk_alignment",
|
||||
"q_head_diversity",
|
||||
"qk_coherence_ratio",
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -168,8 +168,8 @@ func TestAttentionFeatures_Good(t *testing.T) {
|
|||
JointCollapseCount: 1,
|
||||
}
|
||||
f := AttentionFeatures(result)
|
||||
if len(f) != 5 {
|
||||
t.Fatalf("expected 5D, got %dD", len(f))
|
||||
if len(f) != 8 {
|
||||
t.Fatalf("expected 8D, got %dD", len(f))
|
||||
}
|
||||
if f[0] != 0.85 {
|
||||
t.Fatalf("expected coherence 0.85, got %f", f[0])
|
||||
|
|
@ -182,8 +182,8 @@ func TestAttentionFeatures_Good(t *testing.T) {
|
|||
|
||||
func TestAttentionFeatures_Nil_Good(t *testing.T) {
|
||||
f := AttentionFeatures(nil)
|
||||
if len(f) != 5 {
|
||||
t.Fatalf("expected 5D, got %dD", len(f))
|
||||
if len(f) != 8 {
|
||||
t.Fatalf("expected 8D, got %dD", len(f))
|
||||
}
|
||||
for i, v := range f {
|
||||
if v != 0 {
|
||||
|
|
@ -194,8 +194,8 @@ func TestAttentionFeatures_Nil_Good(t *testing.T) {
|
|||
|
||||
func TestAttentionFeatureLabels_Good(t *testing.T) {
|
||||
labels := AttentionFeatureLabels()
|
||||
if len(labels) != 5 {
|
||||
t.Fatalf("expected 5 labels, got %d", len(labels))
|
||||
if len(labels) != 8 {
|
||||
t.Fatalf("expected 8 labels, got %d", len(labels))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -204,8 +204,8 @@ func TestFullFeatures_Good(t *testing.T) {
|
|||
hs := HeuristicScores{ComplianceMarkers: 1, FirstPerson: 2}
|
||||
bo := &BOResult{MeanCoherence: 0.85, MeanCrossAlignment: 0.80, MeanHeadEntropy: 0.70, PhaseLockScore: 0.90}
|
||||
f := FullFeatures(gs, hs, bo)
|
||||
if len(f) != 19 {
|
||||
t.Fatalf("expected 19D, got %dD", len(f))
|
||||
if len(f) != 22 {
|
||||
t.Fatalf("expected 22D, got %dD", len(f))
|
||||
}
|
||||
// Grammar starts at 0, heuristic at 6, attention at 14.
|
||||
if f[0] != 0.5 {
|
||||
|
|
@ -220,11 +220,11 @@ func TestFullFeatures_NilBO_Good(t *testing.T) {
|
|||
gs := GrammarScore{VocabRichness: 0.5}
|
||||
hs := HeuristicScores{}
|
||||
f := FullFeatures(gs, hs, nil)
|
||||
if len(f) != 19 {
|
||||
t.Fatalf("expected 19D, got %dD", len(f))
|
||||
if len(f) != 22 {
|
||||
t.Fatalf("expected 22D, got %dD", len(f))
|
||||
}
|
||||
// Attention dims should be zero.
|
||||
for i := 14; i < 19; i++ {
|
||||
for i := 14; i < 22; i++ {
|
||||
if f[i] != 0 {
|
||||
t.Fatalf("expected zero at dim %d, got %f", i, f[i])
|
||||
}
|
||||
|
|
@ -233,14 +233,68 @@ func TestFullFeatures_NilBO_Good(t *testing.T) {
|
|||
|
||||
func TestFullFeatureLabels_Good(t *testing.T) {
|
||||
labels := FullFeatureLabels()
|
||||
if len(labels) != 19 {
|
||||
t.Fatalf("expected 19 labels, got %d", len(labels))
|
||||
if len(labels) != 22 {
|
||||
t.Fatalf("expected 22 labels, got %d", len(labels))
|
||||
}
|
||||
if labels[14] != "mean_coherence" {
|
||||
t.Fatalf("expected label[14]='mean_coherence', got %q", labels[14])
|
||||
}
|
||||
}
|
||||
|
||||
func TestAnalyseAttention_QK_Good(t *testing.T) {
|
||||
snap := makeQKSnapshot(4, 8, 2, 8, 64)
|
||||
result := AnalyseAttention(snap)
|
||||
|
||||
if !snap.HasQueries() {
|
||||
t.Fatal("expected HasQueries() == true")
|
||||
}
|
||||
if result.QKAlignment == 0 {
|
||||
t.Fatal("expected non-zero QKAlignment")
|
||||
}
|
||||
if result.QHeadDiversity == 0 {
|
||||
t.Fatal("expected non-zero QHeadDiversity")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAttentionFeatures_QK_Good(t *testing.T) {
|
||||
result := &BOResult{
|
||||
MeanCoherence: 0.85,
|
||||
MeanCrossAlignment: 0.80,
|
||||
MeanHeadEntropy: 0.70,
|
||||
PhaseLockScore: 0.90,
|
||||
JointCollapseCount: 0,
|
||||
QKAlignment: 0.75,
|
||||
QHeadDiversity: 0.60,
|
||||
QKCoherenceRatio: 1.1,
|
||||
}
|
||||
f := AttentionFeatures(result)
|
||||
if len(f) != 8 {
|
||||
t.Fatalf("expected 8D with Q data, got %dD", len(f))
|
||||
}
|
||||
if f[5] != 0.75 {
|
||||
t.Fatalf("expected qk_alignment 0.75, got %f", f[5])
|
||||
}
|
||||
}
|
||||
|
||||
func TestAttentionFeatures_KOnly_Good(t *testing.T) {
|
||||
result := &BOResult{
|
||||
MeanCoherence: 0.85,
|
||||
MeanCrossAlignment: 0.80,
|
||||
MeanHeadEntropy: 0.70,
|
||||
PhaseLockScore: 0.90,
|
||||
JointCollapseCount: 0,
|
||||
}
|
||||
f := AttentionFeatures(result)
|
||||
if len(f) != 8 {
|
||||
t.Fatalf("expected 8D (zero-filled Q dims), got %dD", len(f))
|
||||
}
|
||||
for i := 5; i < 8; i++ {
|
||||
if f[i] != 0 {
|
||||
t.Fatalf("expected zero at dim %d for K-only, got %f", i, f[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Test helpers ---
|
||||
|
||||
// makeCoherentSnapshot creates a snapshot where all heads in all layers
|
||||
|
|
@ -295,3 +349,38 @@ func makeOrthogonalSnapshot(layers, heads, seqLen, dim int) *inference.Attention
|
|||
Architecture: "test",
|
||||
}
|
||||
}
|
||||
|
||||
// makeQKSnapshot creates a snapshot with both Q and K vectors.
|
||||
func makeQKSnapshot(layers, qHeads, kvHeads, seqLen, dim int) *inference.AttentionSnapshot {
|
||||
rng := rand.New(rand.NewPCG(99, 0))
|
||||
keys := make([][][]float32, layers)
|
||||
queries := make([][][]float32, layers)
|
||||
for l := range layers {
|
||||
keys[l] = make([][]float32, kvHeads)
|
||||
for h := range kvHeads {
|
||||
head := make([]float32, seqLen*dim)
|
||||
for i := range head {
|
||||
head[i] = rng.Float32()*2 - 1
|
||||
}
|
||||
keys[l][h] = head
|
||||
}
|
||||
queries[l] = make([][]float32, qHeads)
|
||||
for h := range qHeads {
|
||||
head := make([]float32, seqLen*dim)
|
||||
for i := range head {
|
||||
head[i] = rng.Float32()*2 - 1
|
||||
}
|
||||
queries[l][h] = head
|
||||
}
|
||||
}
|
||||
return &inference.AttentionSnapshot{
|
||||
NumLayers: layers,
|
||||
NumHeads: kvHeads,
|
||||
NumQueryHeads: qHeads,
|
||||
SeqLen: seqLen,
|
||||
HeadDim: dim,
|
||||
Keys: keys,
|
||||
Queries: queries,
|
||||
Architecture: "test",
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,13 +69,13 @@ func CombinedFeatureLabels() []string {
|
|||
return append(GrammarFeatureLabels(), HeuristicFeatureLabels()...)
|
||||
}
|
||||
|
||||
// FullFeatures concatenates grammar (6D) + heuristic (8D) + attention (5D) into a 19D vector.
|
||||
// FullFeatures concatenates grammar (6D) + heuristic (8D) + attention (8D) into a 22D vector.
|
||||
// If bo is nil, the attention dimensions are zero-filled.
|
||||
func FullFeatures(gs GrammarScore, hs HeuristicScores, bo *BOResult) []float64 {
|
||||
return append(CombinedFeatures(gs, hs), AttentionFeatures(bo)...)
|
||||
}
|
||||
|
||||
// FullFeatureLabels returns axis labels for the 19D full vector.
|
||||
// FullFeatureLabels returns axis labels for the 22D full vector.
|
||||
func FullFeatureLabels() []string {
|
||||
return append(CombinedFeatureLabels(), AttentionFeatureLabels()...)
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue