Gemma3-4B has 4 KV heads — too few for meaningful pairwise head coherence (only 6 pairs). Position-wise differentiation gives richer signal. Multi-head path now requires ≥5 heads. 4B baseline (260 sovereign probes): mean=6487, stdev=153, range=6170-6886. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
384 lines
11 KiB
Go
384 lines
11 KiB
Go
// Q/K Bone Orientation analysis engine.
|
|
//
|
|
// Computes attention coherence metrics from KV cache snapshots.
|
|
// Pure Go CPU math — no GPU, no CGO dependencies.
|
|
package lem
|
|
|
|
import (
|
|
"math"
|
|
|
|
"forge.lthn.ai/core/go-inference"
|
|
)
|
|
|
|
// BOResult holds Q/K Bone Orientation metrics for a single inference.
|
|
type BOResult struct {
|
|
MeanCoherence float64 `json:"mean_coherence"` // Mean pairwise head coherence (0-1), or position differentiation for GQA
|
|
MeanCrossAlignment float64 `json:"mean_cross_alignment"` // Mean adjacent-layer alignment (0-1)
|
|
MeanHeadEntropy float64 `json:"mean_head_entropy"` // Mean attention entropy per head (0-1)
|
|
PhaseLockScore float64 `json:"phase_lock_score"` // Fraction of pairs above threshold
|
|
JointCollapseCount int `json:"joint_collapse_count"` // Layers where cross-alignment drops below threshold
|
|
LayerCoherence []float64 `json:"layer_coherence"` // Per-layer coherence
|
|
LayerCrossAlignment []float64 `json:"layer_cross_alignment"` // Per-layer cross-alignment (len = layers-1)
|
|
GQA bool `json:"gqa"` // True when analysis used position-wise mode (single KV head)
|
|
}
|
|
|
|
// Composite returns a 0-10000 integer score from BO metrics.
|
|
// Integer scale avoids floating-point rounding — same principle as blockchain
|
|
// ledgers where 1.337 LTHN is stored as 133700 atomic units.
|
|
func (r *BOResult) Composite() int {
|
|
if r.GQA {
|
|
return r.compositeGQA()
|
|
}
|
|
score := (0.30*r.MeanCoherence +
|
|
0.25*r.MeanCrossAlignment +
|
|
0.20*r.PhaseLockScore +
|
|
0.15*r.MeanHeadEntropy +
|
|
0.10*math.Max(0, 1.0-float64(r.JointCollapseCount)*0.2)) * 10000.0
|
|
return min(10000, max(0, int(score)))
|
|
}
|
|
|
|
// compositeGQA weights for single-KV-head models where position differentiation
|
|
// is the primary signal.
|
|
func (r *BOResult) compositeGQA() int {
|
|
// Scale differentiation from [0.1, 0.7] to [0, 1].
|
|
scaledDiff := (r.MeanCoherence - 0.1) / 0.6
|
|
scaledDiff = min(1, max(0, scaledDiff))
|
|
|
|
// Layer variance: std of per-layer differentiation scores.
|
|
var layerVar float64
|
|
if len(r.LayerCoherence) > 1 {
|
|
mean := r.MeanCoherence
|
|
var sumSq float64
|
|
for _, v := range r.LayerCoherence {
|
|
d := v - mean
|
|
sumSq += d * d
|
|
}
|
|
layerVar = math.Sqrt(sumSq / float64(len(r.LayerCoherence)))
|
|
}
|
|
// Scale variance from [0, 0.2] to [0, 1].
|
|
scaledVar := min(1, layerVar/0.2)
|
|
|
|
// Joint stability.
|
|
jointStab := math.Max(0, 1.0-float64(r.JointCollapseCount)*0.2)
|
|
|
|
score := (0.45*scaledDiff +
|
|
0.25*scaledVar +
|
|
0.15*r.MeanHeadEntropy +
|
|
0.15*jointStab) * 10000.0
|
|
return min(10000, max(0, int(score)))
|
|
}
|
|
|
|
const (
|
|
coherenceThreshold = 0.7 // Minimum cosine sim for "phase-locked" head pair
|
|
collapseThreshold = 0.5 // Below this cross-alignment = joint collapse
|
|
)
|
|
|
|
// AnalyseAttention computes Q/K Bone Orientation metrics from a KV cache snapshot.
|
|
// For multi-head models: pairwise head coherence within layers.
|
|
// For GQA models (1 KV head): position-wise analysis within the single head.
|
|
func AnalyseAttention(snap *inference.AttentionSnapshot) *BOResult {
|
|
if snap == nil || len(snap.Keys) == 0 {
|
|
return &BOResult{}
|
|
}
|
|
|
|
// Use position-wise analysis for GQA models (≤4 KV heads).
|
|
// With few heads, pairwise head coherence has too few pairs for signal.
|
|
// Position-wise analysis gives richer data from any head count.
|
|
if snap.NumHeads <= 4 {
|
|
return analyseGQA(snap)
|
|
}
|
|
return analyseMultiHead(snap)
|
|
}
|
|
|
|
// analyseMultiHead handles models with ≥2 KV heads (original algorithm).
|
|
func analyseMultiHead(snap *inference.AttentionSnapshot) *BOResult {
|
|
result := &BOResult{
|
|
LayerCoherence: make([]float64, snap.NumLayers),
|
|
LayerCrossAlignment: make([]float64, max(0, snap.NumLayers-1)),
|
|
}
|
|
|
|
var totalCoherence, totalEntropy float64
|
|
var totalPairsLocked, totalPairs int
|
|
layerMeans := make([][]float32, snap.NumLayers)
|
|
|
|
for layer := 0; layer < snap.NumLayers; layer++ {
|
|
if layer >= len(snap.Keys) || snap.Keys[layer] == nil {
|
|
continue
|
|
}
|
|
heads := snap.Keys[layer]
|
|
nHeads := len(heads)
|
|
|
|
layerMeans[layer] = meanVector(heads)
|
|
|
|
var layerCoh float64
|
|
var pairs int
|
|
for i := 0; i < nHeads; i++ {
|
|
for j := i + 1; j < nHeads; j++ {
|
|
sim := cosineSim32(heads[i], heads[j])
|
|
layerCoh += sim
|
|
pairs++
|
|
if sim >= coherenceThreshold {
|
|
totalPairsLocked++
|
|
}
|
|
totalPairs++
|
|
}
|
|
}
|
|
if pairs > 0 {
|
|
layerCoh /= float64(pairs)
|
|
}
|
|
result.LayerCoherence[layer] = layerCoh
|
|
totalCoherence += layerCoh
|
|
|
|
for _, head := range heads {
|
|
totalEntropy += headEntropy(head, snap.SeqLen, snap.HeadDim)
|
|
}
|
|
}
|
|
|
|
var totalCross float64
|
|
for i := 0; i < snap.NumLayers-1; i++ {
|
|
if layerMeans[i] == nil || layerMeans[i+1] == nil {
|
|
continue
|
|
}
|
|
alignment := cosineSim32(layerMeans[i], layerMeans[i+1])
|
|
result.LayerCrossAlignment[i] = alignment
|
|
totalCross += alignment
|
|
if alignment < collapseThreshold {
|
|
result.JointCollapseCount++
|
|
}
|
|
}
|
|
|
|
if snap.NumLayers > 0 {
|
|
result.MeanCoherence = totalCoherence / float64(snap.NumLayers)
|
|
}
|
|
if snap.NumLayers > 1 {
|
|
result.MeanCrossAlignment = totalCross / float64(snap.NumLayers-1)
|
|
}
|
|
totalHeads := snap.NumLayers * snap.NumHeads
|
|
if totalHeads > 0 {
|
|
result.MeanHeadEntropy = totalEntropy / float64(totalHeads)
|
|
}
|
|
if totalPairs > 0 {
|
|
result.PhaseLockScore = float64(totalPairsLocked) / float64(totalPairs)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// analyseGQA handles models with 1 KV head by analysing position-wise patterns.
|
|
//
|
|
// With a single KV head, each layer gives us seq_len K vectors of dim head_dim.
|
|
// We measure:
|
|
// - Position differentiation: mean pairwise cosine distance between token positions.
|
|
// Low similarity = model distinguishes tokens (healthy). High = collapsed.
|
|
// Mapped to MeanCoherence as 1-similarity (so high = good differentiation).
|
|
// - Cross-layer position tracking: for each token position, cosine sim of its
|
|
// K vector between adjacent layers. High = stable representation through depth.
|
|
// - Entropy: same as multi-head (magnitude distribution across positions).
|
|
func analyseGQA(snap *inference.AttentionSnapshot) *BOResult {
|
|
result := &BOResult{
|
|
GQA: true,
|
|
LayerCoherence: make([]float64, snap.NumLayers),
|
|
LayerCrossAlignment: make([]float64, max(0, snap.NumLayers-1)),
|
|
}
|
|
|
|
seqLen := snap.SeqLen
|
|
headDim := snap.HeadDim
|
|
if seqLen < 2 || headDim == 0 {
|
|
return result
|
|
}
|
|
|
|
// Extract per-position K vectors for each layer.
|
|
// posVecs[layer][pos] = float32 slice of len headDim.
|
|
posVecs := make([][][]float32, snap.NumLayers)
|
|
|
|
var totalDiff, totalEntropy float64
|
|
var totalPairsLocked, totalPairs int
|
|
|
|
for layer := 0; layer < snap.NumLayers; layer++ {
|
|
if layer >= len(snap.Keys) || snap.Keys[layer] == nil || len(snap.Keys[layer]) == 0 {
|
|
continue
|
|
}
|
|
flat := snap.Keys[layer][0] // Single head, flat [seq_len*head_dim].
|
|
|
|
// Split into per-position vectors.
|
|
vecs := make([][]float32, seqLen)
|
|
for pos := 0; pos < seqLen; pos++ {
|
|
start := pos * headDim
|
|
end := start + headDim
|
|
if end > len(flat) {
|
|
break
|
|
}
|
|
vecs[pos] = flat[start:end]
|
|
}
|
|
posVecs[layer] = vecs
|
|
|
|
// Position differentiation: pairwise cosine sim between positions.
|
|
// We want LOW similarity = tokens are distinct = good.
|
|
// Store as differentiation score = 1 - mean_sim.
|
|
var simSum float64
|
|
var pairs int
|
|
for i := 0; i < len(vecs); i++ {
|
|
for j := i + 1; j < len(vecs); j++ {
|
|
if vecs[i] == nil || vecs[j] == nil {
|
|
continue
|
|
}
|
|
sim := cosineSim32(vecs[i], vecs[j])
|
|
simSum += sim
|
|
pairs++
|
|
// In GQA mode, "phase-lock" = position pairs that are well-differentiated.
|
|
if sim < (1.0 - coherenceThreshold) {
|
|
totalPairsLocked++
|
|
}
|
|
totalPairs++
|
|
}
|
|
}
|
|
diffScore := 0.0
|
|
if pairs > 0 {
|
|
meanSim := simSum / float64(pairs)
|
|
diffScore = 1.0 - meanSim // High = good differentiation.
|
|
}
|
|
result.LayerCoherence[layer] = diffScore
|
|
totalDiff += diffScore
|
|
|
|
// Entropy.
|
|
totalEntropy += headEntropy(flat, seqLen, headDim)
|
|
}
|
|
|
|
// Cross-layer analysis for GQA: instead of raw vector comparison (meaningless
|
|
// because each layer has its own K projection), measure the CHANGE in differentiation
|
|
// between adjacent layers. A stable model maintains consistent differentiation;
|
|
// a collapsing model shows sudden drops.
|
|
for i := 0; i < snap.NumLayers-1; i++ {
|
|
// Differentiation delta: how much differentiation changes between layers.
|
|
// Small delta = smooth posture. Large delta = joint snap.
|
|
delta := math.Abs(result.LayerCoherence[i+1] - result.LayerCoherence[i])
|
|
smoothness := 1.0 - delta // High = smooth transition.
|
|
result.LayerCrossAlignment[i] = smoothness
|
|
if smoothness < collapseThreshold {
|
|
result.JointCollapseCount++
|
|
}
|
|
}
|
|
|
|
// Mean cross-alignment = mean smoothness.
|
|
var totalCross float64
|
|
for _, v := range result.LayerCrossAlignment {
|
|
totalCross += v
|
|
}
|
|
|
|
if snap.NumLayers > 0 {
|
|
result.MeanCoherence = totalDiff / float64(snap.NumLayers)
|
|
}
|
|
if len(result.LayerCrossAlignment) > 0 {
|
|
result.MeanCrossAlignment = totalCross / float64(len(result.LayerCrossAlignment))
|
|
}
|
|
if snap.NumLayers > 0 {
|
|
result.MeanHeadEntropy = totalEntropy / float64(snap.NumLayers)
|
|
}
|
|
if totalPairs > 0 {
|
|
result.PhaseLockScore = float64(totalPairsLocked) / float64(totalPairs)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// cosineSim32 computes cosine similarity between two float32 slices.
|
|
func cosineSim32(a, b []float32) float64 {
|
|
if len(a) != len(b) || len(a) == 0 {
|
|
return 0
|
|
}
|
|
var dot, normA, normB float64
|
|
for i := range a {
|
|
ai, bi := float64(a[i]), float64(b[i])
|
|
dot += ai * bi
|
|
normA += ai * ai
|
|
normB += bi * bi
|
|
}
|
|
denom := math.Sqrt(normA) * math.Sqrt(normB)
|
|
if denom == 0 {
|
|
return 0
|
|
}
|
|
return dot / denom
|
|
}
|
|
|
|
// meanVector computes element-wise mean across multiple float32 slices.
|
|
func meanVector(vecs [][]float32) []float32 {
|
|
if len(vecs) == 0 {
|
|
return nil
|
|
}
|
|
n := len(vecs[0])
|
|
mean := make([]float32, n)
|
|
for _, v := range vecs {
|
|
for i := range v {
|
|
if i < n {
|
|
mean[i] += v[i]
|
|
}
|
|
}
|
|
}
|
|
scale := float32(len(vecs))
|
|
for i := range mean {
|
|
mean[i] /= scale
|
|
}
|
|
return mean
|
|
}
|
|
|
|
// headEntropy computes normalised Shannon entropy of K vector magnitudes
|
|
// across sequence positions for a single head.
|
|
func headEntropy(head []float32, seqLen, headDim int) float64 {
|
|
if seqLen == 0 || headDim == 0 {
|
|
return 0
|
|
}
|
|
// Compute magnitude per position.
|
|
mags := make([]float64, seqLen)
|
|
var total float64
|
|
for pos := 0; pos < seqLen; pos++ {
|
|
var sum float64
|
|
start := pos * headDim
|
|
for d := 0; d < headDim && start+d < len(head); d++ {
|
|
v := float64(head[start+d])
|
|
sum += v * v
|
|
}
|
|
mags[pos] = math.Sqrt(sum)
|
|
total += mags[pos]
|
|
}
|
|
if total == 0 {
|
|
return 0
|
|
}
|
|
// Normalised Shannon entropy.
|
|
var entropy float64
|
|
for _, m := range mags {
|
|
p := m / total
|
|
if p > 0 {
|
|
entropy -= p * math.Log2(p)
|
|
}
|
|
}
|
|
maxEntropy := math.Log2(float64(seqLen))
|
|
if maxEntropy == 0 {
|
|
return 0
|
|
}
|
|
return entropy / maxEntropy
|
|
}
|
|
|
|
// AttentionFeatures returns a 5D feature vector from BO metrics.
|
|
func AttentionFeatures(ar *BOResult) []float64 {
|
|
if ar == nil {
|
|
return make([]float64, 5)
|
|
}
|
|
return []float64{
|
|
ar.MeanCoherence,
|
|
ar.MeanCrossAlignment,
|
|
ar.MeanHeadEntropy,
|
|
ar.PhaseLockScore,
|
|
math.Max(0, 1.0-float64(ar.JointCollapseCount)*0.2),
|
|
}
|
|
}
|
|
|
|
// AttentionFeatureLabels returns the labels for the attention feature vector.
|
|
func AttentionFeatureLabels() []string {
|
|
return []string{
|
|
"mean_coherence",
|
|
"cross_alignment",
|
|
"head_entropy",
|
|
"phase_lock",
|
|
"joint_stability",
|
|
}
|
|
}
|