LEM/pkg/lem/attention.go

// Q/K Bone Orientation analysis engine.
//
// Computes attention coherence metrics from KV cache snapshots.
// Pure Go CPU math — no GPU, no CGO dependencies.
package lem

import (
	"math"

	"forge.lthn.ai/core/go-inference"
)

// BOResult holds Q/K Bone Orientation metrics for a single inference.
type BOResult struct {
	MeanCoherence       float64   `json:"mean_coherence"`        // Mean pairwise head coherence (0-1), or position differentiation for GQA
	MeanCrossAlignment  float64   `json:"mean_cross_alignment"`  // Mean adjacent-layer alignment (0-1)
	MeanHeadEntropy     float64   `json:"mean_head_entropy"`     // Mean attention entropy per head (0-1)
	PhaseLockScore      float64   `json:"phase_lock_score"`      // Fraction of pairs above threshold
	JointCollapseCount  int       `json:"joint_collapse_count"`  // Layers where cross-alignment drops below threshold
	LayerCoherence      []float64 `json:"layer_coherence"`       // Per-layer coherence
	LayerCrossAlignment []float64 `json:"layer_cross_alignment"` // Per-layer cross-alignment (len = layers-1)
	GQA                 bool      `json:"gqa"`                   // True when analysis used position-wise mode (single KV head)
}

// Composite returns a 0-10000 integer score from BO metrics.
// Integer scale avoids floating-point rounding — same principle as blockchain
// ledgers where 1.337 LTHN is stored as 133700 atomic units.
func (r *BOResult) Composite() int {
	if r.GQA {
		return r.compositeGQA()
	}
	score := (0.30*r.MeanCoherence +
		0.25*r.MeanCrossAlignment +
		0.20*r.PhaseLockScore +
		0.15*r.MeanHeadEntropy +
		0.10*math.Max(0, 1.0-float64(r.JointCollapseCount)*0.2)) * 10000.0
	return min(10000, max(0, int(score)))
}

// compositeGQA weights for single-KV-head models where position differentiation
// is the primary signal.
func (r *BOResult) compositeGQA() int {
	// Scale differentiation from [0.1, 0.7] to [0, 1].
	scaledDiff := (r.MeanCoherence - 0.1) / 0.6
	scaledDiff = min(1, max(0, scaledDiff))

	// Layer variance: std of per-layer differentiation scores.
	var layerVar float64
	if len(r.LayerCoherence) > 1 {
		mean := r.MeanCoherence
		var sumSq float64
		for _, v := range r.LayerCoherence {
			d := v - mean
			sumSq += d * d
		}
		layerVar = math.Sqrt(sumSq / float64(len(r.LayerCoherence)))
	}
	// Scale variance from [0, 0.2] to [0, 1].
	scaledVar := min(1, layerVar/0.2)

	// Joint stability.
	jointStab := math.Max(0, 1.0-float64(r.JointCollapseCount)*0.2)

	score := (0.45*scaledDiff +
		0.25*scaledVar +
		0.15*r.MeanHeadEntropy +
		0.15*jointStab) * 10000.0
	return min(10000, max(0, int(score)))
}

const (
	coherenceThreshold = 0.7 // Minimum cosine sim for "phase-locked" head pair
	collapseThreshold  = 0.5 // Below this cross-alignment = joint collapse
)

// AnalyseAttention computes Q/K Bone Orientation metrics from a KV cache snapshot.
// For multi-head models: pairwise head coherence within layers.
// For GQA models (1 KV head): position-wise analysis within the single head.
func AnalyseAttention(snap *inference.AttentionSnapshot) *BOResult {
	if snap == nil || len(snap.Keys) == 0 {
		return &BOResult{}
	}

	// Use position-wise analysis for GQA models (≤4 KV heads).
	// With few heads, pairwise head coherence has too few pairs for signal.
	// Position-wise analysis gives richer data from any head count.
	if snap.NumHeads <= 4 {
		return analyseGQA(snap)
	}
	return analyseMultiHead(snap)
}

// analyseMultiHead handles models with ≥2 KV heads (original algorithm).
func analyseMultiHead(snap *inference.AttentionSnapshot) *BOResult {
	result := &BOResult{
		LayerCoherence:      make([]float64, snap.NumLayers),
		LayerCrossAlignment: make([]float64, max(0, snap.NumLayers-1)),
	}

	var totalCoherence, totalEntropy float64
	var totalPairsLocked, totalPairs int
	layerMeans := make([][]float32, snap.NumLayers)

	for layer := 0; layer < snap.NumLayers; layer++ {
		if layer >= len(snap.Keys) || snap.Keys[layer] == nil {
			continue
		}
		heads := snap.Keys[layer]
		nHeads := len(heads)

		layerMeans[layer] = meanVector(heads)

		var layerCoh float64
		var pairs int
		for i := 0; i < nHeads; i++ {
			for j := i + 1; j < nHeads; j++ {
				sim := cosineSim32(heads[i], heads[j])
				layerCoh += sim
				pairs++
				if sim >= coherenceThreshold {
					totalPairsLocked++
				}
				totalPairs++
			}
		}
		if pairs > 0 {
			layerCoh /= float64(pairs)
		}
		result.LayerCoherence[layer] = layerCoh
		totalCoherence += layerCoh

		for _, head := range heads {
			totalEntropy += headEntropy(head, snap.SeqLen, snap.HeadDim)
		}
	}

	var totalCross float64
	for i := 0; i < snap.NumLayers-1; i++ {
		if layerMeans[i] == nil || layerMeans[i+1] == nil {
			continue
		}
		alignment := cosineSim32(layerMeans[i], layerMeans[i+1])
		result.LayerCrossAlignment[i] = alignment
		totalCross += alignment
		if alignment < collapseThreshold {
			result.JointCollapseCount++
		}
	}

	if snap.NumLayers > 0 {
		result.MeanCoherence = totalCoherence / float64(snap.NumLayers)
	}
	if snap.NumLayers > 1 {
		result.MeanCrossAlignment = totalCross / float64(snap.NumLayers-1)
	}
	totalHeads := snap.NumLayers * snap.NumHeads
	if totalHeads > 0 {
		result.MeanHeadEntropy = totalEntropy / float64(totalHeads)
	}
	if totalPairs > 0 {
		result.PhaseLockScore = float64(totalPairsLocked) / float64(totalPairs)
	}

	return result
}

// analyseGQA handles models with 1 KV head by analysing position-wise patterns.
//
// With a single KV head, each layer gives us seq_len K vectors of dim head_dim.
// We measure:
//   - Position differentiation: mean pairwise cosine distance between token positions.
//     Low similarity = model distinguishes tokens (healthy). High = collapsed.
//     Mapped to MeanCoherence as 1-similarity (so high = good differentiation).
//   - Cross-layer position tracking: for each token position, cosine sim of its
//     K vector between adjacent layers. High = stable representation through depth.
//   - Entropy: same as multi-head (magnitude distribution across positions).
func analyseGQA(snap *inference.AttentionSnapshot) *BOResult {
	result := &BOResult{
		GQA:                 true,
		LayerCoherence:      make([]float64, snap.NumLayers),
		LayerCrossAlignment: make([]float64, max(0, snap.NumLayers-1)),
	}

	seqLen := snap.SeqLen
	headDim := snap.HeadDim
	if seqLen < 2 || headDim == 0 {
		return result
	}

	// Extract per-position K vectors for each layer.
	// posVecs[layer][pos] = float32 slice of len headDim.
	posVecs := make([][][]float32, snap.NumLayers)

	var totalDiff, totalEntropy float64
	var totalPairsLocked, totalPairs int

	for layer := 0; layer < snap.NumLayers; layer++ {
		if layer >= len(snap.Keys) || snap.Keys[layer] == nil || len(snap.Keys[layer]) == 0 {
			continue
		}
		flat := snap.Keys[layer][0] // Single head, flat [seq_len*head_dim].

		// Split into per-position vectors.
		vecs := make([][]float32, seqLen)
		for pos := 0; pos < seqLen; pos++ {
			start := pos * headDim
			end := start + headDim
			if end > len(flat) {
				break
			}
			vecs[pos] = flat[start:end]
		}
		posVecs[layer] = vecs

		// Position differentiation: pairwise cosine sim between positions.
		// We want LOW similarity = tokens are distinct = good.
		// Store as differentiation score = 1 - mean_sim.
		var simSum float64
		var pairs int
		for i := 0; i < len(vecs); i++ {
			for j := i + 1; j < len(vecs); j++ {
				if vecs[i] == nil || vecs[j] == nil {
					continue
				}
				sim := cosineSim32(vecs[i], vecs[j])
				simSum += sim
				pairs++
				// In GQA mode, "phase-lock" = position pairs that are well-differentiated.
				if sim < (1.0 - coherenceThreshold) {
					totalPairsLocked++
				}
				totalPairs++
			}
		}
		diffScore := 0.0
		if pairs > 0 {
			meanSim := simSum / float64(pairs)
			diffScore = 1.0 - meanSim // High = good differentiation.
		}
		result.LayerCoherence[layer] = diffScore
		totalDiff += diffScore

		// Entropy.
		totalEntropy += headEntropy(flat, seqLen, headDim)
	}

	// Cross-layer analysis for GQA: instead of raw vector comparison (meaningless
	// because each layer has its own K projection), measure the CHANGE in differentiation
	// between adjacent layers. A stable model maintains consistent differentiation;
	// a collapsing model shows sudden drops.
	for i := 0; i < snap.NumLayers-1; i++ {
		// Differentiation delta: how much differentiation changes between layers.
		// Small delta = smooth posture. Large delta = joint snap.
		delta := math.Abs(result.LayerCoherence[i+1] - result.LayerCoherence[i])
		smoothness := 1.0 - delta // High = smooth transition.
		result.LayerCrossAlignment[i] = smoothness
		if smoothness < collapseThreshold {
			result.JointCollapseCount++
		}
	}

	// Mean cross-alignment = mean smoothness.
	var totalCross float64
	for _, v := range result.LayerCrossAlignment {
		totalCross += v
	}

	if snap.NumLayers > 0 {
		result.MeanCoherence = totalDiff / float64(snap.NumLayers)
	}
	if len(result.LayerCrossAlignment) > 0 {
		result.MeanCrossAlignment = totalCross / float64(len(result.LayerCrossAlignment))
	}
	if snap.NumLayers > 0 {
		result.MeanHeadEntropy = totalEntropy / float64(snap.NumLayers)
	}
	if totalPairs > 0 {
		result.PhaseLockScore = float64(totalPairsLocked) / float64(totalPairs)
	}

	return result
}

// cosineSim32 computes cosine similarity between two float32 slices.
func cosineSim32(a, b []float32) float64 {
	if len(a) != len(b) || len(a) == 0 {
		return 0
	}
	var dot, normA, normB float64
	for i := range a {
		ai, bi := float64(a[i]), float64(b[i])
		dot += ai * bi
		normA += ai * ai
		normB += bi * bi
	}
	denom := math.Sqrt(normA) * math.Sqrt(normB)
	if denom == 0 {
		return 0
	}
	return dot / denom
}

// meanVector computes element-wise mean across multiple float32 slices.
func meanVector(vecs [][]float32) []float32 {
	if len(vecs) == 0 {
		return nil
	}
	n := len(vecs[0])
	mean := make([]float32, n)
	for _, v := range vecs {
		for i := range v {
			if i < n {
				mean[i] += v[i]
			}
		}
	}
	scale := float32(len(vecs))
	for i := range mean {
		mean[i] /= scale
	}
	return mean
}

// headEntropy computes normalised Shannon entropy of K vector magnitudes
// across sequence positions for a single head.
func headEntropy(head []float32, seqLen, headDim int) float64 {
	if seqLen == 0 || headDim == 0 {
		return 0
	}
	// Compute magnitude per position.
	mags := make([]float64, seqLen)
	var total float64
	for pos := 0; pos < seqLen; pos++ {
		var sum float64
		start := pos * headDim
		for d := 0; d < headDim && start+d < len(head); d++ {
			v := float64(head[start+d])
			sum += v * v
		}
		mags[pos] = math.Sqrt(sum)
		total += mags[pos]
	}
	if total == 0 {
		return 0
	}
	// Normalised Shannon entropy.
	var entropy float64
	for _, m := range mags {
		p := m / total
		if p > 0 {
			entropy -= p * math.Log2(p)
		}
	}
	maxEntropy := math.Log2(float64(seqLen))
	if maxEntropy == 0 {
		return 0
	}
	return entropy / maxEntropy
}

// AttentionFeatures returns a 5D feature vector from BO metrics.
func AttentionFeatures(ar *BOResult) []float64 {
	if ar == nil {
		return make([]float64, 5)
	}
	return []float64{
		ar.MeanCoherence,
		ar.MeanCrossAlignment,
		ar.MeanHeadEntropy,
		ar.PhaseLockScore,
		math.Max(0, 1.0-float64(ar.JointCollapseCount)*0.2),
	}
}

// AttentionFeatureLabels returns the labels for the attention feature vector.
func AttentionFeatureLabels() []string {
	return []string{
		"mean_coherence",
		"cross_alignment",
		"head_entropy",
		"phase_lock",
		"joint_stability",
	}
}