LEM/pkg/lem/attention_test.go

package lem

import (
	"math"
	"math/rand/v2"
	"testing"

	"forge.lthn.ai/core/go-inference"
)

func TestAnalyseAttention_Coherent_Good(t *testing.T) {
	// All heads in all layers point the same direction = high coherence (multi-head path, ≥5 heads).
	snap := makeCoherentSnapshot(4, 8, 8, 64)
	result := AnalyseAttention(snap)

	if result.GQA {
		t.Fatal("expected multi-head path for 8 heads")
	}
	if result.MeanCoherence < 0.9 {
		t.Fatalf("expected high coherence for aligned heads, got %.3f", result.MeanCoherence)
	}
	if result.JointCollapseCount > 0 {
		t.Fatalf("expected zero joint collapses, got %d", result.JointCollapseCount)
	}
	if result.PhaseLockScore < 0.9 {
		t.Fatalf("expected high phase-lock, got %.3f", result.PhaseLockScore)
	}
}

func TestAnalyseAttention_Collapsed_Good(t *testing.T) {
	// Orthogonal heads = low coherence (multi-head path, ≥5 heads).
	snap := makeOrthogonalSnapshot(4, 8, 8, 64)
	result := AnalyseAttention(snap)

	if result.GQA {
		t.Fatal("expected multi-head path for 8 heads")
	}
	if result.MeanCoherence > 0.3 {
		t.Fatalf("expected low coherence for orthogonal heads, got %.3f", result.MeanCoherence)
	}
}

func TestAnalyseAttention_GQA_Good(t *testing.T) {
	// Single KV head = GQA position-wise path.
	snap := makeCoherentSnapshot(4, 1, 8, 64)
	result := AnalyseAttention(snap)

	if !result.GQA {
		t.Fatal("expected GQA path for 1 head")
	}
	// Coherent snapshot: all positions have same vector = low differentiation.
	if result.MeanCoherence > 0.1 {
		t.Fatalf("expected low differentiation for identical positions, got %.3f", result.MeanCoherence)
	}
}

func TestAnalyseAttention_GQA_4Heads_Good(t *testing.T) {
	// 4 KV heads still uses GQA path (≤4 threshold).
	snap := makeOrthogonalSnapshot(4, 4, 8, 64)
	result := AnalyseAttention(snap)

	if !result.GQA {
		t.Fatal("expected GQA path for 4 heads")
	}
}

func TestAnalyseAttention_Nil_Good(t *testing.T) {
	result := AnalyseAttention(nil)
	if result.MeanCoherence != 0 {
		t.Fatalf("expected zero coherence for nil snapshot, got %.3f", result.MeanCoherence)
	}
}

func TestBoneOrientationScore_Composite_Good(t *testing.T) {
	result := &BOResult{
		MeanCoherence:       0.85,
		MeanCrossAlignment:  0.80,
		MeanHeadEntropy:     0.70,
		PhaseLockScore:      0.90,
		JointCollapseCount:  0,
		LayerCoherence:      []float64{0.85, 0.85, 0.85, 0.85},
		LayerCrossAlignment: []float64{0.80, 0.80, 0.80},
	}
	score := result.Composite()
	if score < 6000 || score > 10000 {
		t.Fatalf("composite out of range: %d", score)
	}
}

func TestBoneOrientationScore_Composite_ZeroCollapses_Good(t *testing.T) {
	result := &BOResult{
		MeanCoherence:       1.0,
		MeanCrossAlignment:  1.0,
		MeanHeadEntropy:     1.0,
		PhaseLockScore:      1.0,
		JointCollapseCount:  0,
	}
	score := result.Composite()
	if score != 10000 {
		t.Fatalf("expected 10000 for perfect scores, got %d", score)
	}
}

func TestBoneOrientationScore_Composite_ManyCollapses_Good(t *testing.T) {
	result := &BOResult{
		MeanCoherence:       0.0,
		MeanCrossAlignment:  0.0,
		MeanHeadEntropy:     0.0,
		PhaseLockScore:      0.0,
		JointCollapseCount:  10,
	}
	score := result.Composite()
	if score != 0 {
		t.Fatalf("expected 0 for zero scores, got %d", score)
	}
}

func TestCosineSim32_Good(t *testing.T) {
	a := []float32{1, 0, 0}
	b := []float32{1, 0, 0}
	sim := cosineSim32(a, b)
	if math.Abs(sim-1.0) > 1e-6 {
		t.Fatalf("expected cosine sim 1.0 for identical vectors, got %f", sim)
	}
}

func TestCosineSim32_Orthogonal_Good(t *testing.T) {
	a := []float32{1, 0, 0}
	b := []float32{0, 1, 0}
	sim := cosineSim32(a, b)
	if math.Abs(sim) > 1e-6 {
		t.Fatalf("expected cosine sim 0.0 for orthogonal vectors, got %f", sim)
	}
}

func TestHeadEntropy_Uniform_Good(t *testing.T) {
	// Uniform magnitudes across positions = max entropy.
	seqLen, headDim := 8, 4
	head := make([]float32, seqLen*headDim)
	for i := range head {
		head[i] = 1.0 // All same magnitude.
	}
	ent := headEntropy(head, seqLen, headDim)
	if ent < 0.99 {
		t.Fatalf("expected near-max entropy for uniform magnitudes, got %.3f", ent)
	}
}

func TestHeadEntropy_Collapsed_Good(t *testing.T) {
	// All magnitude concentrated in one position = low entropy.
	seqLen, headDim := 8, 4
	head := make([]float32, seqLen*headDim)
	for d := 0; d < headDim; d++ {
		head[d] = 10.0 // Only position 0 has magnitude.
	}
	ent := headEntropy(head, seqLen, headDim)
	if ent > 0.1 {
		t.Fatalf("expected near-zero entropy for concentrated magnitude, got %.3f", ent)
	}
}

func TestAttentionFeatures_Good(t *testing.T) {
	result := &BOResult{
		MeanCoherence:      0.85,
		MeanCrossAlignment: 0.80,
		MeanHeadEntropy:    0.70,
		PhaseLockScore:     0.90,
		JointCollapseCount: 1,
	}
	f := AttentionFeatures(result)
	if len(f) != 5 {
		t.Fatalf("expected 5D, got %dD", len(f))
	}
	if f[0] != 0.85 {
		t.Fatalf("expected coherence 0.85, got %f", f[0])
	}
	// Joint stability: 1.0 - 1*0.2 = 0.8
	if math.Abs(f[4]-0.8) > 1e-9 {
		t.Fatalf("expected joint_stability 0.8, got %f", f[4])
	}
}

func TestAttentionFeatures_Nil_Good(t *testing.T) {
	f := AttentionFeatures(nil)
	if len(f) != 5 {
		t.Fatalf("expected 5D, got %dD", len(f))
	}
	for i, v := range f {
		if v != 0 {
			t.Fatalf("expected zero at %d, got %f", i, v)
		}
	}
}

func TestAttentionFeatureLabels_Good(t *testing.T) {
	labels := AttentionFeatureLabels()
	if len(labels) != 5 {
		t.Fatalf("expected 5 labels, got %d", len(labels))
	}
}

func TestFullFeatures_Good(t *testing.T) {
	gs := GrammarScore{VocabRichness: 0.5, TenseEntropy: 0.3}
	hs := HeuristicScores{ComplianceMarkers: 1, FirstPerson: 2}
	bo := &BOResult{MeanCoherence: 0.85, MeanCrossAlignment: 0.80, MeanHeadEntropy: 0.70, PhaseLockScore: 0.90}
	f := FullFeatures(gs, hs, bo)
	if len(f) != 19 {
		t.Fatalf("expected 19D, got %dD", len(f))
	}
	// Grammar starts at 0, heuristic at 6, attention at 14.
	if f[0] != 0.5 {
		t.Fatalf("expected grammar[0]=0.5, got %f", f[0])
	}
	if f[14] != 0.85 {
		t.Fatalf("expected attention[0]=0.85, got %f", f[14])
	}
}

func TestFullFeatures_NilBO_Good(t *testing.T) {
	gs := GrammarScore{VocabRichness: 0.5}
	hs := HeuristicScores{}
	f := FullFeatures(gs, hs, nil)
	if len(f) != 19 {
		t.Fatalf("expected 19D, got %dD", len(f))
	}
	// Attention dims should be zero.
	for i := 14; i < 19; i++ {
		if f[i] != 0 {
			t.Fatalf("expected zero at dim %d, got %f", i, f[i])
		}
	}
}

func TestFullFeatureLabels_Good(t *testing.T) {
	labels := FullFeatureLabels()
	if len(labels) != 19 {
		t.Fatalf("expected 19 labels, got %d", len(labels))
	}
	if labels[14] != "mean_coherence" {
		t.Fatalf("expected label[14]='mean_coherence', got %q", labels[14])
	}
}

// --- Test helpers ---

// makeCoherentSnapshot creates a snapshot where all heads in all layers
// have identical K vectors (high coherence, high cross-alignment).
func makeCoherentSnapshot(layers, heads, seqLen, dim int) *inference.AttentionSnapshot {
	// Single repeating vector.
	vec := make([]float32, seqLen*dim)
	for i := range vec {
		vec[i] = float32(i%dim+1) * 0.1
	}

	keys := make([][][]float32, layers)
	for l := range layers {
		keys[l] = make([][]float32, heads)
		for h := range heads {
			head := make([]float32, len(vec))
			copy(head, vec)
			keys[l][h] = head
		}
	}
	return &inference.AttentionSnapshot{
		NumLayers:    layers,
		NumHeads:     heads,
		SeqLen:       seqLen,
		HeadDim:      dim,
		Keys:         keys,
		Architecture: "test",
	}
}

// makeOrthogonalSnapshot creates a snapshot where each head has a distinct
// basis direction (low pairwise coherence).
func makeOrthogonalSnapshot(layers, heads, seqLen, dim int) *inference.AttentionSnapshot {
	keys := make([][][]float32, layers)
	rng := rand.New(rand.NewPCG(42, 0))
	for l := range layers {
		keys[l] = make([][]float32, heads)
		for h := range heads {
			head := make([]float32, seqLen*dim)
			for i := range head {
				head[i] = rng.Float32()*2 - 1 // Random in [-1, 1].
			}
			keys[l][h] = head
		}
	}
	return &inference.AttentionSnapshot{
		NumLayers:    layers,
		NumHeads:     heads,
		SeqLen:       seqLen,
		HeadDim:      dim,
		Keys:         keys,
		Architecture: "test",
	}
}