LEM/pkg/lem/cluster_test.go

package lem

import (
	"testing"
)

func TestNewScoreIndex_Empty(t *testing.T) {
	idx, err := NewScoreIndex(nil)
	if err == nil {
		t.Fatal("expected error for nil input")
	}
	if idx != nil {
		t.Fatal("expected nil index")
	}
}

func TestNewScoreIndex_Build(t *testing.T) {
	entries := []ScoredEntry{
		{ID: "a", Grammar: GrammarScore{VocabRichness: 0.1, TenseEntropy: 0.5, QuestionRatio: 0.2, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15}},
		{ID: "b", Grammar: GrammarScore{VocabRichness: 0.2, TenseEntropy: 1.0, QuestionRatio: 0.4, DomainDepth: 7, VerbDiversity: 20, NounDiversity: 25}},
		{ID: "c", Grammar: GrammarScore{VocabRichness: 0.15, TenseEntropy: 0.8, QuestionRatio: 0.3, DomainDepth: 5, VerbDiversity: 15, NounDiversity: 20}},
	}
	idx, err := NewScoreIndex(entries)
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}
	if idx.Len() != 3 {
		t.Fatalf("expected 3 points, got %d", idx.Len())
	}
}

func TestScoreIndex_Nearest(t *testing.T) {
	entries := []ScoredEntry{
		{ID: "low", Grammar: GrammarScore{VocabRichness: 0.05, TenseEntropy: 0.2, QuestionRatio: 0.1, DomainDepth: 1, VerbDiversity: 5, NounDiversity: 5}},
		{ID: "mid", Grammar: GrammarScore{VocabRichness: 0.15, TenseEntropy: 0.8, QuestionRatio: 0.3, DomainDepth: 5, VerbDiversity: 15, NounDiversity: 20}},
		{ID: "high", Grammar: GrammarScore{VocabRichness: 0.25, TenseEntropy: 1.5, QuestionRatio: 0.5, DomainDepth: 10, VerbDiversity: 30, NounDiversity: 35}},
	}
	idx, err := NewScoreIndex(entries)
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}

	query := GrammarFeatures(GrammarScore{VocabRichness: 0.14, TenseEntropy: 0.7, QuestionRatio: 0.28, DomainDepth: 4, VerbDiversity: 14, NounDiversity: 18})
	nearest, dist, ok := idx.Nearest(query)
	if !ok {
		t.Fatal("expected a nearest match")
	}
	if nearest.ID != "mid" {
		t.Errorf("nearest = %q, want mid", nearest.ID)
	}
	if dist < 0 {
		t.Errorf("distance should be non-negative, got %f", dist)
	}
}

func TestScoreIndex_KNearest(t *testing.T) {
	entries := []ScoredEntry{
		{ID: "a", Grammar: GrammarScore{VocabRichness: 0.1, TenseEntropy: 0.3, QuestionRatio: 0.1, DomainDepth: 2, VerbDiversity: 5, NounDiversity: 8}},
		{ID: "b", Grammar: GrammarScore{VocabRichness: 0.2, TenseEntropy: 0.6, QuestionRatio: 0.2, DomainDepth: 4, VerbDiversity: 10, NounDiversity: 15}},
		{ID: "c", Grammar: GrammarScore{VocabRichness: 0.3, TenseEntropy: 0.9, QuestionRatio: 0.3, DomainDepth: 6, VerbDiversity: 15, NounDiversity: 22}},
		{ID: "d", Grammar: GrammarScore{VocabRichness: 0.4, TenseEntropy: 1.2, QuestionRatio: 0.4, DomainDepth: 8, VerbDiversity: 20, NounDiversity: 30}},
	}
	idx, err := NewScoreIndex(entries)
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}

	query := GrammarFeatures(GrammarScore{VocabRichness: 0.15, TenseEntropy: 0.45, QuestionRatio: 0.15, DomainDepth: 3, VerbDiversity: 7, NounDiversity: 11})
	results, dists := idx.KNearest(query, 2)
	if len(results) != 2 {
		t.Fatalf("expected 2 results, got %d", len(results))
	}
	if len(dists) != 2 {
		t.Fatalf("expected 2 distances, got %d", len(dists))
	}
}

func TestScoreIndex_Radius(t *testing.T) {
	entries := []ScoredEntry{
		{ID: "a", Grammar: GrammarScore{VocabRichness: 0.1, TenseEntropy: 0.5, QuestionRatio: 0.2, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15}},
		{ID: "b", Grammar: GrammarScore{VocabRichness: 0.11, TenseEntropy: 0.51, QuestionRatio: 0.21, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15}},
		{ID: "far", Grammar: GrammarScore{VocabRichness: 0.9, TenseEntropy: 1.5, QuestionRatio: 0.8, DomainDepth: 20, VerbDiversity: 40, NounDiversity: 50}},
	}
	idx, err := NewScoreIndex(entries)
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}

	query := GrammarFeatures(GrammarScore{VocabRichness: 0.1, TenseEntropy: 0.5, QuestionRatio: 0.2, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15})
	results, _ := idx.Radius(query, 0.01)
	// "a" and "b" should be within radius, "far" should not.
	if len(results) < 1 {
		t.Errorf("expected at least 1 result within radius, got %d", len(results))
	}
}

func TestIsDuplicate_HighSimilarity(t *testing.T) {
	entries := []ScoredEntry{
		{ID: "existing", Grammar: GrammarScore{VocabRichness: 0.15, TenseEntropy: 0.8, QuestionRatio: 0.3, DomainDepth: 5, VerbDiversity: 15, NounDiversity: 20}},
	}
	idx, err := NewScoreIndex(entries)
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}

	nearDup := GrammarFeatures(GrammarScore{VocabRichness: 0.15, TenseEntropy: 0.8, QuestionRatio: 0.3, DomainDepth: 5, VerbDiversity: 15, NounDiversity: 20})
	if !idx.IsDuplicate(nearDup, 0.05) {
		t.Error("expected near-identical vector to be flagged as duplicate")
	}
}

func TestIsDuplicate_LowSimilarity(t *testing.T) {
	// High vocab/tense, low verb/noun — one angular profile.
	entries := []ScoredEntry{
		{ID: "existing", Grammar: GrammarScore{VocabRichness: 0.3, TenseEntropy: 1.5, QuestionRatio: 0.5, DomainDepth: 1, VerbDiversity: 2, NounDiversity: 3}},
	}
	idx, err := NewScoreIndex(entries)
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}

	// Low vocab/tense, high verb/noun — genuinely different angular profile.
	different := GrammarFeatures(GrammarScore{VocabRichness: 0.01, TenseEntropy: 0.05, QuestionRatio: 0.01, DomainDepth: 20, VerbDiversity: 40, NounDiversity: 50})
	if idx.IsDuplicate(different, 0.05) {
		t.Error("expected different angular profile to NOT be flagged as duplicate")
	}
}

func TestScoreIndex_Insert(t *testing.T) {
	entries := []ScoredEntry{
		{ID: "seed", Grammar: GrammarScore{VocabRichness: 0.15, TenseEntropy: 0.8, QuestionRatio: 0.3, DomainDepth: 5, VerbDiversity: 15, NounDiversity: 20}},
	}
	idx, err := NewScoreIndex(entries)
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}

	err = idx.Insert(ScoredEntry{
		ID:      "new",
		Grammar: GrammarScore{VocabRichness: 0.25, TenseEntropy: 1.2, QuestionRatio: 0.5, DomainDepth: 8, VerbDiversity: 22, NounDiversity: 30},
	})
	if err != nil {
		t.Fatalf("insert error: %v", err)
	}
	if idx.Len() != 2 {
		t.Fatalf("expected 2 entries, got %d", idx.Len())
	}
}

func TestScoreIndex_Points(t *testing.T) {
	entries := []ScoredEntry{
		{ID: "a", Grammar: GrammarScore{VocabRichness: 0.1}},
		{ID: "b", Grammar: GrammarScore{VocabRichness: 0.2}},
	}
	idx, err := NewScoreIndex(entries)
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}
	pts := idx.Points()
	if len(pts) != 2 {
		t.Fatalf("expected 2 points, got %d", len(pts))
	}
}