diff --git a/go.mod b/go.mod index 22ee045..00a4d73 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( forge.lthn.ai/core/go-i18n v0.0.1 forge.lthn.ai/core/go-ml v0.0.1 forge.lthn.ai/core/go-mlx v0.0.1 + github.com/Snider/Poindexter v0.0.0-20260104200422-91146b212a1f github.com/marcboeker/go-duckdb v1.8.5 github.com/parquet-go/parquet-go v0.27.0 gopkg.in/yaml.v3 v3.0.1 diff --git a/go.sum b/go.sum index 2d51e26..af8ea18 100644 --- a/go.sum +++ b/go.sum @@ -14,6 +14,8 @@ github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7Oputl github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/ProtonMail/go-crypto v1.3.0 h1:ILq8+Sf5If5DCpHQp4PbZdS1J7HDFRXz/+xKBiRGFrw= github.com/ProtonMail/go-crypto v1.3.0/go.mod h1:9whxjD8Rbs29b4XWbB8irEcE8KHMqaR2e7GWU1R+/PE= +github.com/Snider/Poindexter v0.0.0-20260104200422-91146b212a1f h1:+EnE414H9wUaBeUVNjyErusrxSbBGnGV6MBhTw/em0k= +github.com/Snider/Poindexter v0.0.0-20260104200422-91146b212a1f/go.mod h1:nhgkbg4zWA4AS2Ga3RmcvdsyiI9TdxvSqe5EVBSb3Hk= github.com/alecthomas/assert/v2 v2.10.0 h1:jjRCHsj6hBJhkmhznrCzoNpbA3zqy0fYiUcYZP/GkPY= github.com/alecthomas/assert/v2 v2.10.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k= github.com/alecthomas/repr v0.4.0 h1:GhI2A8MACjfegCPVq9f1FLvIBS+DrQ2KQBFZP1iFzXc= diff --git a/pkg/lem/analytics.go b/pkg/lem/analytics.go new file mode 100644 index 0000000..5fd3ad9 --- /dev/null +++ b/pkg/lem/analytics.go @@ -0,0 +1,62 @@ +package lem + +import ( + poindexter "github.com/Snider/Poindexter" +) + +// ScoreDistribution wraps Poindexter's DistributionStats for LEM score populations. +type ScoreDistribution = poindexter.DistributionStats + +// GrammarAxisStats wraps Poindexter's AxisDistribution for per-feature analysis. +type GrammarAxisStats = poindexter.AxisDistribution + +// ComputeScoreDistribution calculates percentile/variance stats over grammar composites. +func ComputeScoreDistribution(scores []GrammarScore) ScoreDistribution { + vals := make([]float64, len(scores)) + for i, s := range scores { + vals[i] = s.Composite + } + return poindexter.ComputeDistributionStats(vals) +} + +// ComputeLEKDistribution calculates percentile/variance stats over LEK scores. +func ComputeLEKDistribution(scores []*HeuristicScores) ScoreDistribution { + vals := make([]float64, len(scores)) + for i, s := range scores { + vals[i] = s.LEKScore + } + return poindexter.ComputeDistributionStats(vals) +} + +// ComputeGrammarAxisStats returns per-axis distribution stats for grammar features. +func ComputeGrammarAxisStats(entries []ScoredEntry) []GrammarAxisStats { + points := make([]poindexter.KDPoint[ScoredEntry], len(entries)) + for i, e := range entries { + points[i] = poindexter.KDPoint[ScoredEntry]{ + ID: e.ID, + Coords: GrammarFeatures(e.Grammar), + Value: e, + } + } + return poindexter.ComputeAxisDistributions(points, GrammarFeatureLabels()) +} + +// SummaryReport holds aggregate analytics for a scored population. +type SummaryReport struct { + Total int + CompositeStats ScoreDistribution + AxisStats []GrammarAxisStats +} + +// ScoreSummary computes a full analytics report from scored entries. +func ScoreSummary(entries []ScoredEntry) SummaryReport { + scores := make([]GrammarScore, len(entries)) + for i, e := range entries { + scores[i] = e.Grammar + } + return SummaryReport{ + Total: len(entries), + CompositeStats: ComputeScoreDistribution(scores), + AxisStats: ComputeGrammarAxisStats(entries), + } +} diff --git a/pkg/lem/analytics_test.go b/pkg/lem/analytics_test.go new file mode 100644 index 0000000..1f2cf70 --- /dev/null +++ b/pkg/lem/analytics_test.go @@ -0,0 +1,86 @@ +package lem + +import ( + "testing" +) + +func TestComputeScoreDistribution(t *testing.T) { + scores := []GrammarScore{ + {Composite: 30}, + {Composite: 45}, + {Composite: 55}, + {Composite: 60}, + {Composite: 75}, + {Composite: 80}, + {Composite: 90}, + } + dist := ComputeScoreDistribution(scores) + if dist.Count != 7 { + t.Errorf("count = %d, want 7", dist.Count) + } + if dist.Min != 30 { + t.Errorf("min = %f, want 30", dist.Min) + } + if dist.Max != 90 { + t.Errorf("max = %f, want 90", dist.Max) + } + if dist.Mean < 50 || dist.Mean > 70 { + t.Errorf("mean = %f, expected between 50 and 70", dist.Mean) + } +} + +func TestComputeLEKDistribution(t *testing.T) { + scores := []*HeuristicScores{ + {LEKScore: 10}, + {LEKScore: 20}, + {LEKScore: 30}, + {LEKScore: 40}, + {LEKScore: 50}, + } + dist := ComputeLEKDistribution(scores) + if dist.Count != 5 { + t.Errorf("count = %d, want 5", dist.Count) + } + if dist.Min != 10 { + t.Errorf("min = %f, want 10", dist.Min) + } + if dist.Max != 50 { + t.Errorf("max = %f, want 50", dist.Max) + } +} + +func TestComputeGrammarAxisStats(t *testing.T) { + entries := []ScoredEntry{ + {ID: "a", Grammar: GrammarScore{VocabRichness: 0.1, TenseEntropy: 0.5, QuestionRatio: 0.2, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15}}, + {ID: "b", Grammar: GrammarScore{VocabRichness: 0.2, TenseEntropy: 1.0, QuestionRatio: 0.4, DomainDepth: 6, VerbDiversity: 20, NounDiversity: 25}}, + {ID: "c", Grammar: GrammarScore{VocabRichness: 0.3, TenseEntropy: 1.5, QuestionRatio: 0.6, DomainDepth: 9, VerbDiversity: 30, NounDiversity: 35}}, + } + axes := ComputeGrammarAxisStats(entries) + if len(axes) != 6 { + t.Fatalf("expected 6 axes, got %d", len(axes)) + } + if axes[0].Name != "vocab_richness" { + t.Errorf("axes[0].Name = %q, want vocab_richness", axes[0].Name) + } + if axes[0].Stats.Count != 3 { + t.Errorf("axes[0] count = %d, want 3", axes[0].Stats.Count) + } +} + +func TestScoreSummary(t *testing.T) { + entries := []ScoredEntry{ + {ID: "a", Grammar: GrammarScore{Composite: 40, VocabRichness: 0.1}}, + {ID: "b", Grammar: GrammarScore{Composite: 60, VocabRichness: 0.2}}, + {ID: "c", Grammar: GrammarScore{Composite: 80, VocabRichness: 0.3}}, + } + summary := ScoreSummary(entries) + if summary.Total != 3 { + t.Errorf("total = %d, want 3", summary.Total) + } + if summary.CompositeStats.Count != 3 { + t.Errorf("composite count = %d, want 3", summary.CompositeStats.Count) + } + if len(summary.AxisStats) != 6 { + t.Errorf("axis count = %d, want 6", len(summary.AxisStats)) + } +} diff --git a/pkg/lem/cluster.go b/pkg/lem/cluster.go new file mode 100644 index 0000000..0822879 --- /dev/null +++ b/pkg/lem/cluster.go @@ -0,0 +1,240 @@ +package lem + +import ( + "cmp" + "errors" + "fmt" + "slices" + + poindexter "github.com/Snider/Poindexter" +) + +// ScoredEntry pairs a response ID with its grammar scores for indexing. +type ScoredEntry struct { + ID string + Domain string + Grammar GrammarScore +} + +// ScoreIndex wraps a Poindexter KDTree over grammar feature vectors. +type ScoreIndex struct { + tree *poindexter.KDTree[ScoredEntry] +} + +// NewScoreIndex builds a KDTree from scored entries using cosine distance +// on 6D grammar feature vectors. Raw coordinates are used (no normalization) +// because cosine distance is angle-based and handles magnitude differences. +func NewScoreIndex(entries []ScoredEntry) (*ScoreIndex, error) { + if len(entries) == 0 { + return nil, errors.New("lem: no entries to index") + } + + points := make([]poindexter.KDPoint[ScoredEntry], len(entries)) + for i, e := range entries { + points[i] = poindexter.KDPoint[ScoredEntry]{ + ID: e.ID, + Coords: GrammarFeatures(e.Grammar), + Value: e, + } + } + + tree, err := poindexter.NewKDTree(points, + poindexter.WithMetric(poindexter.CosineDistance{}), + ) + if err != nil { + return nil, fmt.Errorf("lem: build tree: %w", err) + } + + return &ScoreIndex{tree: tree}, nil +} + +// Len returns the number of indexed entries. +func (idx *ScoreIndex) Len() int { + return idx.tree.Len() +} + +// Nearest finds the closest scored entry to the query vector. +func (idx *ScoreIndex) Nearest(query []float64) (ScoredEntry, float64, bool) { + pt, dist, ok := idx.tree.Nearest(query) + if !ok { + return ScoredEntry{}, 0, false + } + return pt.Value, dist, true +} + +// KNearest finds the k closest scored entries to the query vector. +func (idx *ScoreIndex) KNearest(query []float64, k int) ([]ScoredEntry, []float64) { + pts, dists := idx.tree.KNearest(query, k) + entries := make([]ScoredEntry, len(pts)) + for i, pt := range pts { + entries[i] = pt.Value + } + return entries, dists +} + +// Radius finds all entries within distance r of the query vector. +func (idx *ScoreIndex) Radius(query []float64, r float64) ([]ScoredEntry, []float64) { + pts, dists := idx.tree.Radius(query, r) + entries := make([]ScoredEntry, len(pts)) + for i, pt := range pts { + entries[i] = pt.Value + } + return entries, dists +} + +// IsDuplicate returns true if any indexed entry is within threshold distance +// of the query vector. Use during distill to reject near-identical outputs. +func (idx *ScoreIndex) IsDuplicate(query []float64, threshold float64) bool { + _, dist, ok := idx.tree.Nearest(query) + return ok && dist <= threshold +} + +// Insert adds a new scored entry to the index. +func (idx *ScoreIndex) Insert(entry ScoredEntry) error { + features := GrammarFeatures(entry.Grammar) + pt := poindexter.KDPoint[ScoredEntry]{ + ID: entry.ID, + Coords: features, + Value: entry, + } + if !idx.tree.Insert(pt) { + return fmt.Errorf("lem: failed to insert %s (duplicate ID?)", entry.ID) + } + return nil +} + +// Points returns all indexed entries. +func (idx *ScoreIndex) Points() []ScoredEntry { + pts := idx.tree.Points() + entries := make([]ScoredEntry, len(pts)) + for i, pt := range pts { + entries[i] = pt.Value + } + return entries +} + +// featureRange holds the min/max for one axis. +type featureRange struct{ min, max float64 } + +// GapReport describes a region of quality-space with poor coverage. +type GapReport struct { + // Probe is the sample point coordinates in grammar feature space. + Probe []float64 + // AvgDistance is the average distance to the k nearest indexed entries. + AvgDistance float64 + // NearestIDs lists the IDs of the k nearest entries. + NearestIDs []string +} + +// FindGaps samples the grammar feature space and identifies regions +// where the k-nearest indexed entries are far away (poor coverage). +// Returns gap reports sorted by AvgDistance descending (worst gaps first). +func FindGaps(entries []ScoredEntry, k int) []GapReport { + if len(entries) < 2 { + return nil + } + + idx, err := NewScoreIndex(entries) + if err != nil { + return nil + } + + // Compute per-axis min/max for sampling range. + dim := 6 + ranges := make([]featureRange, dim) + first := GrammarFeatures(entries[0].Grammar) + for i := range dim { + ranges[i] = featureRange{min: first[i], max: first[i]} + } + for _, e := range entries[1:] { + f := GrammarFeatures(e.Grammar) + for i := range dim { + if f[i] < ranges[i].min { + ranges[i].min = f[i] + } + if f[i] > ranges[i].max { + ranges[i].max = f[i] + } + } + } + + // Sample a grid of probe points across the feature space. + // 3 steps per axis = 3^6 = 729 probe points. + steps := 3 + probes := sampleGrid(ranges, steps, dim) + + if k > len(entries) { + k = len(entries) + } + + var gaps []GapReport + for _, probe := range probes { + neighbours, dists := idx.KNearest(probe, k) + if len(dists) == 0 { + continue + } + avg := 0.0 + for _, d := range dists { + avg += d + } + avg /= float64(len(dists)) + + ids := make([]string, len(neighbours)) + for i, n := range neighbours { + ids[i] = n.ID + } + gaps = append(gaps, GapReport{ + Probe: probe, + AvgDistance: avg, + NearestIDs: ids, + }) + } + + // Sort by worst coverage first. + slices.SortFunc(gaps, func(a, b GapReport) int { + return cmp.Compare(b.AvgDistance, a.AvgDistance) // descending + }) + + return gaps +} + +// sampleGrid generates probe points across the feature space +// by stepping through each axis's [min, max] range. +func sampleGrid(ranges []featureRange, steps, dim int) [][]float64 { + if dim == 0 || steps < 2 { + return nil + } + + axisValues := make([][]float64, dim) + for i, r := range ranges { + vals := make([]float64, steps) + for j := range steps { + vals[j] = r.min + (r.max-r.min)*float64(j)/float64(steps-1) + } + axisValues[i] = vals + } + + total := 1 + for range dim { + total *= steps + } + probes := make([][]float64, 0, total) + current := make([]float64, dim) + var generate func(axis int) + generate = func(axis int) { + if axis == dim { + probe := make([]float64, dim) + copy(probe, current) + probes = append(probes, probe) + return + } + for _, v := range axisValues[axis] { + current[axis] = v + generate(axis + 1) + } + } + generate(0) + + return probes +} + diff --git a/pkg/lem/cluster_test.go b/pkg/lem/cluster_test.go new file mode 100644 index 0000000..182737d --- /dev/null +++ b/pkg/lem/cluster_test.go @@ -0,0 +1,163 @@ +package lem + +import ( + "testing" +) + +func TestNewScoreIndex_Empty(t *testing.T) { + idx, err := NewScoreIndex(nil) + if err == nil { + t.Fatal("expected error for nil input") + } + if idx != nil { + t.Fatal("expected nil index") + } +} + +func TestNewScoreIndex_Build(t *testing.T) { + entries := []ScoredEntry{ + {ID: "a", Grammar: GrammarScore{VocabRichness: 0.1, TenseEntropy: 0.5, QuestionRatio: 0.2, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15}}, + {ID: "b", Grammar: GrammarScore{VocabRichness: 0.2, TenseEntropy: 1.0, QuestionRatio: 0.4, DomainDepth: 7, VerbDiversity: 20, NounDiversity: 25}}, + {ID: "c", Grammar: GrammarScore{VocabRichness: 0.15, TenseEntropy: 0.8, QuestionRatio: 0.3, DomainDepth: 5, VerbDiversity: 15, NounDiversity: 20}}, + } + idx, err := NewScoreIndex(entries) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if idx.Len() != 3 { + t.Fatalf("expected 3 points, got %d", idx.Len()) + } +} + +func TestScoreIndex_Nearest(t *testing.T) { + entries := []ScoredEntry{ + {ID: "low", Grammar: GrammarScore{VocabRichness: 0.05, TenseEntropy: 0.2, QuestionRatio: 0.1, DomainDepth: 1, VerbDiversity: 5, NounDiversity: 5}}, + {ID: "mid", Grammar: GrammarScore{VocabRichness: 0.15, TenseEntropy: 0.8, QuestionRatio: 0.3, DomainDepth: 5, VerbDiversity: 15, NounDiversity: 20}}, + {ID: "high", Grammar: GrammarScore{VocabRichness: 0.25, TenseEntropy: 1.5, QuestionRatio: 0.5, DomainDepth: 10, VerbDiversity: 30, NounDiversity: 35}}, + } + idx, err := NewScoreIndex(entries) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + query := GrammarFeatures(GrammarScore{VocabRichness: 0.14, TenseEntropy: 0.7, QuestionRatio: 0.28, DomainDepth: 4, VerbDiversity: 14, NounDiversity: 18}) + nearest, dist, ok := idx.Nearest(query) + if !ok { + t.Fatal("expected a nearest match") + } + if nearest.ID != "mid" { + t.Errorf("nearest = %q, want mid", nearest.ID) + } + if dist < 0 { + t.Errorf("distance should be non-negative, got %f", dist) + } +} + +func TestScoreIndex_KNearest(t *testing.T) { + entries := []ScoredEntry{ + {ID: "a", Grammar: GrammarScore{VocabRichness: 0.1, TenseEntropy: 0.3, QuestionRatio: 0.1, DomainDepth: 2, VerbDiversity: 5, NounDiversity: 8}}, + {ID: "b", Grammar: GrammarScore{VocabRichness: 0.2, TenseEntropy: 0.6, QuestionRatio: 0.2, DomainDepth: 4, VerbDiversity: 10, NounDiversity: 15}}, + {ID: "c", Grammar: GrammarScore{VocabRichness: 0.3, TenseEntropy: 0.9, QuestionRatio: 0.3, DomainDepth: 6, VerbDiversity: 15, NounDiversity: 22}}, + {ID: "d", Grammar: GrammarScore{VocabRichness: 0.4, TenseEntropy: 1.2, QuestionRatio: 0.4, DomainDepth: 8, VerbDiversity: 20, NounDiversity: 30}}, + } + idx, err := NewScoreIndex(entries) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + query := GrammarFeatures(GrammarScore{VocabRichness: 0.15, TenseEntropy: 0.45, QuestionRatio: 0.15, DomainDepth: 3, VerbDiversity: 7, NounDiversity: 11}) + results, dists := idx.KNearest(query, 2) + if len(results) != 2 { + t.Fatalf("expected 2 results, got %d", len(results)) + } + if len(dists) != 2 { + t.Fatalf("expected 2 distances, got %d", len(dists)) + } +} + +func TestScoreIndex_Radius(t *testing.T) { + entries := []ScoredEntry{ + {ID: "a", Grammar: GrammarScore{VocabRichness: 0.1, TenseEntropy: 0.5, QuestionRatio: 0.2, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15}}, + {ID: "b", Grammar: GrammarScore{VocabRichness: 0.11, TenseEntropy: 0.51, QuestionRatio: 0.21, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15}}, + {ID: "far", Grammar: GrammarScore{VocabRichness: 0.9, TenseEntropy: 1.5, QuestionRatio: 0.8, DomainDepth: 20, VerbDiversity: 40, NounDiversity: 50}}, + } + idx, err := NewScoreIndex(entries) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + query := GrammarFeatures(GrammarScore{VocabRichness: 0.1, TenseEntropy: 0.5, QuestionRatio: 0.2, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15}) + results, _ := idx.Radius(query, 0.01) + // "a" and "b" should be within radius, "far" should not. + if len(results) < 1 { + t.Errorf("expected at least 1 result within radius, got %d", len(results)) + } +} + +func TestIsDuplicate_HighSimilarity(t *testing.T) { + entries := []ScoredEntry{ + {ID: "existing", Grammar: GrammarScore{VocabRichness: 0.15, TenseEntropy: 0.8, QuestionRatio: 0.3, DomainDepth: 5, VerbDiversity: 15, NounDiversity: 20}}, + } + idx, err := NewScoreIndex(entries) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + nearDup := GrammarFeatures(GrammarScore{VocabRichness: 0.15, TenseEntropy: 0.8, QuestionRatio: 0.3, DomainDepth: 5, VerbDiversity: 15, NounDiversity: 20}) + if !idx.IsDuplicate(nearDup, 0.05) { + t.Error("expected near-identical vector to be flagged as duplicate") + } +} + +func TestIsDuplicate_LowSimilarity(t *testing.T) { + // High vocab/tense, low verb/noun — one angular profile. + entries := []ScoredEntry{ + {ID: "existing", Grammar: GrammarScore{VocabRichness: 0.3, TenseEntropy: 1.5, QuestionRatio: 0.5, DomainDepth: 1, VerbDiversity: 2, NounDiversity: 3}}, + } + idx, err := NewScoreIndex(entries) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Low vocab/tense, high verb/noun — genuinely different angular profile. + different := GrammarFeatures(GrammarScore{VocabRichness: 0.01, TenseEntropy: 0.05, QuestionRatio: 0.01, DomainDepth: 20, VerbDiversity: 40, NounDiversity: 50}) + if idx.IsDuplicate(different, 0.05) { + t.Error("expected different angular profile to NOT be flagged as duplicate") + } +} + +func TestScoreIndex_Insert(t *testing.T) { + entries := []ScoredEntry{ + {ID: "seed", Grammar: GrammarScore{VocabRichness: 0.15, TenseEntropy: 0.8, QuestionRatio: 0.3, DomainDepth: 5, VerbDiversity: 15, NounDiversity: 20}}, + } + idx, err := NewScoreIndex(entries) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + err = idx.Insert(ScoredEntry{ + ID: "new", + Grammar: GrammarScore{VocabRichness: 0.25, TenseEntropy: 1.2, QuestionRatio: 0.5, DomainDepth: 8, VerbDiversity: 22, NounDiversity: 30}, + }) + if err != nil { + t.Fatalf("insert error: %v", err) + } + if idx.Len() != 2 { + t.Fatalf("expected 2 entries, got %d", idx.Len()) + } +} + +func TestScoreIndex_Points(t *testing.T) { + entries := []ScoredEntry{ + {ID: "a", Grammar: GrammarScore{VocabRichness: 0.1}}, + {ID: "b", Grammar: GrammarScore{VocabRichness: 0.2}}, + } + idx, err := NewScoreIndex(entries) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + pts := idx.Points() + if len(pts) != 2 { + t.Fatalf("expected 2 points, got %d", len(pts)) + } +} diff --git a/pkg/lem/coverage.go b/pkg/lem/coverage.go index d27851a..8d723ef 100644 --- a/pkg/lem/coverage.go +++ b/pkg/lem/coverage.go @@ -130,3 +130,49 @@ func RunCoverage(args []string) { fmt.Println(" - Swahili, Yoruba, Amharic (Sub-Saharan Africa)") fmt.Println(" - Indigenous languages (Quechua, Nahuatl, Aymara)") } + +// PrintScoreAnalytics prints score distribution statistics and gap analysis +// for a set of scored entries. Use after scoring responses with grammar v3. +func PrintScoreAnalytics(entries []ScoredEntry) { + if len(entries) == 0 { + fmt.Println("No scored entries to analyse.") + return + } + + report := ScoreSummary(entries) + + fmt.Println("\nGrammar Score Distribution") + fmt.Println("==================================================") + fmt.Printf(" Entries: %d\n", report.Total) + cs := report.CompositeStats + fmt.Printf(" Mean: %.1f\n", cs.Mean) + fmt.Printf(" Median: %.1f\n", cs.Median) + fmt.Printf(" StdDev: %.1f\n", cs.StdDev) + fmt.Printf(" Range: %.1f – %.1f\n", cs.Min, cs.Max) + fmt.Printf(" P25: %.1f\n", cs.P25) + fmt.Printf(" P75: %.1f\n", cs.P75) + fmt.Printf(" P90: %.1f\n", cs.P90) + fmt.Printf(" Skewness: %.2f\n", cs.Skewness) + + fmt.Println("\nPer-Axis Statistics") + fmt.Println("--------------------------------------------------") + fmt.Printf(" %-20s %8s %8s %8s %8s\n", "Feature", "Mean", "StdDev", "Min", "Max") + for _, ax := range report.AxisStats { + fmt.Printf(" %-20s %8.3f %8.3f %8.3f %8.3f\n", + ax.Name, ax.Stats.Mean, ax.Stats.StdDev, ax.Stats.Min, ax.Stats.Max) + } + + // Gap analysis. + if len(entries) >= 3 { + gaps := FindGaps(entries, min(3, len(entries))) + if len(gaps) > 0 { + fmt.Println("\nTop 10 Coverage Gaps (worst first)") + fmt.Println("--------------------------------------------------") + limit := min(10, len(gaps)) + for i := range limit { + g := gaps[i] + fmt.Printf(" #%d avg_dist=%.4f nearest=%v\n", i+1, g.AvgDistance, g.NearestIDs) + } + } + } +} diff --git a/pkg/lem/coverage_test.go b/pkg/lem/coverage_test.go new file mode 100644 index 0000000..a06c4fe --- /dev/null +++ b/pkg/lem/coverage_test.go @@ -0,0 +1,82 @@ +package lem + +import ( + "testing" +) + +func TestFindGaps_UniformCoverage(t *testing.T) { + entries := []ScoredEntry{ + {ID: "a", Grammar: GrammarScore{VocabRichness: 0.1, TenseEntropy: 0.5, QuestionRatio: 0.1, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15}}, + {ID: "b", Grammar: GrammarScore{VocabRichness: 0.2, TenseEntropy: 1.0, QuestionRatio: 0.3, DomainDepth: 6, VerbDiversity: 20, NounDiversity: 25}}, + {ID: "c", Grammar: GrammarScore{VocabRichness: 0.3, TenseEntropy: 1.5, QuestionRatio: 0.5, DomainDepth: 9, VerbDiversity: 30, NounDiversity: 35}}, + } + gaps := FindGaps(entries, 3) + if gaps == nil { + t.Fatal("expected non-nil gaps") + } + if len(gaps) == 0 { + t.Error("expected some gap reports") + } +} + +func TestFindGaps_ClusteredData(t *testing.T) { + // All entries clustered in one corner — grid probes far from cluster should show gaps. + entries := []ScoredEntry{ + {ID: "a", Grammar: GrammarScore{VocabRichness: 0.10, TenseEntropy: 0.50, QuestionRatio: 0.1, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15}}, + {ID: "b", Grammar: GrammarScore{VocabRichness: 0.11, TenseEntropy: 0.51, QuestionRatio: 0.11, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15}}, + {ID: "c", Grammar: GrammarScore{VocabRichness: 0.12, TenseEntropy: 0.52, QuestionRatio: 0.12, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15}}, + } + gaps := FindGaps(entries, 2) + if len(gaps) == 0 { + t.Error("expected gaps in clustered data") + } + // Top gap should have positive distance. + if gaps[0].AvgDistance <= 0 { + t.Error("expected positive distance for worst gap") + } +} + +func TestFindGaps_SortedByWorst(t *testing.T) { + entries := []ScoredEntry{ + {ID: "a", Grammar: GrammarScore{VocabRichness: 0.1, TenseEntropy: 0.3, QuestionRatio: 0.1, DomainDepth: 2, VerbDiversity: 5, NounDiversity: 8}}, + {ID: "b", Grammar: GrammarScore{VocabRichness: 0.5, TenseEntropy: 1.0, QuestionRatio: 0.3, DomainDepth: 5, VerbDiversity: 15, NounDiversity: 20}}, + {ID: "c", Grammar: GrammarScore{VocabRichness: 0.9, TenseEntropy: 1.5, QuestionRatio: 0.8, DomainDepth: 12, VerbDiversity: 30, NounDiversity: 40}}, + } + gaps := FindGaps(entries, 2) + if len(gaps) < 2 { + t.Fatalf("expected at least 2 gaps, got %d", len(gaps)) + } + // Descending order. + if gaps[0].AvgDistance < gaps[len(gaps)-1].AvgDistance { + t.Error("expected gaps sorted descending by AvgDistance") + } +} + +func TestFindGaps_TooFewEntries(t *testing.T) { + entries := []ScoredEntry{ + {ID: "solo", Grammar: GrammarScore{VocabRichness: 0.1}}, + } + gaps := FindGaps(entries, 1) + if gaps != nil { + t.Error("expected nil for single entry") + } +} + +func TestGapReport_HasFields(t *testing.T) { + entries := []ScoredEntry{ + {ID: "a", Grammar: GrammarScore{VocabRichness: 0.1, TenseEntropy: 0.5, QuestionRatio: 0.2, DomainDepth: 3, VerbDiversity: 10, NounDiversity: 15}}, + {ID: "b", Grammar: GrammarScore{VocabRichness: 0.9, TenseEntropy: 1.5, QuestionRatio: 0.8, DomainDepth: 12, VerbDiversity: 35, NounDiversity: 45}}, + } + gaps := FindGaps(entries, 1) + for _, g := range gaps { + if g.AvgDistance < 0 { + t.Error("AvgDistance should be non-negative") + } + if len(g.Probe) != 6 { + t.Errorf("Probe should be 6D, got %d", len(g.Probe)) + } + if len(g.NearestIDs) == 0 { + t.Error("NearestIDs should not be empty") + } + } +} diff --git a/pkg/lem/distill.go b/pkg/lem/distill.go index 95d06ba..cac403d 100644 --- a/pkg/lem/distill.go +++ b/pkg/lem/distill.go @@ -189,10 +189,14 @@ func RunDistill(args []string) { kept := 0 skipped := 0 + deduped := 0 totalStart := time.Now() ctx := context.Background() kernelStr := strings.TrimSpace(string(kernel)) + // Running duplicate index for grammar-profile deduplication. + var dedupIdx *ScoreIndex + for i, probe := range probes { var best *distillCandidate @@ -256,6 +260,16 @@ func RunDistill(args []string) { // Quality gate. if best != nil && best.Grammar.Composite >= *minScore { + // Duplicate filter: reject if grammar profile is too similar to an already-kept entry. + bestFeatures := GrammarFeatures(best.Grammar) + if dedupIdx != nil && dedupIdx.IsDuplicate(bestFeatures, 0.02) { + deduped++ + fmt.Fprintf(os.Stderr, " ~ DEDUP %s (grammar profile too similar to existing)\n", probe.ID) + // Release GPU memory between probes to prevent incremental leak. + runtime.GC() + continue + } + // Save with sandwich prompt — kernel wraps the bare probe for training. example := TrainingExample{ Messages: []ChatMessage{ @@ -266,6 +280,14 @@ func RunDistill(args []string) { line, _ := json.Marshal(example) out.Write(append(line, '\n')) + // Add to dedup index. + entry := ScoredEntry{ID: probe.ID, Domain: probe.Domain, Grammar: best.Grammar} + if dedupIdx == nil { + dedupIdx, _ = NewScoreIndex([]ScoredEntry{entry}) + } else { + _ = dedupIdx.Insert(entry) + } + kept++ fmt.Fprintf(os.Stderr, " ✓ KEPT %s (g=%.1f, verbs=%d, nouns=%d, enr=%+.1f)\n", probe.ID, best.Grammar.Composite, @@ -293,9 +315,11 @@ func RunDistill(args []string) { fmt.Fprintf(os.Stderr, "Runs: %d per probe (%d total generations)\n", *runs, len(probes)**runs) fmt.Fprintf(os.Stderr, "Scorer: go-i18n/reversal grammar v3, gate >= %.1f\n", *minScore) fmt.Fprintf(os.Stderr, "Kept: %d\n", kept) + fmt.Fprintf(os.Stderr, "Deduped: %d\n", deduped) fmt.Fprintf(os.Stderr, "Skipped: %d\n", skipped) - if kept+skipped > 0 { - fmt.Fprintf(os.Stderr, "Pass rate: %.0f%%\n", float64(kept)/float64(kept+skipped)*100) + total := kept + deduped + skipped + if total > 0 { + fmt.Fprintf(os.Stderr, "Pass rate: %.0f%%\n", float64(kept)/float64(total)*100) } fmt.Fprintf(os.Stderr, "Output: %s\n", outputPath) fmt.Fprintf(os.Stderr, "Duration: %.0fs (%.1fm)\n", duration.Seconds(), duration.Minutes()) diff --git a/pkg/lem/features.go b/pkg/lem/features.go new file mode 100644 index 0000000..9ae1248 --- /dev/null +++ b/pkg/lem/features.go @@ -0,0 +1,70 @@ +package lem + +// GrammarFeatures extracts a 6-dimensional feature vector from a GrammarScore. +// Order: VocabRichness, TenseEntropy, QuestionRatio, DomainDepth, VerbDiversity, NounDiversity. +// Composite is excluded — it's a derived weighted sum, not an independent feature. +func GrammarFeatures(gs GrammarScore) []float64 { + return []float64{ + gs.VocabRichness, + gs.TenseEntropy, + gs.QuestionRatio, + float64(gs.DomainDepth), + float64(gs.VerbDiversity), + float64(gs.NounDiversity), + } +} + +// GrammarFeatureLabels returns axis labels matching GrammarFeatures order. +func GrammarFeatureLabels() []string { + return []string{ + "vocab_richness", + "tense_entropy", + "question_ratio", + "domain_depth", + "verb_diversity", + "noun_diversity", + } +} + +// HeuristicFeatures extracts an 8-dimensional feature vector from HeuristicScores. +// Order: ComplianceMarkers, FormulaicPreamble, FirstPerson, CreativeForm, +// +// EngagementDepth, EmotionalRegister, Degeneration, EmptyBroken. +// +// LEKScore is excluded — it's a derived weighted sum. +func HeuristicFeatures(hs HeuristicScores) []float64 { + return []float64{ + float64(hs.ComplianceMarkers), + float64(hs.FormulaicPreamble), + float64(hs.FirstPerson), + float64(hs.CreativeForm), + float64(hs.EngagementDepth), + float64(hs.EmotionalRegister), + float64(hs.Degeneration), + float64(hs.EmptyBroken), + } +} + +// HeuristicFeatureLabels returns axis labels matching HeuristicFeatures order. +func HeuristicFeatureLabels() []string { + return []string{ + "compliance_markers", + "formulaic_preamble", + "first_person", + "creative_form", + "engagement_depth", + "emotional_register", + "degeneration", + "empty_broken", + } +} + +// CombinedFeatures concatenates grammar (6D) and heuristic (8D) into a 14D vector. +func CombinedFeatures(gs GrammarScore, hs HeuristicScores) []float64 { + return append(GrammarFeatures(gs), HeuristicFeatures(hs)...) +} + +// CombinedFeatureLabels returns axis labels for the 14D combined vector. +func CombinedFeatureLabels() []string { + return append(GrammarFeatureLabels(), HeuristicFeatureLabels()...) +} diff --git a/pkg/lem/features_test.go b/pkg/lem/features_test.go new file mode 100644 index 0000000..52d3dfb --- /dev/null +++ b/pkg/lem/features_test.go @@ -0,0 +1,121 @@ +package lem + +import ( + "testing" +) + +func TestGrammarFeatures_Length(t *testing.T) { + gs := GrammarScore{ + VocabRichness: 0.15, + TenseEntropy: 1.2, + QuestionRatio: 0.3, + DomainDepth: 5, + VerbDiversity: 12, + NounDiversity: 18, + Composite: 65.0, + } + vec := GrammarFeatures(gs) + if len(vec) != 6 { + t.Fatalf("expected 6 features, got %d", len(vec)) + } +} + +func TestGrammarFeatures_Values(t *testing.T) { + gs := GrammarScore{ + VocabRichness: 0.15, + TenseEntropy: 1.2, + QuestionRatio: 0.3, + DomainDepth: 5, + VerbDiversity: 12, + NounDiversity: 18, + Composite: 65.0, + } + vec := GrammarFeatures(gs) + if vec[0] != 0.15 { + t.Errorf("vec[0] = %f, want 0.15", vec[0]) + } + if vec[1] != 1.2 { + t.Errorf("vec[1] = %f, want 1.2", vec[1]) + } + if vec[3] != 5.0 { + t.Errorf("vec[3] = %f, want 5.0 (DomainDepth)", vec[3]) + } +} + +func TestHeuristicFeatures_Length(t *testing.T) { + hs := HeuristicScores{ + ComplianceMarkers: 2, + FormulaicPreamble: 1, + FirstPerson: 3, + CreativeForm: 4, + EngagementDepth: 5, + EmotionalRegister: 6, + Degeneration: 0, + EmptyBroken: 0, + LEKScore: 42.0, + } + vec := HeuristicFeatures(hs) + if len(vec) != 8 { + t.Fatalf("expected 8 features, got %d", len(vec)) + } +} + +func TestHeuristicFeatures_Values(t *testing.T) { + hs := HeuristicScores{ + ComplianceMarkers: 2, + FormulaicPreamble: 1, + FirstPerson: 3, + CreativeForm: 4, + EngagementDepth: 5, + EmotionalRegister: 6, + Degeneration: 7, + EmptyBroken: 0, + } + vec := HeuristicFeatures(hs) + if vec[0] != 2.0 { + t.Errorf("vec[0] = %f, want 2.0 (ComplianceMarkers)", vec[0]) + } + if vec[6] != 7.0 { + t.Errorf("vec[6] = %f, want 7.0 (Degeneration)", vec[6]) + } +} + +func TestCombinedFeatures_Length(t *testing.T) { + gs := GrammarScore{Composite: 50} + hs := HeuristicScores{LEKScore: 30} + vec := CombinedFeatures(gs, hs) + if len(vec) != 14 { + t.Fatalf("expected 14 features, got %d", len(vec)) + } +} + +func TestGrammarFeatureLabels(t *testing.T) { + labels := GrammarFeatureLabels() + if len(labels) != 6 { + t.Fatalf("expected 6 labels, got %d", len(labels)) + } + if labels[0] != "vocab_richness" { + t.Errorf("labels[0] = %q, want vocab_richness", labels[0]) + } +} + +func TestHeuristicFeatureLabels(t *testing.T) { + labels := HeuristicFeatureLabels() + if len(labels) != 8 { + t.Fatalf("expected 8 labels, got %d", len(labels)) + } + if labels[4] != "engagement_depth" { + t.Errorf("labels[4] = %q, want engagement_depth", labels[4]) + } +} + +func TestCombinedFeatureLabels(t *testing.T) { + labels := CombinedFeatureLabels() + if len(labels) != 14 { + t.Fatalf("expected 14 labels, got %d", len(labels)) + } + // First 6 are grammar, next 8 are heuristic. + if labels[6] != "compliance_markers" { + t.Errorf("labels[6] = %q, want compliance_markers", labels[6]) + } +}