LEM/pkg/lem/cluster.go
Snider c701c2e0af feat(lem): integrate Poindexter for spatial score indexing and analytics
- Add feature vector extraction (6D grammar, 8D heuristic, 14D combined)
- Add KDTree ScoreIndex with cosine distance for probe clustering
- Add score distribution analytics (percentiles, variance, skewness)
- Add grammar-profile dedup filtering to distill pipeline
- Add spatial gap detection (FindGaps) for coverage analysis
- Wire analytics into coverage CLI (PrintScoreAnalytics)

New files: features.go, cluster.go, analytics.go + tests
Modified: distill.go (dedup filter), coverage.go (analytics output)
Dep: github.com/Snider/Poindexter

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-22 21:26:06 +00:00

240 lines
5.9 KiB
Go

package lem
import (
"cmp"
"errors"
"fmt"
"slices"
poindexter "github.com/Snider/Poindexter"
)
// ScoredEntry pairs a response ID with its grammar scores for indexing.
type ScoredEntry struct {
ID string
Domain string
Grammar GrammarScore
}
// ScoreIndex wraps a Poindexter KDTree over grammar feature vectors.
type ScoreIndex struct {
tree *poindexter.KDTree[ScoredEntry]
}
// NewScoreIndex builds a KDTree from scored entries using cosine distance
// on 6D grammar feature vectors. Raw coordinates are used (no normalization)
// because cosine distance is angle-based and handles magnitude differences.
func NewScoreIndex(entries []ScoredEntry) (*ScoreIndex, error) {
if len(entries) == 0 {
return nil, errors.New("lem: no entries to index")
}
points := make([]poindexter.KDPoint[ScoredEntry], len(entries))
for i, e := range entries {
points[i] = poindexter.KDPoint[ScoredEntry]{
ID: e.ID,
Coords: GrammarFeatures(e.Grammar),
Value: e,
}
}
tree, err := poindexter.NewKDTree(points,
poindexter.WithMetric(poindexter.CosineDistance{}),
)
if err != nil {
return nil, fmt.Errorf("lem: build tree: %w", err)
}
return &ScoreIndex{tree: tree}, nil
}
// Len returns the number of indexed entries.
func (idx *ScoreIndex) Len() int {
return idx.tree.Len()
}
// Nearest finds the closest scored entry to the query vector.
func (idx *ScoreIndex) Nearest(query []float64) (ScoredEntry, float64, bool) {
pt, dist, ok := idx.tree.Nearest(query)
if !ok {
return ScoredEntry{}, 0, false
}
return pt.Value, dist, true
}
// KNearest finds the k closest scored entries to the query vector.
func (idx *ScoreIndex) KNearest(query []float64, k int) ([]ScoredEntry, []float64) {
pts, dists := idx.tree.KNearest(query, k)
entries := make([]ScoredEntry, len(pts))
for i, pt := range pts {
entries[i] = pt.Value
}
return entries, dists
}
// Radius finds all entries within distance r of the query vector.
func (idx *ScoreIndex) Radius(query []float64, r float64) ([]ScoredEntry, []float64) {
pts, dists := idx.tree.Radius(query, r)
entries := make([]ScoredEntry, len(pts))
for i, pt := range pts {
entries[i] = pt.Value
}
return entries, dists
}
// IsDuplicate returns true if any indexed entry is within threshold distance
// of the query vector. Use during distill to reject near-identical outputs.
func (idx *ScoreIndex) IsDuplicate(query []float64, threshold float64) bool {
_, dist, ok := idx.tree.Nearest(query)
return ok && dist <= threshold
}
// Insert adds a new scored entry to the index.
func (idx *ScoreIndex) Insert(entry ScoredEntry) error {
features := GrammarFeatures(entry.Grammar)
pt := poindexter.KDPoint[ScoredEntry]{
ID: entry.ID,
Coords: features,
Value: entry,
}
if !idx.tree.Insert(pt) {
return fmt.Errorf("lem: failed to insert %s (duplicate ID?)", entry.ID)
}
return nil
}
// Points returns all indexed entries.
func (idx *ScoreIndex) Points() []ScoredEntry {
pts := idx.tree.Points()
entries := make([]ScoredEntry, len(pts))
for i, pt := range pts {
entries[i] = pt.Value
}
return entries
}
// featureRange holds the min/max for one axis.
type featureRange struct{ min, max float64 }
// GapReport describes a region of quality-space with poor coverage.
type GapReport struct {
// Probe is the sample point coordinates in grammar feature space.
Probe []float64
// AvgDistance is the average distance to the k nearest indexed entries.
AvgDistance float64
// NearestIDs lists the IDs of the k nearest entries.
NearestIDs []string
}
// FindGaps samples the grammar feature space and identifies regions
// where the k-nearest indexed entries are far away (poor coverage).
// Returns gap reports sorted by AvgDistance descending (worst gaps first).
func FindGaps(entries []ScoredEntry, k int) []GapReport {
if len(entries) < 2 {
return nil
}
idx, err := NewScoreIndex(entries)
if err != nil {
return nil
}
// Compute per-axis min/max for sampling range.
dim := 6
ranges := make([]featureRange, dim)
first := GrammarFeatures(entries[0].Grammar)
for i := range dim {
ranges[i] = featureRange{min: first[i], max: first[i]}
}
for _, e := range entries[1:] {
f := GrammarFeatures(e.Grammar)
for i := range dim {
if f[i] < ranges[i].min {
ranges[i].min = f[i]
}
if f[i] > ranges[i].max {
ranges[i].max = f[i]
}
}
}
// Sample a grid of probe points across the feature space.
// 3 steps per axis = 3^6 = 729 probe points.
steps := 3
probes := sampleGrid(ranges, steps, dim)
if k > len(entries) {
k = len(entries)
}
var gaps []GapReport
for _, probe := range probes {
neighbours, dists := idx.KNearest(probe, k)
if len(dists) == 0 {
continue
}
avg := 0.0
for _, d := range dists {
avg += d
}
avg /= float64(len(dists))
ids := make([]string, len(neighbours))
for i, n := range neighbours {
ids[i] = n.ID
}
gaps = append(gaps, GapReport{
Probe: probe,
AvgDistance: avg,
NearestIDs: ids,
})
}
// Sort by worst coverage first.
slices.SortFunc(gaps, func(a, b GapReport) int {
return cmp.Compare(b.AvgDistance, a.AvgDistance) // descending
})
return gaps
}
// sampleGrid generates probe points across the feature space
// by stepping through each axis's [min, max] range.
func sampleGrid(ranges []featureRange, steps, dim int) [][]float64 {
if dim == 0 || steps < 2 {
return nil
}
axisValues := make([][]float64, dim)
for i, r := range ranges {
vals := make([]float64, steps)
for j := range steps {
vals[j] = r.min + (r.max-r.min)*float64(j)/float64(steps-1)
}
axisValues[i] = vals
}
total := 1
for range dim {
total *= steps
}
probes := make([][]float64, 0, total)
current := make([]float64, dim)
var generate func(axis int)
generate = func(axis int) {
if axis == dim {
probe := make([]float64, dim)
copy(probe, current)
probes = append(probes, probe)
return
}
for _, v := range axisValues[axis] {
current[axis] = v
generate(axis + 1)
}
}
generate(0)
return probes
}