package lem import ( "cmp" "errors" "fmt" "slices" poindexter "github.com/Snider/Poindexter" ) // ScoredEntry pairs a response ID with its grammar scores for indexing. type ScoredEntry struct { ID string Domain string Grammar GrammarScore } // ScoreIndex wraps a Poindexter KDTree over grammar feature vectors. type ScoreIndex struct { tree *poindexter.KDTree[ScoredEntry] } // NewScoreIndex builds a KDTree from scored entries using cosine distance // on 6D grammar feature vectors. Raw coordinates are used (no normalization) // because cosine distance is angle-based and handles magnitude differences. func NewScoreIndex(entries []ScoredEntry) (*ScoreIndex, error) { if len(entries) == 0 { return nil, errors.New("lem: no entries to index") } points := make([]poindexter.KDPoint[ScoredEntry], len(entries)) for i, e := range entries { points[i] = poindexter.KDPoint[ScoredEntry]{ ID: e.ID, Coords: GrammarFeatures(e.Grammar), Value: e, } } tree, err := poindexter.NewKDTree(points, poindexter.WithMetric(poindexter.CosineDistance{}), ) if err != nil { return nil, fmt.Errorf("lem: build tree: %w", err) } return &ScoreIndex{tree: tree}, nil } // Len returns the number of indexed entries. func (idx *ScoreIndex) Len() int { return idx.tree.Len() } // Nearest finds the closest scored entry to the query vector. func (idx *ScoreIndex) Nearest(query []float64) (ScoredEntry, float64, bool) { pt, dist, ok := idx.tree.Nearest(query) if !ok { return ScoredEntry{}, 0, false } return pt.Value, dist, true } // KNearest finds the k closest scored entries to the query vector. func (idx *ScoreIndex) KNearest(query []float64, k int) ([]ScoredEntry, []float64) { pts, dists := idx.tree.KNearest(query, k) entries := make([]ScoredEntry, len(pts)) for i, pt := range pts { entries[i] = pt.Value } return entries, dists } // Radius finds all entries within distance r of the query vector. func (idx *ScoreIndex) Radius(query []float64, r float64) ([]ScoredEntry, []float64) { pts, dists := idx.tree.Radius(query, r) entries := make([]ScoredEntry, len(pts)) for i, pt := range pts { entries[i] = pt.Value } return entries, dists } // IsDuplicate returns true if any indexed entry is within threshold distance // of the query vector. Use during distill to reject near-identical outputs. func (idx *ScoreIndex) IsDuplicate(query []float64, threshold float64) bool { _, dist, ok := idx.tree.Nearest(query) return ok && dist <= threshold } // Insert adds a new scored entry to the index. func (idx *ScoreIndex) Insert(entry ScoredEntry) error { features := GrammarFeatures(entry.Grammar) pt := poindexter.KDPoint[ScoredEntry]{ ID: entry.ID, Coords: features, Value: entry, } if !idx.tree.Insert(pt) { return fmt.Errorf("lem: failed to insert %s (duplicate ID?)", entry.ID) } return nil } // Points returns all indexed entries. func (idx *ScoreIndex) Points() []ScoredEntry { pts := idx.tree.Points() entries := make([]ScoredEntry, len(pts)) for i, pt := range pts { entries[i] = pt.Value } return entries } // featureRange holds the min/max for one axis. type featureRange struct{ min, max float64 } // GapReport describes a region of quality-space with poor coverage. type GapReport struct { // Probe is the sample point coordinates in grammar feature space. Probe []float64 // AvgDistance is the average distance to the k nearest indexed entries. AvgDistance float64 // NearestIDs lists the IDs of the k nearest entries. NearestIDs []string } // FindGaps samples the grammar feature space and identifies regions // where the k-nearest indexed entries are far away (poor coverage). // Returns gap reports sorted by AvgDistance descending (worst gaps first). func FindGaps(entries []ScoredEntry, k int) []GapReport { if len(entries) < 2 { return nil } idx, err := NewScoreIndex(entries) if err != nil { return nil } // Compute per-axis min/max for sampling range. dim := 6 ranges := make([]featureRange, dim) first := GrammarFeatures(entries[0].Grammar) for i := range dim { ranges[i] = featureRange{min: first[i], max: first[i]} } for _, e := range entries[1:] { f := GrammarFeatures(e.Grammar) for i := range dim { if f[i] < ranges[i].min { ranges[i].min = f[i] } if f[i] > ranges[i].max { ranges[i].max = f[i] } } } // Sample a grid of probe points across the feature space. // 3 steps per axis = 3^6 = 729 probe points. steps := 3 probes := sampleGrid(ranges, steps, dim) if k > len(entries) { k = len(entries) } var gaps []GapReport for _, probe := range probes { neighbours, dists := idx.KNearest(probe, k) if len(dists) == 0 { continue } avg := 0.0 for _, d := range dists { avg += d } avg /= float64(len(dists)) ids := make([]string, len(neighbours)) for i, n := range neighbours { ids[i] = n.ID } gaps = append(gaps, GapReport{ Probe: probe, AvgDistance: avg, NearestIDs: ids, }) } // Sort by worst coverage first. slices.SortFunc(gaps, func(a, b GapReport) int { return cmp.Compare(b.AvgDistance, a.AvgDistance) // descending }) return gaps } // sampleGrid generates probe points across the feature space // by stepping through each axis's [min, max] range. func sampleGrid(ranges []featureRange, steps, dim int) [][]float64 { if dim == 0 || steps < 2 { return nil } axisValues := make([][]float64, dim) for i, r := range ranges { vals := make([]float64, steps) for j := range steps { vals[j] = r.min + (r.max-r.min)*float64(j)/float64(steps-1) } axisValues[i] = vals } total := 1 for range dim { total *= steps } probes := make([][]float64, 0, total) current := make([]float64, dim) var generate func(axis int) generate = func(axis int) { if axis == dim { probe := make([]float64, dim) copy(probe, current) probes = append(probes, probe) return } for _, v := range axisValues[axis] { current[axis] = v generate(axis + 1) } } generate(0) return probes }