LEM/pkg/lem/cluster.go

package lem

import (
	"cmp"
	"errors"
	"fmt"
	"slices"

	poindexter "github.com/Snider/Poindexter"
)

// ScoredEntry pairs a response ID with its grammar scores for indexing.
type ScoredEntry struct {
	ID      string
	Domain  string
	Grammar GrammarScore
}

// ScoreIndex wraps a Poindexter KDTree over grammar feature vectors.
type ScoreIndex struct {
	tree *poindexter.KDTree[ScoredEntry]
}

// NewScoreIndex builds a KDTree from scored entries using cosine distance
// on 6D grammar feature vectors. Raw coordinates are used (no normalization)
// because cosine distance is angle-based and handles magnitude differences.
func NewScoreIndex(entries []ScoredEntry) (*ScoreIndex, error) {
	if len(entries) == 0 {
		return nil, errors.New("lem: no entries to index")
	}

	points := make([]poindexter.KDPoint[ScoredEntry], len(entries))
	for i, e := range entries {
		points[i] = poindexter.KDPoint[ScoredEntry]{
			ID:     e.ID,
			Coords: GrammarFeatures(e.Grammar),
			Value:  e,
		}
	}

	tree, err := poindexter.NewKDTree(points,
		poindexter.WithMetric(poindexter.CosineDistance{}),
	)
	if err != nil {
		return nil, fmt.Errorf("lem: build tree: %w", err)
	}

	return &ScoreIndex{tree: tree}, nil
}

// Len returns the number of indexed entries.
func (idx *ScoreIndex) Len() int {
	return idx.tree.Len()
}

// Nearest finds the closest scored entry to the query vector.
func (idx *ScoreIndex) Nearest(query []float64) (ScoredEntry, float64, bool) {
	pt, dist, ok := idx.tree.Nearest(query)
	if !ok {
		return ScoredEntry{}, 0, false
	}
	return pt.Value, dist, true
}

// KNearest finds the k closest scored entries to the query vector.
func (idx *ScoreIndex) KNearest(query []float64, k int) ([]ScoredEntry, []float64) {
	pts, dists := idx.tree.KNearest(query, k)
	entries := make([]ScoredEntry, len(pts))
	for i, pt := range pts {
		entries[i] = pt.Value
	}
	return entries, dists
}

// Radius finds all entries within distance r of the query vector.
func (idx *ScoreIndex) Radius(query []float64, r float64) ([]ScoredEntry, []float64) {
	pts, dists := idx.tree.Radius(query, r)
	entries := make([]ScoredEntry, len(pts))
	for i, pt := range pts {
		entries[i] = pt.Value
	}
	return entries, dists
}

// IsDuplicate returns true if any indexed entry is within threshold distance
// of the query vector. Use during distill to reject near-identical outputs.
func (idx *ScoreIndex) IsDuplicate(query []float64, threshold float64) bool {
	_, dist, ok := idx.tree.Nearest(query)
	return ok && dist <= threshold
}

// Insert adds a new scored entry to the index.
func (idx *ScoreIndex) Insert(entry ScoredEntry) error {
	features := GrammarFeatures(entry.Grammar)
	pt := poindexter.KDPoint[ScoredEntry]{
		ID:     entry.ID,
		Coords: features,
		Value:  entry,
	}
	if !idx.tree.Insert(pt) {
		return fmt.Errorf("lem: failed to insert %s (duplicate ID?)", entry.ID)
	}
	return nil
}

// Points returns all indexed entries.
func (idx *ScoreIndex) Points() []ScoredEntry {
	pts := idx.tree.Points()
	entries := make([]ScoredEntry, len(pts))
	for i, pt := range pts {
		entries[i] = pt.Value
	}
	return entries
}

// featureRange holds the min/max for one axis.
type featureRange struct{ min, max float64 }

// GapReport describes a region of quality-space with poor coverage.
type GapReport struct {
	// Probe is the sample point coordinates in grammar feature space.
	Probe []float64
	// AvgDistance is the average distance to the k nearest indexed entries.
	AvgDistance float64
	// NearestIDs lists the IDs of the k nearest entries.
	NearestIDs []string
}

// FindGaps samples the grammar feature space and identifies regions
// where the k-nearest indexed entries are far away (poor coverage).
// Returns gap reports sorted by AvgDistance descending (worst gaps first).
func FindGaps(entries []ScoredEntry, k int) []GapReport {
	if len(entries) < 2 {
		return nil
	}

	idx, err := NewScoreIndex(entries)
	if err != nil {
		return nil
	}

	// Compute per-axis min/max for sampling range.
	dim := 6
	ranges := make([]featureRange, dim)
	first := GrammarFeatures(entries[0].Grammar)
	for i := range dim {
		ranges[i] = featureRange{min: first[i], max: first[i]}
	}
	for _, e := range entries[1:] {
		f := GrammarFeatures(e.Grammar)
		for i := range dim {
			if f[i] < ranges[i].min {
				ranges[i].min = f[i]
			}
			if f[i] > ranges[i].max {
				ranges[i].max = f[i]
			}
		}
	}

	// Sample a grid of probe points across the feature space.
	// 3 steps per axis = 3^6 = 729 probe points.
	steps := 3
	probes := sampleGrid(ranges, steps, dim)

	if k > len(entries) {
		k = len(entries)
	}

	var gaps []GapReport
	for _, probe := range probes {
		neighbours, dists := idx.KNearest(probe, k)
		if len(dists) == 0 {
			continue
		}
		avg := 0.0
		for _, d := range dists {
			avg += d
		}
		avg /= float64(len(dists))

		ids := make([]string, len(neighbours))
		for i, n := range neighbours {
			ids[i] = n.ID
		}
		gaps = append(gaps, GapReport{
			Probe:      probe,
			AvgDistance: avg,
			NearestIDs: ids,
		})
	}

	// Sort by worst coverage first.
	slices.SortFunc(gaps, func(a, b GapReport) int {
		return cmp.Compare(b.AvgDistance, a.AvgDistance) // descending
	})

	return gaps
}

// sampleGrid generates probe points across the feature space
// by stepping through each axis's [min, max] range.
func sampleGrid(ranges []featureRange, steps, dim int) [][]float64 {
	if dim == 0 || steps < 2 {
		return nil
	}

	axisValues := make([][]float64, dim)
	for i, r := range ranges {
		vals := make([]float64, steps)
		for j := range steps {
			vals[j] = r.min + (r.max-r.min)*float64(j)/float64(steps-1)
		}
		axisValues[i] = vals
	}

	total := 1
	for range dim {
		total *= steps
	}
	probes := make([][]float64, 0, total)
	current := make([]float64, dim)
	var generate func(axis int)
	generate = func(axis int) {
		if axis == dim {
			probe := make([]float64, dim)
			copy(probe, current)
			probes = append(probes, probe)
			return
		}
		for _, v := range axisValues[axis] {
			current[axis] = v
			generate(axis + 1)
		}
	}
	generate(0)

	return probes
}