- Add feature vector extraction (6D grammar, 8D heuristic, 14D combined) - Add KDTree ScoreIndex with cosine distance for probe clustering - Add score distribution analytics (percentiles, variance, skewness) - Add grammar-profile dedup filtering to distill pipeline - Add spatial gap detection (FindGaps) for coverage analysis - Wire analytics into coverage CLI (PrintScoreAnalytics) New files: features.go, cluster.go, analytics.go + tests Modified: distill.go (dedup filter), coverage.go (analytics output) Dep: github.com/Snider/Poindexter Co-Authored-By: Virgil <virgil@lethean.io>
240 lines
5.9 KiB
Go
240 lines
5.9 KiB
Go
package lem
|
|
|
|
import (
|
|
"cmp"
|
|
"errors"
|
|
"fmt"
|
|
"slices"
|
|
|
|
poindexter "github.com/Snider/Poindexter"
|
|
)
|
|
|
|
// ScoredEntry pairs a response ID with its grammar scores for indexing.
|
|
type ScoredEntry struct {
|
|
ID string
|
|
Domain string
|
|
Grammar GrammarScore
|
|
}
|
|
|
|
// ScoreIndex wraps a Poindexter KDTree over grammar feature vectors.
|
|
type ScoreIndex struct {
|
|
tree *poindexter.KDTree[ScoredEntry]
|
|
}
|
|
|
|
// NewScoreIndex builds a KDTree from scored entries using cosine distance
|
|
// on 6D grammar feature vectors. Raw coordinates are used (no normalization)
|
|
// because cosine distance is angle-based and handles magnitude differences.
|
|
func NewScoreIndex(entries []ScoredEntry) (*ScoreIndex, error) {
|
|
if len(entries) == 0 {
|
|
return nil, errors.New("lem: no entries to index")
|
|
}
|
|
|
|
points := make([]poindexter.KDPoint[ScoredEntry], len(entries))
|
|
for i, e := range entries {
|
|
points[i] = poindexter.KDPoint[ScoredEntry]{
|
|
ID: e.ID,
|
|
Coords: GrammarFeatures(e.Grammar),
|
|
Value: e,
|
|
}
|
|
}
|
|
|
|
tree, err := poindexter.NewKDTree(points,
|
|
poindexter.WithMetric(poindexter.CosineDistance{}),
|
|
)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("lem: build tree: %w", err)
|
|
}
|
|
|
|
return &ScoreIndex{tree: tree}, nil
|
|
}
|
|
|
|
// Len returns the number of indexed entries.
|
|
func (idx *ScoreIndex) Len() int {
|
|
return idx.tree.Len()
|
|
}
|
|
|
|
// Nearest finds the closest scored entry to the query vector.
|
|
func (idx *ScoreIndex) Nearest(query []float64) (ScoredEntry, float64, bool) {
|
|
pt, dist, ok := idx.tree.Nearest(query)
|
|
if !ok {
|
|
return ScoredEntry{}, 0, false
|
|
}
|
|
return pt.Value, dist, true
|
|
}
|
|
|
|
// KNearest finds the k closest scored entries to the query vector.
|
|
func (idx *ScoreIndex) KNearest(query []float64, k int) ([]ScoredEntry, []float64) {
|
|
pts, dists := idx.tree.KNearest(query, k)
|
|
entries := make([]ScoredEntry, len(pts))
|
|
for i, pt := range pts {
|
|
entries[i] = pt.Value
|
|
}
|
|
return entries, dists
|
|
}
|
|
|
|
// Radius finds all entries within distance r of the query vector.
|
|
func (idx *ScoreIndex) Radius(query []float64, r float64) ([]ScoredEntry, []float64) {
|
|
pts, dists := idx.tree.Radius(query, r)
|
|
entries := make([]ScoredEntry, len(pts))
|
|
for i, pt := range pts {
|
|
entries[i] = pt.Value
|
|
}
|
|
return entries, dists
|
|
}
|
|
|
|
// IsDuplicate returns true if any indexed entry is within threshold distance
|
|
// of the query vector. Use during distill to reject near-identical outputs.
|
|
func (idx *ScoreIndex) IsDuplicate(query []float64, threshold float64) bool {
|
|
_, dist, ok := idx.tree.Nearest(query)
|
|
return ok && dist <= threshold
|
|
}
|
|
|
|
// Insert adds a new scored entry to the index.
|
|
func (idx *ScoreIndex) Insert(entry ScoredEntry) error {
|
|
features := GrammarFeatures(entry.Grammar)
|
|
pt := poindexter.KDPoint[ScoredEntry]{
|
|
ID: entry.ID,
|
|
Coords: features,
|
|
Value: entry,
|
|
}
|
|
if !idx.tree.Insert(pt) {
|
|
return fmt.Errorf("lem: failed to insert %s (duplicate ID?)", entry.ID)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Points returns all indexed entries.
|
|
func (idx *ScoreIndex) Points() []ScoredEntry {
|
|
pts := idx.tree.Points()
|
|
entries := make([]ScoredEntry, len(pts))
|
|
for i, pt := range pts {
|
|
entries[i] = pt.Value
|
|
}
|
|
return entries
|
|
}
|
|
|
|
// featureRange holds the min/max for one axis.
|
|
type featureRange struct{ min, max float64 }
|
|
|
|
// GapReport describes a region of quality-space with poor coverage.
|
|
type GapReport struct {
|
|
// Probe is the sample point coordinates in grammar feature space.
|
|
Probe []float64
|
|
// AvgDistance is the average distance to the k nearest indexed entries.
|
|
AvgDistance float64
|
|
// NearestIDs lists the IDs of the k nearest entries.
|
|
NearestIDs []string
|
|
}
|
|
|
|
// FindGaps samples the grammar feature space and identifies regions
|
|
// where the k-nearest indexed entries are far away (poor coverage).
|
|
// Returns gap reports sorted by AvgDistance descending (worst gaps first).
|
|
func FindGaps(entries []ScoredEntry, k int) []GapReport {
|
|
if len(entries) < 2 {
|
|
return nil
|
|
}
|
|
|
|
idx, err := NewScoreIndex(entries)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
// Compute per-axis min/max for sampling range.
|
|
dim := 6
|
|
ranges := make([]featureRange, dim)
|
|
first := GrammarFeatures(entries[0].Grammar)
|
|
for i := range dim {
|
|
ranges[i] = featureRange{min: first[i], max: first[i]}
|
|
}
|
|
for _, e := range entries[1:] {
|
|
f := GrammarFeatures(e.Grammar)
|
|
for i := range dim {
|
|
if f[i] < ranges[i].min {
|
|
ranges[i].min = f[i]
|
|
}
|
|
if f[i] > ranges[i].max {
|
|
ranges[i].max = f[i]
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sample a grid of probe points across the feature space.
|
|
// 3 steps per axis = 3^6 = 729 probe points.
|
|
steps := 3
|
|
probes := sampleGrid(ranges, steps, dim)
|
|
|
|
if k > len(entries) {
|
|
k = len(entries)
|
|
}
|
|
|
|
var gaps []GapReport
|
|
for _, probe := range probes {
|
|
neighbours, dists := idx.KNearest(probe, k)
|
|
if len(dists) == 0 {
|
|
continue
|
|
}
|
|
avg := 0.0
|
|
for _, d := range dists {
|
|
avg += d
|
|
}
|
|
avg /= float64(len(dists))
|
|
|
|
ids := make([]string, len(neighbours))
|
|
for i, n := range neighbours {
|
|
ids[i] = n.ID
|
|
}
|
|
gaps = append(gaps, GapReport{
|
|
Probe: probe,
|
|
AvgDistance: avg,
|
|
NearestIDs: ids,
|
|
})
|
|
}
|
|
|
|
// Sort by worst coverage first.
|
|
slices.SortFunc(gaps, func(a, b GapReport) int {
|
|
return cmp.Compare(b.AvgDistance, a.AvgDistance) // descending
|
|
})
|
|
|
|
return gaps
|
|
}
|
|
|
|
// sampleGrid generates probe points across the feature space
|
|
// by stepping through each axis's [min, max] range.
|
|
func sampleGrid(ranges []featureRange, steps, dim int) [][]float64 {
|
|
if dim == 0 || steps < 2 {
|
|
return nil
|
|
}
|
|
|
|
axisValues := make([][]float64, dim)
|
|
for i, r := range ranges {
|
|
vals := make([]float64, steps)
|
|
for j := range steps {
|
|
vals[j] = r.min + (r.max-r.min)*float64(j)/float64(steps-1)
|
|
}
|
|
axisValues[i] = vals
|
|
}
|
|
|
|
total := 1
|
|
for range dim {
|
|
total *= steps
|
|
}
|
|
probes := make([][]float64, 0, total)
|
|
current := make([]float64, dim)
|
|
var generate func(axis int)
|
|
generate = func(axis int) {
|
|
if axis == dim {
|
|
probe := make([]float64, dim)
|
|
copy(probe, current)
|
|
probes = append(probes, probe)
|
|
return
|
|
}
|
|
for _, v := range axisValues[axis] {
|
|
current[axis] = v
|
|
generate(axis + 1)
|
|
}
|
|
}
|
|
generate(0)
|
|
|
|
return probes
|
|
}
|
|
|