LEM/pkg/lem/coverage.go
Snider c701c2e0af feat(lem): integrate Poindexter for spatial score indexing and analytics
- Add feature vector extraction (6D grammar, 8D heuristic, 14D combined)
- Add KDTree ScoreIndex with cosine distance for probe clustering
- Add score distribution analytics (percentiles, variance, skewness)
- Add grammar-profile dedup filtering to distill pipeline
- Add spatial gap detection (FindGaps) for coverage analysis
- Wire analytics into coverage CLI (PrintScoreAnalytics)

New files: features.go, cluster.go, analytics.go + tests
Modified: distill.go (dedup filter), coverage.go (analytics output)
Dep: github.com/Snider/Poindexter

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-22 21:26:06 +00:00

178 lines
5.3 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package lem
import (
"flag"
"fmt"
"log"
"os"
"strings"
)
// RunCoverage is the CLI entry point for the coverage command.
// Analyzes seed coverage and shows underrepresented areas.
func RunCoverage(args []string) {
fs := flag.NewFlagSet("coverage", flag.ExitOnError)
dbPath := fs.String("db", "", "DuckDB database path (defaults to LEM_DB env)")
if err := fs.Parse(args); err != nil {
log.Fatalf("parse flags: %v", err)
}
if *dbPath == "" {
*dbPath = os.Getenv("LEM_DB")
}
if *dbPath == "" {
fmt.Fprintln(os.Stderr, "error: --db or LEM_DB required")
os.Exit(1)
}
db, err := OpenDB(*dbPath)
if err != nil {
log.Fatalf("open db: %v", err)
}
defer db.Close()
var total int
if err := db.conn.QueryRow("SELECT count(*) FROM seeds").Scan(&total); err != nil {
log.Fatalf("No seeds table. Run: lem import-all first")
}
fmt.Println("LEM Seed Coverage Analysis")
fmt.Println("==================================================")
fmt.Printf("\nTotal seeds: %d\n", total)
// Region distribution.
fmt.Println("\nRegion distribution (underrepresented first):")
rows, err := db.conn.Query(`
SELECT
CASE
WHEN region LIKE '%cn%' THEN 'cn (Chinese)'
WHEN region LIKE '%en-%' OR region LIKE '%en_para%' OR region LIKE '%para%' THEN 'en (English)'
WHEN region LIKE '%ru%' THEN 'ru (Russian)'
WHEN region LIKE '%de%' AND region NOT LIKE '%deten%' THEN 'de (German)'
WHEN region LIKE '%es%' THEN 'es (Spanish)'
WHEN region LIKE '%fr%' THEN 'fr (French)'
WHEN region LIKE '%latam%' THEN 'latam (LatAm)'
WHEN region LIKE '%africa%' THEN 'africa'
WHEN region LIKE '%eu%' THEN 'eu (European)'
WHEN region LIKE '%me%' AND region NOT LIKE '%premium%' THEN 'me (MidEast)'
WHEN region LIKE '%multi%' THEN 'multilingual'
WHEN region LIKE '%weak%' THEN 'weak-langs'
ELSE 'other'
END AS lang_group,
count(*) AS n,
count(DISTINCT domain) AS domains
FROM seeds GROUP BY lang_group ORDER BY n ASC
`)
if err != nil {
log.Fatalf("query regions: %v", err)
}
type regionRow struct {
group string
n int
domains int
}
var regionRows []regionRow
for rows.Next() {
var r regionRow
rows.Scan(&r.group, &r.n, &r.domains)
regionRows = append(regionRows, r)
}
rows.Close()
avg := float64(total) / float64(len(regionRows))
for _, r := range regionRows {
barLen := min(int(float64(r.n)/avg*10), 40)
bar := strings.Repeat("#", barLen)
gap := ""
if float64(r.n) < avg*0.5 {
gap = " <- UNDERREPRESENTED"
}
fmt.Printf(" %-22s %6d (%4d domains) %s%s\n", r.group, r.n, r.domains, bar, gap)
}
// Top 10 domains.
fmt.Println("\nTop 10 domains (most seeds):")
topRows, err := db.conn.Query(`
SELECT domain, count(*) AS n FROM seeds
WHERE domain != '' GROUP BY domain ORDER BY n DESC LIMIT 10
`)
if err == nil {
for topRows.Next() {
var domain string
var n int
topRows.Scan(&domain, &n)
fmt.Printf(" %-40s %5d\n", domain, n)
}
topRows.Close()
}
// Bottom 10 domains.
fmt.Println("\nBottom 10 domains (fewest seeds, min 5):")
bottomRows, err := db.conn.Query(`
SELECT domain, count(*) AS n FROM seeds
WHERE domain != '' GROUP BY domain HAVING count(*) >= 5 ORDER BY n ASC LIMIT 10
`)
if err == nil {
for bottomRows.Next() {
var domain string
var n int
bottomRows.Scan(&domain, &n)
fmt.Printf(" %-40s %5d\n", domain, n)
}
bottomRows.Close()
}
fmt.Println("\nSuggested expansion areas:")
fmt.Println(" - Japanese, Korean, Thai, Vietnamese (no seeds found)")
fmt.Println(" - Hindi/Urdu, Bengali, Tamil (South Asian)")
fmt.Println(" - Swahili, Yoruba, Amharic (Sub-Saharan Africa)")
fmt.Println(" - Indigenous languages (Quechua, Nahuatl, Aymara)")
}
// PrintScoreAnalytics prints score distribution statistics and gap analysis
// for a set of scored entries. Use after scoring responses with grammar v3.
func PrintScoreAnalytics(entries []ScoredEntry) {
if len(entries) == 0 {
fmt.Println("No scored entries to analyse.")
return
}
report := ScoreSummary(entries)
fmt.Println("\nGrammar Score Distribution")
fmt.Println("==================================================")
fmt.Printf(" Entries: %d\n", report.Total)
cs := report.CompositeStats
fmt.Printf(" Mean: %.1f\n", cs.Mean)
fmt.Printf(" Median: %.1f\n", cs.Median)
fmt.Printf(" StdDev: %.1f\n", cs.StdDev)
fmt.Printf(" Range: %.1f %.1f\n", cs.Min, cs.Max)
fmt.Printf(" P25: %.1f\n", cs.P25)
fmt.Printf(" P75: %.1f\n", cs.P75)
fmt.Printf(" P90: %.1f\n", cs.P90)
fmt.Printf(" Skewness: %.2f\n", cs.Skewness)
fmt.Println("\nPer-Axis Statistics")
fmt.Println("--------------------------------------------------")
fmt.Printf(" %-20s %8s %8s %8s %8s\n", "Feature", "Mean", "StdDev", "Min", "Max")
for _, ax := range report.AxisStats {
fmt.Printf(" %-20s %8.3f %8.3f %8.3f %8.3f\n",
ax.Name, ax.Stats.Mean, ax.Stats.StdDev, ax.Stats.Min, ax.Stats.Max)
}
// Gap analysis.
if len(entries) >= 3 {
gaps := FindGaps(entries, min(3, len(entries)))
if len(gaps) > 0 {
fmt.Println("\nTop 10 Coverage Gaps (worst first)")
fmt.Println("--------------------------------------------------")
limit := min(10, len(gaps))
for i := range limit {
g := gaps[i]
fmt.Printf(" #%d avg_dist=%.4f nearest=%v\n", i+1, g.AvgDistance, g.NearestIDs)
}
}
}
}