LEM/pkg/lem/coverage.go
Snider 56eda1a081 refactor: migrate all 25 commands from passthrough to cobra framework
Replace passthrough() + stdlib flag.FlagSet anti-pattern with proper
cobra integration. Every Run* function now takes a typed *Opts struct
and returns error. Flags registered via cli.StringFlag/IntFlag/etc.
Commands participate in Core lifecycle with full cobra flag parsing.

- 6 command groups: gen, score, data, export, infra, mon
- 25 commands converted, 0 passthrough() calls remain
- Delete passthrough() helper from lem.go
- Update export_test.go to use ExportOpts struct

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-23 03:32:53 +00:00

174 lines
5.1 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package lem
import (
"fmt"
"os"
"strings"
)
// CoverageOpts holds configuration for the coverage command.
type CoverageOpts struct {
DB string // DuckDB database path (defaults to LEM_DB env)
}
// RunCoverage analyses seed coverage and shows underrepresented areas.
func RunCoverage(cfg CoverageOpts) error {
if cfg.DB == "" {
cfg.DB = os.Getenv("LEM_DB")
}
if cfg.DB == "" {
return fmt.Errorf("--db or LEM_DB required")
}
db, err := OpenDB(cfg.DB)
if err != nil {
return fmt.Errorf("open db: %w", err)
}
defer db.Close()
var total int
if err := db.conn.QueryRow("SELECT count(*) FROM seeds").Scan(&total); err != nil {
return fmt.Errorf("no seeds table — run: lem import-all first")
}
fmt.Println("LEM Seed Coverage Analysis")
fmt.Println("==================================================")
fmt.Printf("\nTotal seeds: %d\n", total)
// Region distribution.
fmt.Println("\nRegion distribution (underrepresented first):")
rows, err := db.conn.Query(`
SELECT
CASE
WHEN region LIKE '%cn%' THEN 'cn (Chinese)'
WHEN region LIKE '%en-%' OR region LIKE '%en_para%' OR region LIKE '%para%' THEN 'en (English)'
WHEN region LIKE '%ru%' THEN 'ru (Russian)'
WHEN region LIKE '%de%' AND region NOT LIKE '%deten%' THEN 'de (German)'
WHEN region LIKE '%es%' THEN 'es (Spanish)'
WHEN region LIKE '%fr%' THEN 'fr (French)'
WHEN region LIKE '%latam%' THEN 'latam (LatAm)'
WHEN region LIKE '%africa%' THEN 'africa'
WHEN region LIKE '%eu%' THEN 'eu (European)'
WHEN region LIKE '%me%' AND region NOT LIKE '%premium%' THEN 'me (MidEast)'
WHEN region LIKE '%multi%' THEN 'multilingual'
WHEN region LIKE '%weak%' THEN 'weak-langs'
ELSE 'other'
END AS lang_group,
count(*) AS n,
count(DISTINCT domain) AS domains
FROM seeds GROUP BY lang_group ORDER BY n ASC
`)
if err != nil {
return fmt.Errorf("query regions: %w", err)
}
type regionRow struct {
group string
n int
domains int
}
var regionRows []regionRow
for rows.Next() {
var r regionRow
rows.Scan(&r.group, &r.n, &r.domains)
regionRows = append(regionRows, r)
}
rows.Close()
avg := float64(total) / float64(len(regionRows))
for _, r := range regionRows {
barLen := min(int(float64(r.n)/avg*10), 40)
bar := strings.Repeat("#", barLen)
gap := ""
if float64(r.n) < avg*0.5 {
gap = " <- UNDERREPRESENTED"
}
fmt.Printf(" %-22s %6d (%4d domains) %s%s\n", r.group, r.n, r.domains, bar, gap)
}
// Top 10 domains.
fmt.Println("\nTop 10 domains (most seeds):")
topRows, err := db.conn.Query(`
SELECT domain, count(*) AS n FROM seeds
WHERE domain != '' GROUP BY domain ORDER BY n DESC LIMIT 10
`)
if err == nil {
for topRows.Next() {
var domain string
var n int
topRows.Scan(&domain, &n)
fmt.Printf(" %-40s %5d\n", domain, n)
}
topRows.Close()
}
// Bottom 10 domains.
fmt.Println("\nBottom 10 domains (fewest seeds, min 5):")
bottomRows, err := db.conn.Query(`
SELECT domain, count(*) AS n FROM seeds
WHERE domain != '' GROUP BY domain HAVING count(*) >= 5 ORDER BY n ASC LIMIT 10
`)
if err == nil {
for bottomRows.Next() {
var domain string
var n int
bottomRows.Scan(&domain, &n)
fmt.Printf(" %-40s %5d\n", domain, n)
}
bottomRows.Close()
}
fmt.Println("\nSuggested expansion areas:")
fmt.Println(" - Japanese, Korean, Thai, Vietnamese (no seeds found)")
fmt.Println(" - Hindi/Urdu, Bengali, Tamil (South Asian)")
fmt.Println(" - Swahili, Yoruba, Amharic (Sub-Saharan Africa)")
fmt.Println(" - Indigenous languages (Quechua, Nahuatl, Aymara)")
return nil
}
// PrintScoreAnalytics prints score distribution statistics and gap analysis
// for a set of scored entries. Use after scoring responses with grammar v3.
func PrintScoreAnalytics(entries []ScoredEntry) {
if len(entries) == 0 {
fmt.Println("No scored entries to analyse.")
return
}
report := ScoreSummary(entries)
fmt.Println("\nGrammar Score Distribution")
fmt.Println("==================================================")
fmt.Printf(" Entries: %d\n", report.Total)
cs := report.CompositeStats
fmt.Printf(" Mean: %.1f\n", cs.Mean)
fmt.Printf(" Median: %.1f\n", cs.Median)
fmt.Printf(" StdDev: %.1f\n", cs.StdDev)
fmt.Printf(" Range: %.1f %.1f\n", cs.Min, cs.Max)
fmt.Printf(" P25: %.1f\n", cs.P25)
fmt.Printf(" P75: %.1f\n", cs.P75)
fmt.Printf(" P90: %.1f\n", cs.P90)
fmt.Printf(" Skewness: %.2f\n", cs.Skewness)
fmt.Println("\nPer-Axis Statistics")
fmt.Println("--------------------------------------------------")
fmt.Printf(" %-20s %8s %8s %8s %8s\n", "Feature", "Mean", "StdDev", "Min", "Max")
for _, ax := range report.AxisStats {
fmt.Printf(" %-20s %8.3f %8.3f %8.3f %8.3f\n",
ax.Name, ax.Stats.Mean, ax.Stats.StdDev, ax.Stats.Min, ax.Stats.Max)
}
// Gap analysis.
if len(entries) >= 3 {
gaps := FindGaps(entries, min(3, len(entries)))
if len(gaps) > 0 {
fmt.Println("\nTop 10 Coverage Gaps (worst first)")
fmt.Println("--------------------------------------------------")
limit := min(10, len(gaps))
for i := range limit {
g := gaps[i]
fmt.Printf(" #%d avg_dist=%.4f nearest=%v\n", i+1, g.AvgDistance, g.NearestIDs)
}
}
}
}