LEM/pkg/lem/coverage.go

179 lines
5.3 KiB
Go
Raw Normal View History

package lem
import (
"flag"
"fmt"
"log"
"os"
"strings"
)
// RunCoverage is the CLI entry point for the coverage command.
// Analyzes seed coverage and shows underrepresented areas.
func RunCoverage(args []string) {
fs := flag.NewFlagSet("coverage", flag.ExitOnError)
dbPath := fs.String("db", "", "DuckDB database path (defaults to LEM_DB env)")
if err := fs.Parse(args); err != nil {
log.Fatalf("parse flags: %v", err)
}
if *dbPath == "" {
*dbPath = os.Getenv("LEM_DB")
}
if *dbPath == "" {
fmt.Fprintln(os.Stderr, "error: --db or LEM_DB required")
os.Exit(1)
}
db, err := OpenDB(*dbPath)
if err != nil {
log.Fatalf("open db: %v", err)
}
defer db.Close()
var total int
if err := db.conn.QueryRow("SELECT count(*) FROM seeds").Scan(&total); err != nil {
log.Fatalf("No seeds table. Run: lem import-all first")
}
fmt.Println("LEM Seed Coverage Analysis")
fmt.Println("==================================================")
fmt.Printf("\nTotal seeds: %d\n", total)
// Region distribution.
fmt.Println("\nRegion distribution (underrepresented first):")
rows, err := db.conn.Query(`
SELECT
CASE
WHEN region LIKE '%cn%' THEN 'cn (Chinese)'
WHEN region LIKE '%en-%' OR region LIKE '%en_para%' OR region LIKE '%para%' THEN 'en (English)'
WHEN region LIKE '%ru%' THEN 'ru (Russian)'
WHEN region LIKE '%de%' AND region NOT LIKE '%deten%' THEN 'de (German)'
WHEN region LIKE '%es%' THEN 'es (Spanish)'
WHEN region LIKE '%fr%' THEN 'fr (French)'
WHEN region LIKE '%latam%' THEN 'latam (LatAm)'
WHEN region LIKE '%africa%' THEN 'africa'
WHEN region LIKE '%eu%' THEN 'eu (European)'
WHEN region LIKE '%me%' AND region NOT LIKE '%premium%' THEN 'me (MidEast)'
WHEN region LIKE '%multi%' THEN 'multilingual'
WHEN region LIKE '%weak%' THEN 'weak-langs'
ELSE 'other'
END AS lang_group,
count(*) AS n,
count(DISTINCT domain) AS domains
FROM seeds GROUP BY lang_group ORDER BY n ASC
`)
if err != nil {
log.Fatalf("query regions: %v", err)
}
type regionRow struct {
group string
n int
domains int
}
var regionRows []regionRow
for rows.Next() {
var r regionRow
rows.Scan(&r.group, &r.n, &r.domains)
regionRows = append(regionRows, r)
}
rows.Close()
avg := float64(total) / float64(len(regionRows))
for _, r := range regionRows {
barLen := min(int(float64(r.n)/avg*10), 40)
bar := strings.Repeat("#", barLen)
gap := ""
if float64(r.n) < avg*0.5 {
gap = " <- UNDERREPRESENTED"
}
fmt.Printf(" %-22s %6d (%4d domains) %s%s\n", r.group, r.n, r.domains, bar, gap)
}
// Top 10 domains.
fmt.Println("\nTop 10 domains (most seeds):")
topRows, err := db.conn.Query(`
SELECT domain, count(*) AS n FROM seeds
WHERE domain != '' GROUP BY domain ORDER BY n DESC LIMIT 10
`)
if err == nil {
for topRows.Next() {
var domain string
var n int
topRows.Scan(&domain, &n)
fmt.Printf(" %-40s %5d\n", domain, n)
}
topRows.Close()
}
// Bottom 10 domains.
fmt.Println("\nBottom 10 domains (fewest seeds, min 5):")
bottomRows, err := db.conn.Query(`
SELECT domain, count(*) AS n FROM seeds
WHERE domain != '' GROUP BY domain HAVING count(*) >= 5 ORDER BY n ASC LIMIT 10
`)
if err == nil {
for bottomRows.Next() {
var domain string
var n int
bottomRows.Scan(&domain, &n)
fmt.Printf(" %-40s %5d\n", domain, n)
}
bottomRows.Close()
}
fmt.Println("\nSuggested expansion areas:")
fmt.Println(" - Japanese, Korean, Thai, Vietnamese (no seeds found)")
fmt.Println(" - Hindi/Urdu, Bengali, Tamil (South Asian)")
fmt.Println(" - Swahili, Yoruba, Amharic (Sub-Saharan Africa)")
fmt.Println(" - Indigenous languages (Quechua, Nahuatl, Aymara)")
}
// PrintScoreAnalytics prints score distribution statistics and gap analysis
// for a set of scored entries. Use after scoring responses with grammar v3.
func PrintScoreAnalytics(entries []ScoredEntry) {
if len(entries) == 0 {
fmt.Println("No scored entries to analyse.")
return
}
report := ScoreSummary(entries)
fmt.Println("\nGrammar Score Distribution")
fmt.Println("==================================================")
fmt.Printf(" Entries: %d\n", report.Total)
cs := report.CompositeStats
fmt.Printf(" Mean: %.1f\n", cs.Mean)
fmt.Printf(" Median: %.1f\n", cs.Median)
fmt.Printf(" StdDev: %.1f\n", cs.StdDev)
fmt.Printf(" Range: %.1f %.1f\n", cs.Min, cs.Max)
fmt.Printf(" P25: %.1f\n", cs.P25)
fmt.Printf(" P75: %.1f\n", cs.P75)
fmt.Printf(" P90: %.1f\n", cs.P90)
fmt.Printf(" Skewness: %.2f\n", cs.Skewness)
fmt.Println("\nPer-Axis Statistics")
fmt.Println("--------------------------------------------------")
fmt.Printf(" %-20s %8s %8s %8s %8s\n", "Feature", "Mean", "StdDev", "Min", "Max")
for _, ax := range report.AxisStats {
fmt.Printf(" %-20s %8.3f %8.3f %8.3f %8.3f\n",
ax.Name, ax.Stats.Mean, ax.Stats.StdDev, ax.Stats.Min, ax.Stats.Max)
}
// Gap analysis.
if len(entries) >= 3 {
gaps := FindGaps(entries, min(3, len(entries)))
if len(gaps) > 0 {
fmt.Println("\nTop 10 Coverage Gaps (worst first)")
fmt.Println("--------------------------------------------------")
limit := min(10, len(gaps))
for i := range limit {
g := gaps[i]
fmt.Printf(" #%d avg_dist=%.4f nearest=%v\n", i+1, g.AvgDistance, g.NearestIDs)
}
}
}
}