Replace passthrough() + stdlib flag.FlagSet anti-pattern with proper cobra integration. Every Run* function now takes a typed *Opts struct and returns error. Flags registered via cli.StringFlag/IntFlag/etc. Commands participate in Core lifecycle with full cobra flag parsing. - 6 command groups: gen, score, data, export, infra, mon - 25 commands converted, 0 passthrough() calls remain - Delete passthrough() helper from lem.go - Update export_test.go to use ExportOpts struct Co-Authored-By: Virgil <virgil@lethean.io>
174 lines
5.1 KiB
Go
174 lines
5.1 KiB
Go
package lem
|
||
|
||
import (
|
||
"fmt"
|
||
"os"
|
||
"strings"
|
||
)
|
||
|
||
// CoverageOpts holds configuration for the coverage command.
|
||
type CoverageOpts struct {
|
||
DB string // DuckDB database path (defaults to LEM_DB env)
|
||
}
|
||
|
||
// RunCoverage analyses seed coverage and shows underrepresented areas.
|
||
func RunCoverage(cfg CoverageOpts) error {
|
||
if cfg.DB == "" {
|
||
cfg.DB = os.Getenv("LEM_DB")
|
||
}
|
||
if cfg.DB == "" {
|
||
return fmt.Errorf("--db or LEM_DB required")
|
||
}
|
||
|
||
db, err := OpenDB(cfg.DB)
|
||
if err != nil {
|
||
return fmt.Errorf("open db: %w", err)
|
||
}
|
||
defer db.Close()
|
||
|
||
var total int
|
||
if err := db.conn.QueryRow("SELECT count(*) FROM seeds").Scan(&total); err != nil {
|
||
return fmt.Errorf("no seeds table — run: lem import-all first")
|
||
}
|
||
|
||
fmt.Println("LEM Seed Coverage Analysis")
|
||
fmt.Println("==================================================")
|
||
fmt.Printf("\nTotal seeds: %d\n", total)
|
||
|
||
// Region distribution.
|
||
fmt.Println("\nRegion distribution (underrepresented first):")
|
||
rows, err := db.conn.Query(`
|
||
SELECT
|
||
CASE
|
||
WHEN region LIKE '%cn%' THEN 'cn (Chinese)'
|
||
WHEN region LIKE '%en-%' OR region LIKE '%en_para%' OR region LIKE '%para%' THEN 'en (English)'
|
||
WHEN region LIKE '%ru%' THEN 'ru (Russian)'
|
||
WHEN region LIKE '%de%' AND region NOT LIKE '%deten%' THEN 'de (German)'
|
||
WHEN region LIKE '%es%' THEN 'es (Spanish)'
|
||
WHEN region LIKE '%fr%' THEN 'fr (French)'
|
||
WHEN region LIKE '%latam%' THEN 'latam (LatAm)'
|
||
WHEN region LIKE '%africa%' THEN 'africa'
|
||
WHEN region LIKE '%eu%' THEN 'eu (European)'
|
||
WHEN region LIKE '%me%' AND region NOT LIKE '%premium%' THEN 'me (MidEast)'
|
||
WHEN region LIKE '%multi%' THEN 'multilingual'
|
||
WHEN region LIKE '%weak%' THEN 'weak-langs'
|
||
ELSE 'other'
|
||
END AS lang_group,
|
||
count(*) AS n,
|
||
count(DISTINCT domain) AS domains
|
||
FROM seeds GROUP BY lang_group ORDER BY n ASC
|
||
`)
|
||
if err != nil {
|
||
return fmt.Errorf("query regions: %w", err)
|
||
}
|
||
|
||
type regionRow struct {
|
||
group string
|
||
n int
|
||
domains int
|
||
}
|
||
var regionRows []regionRow
|
||
for rows.Next() {
|
||
var r regionRow
|
||
rows.Scan(&r.group, &r.n, &r.domains)
|
||
regionRows = append(regionRows, r)
|
||
}
|
||
rows.Close()
|
||
|
||
avg := float64(total) / float64(len(regionRows))
|
||
for _, r := range regionRows {
|
||
barLen := min(int(float64(r.n)/avg*10), 40)
|
||
bar := strings.Repeat("#", barLen)
|
||
gap := ""
|
||
if float64(r.n) < avg*0.5 {
|
||
gap = " <- UNDERREPRESENTED"
|
||
}
|
||
fmt.Printf(" %-22s %6d (%4d domains) %s%s\n", r.group, r.n, r.domains, bar, gap)
|
||
}
|
||
|
||
// Top 10 domains.
|
||
fmt.Println("\nTop 10 domains (most seeds):")
|
||
topRows, err := db.conn.Query(`
|
||
SELECT domain, count(*) AS n FROM seeds
|
||
WHERE domain != '' GROUP BY domain ORDER BY n DESC LIMIT 10
|
||
`)
|
||
if err == nil {
|
||
for topRows.Next() {
|
||
var domain string
|
||
var n int
|
||
topRows.Scan(&domain, &n)
|
||
fmt.Printf(" %-40s %5d\n", domain, n)
|
||
}
|
||
topRows.Close()
|
||
}
|
||
|
||
// Bottom 10 domains.
|
||
fmt.Println("\nBottom 10 domains (fewest seeds, min 5):")
|
||
bottomRows, err := db.conn.Query(`
|
||
SELECT domain, count(*) AS n FROM seeds
|
||
WHERE domain != '' GROUP BY domain HAVING count(*) >= 5 ORDER BY n ASC LIMIT 10
|
||
`)
|
||
if err == nil {
|
||
for bottomRows.Next() {
|
||
var domain string
|
||
var n int
|
||
bottomRows.Scan(&domain, &n)
|
||
fmt.Printf(" %-40s %5d\n", domain, n)
|
||
}
|
||
bottomRows.Close()
|
||
}
|
||
|
||
fmt.Println("\nSuggested expansion areas:")
|
||
fmt.Println(" - Japanese, Korean, Thai, Vietnamese (no seeds found)")
|
||
fmt.Println(" - Hindi/Urdu, Bengali, Tamil (South Asian)")
|
||
fmt.Println(" - Swahili, Yoruba, Amharic (Sub-Saharan Africa)")
|
||
fmt.Println(" - Indigenous languages (Quechua, Nahuatl, Aymara)")
|
||
|
||
return nil
|
||
}
|
||
|
||
// PrintScoreAnalytics prints score distribution statistics and gap analysis
|
||
// for a set of scored entries. Use after scoring responses with grammar v3.
|
||
func PrintScoreAnalytics(entries []ScoredEntry) {
|
||
if len(entries) == 0 {
|
||
fmt.Println("No scored entries to analyse.")
|
||
return
|
||
}
|
||
|
||
report := ScoreSummary(entries)
|
||
|
||
fmt.Println("\nGrammar Score Distribution")
|
||
fmt.Println("==================================================")
|
||
fmt.Printf(" Entries: %d\n", report.Total)
|
||
cs := report.CompositeStats
|
||
fmt.Printf(" Mean: %.1f\n", cs.Mean)
|
||
fmt.Printf(" Median: %.1f\n", cs.Median)
|
||
fmt.Printf(" StdDev: %.1f\n", cs.StdDev)
|
||
fmt.Printf(" Range: %.1f – %.1f\n", cs.Min, cs.Max)
|
||
fmt.Printf(" P25: %.1f\n", cs.P25)
|
||
fmt.Printf(" P75: %.1f\n", cs.P75)
|
||
fmt.Printf(" P90: %.1f\n", cs.P90)
|
||
fmt.Printf(" Skewness: %.2f\n", cs.Skewness)
|
||
|
||
fmt.Println("\nPer-Axis Statistics")
|
||
fmt.Println("--------------------------------------------------")
|
||
fmt.Printf(" %-20s %8s %8s %8s %8s\n", "Feature", "Mean", "StdDev", "Min", "Max")
|
||
for _, ax := range report.AxisStats {
|
||
fmt.Printf(" %-20s %8.3f %8.3f %8.3f %8.3f\n",
|
||
ax.Name, ax.Stats.Mean, ax.Stats.StdDev, ax.Stats.Min, ax.Stats.Max)
|
||
}
|
||
|
||
// Gap analysis.
|
||
if len(entries) >= 3 {
|
||
gaps := FindGaps(entries, min(3, len(entries)))
|
||
if len(gaps) > 0 {
|
||
fmt.Println("\nTop 10 Coverage Gaps (worst first)")
|
||
fmt.Println("--------------------------------------------------")
|
||
limit := min(10, len(gaps))
|
||
for i := range limit {
|
||
g := gaps[i]
|
||
fmt.Printf(" #%d avg_dist=%.4f nearest=%v\n", i+1, g.AvgDistance, g.NearestIDs)
|
||
}
|
||
}
|
||
}
|
||
}
|