package lem import ( "fmt" "os" "strings" ) // CoverageOpts holds configuration for the coverage command. type CoverageOpts struct { DB string // DuckDB database path (defaults to LEM_DB env) } // RunCoverage analyses seed coverage and shows underrepresented areas. func RunCoverage(cfg CoverageOpts) error { if cfg.DB == "" { cfg.DB = os.Getenv("LEM_DB") } if cfg.DB == "" { return fmt.Errorf("--db or LEM_DB required") } db, err := OpenDB(cfg.DB) if err != nil { return fmt.Errorf("open db: %w", err) } defer db.Close() var total int if err := db.conn.QueryRow("SELECT count(*) FROM seeds").Scan(&total); err != nil { return fmt.Errorf("no seeds table — run: lem import-all first") } fmt.Println("LEM Seed Coverage Analysis") fmt.Println("==================================================") fmt.Printf("\nTotal seeds: %d\n", total) // Region distribution. fmt.Println("\nRegion distribution (underrepresented first):") rows, err := db.conn.Query(` SELECT CASE WHEN region LIKE '%cn%' THEN 'cn (Chinese)' WHEN region LIKE '%en-%' OR region LIKE '%en_para%' OR region LIKE '%para%' THEN 'en (English)' WHEN region LIKE '%ru%' THEN 'ru (Russian)' WHEN region LIKE '%de%' AND region NOT LIKE '%deten%' THEN 'de (German)' WHEN region LIKE '%es%' THEN 'es (Spanish)' WHEN region LIKE '%fr%' THEN 'fr (French)' WHEN region LIKE '%latam%' THEN 'latam (LatAm)' WHEN region LIKE '%africa%' THEN 'africa' WHEN region LIKE '%eu%' THEN 'eu (European)' WHEN region LIKE '%me%' AND region NOT LIKE '%premium%' THEN 'me (MidEast)' WHEN region LIKE '%multi%' THEN 'multilingual' WHEN region LIKE '%weak%' THEN 'weak-langs' ELSE 'other' END AS lang_group, count(*) AS n, count(DISTINCT domain) AS domains FROM seeds GROUP BY lang_group ORDER BY n ASC `) if err != nil { return fmt.Errorf("query regions: %w", err) } type regionRow struct { group string n int domains int } var regionRows []regionRow for rows.Next() { var r regionRow rows.Scan(&r.group, &r.n, &r.domains) regionRows = append(regionRows, r) } rows.Close() avg := float64(total) / float64(len(regionRows)) for _, r := range regionRows { barLen := min(int(float64(r.n)/avg*10), 40) bar := strings.Repeat("#", barLen) gap := "" if float64(r.n) < avg*0.5 { gap = " <- UNDERREPRESENTED" } fmt.Printf(" %-22s %6d (%4d domains) %s%s\n", r.group, r.n, r.domains, bar, gap) } // Top 10 domains. fmt.Println("\nTop 10 domains (most seeds):") topRows, err := db.conn.Query(` SELECT domain, count(*) AS n FROM seeds WHERE domain != '' GROUP BY domain ORDER BY n DESC LIMIT 10 `) if err == nil { for topRows.Next() { var domain string var n int topRows.Scan(&domain, &n) fmt.Printf(" %-40s %5d\n", domain, n) } topRows.Close() } // Bottom 10 domains. fmt.Println("\nBottom 10 domains (fewest seeds, min 5):") bottomRows, err := db.conn.Query(` SELECT domain, count(*) AS n FROM seeds WHERE domain != '' GROUP BY domain HAVING count(*) >= 5 ORDER BY n ASC LIMIT 10 `) if err == nil { for bottomRows.Next() { var domain string var n int bottomRows.Scan(&domain, &n) fmt.Printf(" %-40s %5d\n", domain, n) } bottomRows.Close() } fmt.Println("\nSuggested expansion areas:") fmt.Println(" - Japanese, Korean, Thai, Vietnamese (no seeds found)") fmt.Println(" - Hindi/Urdu, Bengali, Tamil (South Asian)") fmt.Println(" - Swahili, Yoruba, Amharic (Sub-Saharan Africa)") fmt.Println(" - Indigenous languages (Quechua, Nahuatl, Aymara)") return nil } // PrintScoreAnalytics prints score distribution statistics and gap analysis // for a set of scored entries. Use after scoring responses with grammar v3. func PrintScoreAnalytics(entries []ScoredEntry) { if len(entries) == 0 { fmt.Println("No scored entries to analyse.") return } report := ScoreSummary(entries) fmt.Println("\nGrammar Score Distribution") fmt.Println("==================================================") fmt.Printf(" Entries: %d\n", report.Total) cs := report.CompositeStats fmt.Printf(" Mean: %.1f\n", cs.Mean) fmt.Printf(" Median: %.1f\n", cs.Median) fmt.Printf(" StdDev: %.1f\n", cs.StdDev) fmt.Printf(" Range: %.1f – %.1f\n", cs.Min, cs.Max) fmt.Printf(" P25: %.1f\n", cs.P25) fmt.Printf(" P75: %.1f\n", cs.P75) fmt.Printf(" P90: %.1f\n", cs.P90) fmt.Printf(" Skewness: %.2f\n", cs.Skewness) fmt.Println("\nPer-Axis Statistics") fmt.Println("--------------------------------------------------") fmt.Printf(" %-20s %8s %8s %8s %8s\n", "Feature", "Mean", "StdDev", "Min", "Max") for _, ax := range report.AxisStats { fmt.Printf(" %-20s %8.3f %8.3f %8.3f %8.3f\n", ax.Name, ax.Stats.Mean, ax.Stats.StdDev, ax.Stats.Min, ax.Stats.Max) } // Gap analysis. if len(entries) >= 3 { gaps := FindGaps(entries, min(3, len(entries))) if len(gaps) > 0 { fmt.Println("\nTop 10 Coverage Gaps (worst first)") fmt.Println("--------------------------------------------------") limit := min(10, len(gaps)) for i := range limit { g := gaps[i] fmt.Printf(" #%d avg_dist=%.4f nearest=%v\n", i+1, g.AvgDistance, g.NearestIDs) } } } }