package ml import ( "fmt" "io" "strings" ) // regionRow holds a single row from the region distribution query. type regionRow struct { group string n int domains int } // PrintCoverage analyzes seed coverage by region and domain, printing // a report with bar chart visualization and gap recommendations. func PrintCoverage(db *DB, w io.Writer) error { rows, err := db.QueryRows("SELECT count(*) AS total FROM seeds") if err != nil { return fmt.Errorf("count seeds: %w (run: core ml import-all first)", err) } if len(rows) == 0 { return fmt.Errorf("no seeds table found (run: core ml import-all first)") } total := toInt(rows[0]["total"]) fmt.Fprintln(w, "LEM Seed Coverage Analysis") fmt.Fprintln(w, "==================================================") fmt.Fprintf(w, "\nTotal seeds: %d\n", total) // Region distribution. regionRows, err := queryRegionDistribution(db) if err != nil { return fmt.Errorf("query regions: %w", err) } fmt.Fprintln(w, "\nRegion distribution (underrepresented first):") avg := float64(total) / float64(len(regionRows)) for _, r := range regionRows { barLen := int(float64(r.n) / avg * 10) if barLen > 40 { barLen = 40 } bar := strings.Repeat("#", barLen) gap := "" if float64(r.n) < avg*0.5 { gap = " <- UNDERREPRESENTED" } fmt.Fprintf(w, " %-22s %6d (%4d domains) %s%s\n", r.group, r.n, r.domains, bar, gap) } // Top 10 domains. fmt.Fprintln(w, "\nTop 10 domains (most seeds):") topRows, err := db.QueryRows(` SELECT domain, count(*) AS n FROM seeds WHERE domain != '' GROUP BY domain ORDER BY n DESC LIMIT 10 `) if err == nil { for _, row := range topRows { domain := strVal(row, "domain") n := toInt(row["n"]) fmt.Fprintf(w, " %-40s %5d\n", domain, n) } } // Bottom 10 domains. fmt.Fprintln(w, "\nBottom 10 domains (fewest seeds, min 5):") bottomRows, err := db.QueryRows(` SELECT domain, count(*) AS n FROM seeds WHERE domain != '' GROUP BY domain HAVING count(*) >= 5 ORDER BY n ASC LIMIT 10 `) if err == nil { for _, row := range bottomRows { domain := strVal(row, "domain") n := toInt(row["n"]) fmt.Fprintf(w, " %-40s %5d\n", domain, n) } } fmt.Fprintln(w, "\nSuggested expansion areas:") fmt.Fprintln(w, " - Japanese, Korean, Thai, Vietnamese (no seeds found)") fmt.Fprintln(w, " - Hindi/Urdu, Bengali, Tamil (South Asian)") fmt.Fprintln(w, " - Swahili, Yoruba, Amharic (Sub-Saharan Africa)") fmt.Fprintln(w, " - Indigenous languages (Quechua, Nahuatl, Aymara)") return nil } // queryRegionDistribution returns seed counts grouped by normalized language // region, ordered ascending (underrepresented first). func queryRegionDistribution(db *DB) ([]regionRow, error) { rows, err := db.QueryRows(` SELECT CASE WHEN region LIKE '%cn%' THEN 'cn (Chinese)' WHEN region LIKE '%en-%' OR region LIKE '%en_para%' OR region LIKE '%para%' THEN 'en (English)' WHEN region LIKE '%ru%' THEN 'ru (Russian)' WHEN region LIKE '%de%' AND region NOT LIKE '%deten%' THEN 'de (German)' WHEN region LIKE '%es%' THEN 'es (Spanish)' WHEN region LIKE '%fr%' THEN 'fr (French)' WHEN region LIKE '%latam%' THEN 'latam (LatAm)' WHEN region LIKE '%africa%' THEN 'africa' WHEN region LIKE '%eu%' THEN 'eu (European)' WHEN region LIKE '%me%' AND region NOT LIKE '%premium%' THEN 'me (MidEast)' WHEN region LIKE '%multi%' THEN 'multilingual' WHEN region LIKE '%weak%' THEN 'weak-langs' ELSE 'other' END AS lang_group, count(*) AS n, count(DISTINCT domain) AS domains FROM seeds GROUP BY lang_group ORDER BY n ASC `) if err != nil { return nil, err } result := make([]regionRow, 0, len(rows)) for _, row := range rows { result = append(result, regionRow{ group: strVal(row, "lang_group"), n: toInt(row["n"]), domains: toInt(row["domains"]), }) } return result, nil }