LEM/pkg/lem/coverage.go

package lem

import (
	"fmt"
	"os"
	"strings"
)

// CoverageOpts holds configuration for the coverage command.
type CoverageOpts struct {
	DB string // DuckDB database path (defaults to LEM_DB env)
}

// RunCoverage analyses seed coverage and shows underrepresented areas.
func RunCoverage(cfg CoverageOpts) error {
	if cfg.DB == "" {
		cfg.DB = os.Getenv("LEM_DB")
	}
	if cfg.DB == "" {
		return fmt.Errorf("--db or LEM_DB required")
	}

	db, err := OpenDB(cfg.DB)
	if err != nil {
		return fmt.Errorf("open db: %w", err)
	}
	defer db.Close()

	var total int
	if err := db.conn.QueryRow("SELECT count(*) FROM seeds").Scan(&total); err != nil {
		return fmt.Errorf("no seeds table — run: lem import-all first")
	}

	fmt.Println("LEM Seed Coverage Analysis")
	fmt.Println("==================================================")
	fmt.Printf("\nTotal seeds: %d\n", total)

	// Region distribution.
	fmt.Println("\nRegion distribution (underrepresented first):")
	rows, err := db.conn.Query(`
		SELECT
			CASE
				WHEN region LIKE '%cn%' THEN 'cn (Chinese)'
				WHEN region LIKE '%en-%' OR region LIKE '%en_para%' OR region LIKE '%para%' THEN 'en (English)'
				WHEN region LIKE '%ru%' THEN 'ru (Russian)'
				WHEN region LIKE '%de%' AND region NOT LIKE '%deten%' THEN 'de (German)'
				WHEN region LIKE '%es%' THEN 'es (Spanish)'
				WHEN region LIKE '%fr%' THEN 'fr (French)'
				WHEN region LIKE '%latam%' THEN 'latam (LatAm)'
				WHEN region LIKE '%africa%' THEN 'africa'
				WHEN region LIKE '%eu%' THEN 'eu (European)'
				WHEN region LIKE '%me%' AND region NOT LIKE '%premium%' THEN 'me (MidEast)'
				WHEN region LIKE '%multi%' THEN 'multilingual'
				WHEN region LIKE '%weak%' THEN 'weak-langs'
				ELSE 'other'
			END AS lang_group,
			count(*) AS n,
			count(DISTINCT domain) AS domains
		FROM seeds GROUP BY lang_group ORDER BY n ASC
	`)
	if err != nil {
		return fmt.Errorf("query regions: %w", err)
	}

	type regionRow struct {
		group   string
		n       int
		domains int
	}
	var regionRows []regionRow
	for rows.Next() {
		var r regionRow
		rows.Scan(&r.group, &r.n, &r.domains)
		regionRows = append(regionRows, r)
	}
	rows.Close()

	avg := float64(total) / float64(len(regionRows))
	for _, r := range regionRows {
		barLen := min(int(float64(r.n)/avg*10), 40)
		bar := strings.Repeat("#", barLen)
		gap := ""
		if float64(r.n) < avg*0.5 {
			gap = "  <- UNDERREPRESENTED"
		}
		fmt.Printf("  %-22s %6d  (%4d domains)  %s%s\n", r.group, r.n, r.domains, bar, gap)
	}

	// Top 10 domains.
	fmt.Println("\nTop 10 domains (most seeds):")
	topRows, err := db.conn.Query(`
		SELECT domain, count(*) AS n FROM seeds
		WHERE domain != '' GROUP BY domain ORDER BY n DESC LIMIT 10
	`)
	if err == nil {
		for topRows.Next() {
			var domain string
			var n int
			topRows.Scan(&domain, &n)
			fmt.Printf("  %-40s %5d\n", domain, n)
		}
		topRows.Close()
	}

	// Bottom 10 domains.
	fmt.Println("\nBottom 10 domains (fewest seeds, min 5):")
	bottomRows, err := db.conn.Query(`
		SELECT domain, count(*) AS n FROM seeds
		WHERE domain != '' GROUP BY domain HAVING count(*) >= 5 ORDER BY n ASC LIMIT 10
	`)
	if err == nil {
		for bottomRows.Next() {
			var domain string
			var n int
			bottomRows.Scan(&domain, &n)
			fmt.Printf("  %-40s %5d\n", domain, n)
		}
		bottomRows.Close()
	}

	fmt.Println("\nSuggested expansion areas:")
	fmt.Println("  - Japanese, Korean, Thai, Vietnamese (no seeds found)")
	fmt.Println("  - Hindi/Urdu, Bengali, Tamil (South Asian)")
	fmt.Println("  - Swahili, Yoruba, Amharic (Sub-Saharan Africa)")
	fmt.Println("  - Indigenous languages (Quechua, Nahuatl, Aymara)")

	return nil
}

// PrintScoreAnalytics prints score distribution statistics and gap analysis
// for a set of scored entries. Use after scoring responses with grammar v3.
func PrintScoreAnalytics(entries []ScoredEntry) {
	if len(entries) == 0 {
		fmt.Println("No scored entries to analyse.")
		return
	}

	report := ScoreSummary(entries)

	fmt.Println("\nGrammar Score Distribution")
	fmt.Println("==================================================")
	fmt.Printf("  Entries:   %d\n", report.Total)
	cs := report.CompositeStats
	fmt.Printf("  Mean:      %.1f\n", cs.Mean)
	fmt.Printf("  Median:    %.1f\n", cs.Median)
	fmt.Printf("  StdDev:    %.1f\n", cs.StdDev)
	fmt.Printf("  Range:     %.1f – %.1f\n", cs.Min, cs.Max)
	fmt.Printf("  P25:       %.1f\n", cs.P25)
	fmt.Printf("  P75:       %.1f\n", cs.P75)
	fmt.Printf("  P90:       %.1f\n", cs.P90)
	fmt.Printf("  Skewness:  %.2f\n", cs.Skewness)

	fmt.Println("\nPer-Axis Statistics")
	fmt.Println("--------------------------------------------------")
	fmt.Printf("  %-20s %8s %8s %8s %8s\n", "Feature", "Mean", "StdDev", "Min", "Max")
	for _, ax := range report.AxisStats {
		fmt.Printf("  %-20s %8.3f %8.3f %8.3f %8.3f\n",
			ax.Name, ax.Stats.Mean, ax.Stats.StdDev, ax.Stats.Min, ax.Stats.Max)
	}

	// Gap analysis.
	if len(entries) >= 3 {
		gaps := FindGaps(entries, min(3, len(entries)))
		if len(gaps) > 0 {
			fmt.Println("\nTop 10 Coverage Gaps (worst first)")
			fmt.Println("--------------------------------------------------")
			limit := min(10, len(gaps))
			for i := range limit {
				g := gaps[i]
				fmt.Printf("  #%d  avg_dist=%.4f  nearest=%v\n", i+1, g.AvgDistance, g.NearestIDs)
			}
		}
	}
}