Complete conversion of pipeline.py into Go `lem` CLI: - import-all: bulk import all LEM data into DuckDB from M3 - consolidate: pull worker JSONLs, merge, deduplicate - normalize: seeds → deduplicated expansion_prompts table - approve: filter scored expansions → training JSONL - tier-score: heuristic/judge tiered expansion scoring - expand-status: expansion pipeline progress from DuckDB - inventory: DuckDB table counts and summary - coverage: seed coverage gap analysis - seed-influx: bootstrap InfluxDB from DuckDB golden_gen - query: ad-hoc SQL against DuckDB 22 commands total, 49 Go files. Replaces entire pipeline.py. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
135 lines
3.7 KiB
Go
135 lines
3.7 KiB
Go
package lem
|
|
|
|
import (
|
|
"flag"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"strings"
|
|
)
|
|
|
|
// RunCoverage is the CLI entry point for the coverage command.
|
|
// Analyzes seed coverage and shows underrepresented areas.
|
|
func RunCoverage(args []string) {
|
|
fs := flag.NewFlagSet("coverage", flag.ExitOnError)
|
|
dbPath := fs.String("db", "", "DuckDB database path (defaults to LEM_DB env)")
|
|
|
|
if err := fs.Parse(args); err != nil {
|
|
log.Fatalf("parse flags: %v", err)
|
|
}
|
|
|
|
if *dbPath == "" {
|
|
*dbPath = os.Getenv("LEM_DB")
|
|
}
|
|
if *dbPath == "" {
|
|
fmt.Fprintln(os.Stderr, "error: --db or LEM_DB required")
|
|
os.Exit(1)
|
|
}
|
|
|
|
db, err := OpenDB(*dbPath)
|
|
if err != nil {
|
|
log.Fatalf("open db: %v", err)
|
|
}
|
|
defer db.Close()
|
|
|
|
var total int
|
|
if err := db.conn.QueryRow("SELECT count(*) FROM seeds").Scan(&total); err != nil {
|
|
log.Fatalf("No seeds table. Run: lem import-all first")
|
|
}
|
|
|
|
fmt.Println("LEM Seed Coverage Analysis")
|
|
fmt.Println("==================================================")
|
|
fmt.Printf("\nTotal seeds: %d\n", total)
|
|
|
|
// Region distribution.
|
|
fmt.Println("\nRegion distribution (underrepresented first):")
|
|
rows, err := db.conn.Query(`
|
|
SELECT
|
|
CASE
|
|
WHEN region LIKE '%cn%' THEN 'cn (Chinese)'
|
|
WHEN region LIKE '%en-%' OR region LIKE '%en_para%' OR region LIKE '%para%' THEN 'en (English)'
|
|
WHEN region LIKE '%ru%' THEN 'ru (Russian)'
|
|
WHEN region LIKE '%de%' AND region NOT LIKE '%deten%' THEN 'de (German)'
|
|
WHEN region LIKE '%es%' THEN 'es (Spanish)'
|
|
WHEN region LIKE '%fr%' THEN 'fr (French)'
|
|
WHEN region LIKE '%latam%' THEN 'latam (LatAm)'
|
|
WHEN region LIKE '%africa%' THEN 'africa'
|
|
WHEN region LIKE '%eu%' THEN 'eu (European)'
|
|
WHEN region LIKE '%me%' AND region NOT LIKE '%premium%' THEN 'me (MidEast)'
|
|
WHEN region LIKE '%multi%' THEN 'multilingual'
|
|
WHEN region LIKE '%weak%' THEN 'weak-langs'
|
|
ELSE 'other'
|
|
END AS lang_group,
|
|
count(*) AS n,
|
|
count(DISTINCT domain) AS domains
|
|
FROM seeds GROUP BY lang_group ORDER BY n ASC
|
|
`)
|
|
if err != nil {
|
|
log.Fatalf("query regions: %v", err)
|
|
}
|
|
|
|
type regionRow struct {
|
|
group string
|
|
n int
|
|
domains int
|
|
}
|
|
var regionRows []regionRow
|
|
for rows.Next() {
|
|
var r regionRow
|
|
rows.Scan(&r.group, &r.n, &r.domains)
|
|
regionRows = append(regionRows, r)
|
|
}
|
|
rows.Close()
|
|
|
|
avg := float64(total) / float64(len(regionRows))
|
|
for _, r := range regionRows {
|
|
barLen := int(float64(r.n) / avg * 10)
|
|
if barLen > 40 {
|
|
barLen = 40
|
|
}
|
|
bar := strings.Repeat("#", barLen)
|
|
gap := ""
|
|
if float64(r.n) < avg*0.5 {
|
|
gap = " <- UNDERREPRESENTED"
|
|
}
|
|
fmt.Printf(" %-22s %6d (%4d domains) %s%s\n", r.group, r.n, r.domains, bar, gap)
|
|
}
|
|
|
|
// Top 10 domains.
|
|
fmt.Println("\nTop 10 domains (most seeds):")
|
|
topRows, err := db.conn.Query(`
|
|
SELECT domain, count(*) AS n FROM seeds
|
|
WHERE domain != '' GROUP BY domain ORDER BY n DESC LIMIT 10
|
|
`)
|
|
if err == nil {
|
|
for topRows.Next() {
|
|
var domain string
|
|
var n int
|
|
topRows.Scan(&domain, &n)
|
|
fmt.Printf(" %-40s %5d\n", domain, n)
|
|
}
|
|
topRows.Close()
|
|
}
|
|
|
|
// Bottom 10 domains.
|
|
fmt.Println("\nBottom 10 domains (fewest seeds, min 5):")
|
|
bottomRows, err := db.conn.Query(`
|
|
SELECT domain, count(*) AS n FROM seeds
|
|
WHERE domain != '' GROUP BY domain HAVING count(*) >= 5 ORDER BY n ASC LIMIT 10
|
|
`)
|
|
if err == nil {
|
|
for bottomRows.Next() {
|
|
var domain string
|
|
var n int
|
|
bottomRows.Scan(&domain, &n)
|
|
fmt.Printf(" %-40s %5d\n", domain, n)
|
|
}
|
|
bottomRows.Close()
|
|
}
|
|
|
|
fmt.Println("\nSuggested expansion areas:")
|
|
fmt.Println(" - Japanese, Korean, Thai, Vietnamese (no seeds found)")
|
|
fmt.Println(" - Hindi/Urdu, Bengali, Tamil (South Asian)")
|
|
fmt.Println(" - Swahili, Yoruba, Amharic (Sub-Saharan Africa)")
|
|
fmt.Println(" - Indigenous languages (Quechua, Nahuatl, Aymara)")
|
|
}
|