LEM/pkg/lem/normalize.go
Claude 91ee389377
feat: convert all pipeline.py commands to Go
Complete conversion of pipeline.py into Go `lem` CLI:
- import-all: bulk import all LEM data into DuckDB from M3
- consolidate: pull worker JSONLs, merge, deduplicate
- normalize: seeds → deduplicated expansion_prompts table
- approve: filter scored expansions → training JSONL
- tier-score: heuristic/judge tiered expansion scoring
- expand-status: expansion pipeline progress from DuckDB
- inventory: DuckDB table counts and summary
- coverage: seed coverage gap analysis
- seed-influx: bootstrap InfluxDB from DuckDB golden_gen
- query: ad-hoc SQL against DuckDB

22 commands total, 49 Go files. Replaces entire pipeline.py.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 17:12:03 +00:00

148 lines
4 KiB
Go

package lem
import (
"flag"
"fmt"
"log"
"os"
)
// RunNormalize is the CLI entry point for the normalize command.
// Normalizes seeds into the expansion_prompts table, deduplicating against
// the golden set and existing prompts. Assigns priority based on domain
// coverage (underrepresented domains first).
func RunNormalize(args []string) {
fs := flag.NewFlagSet("normalize", flag.ExitOnError)
dbPath := fs.String("db", "", "DuckDB database path (defaults to LEM_DB env)")
minLen := fs.Int("min-length", 50, "Minimum prompt length in characters")
if err := fs.Parse(args); err != nil {
log.Fatalf("parse flags: %v", err)
}
if *dbPath == "" {
*dbPath = os.Getenv("LEM_DB")
}
if *dbPath == "" {
fmt.Fprintln(os.Stderr, "error: --db or LEM_DB required")
os.Exit(1)
}
db, err := OpenDBReadWrite(*dbPath)
if err != nil {
log.Fatalf("open db: %v", err)
}
defer db.Close()
// Check source tables.
var seedCount int
if err := db.conn.QueryRow("SELECT count(*) FROM seeds").Scan(&seedCount); err != nil {
log.Fatalf("No seeds table. Run: lem import-all first")
}
fmt.Printf("Seeds table: %d rows\n", seedCount)
// Drop and recreate expansion_prompts.
_, err = db.conn.Exec("DROP TABLE IF EXISTS expansion_prompts")
if err != nil {
log.Fatalf("drop expansion_prompts: %v", err)
}
// Deduplicate: remove seeds whose prompt already appears in prompts or golden_set.
_, err = db.conn.Exec(fmt.Sprintf(`
CREATE TABLE expansion_prompts AS
WITH unique_seeds AS (
SELECT
ROW_NUMBER() OVER (ORDER BY region, domain, seed_id) AS idx,
seed_id,
region,
domain,
prompt
FROM (
SELECT DISTINCT ON (prompt)
seed_id, region, domain, prompt
FROM seeds
WHERE length(prompt) >= %d
ORDER BY prompt, seed_id
)
),
existing_prompts AS (
SELECT prompt FROM prompts
UNION ALL
SELECT prompt FROM golden_set
)
SELECT
us.idx,
us.seed_id,
us.region,
us.domain,
'en' AS language,
us.prompt,
'' AS prompt_en,
0 AS priority,
'pending' AS status
FROM unique_seeds us
WHERE NOT EXISTS (
SELECT 1 FROM existing_prompts ep
WHERE ep.prompt = us.prompt
)
`, *minLen))
if err != nil {
log.Fatalf("create expansion_prompts: %v", err)
}
var total, domains, regions int
db.conn.QueryRow("SELECT count(*) FROM expansion_prompts").Scan(&total)
db.conn.QueryRow("SELECT count(DISTINCT domain) FROM expansion_prompts").Scan(&domains)
db.conn.QueryRow("SELECT count(DISTINCT region) FROM expansion_prompts").Scan(&regions)
// Assign priority based on domain coverage.
_, err = db.conn.Exec(`
UPDATE expansion_prompts SET priority = (
SELECT RANK() OVER (ORDER BY cnt ASC)
FROM (
SELECT domain, count(*) AS cnt
FROM expansion_prompts GROUP BY domain
) domain_counts
WHERE domain_counts.domain = expansion_prompts.domain
)
`)
if err != nil {
log.Printf("warning: priority assignment failed: %v", err)
}
fmt.Printf("\nExpansion Prompts: %d\n", total)
fmt.Printf(" Domains: %d\n", domains)
fmt.Printf(" Regions: %d\n", regions)
// Show region distribution.
fmt.Println("\n By region group:")
rows, err := db.conn.Query(`
SELECT
CASE
WHEN region LIKE '%cn%' THEN 'cn'
WHEN region LIKE '%en-%' OR region LIKE '%en_para%' OR region LIKE '%para%' THEN 'en'
WHEN region LIKE '%ru%' THEN 'ru'
WHEN region LIKE '%de%' AND region NOT LIKE '%deten%' THEN 'de'
WHEN region LIKE '%es%' THEN 'es'
WHEN region LIKE '%fr%' THEN 'fr'
WHEN region LIKE '%latam%' THEN 'latam'
WHEN region LIKE '%africa%' THEN 'africa'
WHEN region LIKE '%eu%' THEN 'eu'
WHEN region LIKE '%me%' AND region NOT LIKE '%premium%' THEN 'me'
ELSE 'other'
END AS lang_group,
count(*) AS n
FROM expansion_prompts GROUP BY lang_group ORDER BY n DESC
`)
if err == nil {
for rows.Next() {
var group string
var n int
rows.Scan(&group, &n)
fmt.Printf(" %-15s %6d\n", group, n)
}
rows.Close()
}
fmt.Printf("\nNormalization complete: %d expansion prompts from %d seeds\n", total, seedCount)
}