Complete conversion of pipeline.py into Go `lem` CLI: - import-all: bulk import all LEM data into DuckDB from M3 - consolidate: pull worker JSONLs, merge, deduplicate - normalize: seeds → deduplicated expansion_prompts table - approve: filter scored expansions → training JSONL - tier-score: heuristic/judge tiered expansion scoring - expand-status: expansion pipeline progress from DuckDB - inventory: DuckDB table counts and summary - coverage: seed coverage gap analysis - seed-influx: bootstrap InfluxDB from DuckDB golden_gen - query: ad-hoc SQL against DuckDB 22 commands total, 49 Go files. Replaces entire pipeline.py. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
148 lines
4 KiB
Go
148 lines
4 KiB
Go
package lem
|
|
|
|
import (
|
|
"flag"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
)
|
|
|
|
// RunNormalize is the CLI entry point for the normalize command.
|
|
// Normalizes seeds into the expansion_prompts table, deduplicating against
|
|
// the golden set and existing prompts. Assigns priority based on domain
|
|
// coverage (underrepresented domains first).
|
|
func RunNormalize(args []string) {
|
|
fs := flag.NewFlagSet("normalize", flag.ExitOnError)
|
|
dbPath := fs.String("db", "", "DuckDB database path (defaults to LEM_DB env)")
|
|
minLen := fs.Int("min-length", 50, "Minimum prompt length in characters")
|
|
|
|
if err := fs.Parse(args); err != nil {
|
|
log.Fatalf("parse flags: %v", err)
|
|
}
|
|
|
|
if *dbPath == "" {
|
|
*dbPath = os.Getenv("LEM_DB")
|
|
}
|
|
if *dbPath == "" {
|
|
fmt.Fprintln(os.Stderr, "error: --db or LEM_DB required")
|
|
os.Exit(1)
|
|
}
|
|
|
|
db, err := OpenDBReadWrite(*dbPath)
|
|
if err != nil {
|
|
log.Fatalf("open db: %v", err)
|
|
}
|
|
defer db.Close()
|
|
|
|
// Check source tables.
|
|
var seedCount int
|
|
if err := db.conn.QueryRow("SELECT count(*) FROM seeds").Scan(&seedCount); err != nil {
|
|
log.Fatalf("No seeds table. Run: lem import-all first")
|
|
}
|
|
fmt.Printf("Seeds table: %d rows\n", seedCount)
|
|
|
|
// Drop and recreate expansion_prompts.
|
|
_, err = db.conn.Exec("DROP TABLE IF EXISTS expansion_prompts")
|
|
if err != nil {
|
|
log.Fatalf("drop expansion_prompts: %v", err)
|
|
}
|
|
|
|
// Deduplicate: remove seeds whose prompt already appears in prompts or golden_set.
|
|
_, err = db.conn.Exec(fmt.Sprintf(`
|
|
CREATE TABLE expansion_prompts AS
|
|
WITH unique_seeds AS (
|
|
SELECT
|
|
ROW_NUMBER() OVER (ORDER BY region, domain, seed_id) AS idx,
|
|
seed_id,
|
|
region,
|
|
domain,
|
|
prompt
|
|
FROM (
|
|
SELECT DISTINCT ON (prompt)
|
|
seed_id, region, domain, prompt
|
|
FROM seeds
|
|
WHERE length(prompt) >= %d
|
|
ORDER BY prompt, seed_id
|
|
)
|
|
),
|
|
existing_prompts AS (
|
|
SELECT prompt FROM prompts
|
|
UNION ALL
|
|
SELECT prompt FROM golden_set
|
|
)
|
|
SELECT
|
|
us.idx,
|
|
us.seed_id,
|
|
us.region,
|
|
us.domain,
|
|
'en' AS language,
|
|
us.prompt,
|
|
'' AS prompt_en,
|
|
0 AS priority,
|
|
'pending' AS status
|
|
FROM unique_seeds us
|
|
WHERE NOT EXISTS (
|
|
SELECT 1 FROM existing_prompts ep
|
|
WHERE ep.prompt = us.prompt
|
|
)
|
|
`, *minLen))
|
|
if err != nil {
|
|
log.Fatalf("create expansion_prompts: %v", err)
|
|
}
|
|
|
|
var total, domains, regions int
|
|
db.conn.QueryRow("SELECT count(*) FROM expansion_prompts").Scan(&total)
|
|
db.conn.QueryRow("SELECT count(DISTINCT domain) FROM expansion_prompts").Scan(&domains)
|
|
db.conn.QueryRow("SELECT count(DISTINCT region) FROM expansion_prompts").Scan(®ions)
|
|
|
|
// Assign priority based on domain coverage.
|
|
_, err = db.conn.Exec(`
|
|
UPDATE expansion_prompts SET priority = (
|
|
SELECT RANK() OVER (ORDER BY cnt ASC)
|
|
FROM (
|
|
SELECT domain, count(*) AS cnt
|
|
FROM expansion_prompts GROUP BY domain
|
|
) domain_counts
|
|
WHERE domain_counts.domain = expansion_prompts.domain
|
|
)
|
|
`)
|
|
if err != nil {
|
|
log.Printf("warning: priority assignment failed: %v", err)
|
|
}
|
|
|
|
fmt.Printf("\nExpansion Prompts: %d\n", total)
|
|
fmt.Printf(" Domains: %d\n", domains)
|
|
fmt.Printf(" Regions: %d\n", regions)
|
|
|
|
// Show region distribution.
|
|
fmt.Println("\n By region group:")
|
|
rows, err := db.conn.Query(`
|
|
SELECT
|
|
CASE
|
|
WHEN region LIKE '%cn%' THEN 'cn'
|
|
WHEN region LIKE '%en-%' OR region LIKE '%en_para%' OR region LIKE '%para%' THEN 'en'
|
|
WHEN region LIKE '%ru%' THEN 'ru'
|
|
WHEN region LIKE '%de%' AND region NOT LIKE '%deten%' THEN 'de'
|
|
WHEN region LIKE '%es%' THEN 'es'
|
|
WHEN region LIKE '%fr%' THEN 'fr'
|
|
WHEN region LIKE '%latam%' THEN 'latam'
|
|
WHEN region LIKE '%africa%' THEN 'africa'
|
|
WHEN region LIKE '%eu%' THEN 'eu'
|
|
WHEN region LIKE '%me%' AND region NOT LIKE '%premium%' THEN 'me'
|
|
ELSE 'other'
|
|
END AS lang_group,
|
|
count(*) AS n
|
|
FROM expansion_prompts GROUP BY lang_group ORDER BY n DESC
|
|
`)
|
|
if err == nil {
|
|
for rows.Next() {
|
|
var group string
|
|
var n int
|
|
rows.Scan(&group, &n)
|
|
fmt.Printf(" %-15s %6d\n", group, n)
|
|
}
|
|
rows.Close()
|
|
}
|
|
|
|
fmt.Printf("\nNormalization complete: %d expansion prompts from %d seeds\n", total, seedCount)
|
|
}
|