LEM/pkg/lem/seed_influx.go
Claude 91ee389377
feat: convert all pipeline.py commands to Go
Complete conversion of pipeline.py into Go `lem` CLI:
- import-all: bulk import all LEM data into DuckDB from M3
- consolidate: pull worker JSONLs, merge, deduplicate
- normalize: seeds → deduplicated expansion_prompts table
- approve: filter scored expansions → training JSONL
- tier-score: heuristic/judge tiered expansion scoring
- expand-status: expansion pipeline progress from DuckDB
- inventory: DuckDB table counts and summary
- coverage: seed coverage gap analysis
- seed-influx: bootstrap InfluxDB from DuckDB golden_gen
- query: ad-hoc SQL against DuckDB

22 commands total, 49 Go files. Replaces entire pipeline.py.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 17:12:03 +00:00

111 lines
2.9 KiB
Go

package lem
import (
"flag"
"fmt"
"log"
"os"
"strings"
)
// RunSeedInflux is the CLI entry point for the seed-influx command.
// Seeds InfluxDB golden_gen measurement from DuckDB golden_set data.
// One-time migration tool for bootstrapping InfluxDB from existing data.
func RunSeedInflux(args []string) {
fs := flag.NewFlagSet("seed-influx", flag.ExitOnError)
dbPath := fs.String("db", "", "DuckDB database path (defaults to LEM_DB env)")
influxURL := fs.String("influx", "", "InfluxDB URL")
influxDB := fs.String("influx-db", "", "InfluxDB database name")
force := fs.Bool("force", false, "Re-seed even if InfluxDB already has data")
batchSize := fs.Int("batch-size", 500, "Lines per InfluxDB write batch")
if err := fs.Parse(args); err != nil {
log.Fatalf("parse flags: %v", err)
}
if *dbPath == "" {
*dbPath = os.Getenv("LEM_DB")
}
if *dbPath == "" {
fmt.Fprintln(os.Stderr, "error: --db or LEM_DB required")
os.Exit(1)
}
db, err := OpenDB(*dbPath)
if err != nil {
log.Fatalf("open db: %v", err)
}
defer db.Close()
var total int
if err := db.conn.QueryRow("SELECT count(*) FROM golden_set").Scan(&total); err != nil {
log.Fatalf("No golden_set table. Run ingest first.")
}
influx := NewInfluxClient(*influxURL, *influxDB)
// Check existing count in InfluxDB.
existing := 0
rows, err := influx.QuerySQL("SELECT count(DISTINCT i) AS n FROM gold_gen")
if err == nil && len(rows) > 0 {
if n, ok := rows[0]["n"].(float64); ok {
existing = int(n)
}
}
fmt.Printf("DuckDB has %d records, InfluxDB golden_gen has %d\n", total, existing)
if existing >= total && !*force {
fmt.Println("InfluxDB already has all records. Use --force to re-seed.")
return
}
// Read all rows.
dbRows, err := db.conn.Query(`
SELECT idx, seed_id, domain, voice, gen_time, char_count
FROM golden_set ORDER BY idx
`)
if err != nil {
log.Fatalf("query golden_set: %v", err)
}
defer dbRows.Close()
var lines []string
written := 0
for dbRows.Next() {
var idx, charCount int
var seedID, domain, voice string
var genTime float64
if err := dbRows.Scan(&idx, &seedID, &domain, &voice, &genTime, &charCount); err != nil {
log.Fatalf("scan: %v", err)
}
sid := strings.ReplaceAll(seedID, `"`, `\"`)
lp := fmt.Sprintf(`gold_gen,i=%d,w=migration,d=%s,v=%s seed_id="%s",gen_time=%.1f,chars=%di`,
idx, escapeLp(domain), escapeLp(voice), sid, genTime, charCount)
lines = append(lines, lp)
if len(lines) >= *batchSize {
if err := influx.WriteLp(lines); err != nil {
log.Fatalf("write batch at %d: %v", written, err)
}
written += len(lines)
lines = lines[:0]
if written%2000 == 0 {
fmt.Printf(" Seeded %d/%d records\n", written, total)
}
}
}
if len(lines) > 0 {
if err := influx.WriteLp(lines); err != nil {
log.Fatalf("flush: %v", err)
}
written += len(lines)
}
fmt.Printf("Seeded %d golden_gen records into InfluxDB\n", written)
}