Ports all remaining LEM pipeline commands from pkg/lem into core ml, eliminating the standalone LEM CLI dependency. Each command is split into reusable business logic (pkg/ml/) and a thin cobra wrapper (internal/cmd/ml/). New commands: query, inventory, metrics, ingest, normalize, seed-influx, consolidate, import-all, approve, publish, coverage. Adds Path(), Exec(), QueryRowScan() convenience methods to DB type. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
147 lines
4 KiB
Go
147 lines
4 KiB
Go
package ml
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
)
|
|
|
|
// TargetTotal is the golden set target size used for progress reporting.
|
|
const TargetTotal = 15000
|
|
|
|
// tableOrder defines the canonical display order for inventory tables.
|
|
var tableOrder = []string{
|
|
"golden_set", "expansion_prompts", "seeds", "prompts",
|
|
"training_examples", "gemini_responses", "benchmark_questions",
|
|
"benchmark_results", "validations", "checkpoint_scores",
|
|
"probe_results", "scoring_results",
|
|
}
|
|
|
|
// tableDetail holds extra context for a single table beyond its row count.
|
|
type tableDetail struct {
|
|
notes []string
|
|
}
|
|
|
|
// PrintInventory queries all known DuckDB tables and prints a formatted
|
|
// inventory with row counts, detail breakdowns, and a grand total.
|
|
func PrintInventory(db *DB, w io.Writer) error {
|
|
counts, err := db.TableCounts()
|
|
if err != nil {
|
|
return fmt.Errorf("table counts: %w", err)
|
|
}
|
|
|
|
details := gatherDetails(db, counts)
|
|
|
|
fmt.Fprintln(w, "DuckDB Inventory")
|
|
fmt.Fprintln(w, strings.Repeat("-", 52))
|
|
|
|
grand := 0
|
|
for _, table := range tableOrder {
|
|
count, ok := counts[table]
|
|
if !ok {
|
|
continue
|
|
}
|
|
grand += count
|
|
fmt.Fprintf(w, " %-24s %8d rows", table, count)
|
|
|
|
if d, has := details[table]; has && len(d.notes) > 0 {
|
|
fmt.Fprintf(w, " (%s)", strings.Join(d.notes, ", "))
|
|
}
|
|
fmt.Fprintln(w)
|
|
}
|
|
|
|
fmt.Fprintln(w, strings.Repeat("-", 52))
|
|
fmt.Fprintf(w, " %-24s %8d rows\n", "TOTAL", grand)
|
|
|
|
return nil
|
|
}
|
|
|
|
// gatherDetails runs per-table detail queries and returns annotations keyed
|
|
// by table name. Errors on individual queries are silently ignored so the
|
|
// inventory always prints.
|
|
func gatherDetails(db *DB, counts map[string]int) map[string]*tableDetail {
|
|
details := make(map[string]*tableDetail)
|
|
|
|
// golden_set: progress toward target
|
|
if count, ok := counts["golden_set"]; ok {
|
|
pct := float64(count) / float64(TargetTotal) * 100
|
|
details["golden_set"] = &tableDetail{
|
|
notes: []string{fmt.Sprintf("%.1f%% of %d target", pct, TargetTotal)},
|
|
}
|
|
}
|
|
|
|
// training_examples: distinct sources
|
|
if _, ok := counts["training_examples"]; ok {
|
|
rows, err := db.QueryRows("SELECT COUNT(DISTINCT source) AS n FROM training_examples")
|
|
if err == nil && len(rows) > 0 {
|
|
n := toInt(rows[0]["n"])
|
|
details["training_examples"] = &tableDetail{
|
|
notes: []string{fmt.Sprintf("%d sources", n)},
|
|
}
|
|
}
|
|
}
|
|
|
|
// prompts: distinct domains and voices
|
|
if _, ok := counts["prompts"]; ok {
|
|
d := &tableDetail{}
|
|
rows, err := db.QueryRows("SELECT COUNT(DISTINCT domain) AS n FROM prompts")
|
|
if err == nil && len(rows) > 0 {
|
|
d.notes = append(d.notes, fmt.Sprintf("%d domains", toInt(rows[0]["n"])))
|
|
}
|
|
rows, err = db.QueryRows("SELECT COUNT(DISTINCT voice) AS n FROM prompts")
|
|
if err == nil && len(rows) > 0 {
|
|
d.notes = append(d.notes, fmt.Sprintf("%d voices", toInt(rows[0]["n"])))
|
|
}
|
|
if len(d.notes) > 0 {
|
|
details["prompts"] = d
|
|
}
|
|
}
|
|
|
|
// gemini_responses: group by source_model
|
|
if _, ok := counts["gemini_responses"]; ok {
|
|
rows, err := db.QueryRows(
|
|
"SELECT source_model, COUNT(*) AS n FROM gemini_responses GROUP BY source_model ORDER BY n DESC",
|
|
)
|
|
if err == nil && len(rows) > 0 {
|
|
var parts []string
|
|
for _, row := range rows {
|
|
model := strVal(row, "source_model")
|
|
n := toInt(row["n"])
|
|
if model != "" {
|
|
parts = append(parts, fmt.Sprintf("%s:%d", model, n))
|
|
}
|
|
}
|
|
if len(parts) > 0 {
|
|
details["gemini_responses"] = &tableDetail{notes: parts}
|
|
}
|
|
}
|
|
}
|
|
|
|
// benchmark_results: distinct source categories
|
|
if _, ok := counts["benchmark_results"]; ok {
|
|
rows, err := db.QueryRows("SELECT COUNT(DISTINCT source) AS n FROM benchmark_results")
|
|
if err == nil && len(rows) > 0 {
|
|
n := toInt(rows[0]["n"])
|
|
details["benchmark_results"] = &tableDetail{
|
|
notes: []string{fmt.Sprintf("%d categories", n)},
|
|
}
|
|
}
|
|
}
|
|
|
|
return details
|
|
}
|
|
|
|
// toInt converts a DuckDB value to int. DuckDB returns integers as int64 (not
|
|
// float64 like InfluxDB), so we handle both types.
|
|
func toInt(v interface{}) int {
|
|
switch n := v.(type) {
|
|
case int64:
|
|
return int(n)
|
|
case int32:
|
|
return int(n)
|
|
case float64:
|
|
return int(n)
|
|
default:
|
|
return 0
|
|
}
|
|
}
|