cli/pkg/ml/inventory.go
Claude 1f3a1bcc47 feat: port 11 LEM data management commands into core ml
Ports all remaining LEM pipeline commands from pkg/lem into core ml,
eliminating the standalone LEM CLI dependency. Each command is split
into reusable business logic (pkg/ml/) and a thin cobra wrapper
(internal/cmd/ml/).

New commands: query, inventory, metrics, ingest, normalize, seed-influx,
consolidate, import-all, approve, publish, coverage.

Adds Path(), Exec(), QueryRowScan() convenience methods to DB type.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 05:53:52 +00:00

147 lines
4 KiB
Go

package ml
import (
"fmt"
"io"
"strings"
)
// TargetTotal is the golden set target size used for progress reporting.
const TargetTotal = 15000
// tableOrder defines the canonical display order for inventory tables.
var tableOrder = []string{
"golden_set", "expansion_prompts", "seeds", "prompts",
"training_examples", "gemini_responses", "benchmark_questions",
"benchmark_results", "validations", "checkpoint_scores",
"probe_results", "scoring_results",
}
// tableDetail holds extra context for a single table beyond its row count.
type tableDetail struct {
notes []string
}
// PrintInventory queries all known DuckDB tables and prints a formatted
// inventory with row counts, detail breakdowns, and a grand total.
func PrintInventory(db *DB, w io.Writer) error {
counts, err := db.TableCounts()
if err != nil {
return fmt.Errorf("table counts: %w", err)
}
details := gatherDetails(db, counts)
fmt.Fprintln(w, "DuckDB Inventory")
fmt.Fprintln(w, strings.Repeat("-", 52))
grand := 0
for _, table := range tableOrder {
count, ok := counts[table]
if !ok {
continue
}
grand += count
fmt.Fprintf(w, " %-24s %8d rows", table, count)
if d, has := details[table]; has && len(d.notes) > 0 {
fmt.Fprintf(w, " (%s)", strings.Join(d.notes, ", "))
}
fmt.Fprintln(w)
}
fmt.Fprintln(w, strings.Repeat("-", 52))
fmt.Fprintf(w, " %-24s %8d rows\n", "TOTAL", grand)
return nil
}
// gatherDetails runs per-table detail queries and returns annotations keyed
// by table name. Errors on individual queries are silently ignored so the
// inventory always prints.
func gatherDetails(db *DB, counts map[string]int) map[string]*tableDetail {
details := make(map[string]*tableDetail)
// golden_set: progress toward target
if count, ok := counts["golden_set"]; ok {
pct := float64(count) / float64(TargetTotal) * 100
details["golden_set"] = &tableDetail{
notes: []string{fmt.Sprintf("%.1f%% of %d target", pct, TargetTotal)},
}
}
// training_examples: distinct sources
if _, ok := counts["training_examples"]; ok {
rows, err := db.QueryRows("SELECT COUNT(DISTINCT source) AS n FROM training_examples")
if err == nil && len(rows) > 0 {
n := toInt(rows[0]["n"])
details["training_examples"] = &tableDetail{
notes: []string{fmt.Sprintf("%d sources", n)},
}
}
}
// prompts: distinct domains and voices
if _, ok := counts["prompts"]; ok {
d := &tableDetail{}
rows, err := db.QueryRows("SELECT COUNT(DISTINCT domain) AS n FROM prompts")
if err == nil && len(rows) > 0 {
d.notes = append(d.notes, fmt.Sprintf("%d domains", toInt(rows[0]["n"])))
}
rows, err = db.QueryRows("SELECT COUNT(DISTINCT voice) AS n FROM prompts")
if err == nil && len(rows) > 0 {
d.notes = append(d.notes, fmt.Sprintf("%d voices", toInt(rows[0]["n"])))
}
if len(d.notes) > 0 {
details["prompts"] = d
}
}
// gemini_responses: group by source_model
if _, ok := counts["gemini_responses"]; ok {
rows, err := db.QueryRows(
"SELECT source_model, COUNT(*) AS n FROM gemini_responses GROUP BY source_model ORDER BY n DESC",
)
if err == nil && len(rows) > 0 {
var parts []string
for _, row := range rows {
model := strVal(row, "source_model")
n := toInt(row["n"])
if model != "" {
parts = append(parts, fmt.Sprintf("%s:%d", model, n))
}
}
if len(parts) > 0 {
details["gemini_responses"] = &tableDetail{notes: parts}
}
}
}
// benchmark_results: distinct source categories
if _, ok := counts["benchmark_results"]; ok {
rows, err := db.QueryRows("SELECT COUNT(DISTINCT source) AS n FROM benchmark_results")
if err == nil && len(rows) > 0 {
n := toInt(rows[0]["n"])
details["benchmark_results"] = &tableDetail{
notes: []string{fmt.Sprintf("%d categories", n)},
}
}
}
return details
}
// toInt converts a DuckDB value to int. DuckDB returns integers as int64 (not
// float64 like InfluxDB), so we handle both types.
func toInt(v interface{}) int {
switch n := v.(type) {
case int64:
return int(n)
case int32:
return int(n)
case float64:
return int(n)
default:
return 0
}
}