2026-02-22 18:55:57 +00:00
|
|
|
package lemcmd
|
|
|
|
|
|
|
|
|
|
import (
|
2026-02-22 23:01:41 +00:00
|
|
|
"forge.lthn.ai/core/cli/pkg/cli"
|
2026-02-22 18:55:57 +00:00
|
|
|
"forge.lthn.ai/lthn/lem/pkg/lem"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
func addDataCommands(root *cli.Command) {
|
|
|
|
|
dataGroup := cli.NewGroup("data", "Data management commands", "Import, consolidate, normalise, and approve training data.")
|
|
|
|
|
|
2026-02-23 03:32:53 +00:00
|
|
|
// import-all — Import ALL LEM data into DuckDB from M3.
|
|
|
|
|
var importCfg lem.ImportOpts
|
|
|
|
|
importCmd := cli.NewCommand("import-all", "Import ALL LEM data into DuckDB from M3", "",
|
|
|
|
|
func(cmd *cli.Command, args []string) error {
|
|
|
|
|
return lem.RunImport(importCfg)
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
cli.StringFlag(importCmd, &importCfg.DB, "db", "", "", "DuckDB database path (defaults to LEM_DB env)")
|
|
|
|
|
cli.BoolFlag(importCmd, &importCfg.SkipM3, "skip-m3", "", false, "Skip pulling data from M3")
|
|
|
|
|
cli.StringFlag(importCmd, &importCfg.DataDir, "data-dir", "", "", "Local data directory (defaults to db directory)")
|
|
|
|
|
dataGroup.AddCommand(importCmd)
|
|
|
|
|
|
|
|
|
|
// consolidate — Pull worker JSONLs from M3, merge, deduplicate.
|
|
|
|
|
var consolidateCfg lem.ConsolidateOpts
|
|
|
|
|
consolidateCmd := cli.NewCommand("consolidate", "Pull worker JSONLs from M3, merge, deduplicate", "",
|
|
|
|
|
func(cmd *cli.Command, args []string) error {
|
|
|
|
|
return lem.RunConsolidate(consolidateCfg)
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
cli.StringFlag(consolidateCmd, &consolidateCfg.Host, "host", "", "m3", "SSH host for remote files")
|
|
|
|
|
cli.StringFlag(consolidateCmd, &consolidateCfg.Remote, "remote", "", "/Volumes/Data/lem/responses", "Remote directory for JSONL files")
|
|
|
|
|
cli.StringFlag(consolidateCmd, &consolidateCfg.Pattern, "pattern", "", "gold*.jsonl", "File glob pattern")
|
|
|
|
|
cli.StringFlag(consolidateCmd, &consolidateCfg.OutputDir, "output", "o", "", "Output directory (defaults to ./responses)")
|
|
|
|
|
cli.StringFlag(consolidateCmd, &consolidateCfg.Merged, "merged", "", "", "Merged output file (defaults to gold-merged.jsonl in output dir)")
|
|
|
|
|
dataGroup.AddCommand(consolidateCmd)
|
|
|
|
|
|
|
|
|
|
// normalize — Normalise seeds to deduplicated expansion prompts.
|
|
|
|
|
var normalizeCfg lem.NormalizeOpts
|
|
|
|
|
normalizeCmd := cli.NewCommand("normalize", "Normalise seeds to deduplicated expansion prompts", "",
|
|
|
|
|
func(cmd *cli.Command, args []string) error {
|
|
|
|
|
return lem.RunNormalize(normalizeCfg)
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
cli.StringFlag(normalizeCmd, &normalizeCfg.DB, "db", "", "", "DuckDB database path (defaults to LEM_DB env)")
|
|
|
|
|
cli.IntFlag(normalizeCmd, &normalizeCfg.MinLen, "min-length", "", 50, "Minimum prompt length in characters")
|
|
|
|
|
dataGroup.AddCommand(normalizeCmd)
|
|
|
|
|
|
|
|
|
|
// approve — Filter scored expansions to training JSONL.
|
|
|
|
|
var approveCfg lem.ApproveOpts
|
|
|
|
|
approveCmd := cli.NewCommand("approve", "Filter scored expansions to training JSONL", "",
|
|
|
|
|
func(cmd *cli.Command, args []string) error {
|
|
|
|
|
return lem.RunApprove(approveCfg)
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
cli.StringFlag(approveCmd, &approveCfg.DB, "db", "", "", "DuckDB database path (defaults to LEM_DB env)")
|
|
|
|
|
cli.StringFlag(approveCmd, &approveCfg.Output, "output", "o", "", "Output JSONL file (defaults to expansion-approved.jsonl in db dir)")
|
|
|
|
|
cli.Float64Flag(approveCmd, &approveCfg.Threshold, "threshold", "", 6.0, "Min judge average to approve")
|
|
|
|
|
dataGroup.AddCommand(approveCmd)
|
2026-02-22 18:55:57 +00:00
|
|
|
|
|
|
|
|
root.AddCommand(dataGroup)
|
|
|
|
|
}
|