package lemcmd import ( "forge.lthn.ai/core/cli/pkg/cli" "forge.lthn.ai/lthn/lem/pkg/lem" ) func addDataCommands(root *cli.Command) { dataGroup := cli.NewGroup("data", "Data management commands", "Import, consolidate, normalise, and approve training data.") // import-all — Import ALL LEM data into DuckDB from M3. var importCfg lem.ImportOpts importCmd := cli.NewCommand("import-all", "Import ALL LEM data into DuckDB from M3", "", func(cmd *cli.Command, args []string) error { return lem.RunImport(importCfg) }, ) cli.StringFlag(importCmd, &importCfg.DB, "db", "", "", "DuckDB database path (defaults to LEM_DB env)") cli.BoolFlag(importCmd, &importCfg.SkipM3, "skip-m3", "", false, "Skip pulling data from M3") cli.StringFlag(importCmd, &importCfg.DataDir, "data-dir", "", "", "Local data directory (defaults to db directory)") dataGroup.AddCommand(importCmd) // consolidate — Pull worker JSONLs from M3, merge, deduplicate. var consolidateCfg lem.ConsolidateOpts consolidateCmd := cli.NewCommand("consolidate", "Pull worker JSONLs from M3, merge, deduplicate", "", func(cmd *cli.Command, args []string) error { return lem.RunConsolidate(consolidateCfg) }, ) cli.StringFlag(consolidateCmd, &consolidateCfg.Host, "host", "", "m3", "SSH host for remote files") cli.StringFlag(consolidateCmd, &consolidateCfg.Remote, "remote", "", "/Volumes/Data/lem/responses", "Remote directory for JSONL files") cli.StringFlag(consolidateCmd, &consolidateCfg.Pattern, "pattern", "", "gold*.jsonl", "File glob pattern") cli.StringFlag(consolidateCmd, &consolidateCfg.OutputDir, "output", "o", "", "Output directory (defaults to ./responses)") cli.StringFlag(consolidateCmd, &consolidateCfg.Merged, "merged", "", "", "Merged output file (defaults to gold-merged.jsonl in output dir)") dataGroup.AddCommand(consolidateCmd) // normalize — Normalise seeds to deduplicated expansion prompts. var normalizeCfg lem.NormalizeOpts normalizeCmd := cli.NewCommand("normalize", "Normalise seeds to deduplicated expansion prompts", "", func(cmd *cli.Command, args []string) error { return lem.RunNormalize(normalizeCfg) }, ) cli.StringFlag(normalizeCmd, &normalizeCfg.DB, "db", "", "", "DuckDB database path (defaults to LEM_DB env)") cli.IntFlag(normalizeCmd, &normalizeCfg.MinLen, "min-length", "", 50, "Minimum prompt length in characters") dataGroup.AddCommand(normalizeCmd) // approve — Filter scored expansions to training JSONL. var approveCfg lem.ApproveOpts approveCmd := cli.NewCommand("approve", "Filter scored expansions to training JSONL", "", func(cmd *cli.Command, args []string) error { return lem.RunApprove(approveCfg) }, ) cli.StringFlag(approveCmd, &approveCfg.DB, "db", "", "", "DuckDB database path (defaults to LEM_DB env)") cli.StringFlag(approveCmd, &approveCfg.Output, "output", "o", "", "Output JSONL file (defaults to expansion-approved.jsonl in db dir)") cli.Float64Flag(approveCmd, &approveCfg.Threshold, "threshold", "", 6.0, "Min judge average to approve") dataGroup.AddCommand(approveCmd) root.AddCommand(dataGroup) }