package lemcmd import ( "forge.lthn.ai/core/cli/pkg/cli" "forge.lthn.ai/lthn/lem/pkg/lem" ) func addExportCommands(root *cli.Command) { exportGroup := cli.NewGroup("export", "Export and publish commands", "Export training data to JSONL, Parquet, HuggingFace, and PEFT formats.") // jsonl — export golden set to training-format JSONL splits. var exportCfg lem.ExportOpts jsonlCmd := cli.NewCommand("jsonl", "Export golden set to training-format JSONL splits", "", func(cmd *cli.Command, args []string) error { return lem.RunExport(exportCfg) }, ) cli.StringFlag(jsonlCmd, &exportCfg.DBPath, "db", "", "", "DuckDB database path (primary source; defaults to LEM_DB env)") cli.StringFlag(jsonlCmd, &exportCfg.Input, "input", "i", "", "Input golden set JSONL file (fallback if --db not set)") cli.StringFlag(jsonlCmd, &exportCfg.OutputDir, "output-dir", "o", "", "Output directory for training files (required)") cli.IntFlag(jsonlCmd, &exportCfg.TrainPct, "train-pct", "", 90, "Training set percentage") cli.IntFlag(jsonlCmd, &exportCfg.ValidPct, "valid-pct", "", 5, "Validation set percentage") cli.IntFlag(jsonlCmd, &exportCfg.TestPct, "test-pct", "", 5, "Test set percentage") cli.Int64Flag(jsonlCmd, &exportCfg.Seed, "seed", "", 42, "Random seed for shuffling") cli.IntFlag(jsonlCmd, &exportCfg.MinChars, "min-chars", "", 50, "Minimum response character count") exportGroup.AddCommand(jsonlCmd) // parquet — export JSONL training splits to Parquet. var parquetCfg lem.ParquetOpts parquetCmd := cli.NewCommand("parquet", "Export JSONL training splits to Parquet", "", func(cmd *cli.Command, args []string) error { return lem.RunParquet(parquetCfg) }, ) cli.StringFlag(parquetCmd, &parquetCfg.Input, "input", "i", "", "Directory containing train.jsonl, valid.jsonl, test.jsonl (required)") cli.StringFlag(parquetCmd, &parquetCfg.Output, "output", "o", "", "Output directory for Parquet files (defaults to input/parquet)") exportGroup.AddCommand(parquetCmd) // publish — push Parquet files to HuggingFace dataset repo. var publishCfg lem.PublishOpts publishCmd := cli.NewCommand("publish", "Push Parquet files to HuggingFace dataset repo", "", func(cmd *cli.Command, args []string) error { return lem.RunPublish(publishCfg) }, ) cli.StringFlag(publishCmd, &publishCfg.Input, "input", "i", "", "Directory containing Parquet files (required)") cli.StringFlag(publishCmd, &publishCfg.Repo, "repo", "", "lthn/LEM-golden-set", "HuggingFace dataset repo ID") cli.BoolFlag(publishCmd, &publishCfg.Public, "public", "", false, "Make dataset public") cli.StringFlag(publishCmd, &publishCfg.Token, "token", "", "", "HuggingFace API token (defaults to HF_TOKEN env)") cli.BoolFlag(publishCmd, &publishCfg.DryRun, "dry-run", "", false, "Show what would be uploaded without uploading") exportGroup.AddCommand(publishCmd) // convert — convert MLX LoRA adapter to PEFT format. var convertCfg lem.ConvertOpts convertCmd := cli.NewCommand("convert", "Convert MLX LoRA adapter to PEFT format", "", func(cmd *cli.Command, args []string) error { return lem.RunConvert(convertCfg) }, ) cli.StringFlag(convertCmd, &convertCfg.Input, "input", "i", "", "Path to MLX .safetensors file (required)") cli.StringFlag(convertCmd, &convertCfg.Config, "config", "c", "", "Path to MLX adapter_config.json (required)") cli.StringFlag(convertCmd, &convertCfg.Output, "output", "o", "./peft_output", "Output directory for PEFT adapter") cli.StringFlag(convertCmd, &convertCfg.BaseModel, "base-model", "", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "HuggingFace base model ID") exportGroup.AddCommand(convertCmd) root.AddCommand(exportGroup) }