package lem import ( "bufio" "encoding/json" "flag" "fmt" "log" "math/rand" "os" "strings" ) // ChatMessage is a single message in the chat training format. type ChatMessage struct { Role string `json:"role"` Content string `json:"content"` } // TrainingExample is a single training example in chat JSONL format. type TrainingExample struct { Messages []ChatMessage `json:"messages"` } // runExport is the CLI entry point for the export command. func RunExport(args []string) { fs := flag.NewFlagSet("export", flag.ExitOnError) dbPath := fs.String("db", "", "DuckDB database path (primary source)") input := fs.String("input", "", "Input golden set JSONL file (fallback if --db not set)") outputDir := fs.String("output-dir", "", "Output directory for training files (required)") trainPct := fs.Int("train-pct", 90, "Training set percentage") validPct := fs.Int("valid-pct", 5, "Validation set percentage") testPct := fs.Int("test-pct", 5, "Test set percentage") seed := fs.Int64("seed", 42, "Random seed for shuffling") minChars := fs.Int("min-chars", 50, "Minimum response character count") if err := fs.Parse(args); err != nil { log.Fatalf("parse flags: %v", err) } // Check LEM_DB env as default for --db. if *dbPath == "" { *dbPath = os.Getenv("LEM_DB") } if *dbPath == "" && *input == "" { fmt.Fprintln(os.Stderr, "error: --db or --input is required (set LEM_DB env for default)") fs.Usage() os.Exit(1) } if *outputDir == "" { fmt.Fprintln(os.Stderr, "error: --output-dir is required") fs.Usage() os.Exit(1) } if err := validatePercentages(*trainPct, *validPct, *testPct); err != nil { fmt.Fprintf(os.Stderr, "error: %v\n", err) os.Exit(1) } var responses []Response if *dbPath != "" { // Primary: read from DuckDB golden_set table. db, err := OpenDB(*dbPath) if err != nil { log.Fatalf("open db: %v", err) } defer db.Close() rows, err := db.QueryGoldenSet(*minChars) if err != nil { log.Fatalf("query golden_set: %v", err) } log.Printf("loaded %d golden set rows from %s (min_chars=%d)", len(rows), *dbPath, *minChars) // Convert GoldenSetRow → Response for the shared pipeline. for _, r := range rows { responses = append(responses, Response{ ID: r.SeedID, Domain: r.Domain, Prompt: r.Prompt, Response: r.Response, Model: r.Voice, // voice maps to the "model" slot for tracking }) } } else { // Fallback: read from JSONL file. var err error responses, err = ReadResponses(*input) if err != nil { log.Fatalf("read responses: %v", err) } log.Printf("loaded %d responses from %s", len(responses), *input) } // Filter out bad responses (DuckDB already filters by char_count, but // JSONL input needs filtering, and both need ERROR: prefix check). filtered := filterResponses(responses) log.Printf("filtered to %d valid responses (removed %d)", len(filtered), len(responses)-len(filtered)) // Split into train/valid/test. train, valid, test := splitData(filtered, *trainPct, *validPct, *testPct, *seed) // Create output directory. if err := os.MkdirAll(*outputDir, 0755); err != nil { log.Fatalf("create output dir: %v", err) } // Write output files. for _, split := range []struct { name string data []Response }{ {"train.jsonl", train}, {"valid.jsonl", valid}, {"test.jsonl", test}, } { path := *outputDir + "/" + split.name if err := writeTrainingJSONL(path, split.data); err != nil { log.Fatalf("write %s: %v", split.name, err) } } fmt.Printf("Exported: %d train / %d valid / %d test\n", len(train), len(valid), len(test)) } // validatePercentages checks that train+valid+test percentages sum to 100 // and that none are negative. func validatePercentages(trainPct, validPct, testPct int) error { if trainPct < 0 || validPct < 0 || testPct < 0 { return fmt.Errorf("percentages must be non-negative: train=%d, valid=%d, test=%d", trainPct, validPct, testPct) } sum := trainPct + validPct + testPct if sum != 100 { return fmt.Errorf("percentages must sum to 100, got %d (train=%d + valid=%d + test=%d)", sum, trainPct, validPct, testPct) } return nil } // filterResponses removes responses with empty content, "ERROR:" prefix, // or response length < 50 characters. func filterResponses(responses []Response) []Response { var filtered []Response for _, r := range responses { if r.Response == "" { continue } if strings.HasPrefix(r.Response, "ERROR:") { continue } if len(r.Response) < 50 { continue } filtered = append(filtered, r) } return filtered } // splitData shuffles responses with a deterministic seed and splits them // into train, valid, and test sets by the given percentages. func splitData(responses []Response, trainPct, validPct, testPct int, seed int64) (train, valid, test []Response) { // Make a copy to avoid mutating the input. shuffled := make([]Response, len(responses)) copy(shuffled, responses) // Shuffle with deterministic seed. rng := rand.New(rand.NewSource(seed)) rng.Shuffle(len(shuffled), func(i, j int) { shuffled[i], shuffled[j] = shuffled[j], shuffled[i] }) n := len(shuffled) trainN := n * trainPct / 100 validN := n * validPct / 100 // Test gets the remainder to ensure no items are lost. _ = testPct train = shuffled[:trainN] valid = shuffled[trainN : trainN+validN] test = shuffled[trainN+validN:] return train, valid, test } // writeTrainingJSONL writes responses in chat JSONL format suitable for // MLX LoRA fine-tuning. Each line contains a TrainingExample with user // and assistant messages. func writeTrainingJSONL(path string, responses []Response) error { f, err := os.Create(path) if err != nil { return fmt.Errorf("create %s: %w", path, err) } defer f.Close() w := bufio.NewWriter(f) defer w.Flush() for _, r := range responses { example := TrainingExample{ Messages: []ChatMessage{ {Role: "user", Content: r.Prompt}, {Role: "assistant", Content: r.Response}, }, } data, err := json.Marshal(example) if err != nil { return fmt.Errorf("marshal example: %w", err) } if _, err := w.Write(data); err != nil { return fmt.Errorf("write line: %w", err) } if _, err := w.WriteString("\n"); err != nil { return fmt.Errorf("write newline: %w", err) } } return nil }