LEM/pkg/lem/export.go
Snider 56eda1a081 refactor: migrate all 25 commands from passthrough to cobra framework
Replace passthrough() + stdlib flag.FlagSet anti-pattern with proper
cobra integration. Every Run* function now takes a typed *Opts struct
and returns error. Flags registered via cli.StringFlag/IntFlag/etc.
Commands participate in Core lifecycle with full cobra flag parsing.

- 6 command groups: gen, score, data, export, infra, mon
- 25 commands converted, 0 passthrough() calls remain
- Delete passthrough() helper from lem.go
- Update export_test.go to use ExportOpts struct

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-23 03:32:53 +00:00

216 lines
6.1 KiB
Go

package lem
import (
"bufio"
"encoding/json"
"fmt"
"log"
"math/rand"
"os"
"strings"
)
// ExportOpts holds configuration for the JSONL export command.
type ExportOpts struct {
DBPath string // DuckDB database path (primary source); falls back to LEM_DB env
Input string // Input golden set JSONL file (fallback if DBPath not set)
OutputDir string // Output directory for training files (required)
TrainPct int // Training set percentage
ValidPct int // Validation set percentage
TestPct int // Test set percentage
Seed int64 // Random seed for shuffling
MinChars int // Minimum response character count
}
// ChatMessage is a single message in the chat training format.
type ChatMessage struct {
Role string `json:"role"`
Content string `json:"content"`
}
// TrainingExample is a single training example in chat JSONL format.
type TrainingExample struct {
Messages []ChatMessage `json:"messages"`
}
// RunExport is the CLI entry point for the export command.
func RunExport(cfg ExportOpts) error {
// Check LEM_DB env as default for --db.
if cfg.DBPath == "" {
cfg.DBPath = os.Getenv("LEM_DB")
}
if cfg.DBPath == "" && cfg.Input == "" {
return fmt.Errorf("--db or --input is required (set LEM_DB env for default)")
}
if cfg.OutputDir == "" {
return fmt.Errorf("--output-dir is required")
}
if err := validatePercentages(cfg.TrainPct, cfg.ValidPct, cfg.TestPct); err != nil {
return fmt.Errorf("invalid percentages: %w", err)
}
var responses []Response
if cfg.DBPath != "" {
// Primary: read from DuckDB golden_set table.
db, err := OpenDB(cfg.DBPath)
if err != nil {
return fmt.Errorf("open db: %w", err)
}
defer db.Close()
rows, err := db.QueryGoldenSet(cfg.MinChars)
if err != nil {
return fmt.Errorf("query golden_set: %w", err)
}
log.Printf("loaded %d golden set rows from %s (min_chars=%d)", len(rows), cfg.DBPath, cfg.MinChars)
// Convert GoldenSetRow → Response for the shared pipeline.
for _, r := range rows {
responses = append(responses, Response{
ID: r.SeedID,
Domain: r.Domain,
Prompt: r.Prompt,
Response: r.Response,
Model: r.Voice, // voice maps to the "model" slot for tracking
})
}
} else {
// Fallback: read from JSONL file.
var err error
responses, err = ReadResponses(cfg.Input)
if err != nil {
return fmt.Errorf("read responses: %w", err)
}
log.Printf("loaded %d responses from %s", len(responses), cfg.Input)
}
// Filter out bad responses (DuckDB already filters by char_count, but
// JSONL input needs filtering, and both need ERROR: prefix check).
filtered := filterResponses(responses)
log.Printf("filtered to %d valid responses (removed %d)", len(filtered), len(responses)-len(filtered))
// Split into train/valid/test.
train, valid, test := splitData(filtered, cfg.TrainPct, cfg.ValidPct, cfg.TestPct, cfg.Seed)
// Create output directory.
if err := os.MkdirAll(cfg.OutputDir, 0755); err != nil {
return fmt.Errorf("create output dir: %w", err)
}
// Write output files.
for _, split := range []struct {
name string
data []Response
}{
{"train.jsonl", train},
{"valid.jsonl", valid},
{"test.jsonl", test},
} {
path := cfg.OutputDir + "/" + split.name
if err := writeTrainingJSONL(path, split.data); err != nil {
return fmt.Errorf("write %s: %w", split.name, err)
}
}
fmt.Printf("Exported: %d train / %d valid / %d test\n", len(train), len(valid), len(test))
return nil
}
// validatePercentages checks that train+valid+test percentages sum to 100
// and that none are negative.
func validatePercentages(trainPct, validPct, testPct int) error {
if trainPct < 0 || validPct < 0 || testPct < 0 {
return fmt.Errorf("percentages must be non-negative: train=%d, valid=%d, test=%d", trainPct, validPct, testPct)
}
sum := trainPct + validPct + testPct
if sum != 100 {
return fmt.Errorf("percentages must sum to 100, got %d (train=%d + valid=%d + test=%d)", sum, trainPct, validPct, testPct)
}
return nil
}
// filterResponses removes responses with empty content, "ERROR:" prefix,
// or response length < 50 characters.
func filterResponses(responses []Response) []Response {
var filtered []Response
for _, r := range responses {
if r.Response == "" {
continue
}
if strings.HasPrefix(r.Response, "ERROR:") {
continue
}
if len(r.Response) < 50 {
continue
}
filtered = append(filtered, r)
}
return filtered
}
// splitData shuffles responses with a deterministic seed and splits them
// into train, valid, and test sets by the given percentages.
func splitData(responses []Response, trainPct, validPct, testPct int, seed int64) (train, valid, test []Response) {
// Make a copy to avoid mutating the input.
shuffled := make([]Response, len(responses))
copy(shuffled, responses)
// Shuffle with deterministic seed.
rng := rand.New(rand.NewSource(seed))
rng.Shuffle(len(shuffled), func(i, j int) {
shuffled[i], shuffled[j] = shuffled[j], shuffled[i]
})
n := len(shuffled)
trainN := n * trainPct / 100
validN := n * validPct / 100
// Test gets the remainder to ensure no items are lost.
_ = testPct
train = shuffled[:trainN]
valid = shuffled[trainN : trainN+validN]
test = shuffled[trainN+validN:]
return train, valid, test
}
// writeTrainingJSONL writes responses in chat JSONL format suitable for
// MLX LoRA fine-tuning. Each line contains a TrainingExample with user
// and assistant messages.
func writeTrainingJSONL(path string, responses []Response) error {
f, err := os.Create(path)
if err != nil {
return fmt.Errorf("create %s: %w", path, err)
}
defer f.Close()
w := bufio.NewWriter(f)
defer w.Flush()
for _, r := range responses {
example := TrainingExample{
Messages: []ChatMessage{
{Role: "user", Content: r.Prompt},
{Role: "assistant", Content: r.Response},
},
}
data, err := json.Marshal(example)
if err != nil {
return fmt.Errorf("marshal example: %w", err)
}
if _, err := w.Write(data); err != nil {
return fmt.Errorf("write line: %w", err)
}
if _, err := w.WriteString("\n"); err != nil {
return fmt.Errorf("write newline: %w", err)
}
}
return nil
}