Replace passthrough() + stdlib flag.FlagSet anti-pattern with proper cobra integration. Every Run* function now takes a typed *Opts struct and returns error. Flags registered via cli.StringFlag/IntFlag/etc. Commands participate in Core lifecycle with full cobra flag parsing. - 6 command groups: gen, score, data, export, infra, mon - 25 commands converted, 0 passthrough() calls remain - Delete passthrough() helper from lem.go - Update export_test.go to use ExportOpts struct Co-Authored-By: Virgil <virgil@lethean.io>
143 lines
3.3 KiB
Go
143 lines
3.3 KiB
Go
package lem
|
|
|
|
import (
|
|
"bufio"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
)
|
|
|
|
// ConsolidateOpts holds configuration for the consolidate command.
|
|
type ConsolidateOpts struct {
|
|
Host string // SSH host for remote files
|
|
Remote string // Remote directory for JSONL files
|
|
Pattern string // File glob pattern
|
|
OutputDir string // Output directory (defaults to ./responses)
|
|
Merged string // Merged output file (defaults to gold-merged.jsonl in output dir)
|
|
}
|
|
|
|
// RunConsolidate is the CLI entry point for the consolidate command.
|
|
// Pulls all worker JSONLs from M3, merges them, deduplicates on idx,
|
|
// and writes a single merged file.
|
|
func RunConsolidate(cfg ConsolidateOpts) error {
|
|
remoteHost := cfg.Host
|
|
remotePath := cfg.Remote
|
|
pattern := cfg.Pattern
|
|
outputDir := cfg.OutputDir
|
|
merged := cfg.Merged
|
|
|
|
if outputDir == "" {
|
|
outputDir = "responses"
|
|
}
|
|
if err := os.MkdirAll(outputDir, 0755); err != nil {
|
|
return fmt.Errorf("create output dir: %v", err)
|
|
}
|
|
|
|
// List remote files.
|
|
fmt.Println("Pulling responses from remote...")
|
|
listCmd := exec.Command("ssh", remoteHost, fmt.Sprintf("ls %s/%s", remotePath, pattern))
|
|
listOutput, err := listCmd.Output()
|
|
if err != nil {
|
|
return fmt.Errorf("list remote files: %v", err)
|
|
}
|
|
|
|
remoteFiles := strings.Split(strings.TrimSpace(string(listOutput)), "\n")
|
|
var validFiles []string
|
|
for _, f := range remoteFiles {
|
|
f = strings.TrimSpace(f)
|
|
if f != "" {
|
|
validFiles = append(validFiles, f)
|
|
}
|
|
}
|
|
fmt.Printf(" Found %d JSONL files on %s\n", len(validFiles), remoteHost)
|
|
|
|
// Pull files.
|
|
for _, rf := range validFiles {
|
|
local := filepath.Join(outputDir, filepath.Base(rf))
|
|
scpCmd := exec.Command("scp", fmt.Sprintf("%s:%s", remoteHost, rf), local)
|
|
if err := scpCmd.Run(); err != nil {
|
|
log.Printf("warning: failed to pull %s: %v", rf, err)
|
|
continue
|
|
}
|
|
|
|
// Count lines.
|
|
f, err := os.Open(local)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
lines := 0
|
|
scanner := bufio.NewScanner(f)
|
|
for scanner.Scan() {
|
|
lines++
|
|
}
|
|
f.Close()
|
|
fmt.Printf(" %s: %d records\n", filepath.Base(rf), lines)
|
|
}
|
|
|
|
// Merge and deduplicate on idx.
|
|
seen := make(map[int]json.RawMessage)
|
|
skipped := 0
|
|
|
|
matches, _ := filepath.Glob(filepath.Join(outputDir, pattern))
|
|
sort.Strings(matches)
|
|
|
|
for _, local := range matches {
|
|
f, err := os.Open(local)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
scanner := bufio.NewScanner(f)
|
|
scanner.Buffer(make([]byte, 1024*1024), 1024*1024)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
var rec struct {
|
|
Idx *int `json:"idx"`
|
|
}
|
|
if err := json.Unmarshal([]byte(line), &rec); err != nil {
|
|
skipped++
|
|
continue
|
|
}
|
|
if rec.Idx == nil {
|
|
skipped++
|
|
continue
|
|
}
|
|
if _, exists := seen[*rec.Idx]; !exists {
|
|
seen[*rec.Idx] = json.RawMessage(line)
|
|
}
|
|
}
|
|
f.Close()
|
|
}
|
|
|
|
if skipped > 0 {
|
|
fmt.Printf(" Skipped %d records without idx\n", skipped)
|
|
}
|
|
|
|
// Sort by idx and write merged file.
|
|
if merged == "" {
|
|
merged = filepath.Join(outputDir, "..", "gold-merged.jsonl")
|
|
}
|
|
|
|
idxs := make([]int, 0, len(seen))
|
|
for idx := range seen {
|
|
idxs = append(idxs, idx)
|
|
}
|
|
sort.Ints(idxs)
|
|
|
|
f, err := os.Create(merged)
|
|
if err != nil {
|
|
return fmt.Errorf("create merged file: %v", err)
|
|
}
|
|
for _, idx := range idxs {
|
|
f.Write(seen[idx])
|
|
f.WriteString("\n")
|
|
}
|
|
f.Close()
|
|
|
|
fmt.Printf("\nMerged: %d unique examples → %s\n", len(seen), merged)
|
|
return nil
|
|
}
|