1
0
Fork 0
forked from lthn/LEM
LEM/pkg/lem/consolidate.go
Snider 56eda1a081 refactor: migrate all 25 commands from passthrough to cobra framework
Replace passthrough() + stdlib flag.FlagSet anti-pattern with proper
cobra integration. Every Run* function now takes a typed *Opts struct
and returns error. Flags registered via cli.StringFlag/IntFlag/etc.
Commands participate in Core lifecycle with full cobra flag parsing.

- 6 command groups: gen, score, data, export, infra, mon
- 25 commands converted, 0 passthrough() calls remain
- Delete passthrough() helper from lem.go
- Update export_test.go to use ExportOpts struct

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-23 03:32:53 +00:00

143 lines
3.3 KiB
Go

package lem
import (
"bufio"
"encoding/json"
"fmt"
"log"
"os"
"os/exec"
"path/filepath"
"sort"
"strings"
)
// ConsolidateOpts holds configuration for the consolidate command.
type ConsolidateOpts struct {
Host string // SSH host for remote files
Remote string // Remote directory for JSONL files
Pattern string // File glob pattern
OutputDir string // Output directory (defaults to ./responses)
Merged string // Merged output file (defaults to gold-merged.jsonl in output dir)
}
// RunConsolidate is the CLI entry point for the consolidate command.
// Pulls all worker JSONLs from M3, merges them, deduplicates on idx,
// and writes a single merged file.
func RunConsolidate(cfg ConsolidateOpts) error {
remoteHost := cfg.Host
remotePath := cfg.Remote
pattern := cfg.Pattern
outputDir := cfg.OutputDir
merged := cfg.Merged
if outputDir == "" {
outputDir = "responses"
}
if err := os.MkdirAll(outputDir, 0755); err != nil {
return fmt.Errorf("create output dir: %v", err)
}
// List remote files.
fmt.Println("Pulling responses from remote...")
listCmd := exec.Command("ssh", remoteHost, fmt.Sprintf("ls %s/%s", remotePath, pattern))
listOutput, err := listCmd.Output()
if err != nil {
return fmt.Errorf("list remote files: %v", err)
}
remoteFiles := strings.Split(strings.TrimSpace(string(listOutput)), "\n")
var validFiles []string
for _, f := range remoteFiles {
f = strings.TrimSpace(f)
if f != "" {
validFiles = append(validFiles, f)
}
}
fmt.Printf(" Found %d JSONL files on %s\n", len(validFiles), remoteHost)
// Pull files.
for _, rf := range validFiles {
local := filepath.Join(outputDir, filepath.Base(rf))
scpCmd := exec.Command("scp", fmt.Sprintf("%s:%s", remoteHost, rf), local)
if err := scpCmd.Run(); err != nil {
log.Printf("warning: failed to pull %s: %v", rf, err)
continue
}
// Count lines.
f, err := os.Open(local)
if err != nil {
continue
}
lines := 0
scanner := bufio.NewScanner(f)
for scanner.Scan() {
lines++
}
f.Close()
fmt.Printf(" %s: %d records\n", filepath.Base(rf), lines)
}
// Merge and deduplicate on idx.
seen := make(map[int]json.RawMessage)
skipped := 0
matches, _ := filepath.Glob(filepath.Join(outputDir, pattern))
sort.Strings(matches)
for _, local := range matches {
f, err := os.Open(local)
if err != nil {
continue
}
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 1024*1024), 1024*1024)
for scanner.Scan() {
line := scanner.Text()
var rec struct {
Idx *int `json:"idx"`
}
if err := json.Unmarshal([]byte(line), &rec); err != nil {
skipped++
continue
}
if rec.Idx == nil {
skipped++
continue
}
if _, exists := seen[*rec.Idx]; !exists {
seen[*rec.Idx] = json.RawMessage(line)
}
}
f.Close()
}
if skipped > 0 {
fmt.Printf(" Skipped %d records without idx\n", skipped)
}
// Sort by idx and write merged file.
if merged == "" {
merged = filepath.Join(outputDir, "..", "gold-merged.jsonl")
}
idxs := make([]int, 0, len(seen))
for idx := range seen {
idxs = append(idxs, idx)
}
sort.Ints(idxs)
f, err := os.Create(merged)
if err != nil {
return fmt.Errorf("create merged file: %v", err)
}
for _, idx := range idxs {
f.Write(seen[idx])
f.WriteString("\n")
}
f.Close()
fmt.Printf("\nMerged: %d unique examples → %s\n", len(seen), merged)
return nil
}