1
0
Fork 0
forked from lthn/LEM
LEM/pkg/lem/consolidate.go
Claude 91ee389377
feat: convert all pipeline.py commands to Go
Complete conversion of pipeline.py into Go `lem` CLI:
- import-all: bulk import all LEM data into DuckDB from M3
- consolidate: pull worker JSONLs, merge, deduplicate
- normalize: seeds → deduplicated expansion_prompts table
- approve: filter scored expansions → training JSONL
- tier-score: heuristic/judge tiered expansion scoring
- expand-status: expansion pipeline progress from DuckDB
- inventory: DuckDB table counts and summary
- coverage: seed coverage gap analysis
- seed-influx: bootstrap InfluxDB from DuckDB golden_gen
- query: ad-hoc SQL against DuckDB

22 commands total, 49 Go files. Replaces entire pipeline.py.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 17:12:03 +00:00

139 lines
3.4 KiB
Go

package lem
import (
"bufio"
"encoding/json"
"flag"
"fmt"
"log"
"os"
"os/exec"
"path/filepath"
"sort"
"strings"
)
// RunConsolidate is the CLI entry point for the consolidate command.
// Pulls all worker JSONLs from M3, merges them, deduplicates on idx,
// and writes a single merged file.
func RunConsolidate(args []string) {
fs := flag.NewFlagSet("consolidate", flag.ExitOnError)
remoteHost := fs.String("host", "m3", "SSH host for remote files")
remotePath := fs.String("remote", "/Volumes/Data/lem/responses", "Remote directory for JSONL files")
pattern := fs.String("pattern", "gold*.jsonl", "File glob pattern")
outputDir := fs.String("output", "", "Output directory (defaults to ./responses)")
merged := fs.String("merged", "", "Merged output file (defaults to gold-merged.jsonl in output dir)")
if err := fs.Parse(args); err != nil {
log.Fatalf("parse flags: %v", err)
}
if *outputDir == "" {
*outputDir = "responses"
}
if err := os.MkdirAll(*outputDir, 0755); err != nil {
log.Fatalf("create output dir: %v", err)
}
// List remote files.
fmt.Println("Pulling responses from remote...")
listCmd := exec.Command("ssh", *remoteHost, fmt.Sprintf("ls %s/%s", *remotePath, *pattern))
listOutput, err := listCmd.Output()
if err != nil {
log.Fatalf("list remote files: %v", err)
}
remoteFiles := strings.Split(strings.TrimSpace(string(listOutput)), "\n")
var validFiles []string
for _, f := range remoteFiles {
f = strings.TrimSpace(f)
if f != "" {
validFiles = append(validFiles, f)
}
}
fmt.Printf(" Found %d JSONL files on %s\n", len(validFiles), *remoteHost)
// Pull files.
for _, rf := range validFiles {
local := filepath.Join(*outputDir, filepath.Base(rf))
scpCmd := exec.Command("scp", fmt.Sprintf("%s:%s", *remoteHost, rf), local)
if err := scpCmd.Run(); err != nil {
log.Printf("warning: failed to pull %s: %v", rf, err)
continue
}
// Count lines.
f, err := os.Open(local)
if err != nil {
continue
}
lines := 0
scanner := bufio.NewScanner(f)
for scanner.Scan() {
lines++
}
f.Close()
fmt.Printf(" %s: %d records\n", filepath.Base(rf), lines)
}
// Merge and deduplicate on idx.
seen := make(map[int]json.RawMessage)
skipped := 0
matches, _ := filepath.Glob(filepath.Join(*outputDir, *pattern))
sort.Strings(matches)
for _, local := range matches {
f, err := os.Open(local)
if err != nil {
continue
}
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 1024*1024), 1024*1024)
for scanner.Scan() {
line := scanner.Text()
var rec struct {
Idx *int `json:"idx"`
}
if err := json.Unmarshal([]byte(line), &rec); err != nil {
skipped++
continue
}
if rec.Idx == nil {
skipped++
continue
}
if _, exists := seen[*rec.Idx]; !exists {
seen[*rec.Idx] = json.RawMessage(line)
}
}
f.Close()
}
if skipped > 0 {
fmt.Printf(" Skipped %d records without idx\n", skipped)
}
// Sort by idx and write merged file.
if *merged == "" {
*merged = filepath.Join(*outputDir, "..", "gold-merged.jsonl")
}
idxs := make([]int, 0, len(seen))
for idx := range seen {
idxs = append(idxs, idx)
}
sort.Ints(idxs)
f, err := os.Create(*merged)
if err != nil {
log.Fatalf("create merged file: %v", err)
}
for _, idx := range idxs {
f.Write(seen[idx])
f.WriteString("\n")
}
f.Close()
fmt.Printf("\nMerged: %d unique examples → %s\n", len(seen), *merged)
}