1
0
Fork 0
forked from lthn/LEM
LEM/pkg/lem/parquet.go
Snider 3606ff994b fix: memory, error handling, and signal improvements across pkg/lem
- Stream parquet export rows instead of unbounded memory allocation
- Replace QueryGoldenSet/QueryExpansionPrompts with iter.Seq2 iterators
- Remove legacy runtime.GC() calls from distill (go-mlx handles cleanup)
- Replace log.Fatalf with error return in tier_score.go
- Add SIGINT/SIGTERM signal handling to agent and worker daemon loops
- Add error checks for unchecked db.conn.Exec in import.go and tier_score.go
- Update tests for iterator-based database methods

Co-Authored-By: Gemini <noreply@google.com>
Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-23 04:46:51 +00:00

157 lines
3.7 KiB
Go

package lem
import (
"bufio"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
"github.com/parquet-go/parquet-go"
)
// ParquetOpts holds configuration for the Parquet export command.
type ParquetOpts struct {
Input string // Directory containing train.jsonl, valid.jsonl, test.jsonl (required)
Output string // Output directory for Parquet files (defaults to input/parquet)
}
// ParquetRow is the schema for exported Parquet files.
type ParquetRow struct {
Prompt string `parquet:"prompt"`
Response string `parquet:"response"`
System string `parquet:"system"`
Messages string `parquet:"messages"`
}
// RunParquet is the CLI entry point for the parquet command.
// Reads JSONL training splits (train.jsonl, valid.jsonl, test.jsonl) and
// writes Parquet files with snappy compression for HuggingFace datasets.
func RunParquet(cfg ParquetOpts) error {
if cfg.Input == "" {
return fmt.Errorf("--input is required (directory with JSONL splits)")
}
outputDir := cfg.Output
if outputDir == "" {
outputDir = filepath.Join(cfg.Input, "parquet")
}
if err := os.MkdirAll(outputDir, 0755); err != nil {
return fmt.Errorf("create output dir: %w", err)
}
fmt.Printf("Exporting Parquet from %s → %s\n", cfg.Input, outputDir)
total := 0
for _, split := range []string{"train", "valid", "test"} {
jsonlPath := filepath.Join(cfg.Input, split+".jsonl")
if _, err := os.Stat(jsonlPath); os.IsNotExist(err) {
fmt.Printf(" Skip: %s.jsonl not found\n", split)
continue
}
n, err := exportSplitParquet(jsonlPath, outputDir, split)
if err != nil {
return fmt.Errorf("export %s: %w", split, err)
}
total += n
}
fmt.Printf("\nTotal: %d rows exported\n", total)
return nil
}
// exportSplitParquet reads a JSONL file and writes a Parquet file for the split.
func exportSplitParquet(jsonlPath, outputDir, split string) (int, error) {
f, err := os.Open(jsonlPath)
if err != nil {
return 0, fmt.Errorf("open %s: %w", jsonlPath, err)
}
defer f.Close()
outPath := filepath.Join(outputDir, split+".parquet")
out, err := os.Create(outPath)
if err != nil {
return 0, fmt.Errorf("create %s: %w", outPath, err)
}
defer out.Close()
writer := parquet.NewGenericWriter[ParquetRow](out,
parquet.Compression(&parquet.Snappy),
)
defer writer.Close()
count := 0
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 1024*1024), 1024*1024)
for scanner.Scan() {
text := strings.TrimSpace(scanner.Text())
if text == "" {
continue
}
var data struct {
Messages []ChatMessage `json:"messages"`
}
if err := json.Unmarshal([]byte(text), &data); err != nil {
continue
}
var prompt, response, system string
for _, m := range data.Messages {
switch m.Role {
case "user":
if prompt == "" {
prompt = m.Content
}
case "assistant":
if response == "" {
response = m.Content
}
case "system":
if system == "" {
system = m.Content
}
}
}
msgsJSON, _ := json.Marshal(data.Messages)
row := ParquetRow{
Prompt: prompt,
Response: response,
System: system,
Messages: string(msgsJSON),
}
if _, err := writer.Write([]ParquetRow{row}); err != nil {
return count, fmt.Errorf("write parquet row: %w", err)
}
count++
}
if err := scanner.Err(); err != nil {
return count, fmt.Errorf("scan %s: %w", jsonlPath, err)
}
if count == 0 {
fmt.Printf(" Skip: %s — no data\n", split)
return 0, nil
}
if err := writer.Close(); err != nil {
return count, fmt.Errorf("close parquet writer: %w", err)
}
if err := out.Close(); err != nil {
return count, fmt.Errorf("close file: %w", err)
}
info, _ := os.Stat(outPath)
sizeMB := float64(info.Size()) / 1024 / 1024
fmt.Printf(" %s.parquet: %d rows (%.1f MB)\n", split, count, sizeMB)
return count, nil
}