LEM/cmd/dedup-check/main.go
Snider f75458bce6 refactor: apply go fix modernizers for Go 1.26
Automated fixes: interface{} → any, range-over-int, t.Context(),
wg.Go(), strings.SplitSeq, strings.Builder, slices.Contains,
maps helpers, min/max builtins.

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-22 21:00:17 +00:00

261 lines
5.5 KiB
Go

// dedup-check scans JSONL training files for duplicate prompts.
// Reports exact matches and near-duplicates across files.
package main
import (
"bufio"
"encoding/json"
"fmt"
"log"
"os"
"path/filepath"
"strings"
)
type entry struct {
File string
Line int
SeedID string
Voice string
Domain string
Prompt string
}
func main() {
if len(os.Args) < 2 {
fmt.Fprintf(os.Stderr, "Usage: dedup-check <dir-or-file> [...]\n")
fmt.Fprintf(os.Stderr, "\nScans JSONL/JSON files for duplicate prompts.\n")
fmt.Fprintf(os.Stderr, "Reports exact duplicates and shows which files contain them.\n")
os.Exit(1)
}
var files []string
for _, arg := range os.Args[1:] {
info, err := os.Stat(arg)
if err != nil {
log.Printf("skip %s: %v", arg, err)
continue
}
if info.IsDir() {
filepath.Walk(arg, func(path string, fi os.FileInfo, err error) error {
if err != nil {
return nil
}
if !fi.IsDir() && (strings.HasSuffix(path, ".jsonl") || strings.HasSuffix(path, ".json")) {
files = append(files, path)
}
return nil
})
} else {
files = append(files, arg)
}
}
log.Printf("scanning %d files", len(files))
// Map: normalised prompt → list of entries.
exact := make(map[string][]entry)
total := 0
for _, f := range files {
entries, err := readEntries(f)
if err != nil {
log.Printf("skip %s: %v", f, err)
continue
}
for _, e := range entries {
key := normalise(e.Prompt)
exact[key] = append(exact[key], e)
total++
}
}
// Report duplicates.
dupeGroups := 0
dupeEntries := 0
crossFile := 0
for _, entries := range exact {
if len(entries) < 2 {
continue
}
dupeGroups++
dupeEntries += len(entries)
// Check if duplicates span multiple files.
fileSet := make(map[string]bool)
for _, e := range entries {
fileSet[e.File] = true
}
if len(fileSet) > 1 {
crossFile++
}
}
fmt.Printf("\n=== Dedup Report ===\n")
fmt.Printf("Files scanned: %d\n", len(files))
fmt.Printf("Total prompts: %d\n", total)
fmt.Printf("Unique prompts: %d\n", len(exact))
fmt.Printf("Duplicate groups: %d\n", dupeGroups)
fmt.Printf("Duplicate entries: %d\n", dupeEntries)
fmt.Printf("Cross-file dupes: %d (same prompt in different files)\n", crossFile)
if crossFile > 0 {
fmt.Printf("\n--- Cross-File Duplicates ---\n")
shown := 0
for prompt, entries := range exact {
if len(entries) < 2 {
continue
}
fileSet := make(map[string]bool)
for _, e := range entries {
fileSet[e.File] = true
}
if len(fileSet) < 2 {
continue
}
shown++
if shown > 50 {
fmt.Printf("\n... and %d more cross-file groups\n", crossFile-50)
break
}
preview := prompt
if len(preview) > 100 {
preview = preview[:100] + "..."
}
fmt.Printf("\n[%d] %q\n", shown, preview)
for _, e := range entries {
seedInfo := ""
if e.SeedID != "" {
seedInfo = fmt.Sprintf(" seed=%s", e.SeedID)
}
if e.Voice != "" {
seedInfo += fmt.Sprintf(" voice=%s", e.Voice)
}
fmt.Printf(" %s:%d%s\n", e.File, e.Line, seedInfo)
}
}
}
if dupeGroups > 0 && crossFile == 0 {
fmt.Printf("\nAll duplicates are within the same file (no cross-file conflicts).\n")
}
if dupeGroups == 0 {
fmt.Printf("\nNo duplicates found.\n")
}
}
func readEntries(path string) ([]entry, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, err
}
text := strings.TrimSpace(string(data))
if text == "" {
return nil, nil
}
// Try as JSON array first.
if text[0] == '[' {
var arr []map[string]any
if err := json.Unmarshal(data, &arr); err != nil {
return nil, fmt.Errorf("parse JSON array: %w", err)
}
var entries []entry
for i, obj := range arr {
prompt := strVal(obj, "prompt")
if prompt == "" {
// Try messages format.
prompt = extractFromMessages(obj)
}
if prompt == "" {
continue
}
entries = append(entries, entry{
File: path,
Line: i + 1,
SeedID: strVal(obj, "seed_id", "id"),
Voice: strVal(obj, "voice"),
Domain: strVal(obj, "domain"),
Prompt: prompt,
})
}
return entries, nil
}
// JSONL.
var entries []entry
scanner := bufio.NewScanner(strings.NewReader(text))
scanner.Buffer(make([]byte, 4*1024*1024), 4*1024*1024)
lineNo := 0
for scanner.Scan() {
lineNo++
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
var obj map[string]any
if err := json.Unmarshal([]byte(line), &obj); err != nil {
continue
}
prompt := strVal(obj, "prompt")
if prompt == "" {
prompt = extractFromMessages(obj)
}
if prompt == "" {
continue
}
entries = append(entries, entry{
File: path,
Line: lineNo,
SeedID: strVal(obj, "seed_id", "id"),
Voice: strVal(obj, "voice"),
Domain: strVal(obj, "domain"),
Prompt: prompt,
})
}
return entries, nil
}
// extractFromMessages pulls the user prompt from training format.
func extractFromMessages(obj map[string]any) string {
msgs, ok := obj["messages"]
if !ok {
return ""
}
arr, ok := msgs.([]any)
if !ok {
return ""
}
for _, m := range arr {
msg, ok := m.(map[string]any)
if !ok {
continue
}
if strVal(msg, "role") == "user" {
return strVal(msg, "content")
}
}
return ""
}
// strVal extracts a string from a map, trying multiple keys.
func strVal(obj map[string]any, keys ...string) string {
for _, k := range keys {
if v, ok := obj[k]; ok {
if s, ok := v.(string); ok {
return s
}
}
}
return ""
}
// normalise strips whitespace for comparison.
func normalise(s string) string {
return strings.Join(strings.Fields(s), " ")
}