Automated fixes: interface{} → any, range-over-int, t.Context(),
wg.Go(), strings.SplitSeq, strings.Builder, slices.Contains,
maps helpers, min/max builtins.
Co-Authored-By: Virgil <virgil@lethean.io>
261 lines
5.5 KiB
Go
261 lines
5.5 KiB
Go
// dedup-check scans JSONL training files for duplicate prompts.
|
|
// Reports exact matches and near-duplicates across files.
|
|
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
type entry struct {
|
|
File string
|
|
Line int
|
|
SeedID string
|
|
Voice string
|
|
Domain string
|
|
Prompt string
|
|
}
|
|
|
|
func main() {
|
|
if len(os.Args) < 2 {
|
|
fmt.Fprintf(os.Stderr, "Usage: dedup-check <dir-or-file> [...]\n")
|
|
fmt.Fprintf(os.Stderr, "\nScans JSONL/JSON files for duplicate prompts.\n")
|
|
fmt.Fprintf(os.Stderr, "Reports exact duplicates and shows which files contain them.\n")
|
|
os.Exit(1)
|
|
}
|
|
|
|
var files []string
|
|
for _, arg := range os.Args[1:] {
|
|
info, err := os.Stat(arg)
|
|
if err != nil {
|
|
log.Printf("skip %s: %v", arg, err)
|
|
continue
|
|
}
|
|
if info.IsDir() {
|
|
filepath.Walk(arg, func(path string, fi os.FileInfo, err error) error {
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
if !fi.IsDir() && (strings.HasSuffix(path, ".jsonl") || strings.HasSuffix(path, ".json")) {
|
|
files = append(files, path)
|
|
}
|
|
return nil
|
|
})
|
|
} else {
|
|
files = append(files, arg)
|
|
}
|
|
}
|
|
|
|
log.Printf("scanning %d files", len(files))
|
|
|
|
// Map: normalised prompt → list of entries.
|
|
exact := make(map[string][]entry)
|
|
total := 0
|
|
|
|
for _, f := range files {
|
|
entries, err := readEntries(f)
|
|
if err != nil {
|
|
log.Printf("skip %s: %v", f, err)
|
|
continue
|
|
}
|
|
for _, e := range entries {
|
|
key := normalise(e.Prompt)
|
|
exact[key] = append(exact[key], e)
|
|
total++
|
|
}
|
|
}
|
|
|
|
// Report duplicates.
|
|
dupeGroups := 0
|
|
dupeEntries := 0
|
|
crossFile := 0
|
|
|
|
for _, entries := range exact {
|
|
if len(entries) < 2 {
|
|
continue
|
|
}
|
|
dupeGroups++
|
|
dupeEntries += len(entries)
|
|
|
|
// Check if duplicates span multiple files.
|
|
fileSet := make(map[string]bool)
|
|
for _, e := range entries {
|
|
fileSet[e.File] = true
|
|
}
|
|
if len(fileSet) > 1 {
|
|
crossFile++
|
|
}
|
|
}
|
|
|
|
fmt.Printf("\n=== Dedup Report ===\n")
|
|
fmt.Printf("Files scanned: %d\n", len(files))
|
|
fmt.Printf("Total prompts: %d\n", total)
|
|
fmt.Printf("Unique prompts: %d\n", len(exact))
|
|
fmt.Printf("Duplicate groups: %d\n", dupeGroups)
|
|
fmt.Printf("Duplicate entries: %d\n", dupeEntries)
|
|
fmt.Printf("Cross-file dupes: %d (same prompt in different files)\n", crossFile)
|
|
|
|
if crossFile > 0 {
|
|
fmt.Printf("\n--- Cross-File Duplicates ---\n")
|
|
shown := 0
|
|
for prompt, entries := range exact {
|
|
if len(entries) < 2 {
|
|
continue
|
|
}
|
|
fileSet := make(map[string]bool)
|
|
for _, e := range entries {
|
|
fileSet[e.File] = true
|
|
}
|
|
if len(fileSet) < 2 {
|
|
continue
|
|
}
|
|
|
|
shown++
|
|
if shown > 50 {
|
|
fmt.Printf("\n... and %d more cross-file groups\n", crossFile-50)
|
|
break
|
|
}
|
|
|
|
preview := prompt
|
|
if len(preview) > 100 {
|
|
preview = preview[:100] + "..."
|
|
}
|
|
fmt.Printf("\n[%d] %q\n", shown, preview)
|
|
for _, e := range entries {
|
|
seedInfo := ""
|
|
if e.SeedID != "" {
|
|
seedInfo = fmt.Sprintf(" seed=%s", e.SeedID)
|
|
}
|
|
if e.Voice != "" {
|
|
seedInfo += fmt.Sprintf(" voice=%s", e.Voice)
|
|
}
|
|
fmt.Printf(" %s:%d%s\n", e.File, e.Line, seedInfo)
|
|
}
|
|
}
|
|
}
|
|
|
|
if dupeGroups > 0 && crossFile == 0 {
|
|
fmt.Printf("\nAll duplicates are within the same file (no cross-file conflicts).\n")
|
|
}
|
|
|
|
if dupeGroups == 0 {
|
|
fmt.Printf("\nNo duplicates found.\n")
|
|
}
|
|
}
|
|
|
|
func readEntries(path string) ([]entry, error) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
text := strings.TrimSpace(string(data))
|
|
if text == "" {
|
|
return nil, nil
|
|
}
|
|
|
|
// Try as JSON array first.
|
|
if text[0] == '[' {
|
|
var arr []map[string]any
|
|
if err := json.Unmarshal(data, &arr); err != nil {
|
|
return nil, fmt.Errorf("parse JSON array: %w", err)
|
|
}
|
|
var entries []entry
|
|
for i, obj := range arr {
|
|
prompt := strVal(obj, "prompt")
|
|
if prompt == "" {
|
|
// Try messages format.
|
|
prompt = extractFromMessages(obj)
|
|
}
|
|
if prompt == "" {
|
|
continue
|
|
}
|
|
entries = append(entries, entry{
|
|
File: path,
|
|
Line: i + 1,
|
|
SeedID: strVal(obj, "seed_id", "id"),
|
|
Voice: strVal(obj, "voice"),
|
|
Domain: strVal(obj, "domain"),
|
|
Prompt: prompt,
|
|
})
|
|
}
|
|
return entries, nil
|
|
}
|
|
|
|
// JSONL.
|
|
var entries []entry
|
|
scanner := bufio.NewScanner(strings.NewReader(text))
|
|
scanner.Buffer(make([]byte, 4*1024*1024), 4*1024*1024)
|
|
lineNo := 0
|
|
for scanner.Scan() {
|
|
lineNo++
|
|
line := strings.TrimSpace(scanner.Text())
|
|
if line == "" {
|
|
continue
|
|
}
|
|
var obj map[string]any
|
|
if err := json.Unmarshal([]byte(line), &obj); err != nil {
|
|
continue
|
|
}
|
|
prompt := strVal(obj, "prompt")
|
|
if prompt == "" {
|
|
prompt = extractFromMessages(obj)
|
|
}
|
|
if prompt == "" {
|
|
continue
|
|
}
|
|
entries = append(entries, entry{
|
|
File: path,
|
|
Line: lineNo,
|
|
SeedID: strVal(obj, "seed_id", "id"),
|
|
Voice: strVal(obj, "voice"),
|
|
Domain: strVal(obj, "domain"),
|
|
Prompt: prompt,
|
|
})
|
|
}
|
|
return entries, nil
|
|
}
|
|
|
|
// extractFromMessages pulls the user prompt from training format.
|
|
func extractFromMessages(obj map[string]any) string {
|
|
msgs, ok := obj["messages"]
|
|
if !ok {
|
|
return ""
|
|
}
|
|
arr, ok := msgs.([]any)
|
|
if !ok {
|
|
return ""
|
|
}
|
|
for _, m := range arr {
|
|
msg, ok := m.(map[string]any)
|
|
if !ok {
|
|
continue
|
|
}
|
|
if strVal(msg, "role") == "user" {
|
|
return strVal(msg, "content")
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// strVal extracts a string from a map, trying multiple keys.
|
|
func strVal(obj map[string]any, keys ...string) string {
|
|
for _, k := range keys {
|
|
if v, ok := obj[k]; ok {
|
|
if s, ok := v.(string); ok {
|
|
return s
|
|
}
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// normalise strips whitespace for comparison.
|
|
func normalise(s string) string {
|
|
return strings.Join(strings.Fields(s), " ")
|
|
}
|