package rag import ( "crypto/md5" "fmt" "iter" "path/filepath" "slices" "strings" ) // ChunkConfig holds chunking configuration. type ChunkConfig struct { Size int // Characters per chunk Overlap int // Overlap between chunks } // DefaultChunkConfig returns default chunking configuration. func DefaultChunkConfig() ChunkConfig { return ChunkConfig{ Size: 500, Overlap: 50, } } // Chunk represents a text chunk with metadata. type Chunk struct { Text string Section string Index int } // ChunkMarkdown splits markdown text into chunks by sections and paragraphs. // Preserves context with configurable overlap. When a paragraph exceeds the // configured Size, it is split at sentence boundaries. Overlap is aligned to // word boundaries to avoid splitting mid-word. func ChunkMarkdown(text string, cfg ChunkConfig) []Chunk { return slices.Collect(ChunkMarkdownSeq(text, cfg)) } // ChunkMarkdownSeq returns an iterator that yields document chunks from markdown text. func ChunkMarkdownSeq(text string, cfg ChunkConfig) iter.Seq[Chunk] { if cfg.Size <= 0 { cfg.Size = 500 } if cfg.Overlap < 0 || cfg.Overlap >= cfg.Size { cfg.Overlap = 0 } return func(yield func(Chunk) bool) { chunkIndex := 0 // Split by ## headers for section := range splitBySectionsSeq(text) { section = strings.TrimSpace(section) if section == "" { continue } // Extract section title lines := strings.SplitN(section, "\n", 2) title := "" if strings.HasPrefix(lines[0], "#") { title = strings.TrimLeft(lines[0], "#") title = strings.TrimSpace(title) } // If section is small enough, yield as-is if len(section) <= cfg.Size { if !yield(Chunk{ Text: section, Section: title, Index: chunkIndex, }) { return } chunkIndex++ continue } // Otherwise, chunk by paragraphs currentChunk := "" for para := range splitByParagraphsSeq(section) { para = strings.TrimSpace(para) if para == "" { continue } // If the paragraph itself exceeds Size, split at sentence // boundaries and treat each sentence (or group of sentences) // as a separate sub-paragraph. for sp := range yieldSubParas(para, cfg.Size) { sp = strings.TrimSpace(sp) if sp == "" { continue } if len(currentChunk)+len(sp)+2 <= cfg.Size { if currentChunk != "" { currentChunk += "\n\n" + sp } else { currentChunk = sp } } else { if currentChunk != "" { if !yield(Chunk{ Text: strings.TrimSpace(currentChunk), Section: title, Index: chunkIndex, }) { return } chunkIndex++ } // Start new chunk with overlap from previous, // aligned to the nearest word boundary. currentChunk = overlapPrefix(currentChunk, cfg.Overlap, sp) } } } // Don't forget the last chunk of the section if strings.TrimSpace(currentChunk) != "" { if !yield(Chunk{ Text: strings.TrimSpace(currentChunk), Section: title, Index: chunkIndex, }) { return } chunkIndex++ } } } } func yieldSubParas(para string, size int) iter.Seq[string] { return func(yield func(string) bool) { if len(para) <= size { yield(para) return } for s := range splitBySentencesSeq(para) { if !yield(s) { return } } } } // overlapPrefix builds the start of a new chunk by taking word-boundary-aligned // overlap text from the previous chunk and prepending it to the new paragraph. func overlapPrefix(prevChunk string, overlap int, newPara string) string { if overlap <= 0 { return newPara } runes := []rune(prevChunk) if len(runes) <= overlap { return newPara } // Slice from the end of the previous chunk overlapRunes := runes[len(runes)-overlap:] // Align to the nearest word boundary: find the first space within the // overlap slice and start after it to avoid a partial leading word. overlapText := string(overlapRunes) if idx := strings.IndexByte(overlapText, ' '); idx >= 0 { overlapText = overlapText[idx+1:] } if overlapText == "" { return newPara } return overlapText + "\n\n" + newPara } // splitBySentences splits text at sentence boundaries (". ", "? ", "! "). // Returns the original text in a single-element slice when no boundaries are found. func splitBySentences(text string) []string { return slices.Collect(splitBySentencesSeq(text)) } // splitBySentencesSeq returns an iterator that yields sentences split at // boundaries (". ", "? ", "! "). func splitBySentencesSeq(text string) iter.Seq[string] { return func(yield func(string) bool) { remaining := text for len(remaining) > 0 { // Find the earliest sentence boundary bestIdx := -1 var bestSep string for _, sep := range []string{". ", "? ", "! "} { idx := strings.Index(remaining, sep) if idx >= 0 && (bestIdx < 0 || idx < bestIdx) { bestIdx = idx bestSep = sep } } if bestIdx < 0 { // No more boundaries — yield remainder if not empty if s := strings.TrimSpace(remaining); s != "" { if !yield(s) { return } } break } // Include the punctuation mark in the sentence, but not the trailing space sentence := remaining[:bestIdx+len(bestSep)-1] if s := strings.TrimSpace(sentence); s != "" { if !yield(s) { return } } remaining = remaining[bestIdx+len(bestSep):] } } } // splitBySections splits text by ## headers while preserving the header with its content. func splitBySections(text string) []string { return slices.Collect(splitBySectionsSeq(text)) } // splitBySectionsSeq returns an iterator that yields text sections split by ## headers. func splitBySectionsSeq(text string) iter.Seq[string] { return func(yield func(string) bool) { var currentSection strings.Builder for line := range strings.SplitSeq(text, "\n") { // Check if this line is a ## header if strings.HasPrefix(line, "## ") { // Yield previous section if exists if currentSection.Len() > 0 { if !yield(currentSection.String()) { return } currentSection.Reset() } } currentSection.WriteString(line) currentSection.WriteString("\n") } // Don't forget the last section if currentSection.Len() > 0 { yield(currentSection.String()) } } } // splitByParagraphs splits text by double newlines. func splitByParagraphs(text string) []string { return slices.Collect(splitByParagraphsSeq(text)) } // splitByParagraphsSeq returns an iterator that yields paragraphs split by double newlines. func splitByParagraphsSeq(text string) iter.Seq[string] { return func(yield func(string) bool) { // Replace multiple newlines with a marker, then split normalized := text for strings.Contains(normalized, "\n\n\n") { normalized = strings.ReplaceAll(normalized, "\n\n\n", "\n\n") } for s := range strings.SplitSeq(normalized, "\n\n") { if !yield(s) { return } } } } // Category determines the document category from file path. func Category(path string) string { lower := strings.ToLower(path) switch { case strings.Contains(lower, "flux") || strings.Contains(lower, "ui/component"): return "ui-component" case strings.Contains(lower, "brand") || strings.Contains(lower, "mascot"): return "brand" case strings.Contains(lower, "brief"): return "product-brief" case strings.Contains(lower, "help") || strings.Contains(lower, "draft"): return "help-doc" case strings.Contains(lower, "task") || strings.Contains(lower, "plan"): return "task" case strings.Contains(lower, "architecture") || strings.Contains(lower, "migration"): return "architecture" default: return "documentation" } } // ChunkID generates a unique ID for a chunk. func ChunkID(path string, index int, text string) string { // Use first 100 runes of text for uniqueness (rune-safe for UTF-8) runes := []rune(text) if len(runes) > 100 { runes = runes[:100] } textPart := string(runes) data := fmt.Sprintf("%s:%d:%s", path, index, textPart) hash := md5.Sum([]byte(data)) return fmt.Sprintf("%x", hash) } // FileExtensions returns the file extensions to process. func FileExtensions() []string { return []string{".md", ".markdown", ".txt"} } // ShouldProcess checks if a file should be processed based on extension. func ShouldProcess(path string) bool { ext := strings.ToLower(filepath.Ext(path)) return slices.Contains(FileExtensions(), ext) }