Add ChunkMarkdownSeq, QuerySeq, KeywordFilterSeq, ListCollectionsSeq iterators for streaming. Use slices.SortFunc, slices.Contains, slices.Collect in mock/query/keyword. Range-over-int in benchmarks. Co-Authored-By: Gemini <noreply@google.com> Co-Authored-By: Virgil <virgil@lethean.io>
322 lines
8.3 KiB
Go
322 lines
8.3 KiB
Go
package rag
|
|
|
|
import (
|
|
"crypto/md5"
|
|
"fmt"
|
|
"iter"
|
|
"path/filepath"
|
|
"slices"
|
|
"strings"
|
|
)
|
|
|
|
// ChunkConfig holds chunking configuration.
|
|
type ChunkConfig struct {
|
|
Size int // Characters per chunk
|
|
Overlap int // Overlap between chunks
|
|
}
|
|
|
|
// DefaultChunkConfig returns default chunking configuration.
|
|
func DefaultChunkConfig() ChunkConfig {
|
|
return ChunkConfig{
|
|
Size: 500,
|
|
Overlap: 50,
|
|
}
|
|
}
|
|
|
|
// Chunk represents a text chunk with metadata.
|
|
type Chunk struct {
|
|
Text string
|
|
Section string
|
|
Index int
|
|
}
|
|
|
|
// ChunkMarkdown splits markdown text into chunks by sections and paragraphs.
|
|
// Preserves context with configurable overlap. When a paragraph exceeds the
|
|
// configured Size, it is split at sentence boundaries. Overlap is aligned to
|
|
// word boundaries to avoid splitting mid-word.
|
|
func ChunkMarkdown(text string, cfg ChunkConfig) []Chunk {
|
|
return slices.Collect(ChunkMarkdownSeq(text, cfg))
|
|
}
|
|
|
|
// ChunkMarkdownSeq returns an iterator that yields document chunks from markdown text.
|
|
func ChunkMarkdownSeq(text string, cfg ChunkConfig) iter.Seq[Chunk] {
|
|
if cfg.Size <= 0 {
|
|
cfg.Size = 500
|
|
}
|
|
if cfg.Overlap < 0 || cfg.Overlap >= cfg.Size {
|
|
cfg.Overlap = 0
|
|
}
|
|
|
|
return func(yield func(Chunk) bool) {
|
|
chunkIndex := 0
|
|
|
|
// Split by ## headers
|
|
for section := range splitBySectionsSeq(text) {
|
|
section = strings.TrimSpace(section)
|
|
if section == "" {
|
|
continue
|
|
}
|
|
|
|
// Extract section title
|
|
lines := strings.SplitN(section, "\n", 2)
|
|
title := ""
|
|
if strings.HasPrefix(lines[0], "#") {
|
|
title = strings.TrimLeft(lines[0], "#")
|
|
title = strings.TrimSpace(title)
|
|
}
|
|
|
|
// If section is small enough, yield as-is
|
|
if len(section) <= cfg.Size {
|
|
if !yield(Chunk{
|
|
Text: section,
|
|
Section: title,
|
|
Index: chunkIndex,
|
|
}) {
|
|
return
|
|
}
|
|
chunkIndex++
|
|
continue
|
|
}
|
|
|
|
// Otherwise, chunk by paragraphs
|
|
currentChunk := ""
|
|
for para := range splitByParagraphsSeq(section) {
|
|
para = strings.TrimSpace(para)
|
|
if para == "" {
|
|
continue
|
|
}
|
|
|
|
// If the paragraph itself exceeds Size, split at sentence
|
|
// boundaries and treat each sentence (or group of sentences)
|
|
// as a separate sub-paragraph.
|
|
for sp := range yieldSubParas(para, cfg.Size) {
|
|
sp = strings.TrimSpace(sp)
|
|
if sp == "" {
|
|
continue
|
|
}
|
|
|
|
if len(currentChunk)+len(sp)+2 <= cfg.Size {
|
|
if currentChunk != "" {
|
|
currentChunk += "\n\n" + sp
|
|
} else {
|
|
currentChunk = sp
|
|
}
|
|
} else {
|
|
if currentChunk != "" {
|
|
if !yield(Chunk{
|
|
Text: strings.TrimSpace(currentChunk),
|
|
Section: title,
|
|
Index: chunkIndex,
|
|
}) {
|
|
return
|
|
}
|
|
chunkIndex++
|
|
}
|
|
// Start new chunk with overlap from previous,
|
|
// aligned to the nearest word boundary.
|
|
currentChunk = overlapPrefix(currentChunk, cfg.Overlap, sp)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Don't forget the last chunk of the section
|
|
if strings.TrimSpace(currentChunk) != "" {
|
|
if !yield(Chunk{
|
|
Text: strings.TrimSpace(currentChunk),
|
|
Section: title,
|
|
Index: chunkIndex,
|
|
}) {
|
|
return
|
|
}
|
|
chunkIndex++
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func yieldSubParas(para string, size int) iter.Seq[string] {
|
|
return func(yield func(string) bool) {
|
|
if len(para) <= size {
|
|
yield(para)
|
|
return
|
|
}
|
|
for s := range splitBySentencesSeq(para) {
|
|
if !yield(s) {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// overlapPrefix builds the start of a new chunk by taking word-boundary-aligned
|
|
// overlap text from the previous chunk and prepending it to the new paragraph.
|
|
func overlapPrefix(prevChunk string, overlap int, newPara string) string {
|
|
if overlap <= 0 {
|
|
return newPara
|
|
}
|
|
|
|
runes := []rune(prevChunk)
|
|
if len(runes) <= overlap {
|
|
return newPara
|
|
}
|
|
|
|
// Slice from the end of the previous chunk
|
|
overlapRunes := runes[len(runes)-overlap:]
|
|
|
|
// Align to the nearest word boundary: find the first space within the
|
|
// overlap slice and start after it to avoid a partial leading word.
|
|
overlapText := string(overlapRunes)
|
|
if idx := strings.IndexByte(overlapText, ' '); idx >= 0 {
|
|
overlapText = overlapText[idx+1:]
|
|
}
|
|
|
|
if overlapText == "" {
|
|
return newPara
|
|
}
|
|
|
|
return overlapText + "\n\n" + newPara
|
|
}
|
|
|
|
// splitBySentences splits text at sentence boundaries (". ", "? ", "! ").
|
|
// Returns the original text in a single-element slice when no boundaries are found.
|
|
func splitBySentences(text string) []string {
|
|
return slices.Collect(splitBySentencesSeq(text))
|
|
}
|
|
|
|
// splitBySentencesSeq returns an iterator that yields sentences split at
|
|
// boundaries (". ", "? ", "! ").
|
|
func splitBySentencesSeq(text string) iter.Seq[string] {
|
|
return func(yield func(string) bool) {
|
|
remaining := text
|
|
|
|
for len(remaining) > 0 {
|
|
// Find the earliest sentence boundary
|
|
bestIdx := -1
|
|
var bestSep string
|
|
for _, sep := range []string{". ", "? ", "! "} {
|
|
idx := strings.Index(remaining, sep)
|
|
if idx >= 0 && (bestIdx < 0 || idx < bestIdx) {
|
|
bestIdx = idx
|
|
bestSep = sep
|
|
}
|
|
}
|
|
|
|
if bestIdx < 0 {
|
|
// No more boundaries — yield remainder if not empty
|
|
if s := strings.TrimSpace(remaining); s != "" {
|
|
if !yield(s) {
|
|
return
|
|
}
|
|
}
|
|
break
|
|
}
|
|
|
|
// Include the punctuation mark in the sentence, but not the trailing space
|
|
sentence := remaining[:bestIdx+len(bestSep)-1]
|
|
if s := strings.TrimSpace(sentence); s != "" {
|
|
if !yield(s) {
|
|
return
|
|
}
|
|
}
|
|
remaining = remaining[bestIdx+len(bestSep):]
|
|
}
|
|
}
|
|
}
|
|
|
|
// splitBySections splits text by ## headers while preserving the header with its content.
|
|
func splitBySections(text string) []string {
|
|
return slices.Collect(splitBySectionsSeq(text))
|
|
}
|
|
|
|
// splitBySectionsSeq returns an iterator that yields text sections split by ## headers.
|
|
func splitBySectionsSeq(text string) iter.Seq[string] {
|
|
return func(yield func(string) bool) {
|
|
var currentSection strings.Builder
|
|
for line := range strings.SplitSeq(text, "\n") {
|
|
// Check if this line is a ## header
|
|
if strings.HasPrefix(line, "## ") {
|
|
// Yield previous section if exists
|
|
if currentSection.Len() > 0 {
|
|
if !yield(currentSection.String()) {
|
|
return
|
|
}
|
|
currentSection.Reset()
|
|
}
|
|
}
|
|
currentSection.WriteString(line)
|
|
currentSection.WriteString("\n")
|
|
}
|
|
|
|
// Don't forget the last section
|
|
if currentSection.Len() > 0 {
|
|
yield(currentSection.String())
|
|
}
|
|
}
|
|
}
|
|
|
|
// splitByParagraphs splits text by double newlines.
|
|
func splitByParagraphs(text string) []string {
|
|
return slices.Collect(splitByParagraphsSeq(text))
|
|
}
|
|
|
|
// splitByParagraphsSeq returns an iterator that yields paragraphs split by double newlines.
|
|
func splitByParagraphsSeq(text string) iter.Seq[string] {
|
|
return func(yield func(string) bool) {
|
|
// Replace multiple newlines with a marker, then split
|
|
normalized := text
|
|
for strings.Contains(normalized, "\n\n\n") {
|
|
normalized = strings.ReplaceAll(normalized, "\n\n\n", "\n\n")
|
|
}
|
|
for s := range strings.SplitSeq(normalized, "\n\n") {
|
|
if !yield(s) {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Category determines the document category from file path.
|
|
func Category(path string) string {
|
|
lower := strings.ToLower(path)
|
|
|
|
switch {
|
|
case strings.Contains(lower, "flux") || strings.Contains(lower, "ui/component"):
|
|
return "ui-component"
|
|
case strings.Contains(lower, "brand") || strings.Contains(lower, "mascot"):
|
|
return "brand"
|
|
case strings.Contains(lower, "brief"):
|
|
return "product-brief"
|
|
case strings.Contains(lower, "help") || strings.Contains(lower, "draft"):
|
|
return "help-doc"
|
|
case strings.Contains(lower, "task") || strings.Contains(lower, "plan"):
|
|
return "task"
|
|
case strings.Contains(lower, "architecture") || strings.Contains(lower, "migration"):
|
|
return "architecture"
|
|
default:
|
|
return "documentation"
|
|
}
|
|
}
|
|
|
|
// ChunkID generates a unique ID for a chunk.
|
|
func ChunkID(path string, index int, text string) string {
|
|
// Use first 100 runes of text for uniqueness (rune-safe for UTF-8)
|
|
runes := []rune(text)
|
|
if len(runes) > 100 {
|
|
runes = runes[:100]
|
|
}
|
|
textPart := string(runes)
|
|
data := fmt.Sprintf("%s:%d:%s", path, index, textPart)
|
|
hash := md5.Sum([]byte(data))
|
|
return fmt.Sprintf("%x", hash)
|
|
}
|
|
|
|
// FileExtensions returns the file extensions to process.
|
|
func FileExtensions() []string {
|
|
return []string{".md", ".markdown", ".txt"}
|
|
}
|
|
|
|
// ShouldProcess checks if a file should be processed based on extension.
|
|
func ShouldProcess(path string) bool {
|
|
ext := strings.ToLower(filepath.Ext(path))
|
|
return slices.Contains(FileExtensions(), ext)
|
|
}
|