go-rag/keyword.go
Snider d8fd067a8c feat: Phase 3 enhancements — sentence splitting, collection helpers, keyword filter, benchmarks
3.1: Sentence-aware chunk splitting at ". ", "? ", "! " boundaries when
paragraphs exceed ChunkConfig.Size. Overlap now aligns to word boundaries
to avoid mid-word splits.

3.2: VectorStore interface gains ListCollections and CollectionInfo methods.
New collections.go with ListCollections, DeleteCollection, CollectionStats
helpers returning backend-agnostic CollectionInfo. Mock updated accordingly.

3.3: KeywordFilter re-ranks QueryResults by boosting scores for keyword
matches (case-insensitive, +10% per keyword). QueryConfig.Keywords flag
enables automatic extraction and filtering.

3.4: Mock-only benchmarks for chunking, query, ingest, formatting, and
keyword filtering.

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-20 08:02:00 +00:00

60 lines
1.6 KiB
Go

package rag
import (
"sort"
"strings"
)
// KeywordFilter re-ranks query results by boosting scores for results whose
// text contains one or more of the given keywords. Matching is
// case-insensitive using strings.Contains. Each keyword match adds a 10%
// boost to the original score: score *= 1.0 + 0.1 * matchCount.
// Results are re-sorted by boosted score descending.
func KeywordFilter(results []QueryResult, keywords []string) []QueryResult {
if len(keywords) == 0 || len(results) == 0 {
return results
}
// Normalise keywords to lowercase once
lowerKeywords := make([]string, len(keywords))
for i, kw := range keywords {
lowerKeywords[i] = strings.ToLower(kw)
}
// Apply boost
boosted := make([]QueryResult, len(results))
copy(boosted, results)
for i := range boosted {
lowerText := strings.ToLower(boosted[i].Text)
matchCount := 0
for _, kw := range lowerKeywords {
if kw != "" && strings.Contains(lowerText, kw) {
matchCount++
}
}
if matchCount > 0 {
boosted[i].Score *= 1.0 + 0.1*float32(matchCount)
}
}
// Re-sort by boosted score descending
sort.Slice(boosted, func(i, j int) bool {
return boosted[i].Score > boosted[j].Score
})
return boosted
}
// extractKeywords splits query text into individual keywords for filtering.
// Words shorter than 3 characters are discarded as they tend to be noise.
func extractKeywords(query string) []string {
words := strings.Fields(strings.ToLower(query))
var keywords []string
for _, w := range words {
if len(w) >= 3 {
keywords = append(keywords, w)
}
}
return keywords
}