go-help/search.go
Snider fc758a832b feat(search): add English stemmer for improved search recall
Implement lightweight Porter-style suffix stripping in stemmer.go covering
plurals (-sses, -ies, -s), verb forms (-ed, -ing, -eed), and derivational
suffixes (-ational, -tional, -fulness, -ness, -ment, -ation, -ously,
-ively, -ably, -ally, -izer, -ingly). Words under 4 chars are unchanged
and results are guaranteed at least 2 chars.

tokenize() now emits both raw and stemmed forms so the index contains both.
Search() distinguishes stem-only matches (scoreStemWord=0.7) from exact
matches (1.0), keeping stemmed results slightly below raw hits.

22 stem unit tests, 5 search integration tests, and BenchmarkStem with
100 words. All existing tests pass with no regressions.

Co-Authored-By: Virgil <virgil@lethean.io>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 08:07:50 +00:00

573 lines
13 KiB
Go

package help
import (
"regexp"
"sort"
"strings"
"unicode"
)
// Scoring weights for search result ranking.
const (
scoreExactWord = 1.0 // Exact word match in the index
scorePrefixWord = 0.5 // Prefix/partial word match
scoreFuzzyWord = 0.3 // Fuzzy (Levenshtein) match
scoreStemWord = 0.7 // Stemmed word match (between exact and prefix)
scoreTitleBoost = 10.0 // Query word appears in topic title
scoreSectionBoost = 5.0 // Query word appears in section title
scoreTagBoost = 3.0 // Query word appears in topic tags
scorePhraseBoost = 8.0 // Exact phrase match in content
scoreAllWords = 2.0 // All query words present (multi-word bonus)
fuzzyMaxDistance = 2 // Maximum edit distance for fuzzy matching
)
// SearchResult represents a search match.
type SearchResult struct {
Topic *Topic
Section *Section // nil if topic-level match
Score float64
Snippet string // Context around match
}
// searchIndex provides full-text search.
type searchIndex struct {
topics map[string]*Topic // topicID -> Topic
index map[string][]string // word -> []topicID
}
// newSearchIndex creates a new empty search index.
func newSearchIndex() *searchIndex {
return &searchIndex{
topics: make(map[string]*Topic),
index: make(map[string][]string),
}
}
// Add indexes a topic for searching.
func (i *searchIndex) Add(topic *Topic) {
i.topics[topic.ID] = topic
// Index title words with boost
for _, word := range tokenize(topic.Title) {
i.addToIndex(word, topic.ID)
}
// Index content words
for _, word := range tokenize(topic.Content) {
i.addToIndex(word, topic.ID)
}
// Index section titles and content
for _, section := range topic.Sections {
for _, word := range tokenize(section.Title) {
i.addToIndex(word, topic.ID)
}
for _, word := range tokenize(section.Content) {
i.addToIndex(word, topic.ID)
}
}
// Index tags
for _, tag := range topic.Tags {
for _, word := range tokenize(tag) {
i.addToIndex(word, topic.ID)
}
}
}
// addToIndex adds a word-to-topic mapping.
func (i *searchIndex) addToIndex(word, topicID string) {
// Avoid duplicates
for _, id := range i.index[word] {
if id == topicID {
return
}
}
i.index[word] = append(i.index[word], topicID)
}
// Search finds topics matching the query. Supports:
// - Single and multi-word keyword queries
// - Quoted phrase search (e.g. `"rate limit"`)
// - Fuzzy matching via Levenshtein distance for typo tolerance
// - Prefix matching for partial words
func (i *searchIndex) Search(query string) []*SearchResult {
// Extract quoted phrases before tokenising.
phrases, stripped := extractPhrases(query)
queryWords := tokenize(stripped)
if len(queryWords) == 0 && len(phrases) == 0 {
return nil
}
// Track scores per topic
scores := make(map[string]float64)
// Build set of stemmed query variants for stem-aware scoring.
stemmedWords := make(map[string]bool)
for _, word := range queryWords {
if s := stem(word); s != word {
stemmedWords[s] = true
}
}
for _, word := range queryWords {
isStem := stemmedWords[word]
// Exact matches — score stems lower than raw words.
if topicIDs, ok := i.index[word]; ok {
sc := scoreExactWord
if isStem {
sc = scoreStemWord
}
for _, topicID := range topicIDs {
scores[topicID] += sc
}
}
// Prefix matches (partial word matching)
for indexWord, topicIDs := range i.index {
if strings.HasPrefix(indexWord, word) && indexWord != word {
for _, topicID := range topicIDs {
scores[topicID] += scorePrefixWord
}
}
}
// Fuzzy matches (Levenshtein distance)
if len(word) >= 3 {
for indexWord, topicIDs := range i.index {
if indexWord == word {
continue // Already scored as exact match
}
if strings.HasPrefix(indexWord, word) {
continue // Already scored as prefix match
}
dist := levenshtein(word, indexWord)
if dist > 0 && dist <= fuzzyMaxDistance {
for _, topicID := range topicIDs {
scores[topicID] += scoreFuzzyWord
}
}
}
}
}
// Pre-compile regexes for snippets
var res []*regexp.Regexp
for _, word := range queryWords {
if len(word) >= 2 {
if re, err := regexp.Compile("(?i)" + regexp.QuoteMeta(word)); err == nil {
res = append(res, re)
}
}
}
// Also add phrase regexes for highlighting
for _, phrase := range phrases {
if re, err := regexp.Compile("(?i)" + regexp.QuoteMeta(phrase)); err == nil {
res = append(res, re)
}
}
// Phrase matching: boost topics that contain the exact phrase.
for _, phrase := range phrases {
phraseLower := strings.ToLower(phrase)
for topicID, topic := range i.topics {
text := strings.ToLower(topic.Title + " " + topic.Content)
for _, section := range topic.Sections {
text += " " + strings.ToLower(section.Title+" "+section.Content)
}
if strings.Contains(text, phraseLower) {
scores[topicID] += scorePhraseBoost
}
}
}
// Build results with title/section/tag boosts and snippet extraction
var results []*SearchResult
for topicID, score := range scores {
topic := i.topics[topicID]
if topic == nil {
continue
}
// Title boost: if query words appear in title
titleLower := strings.ToLower(topic.Title)
titleMatchCount := 0
for _, word := range queryWords {
if strings.Contains(titleLower, word) {
titleMatchCount++
}
}
if titleMatchCount > 0 {
score += scoreTitleBoost
}
// Tag boost: if query words match tags
for _, tag := range topic.Tags {
tagLower := strings.ToLower(tag)
for _, word := range queryWords {
if tagLower == word || strings.Contains(tagLower, word) {
score += scoreTagBoost
break
}
}
}
// Multi-word bonus: if all query words are present in the topic
if len(queryWords) > 1 {
allPresent := true
fullText := strings.ToLower(topic.Title + " " + topic.Content)
for _, word := range queryWords {
if !strings.Contains(fullText, word) {
allPresent = false
break
}
}
if allPresent {
score += scoreAllWords
}
}
// Find matching section and extract snippet
section, snippet := i.findBestMatch(topic, queryWords, res)
// Section title boost
if section != nil {
sectionTitleLower := strings.ToLower(section.Title)
hasSectionTitleMatch := false
for _, word := range queryWords {
if strings.Contains(sectionTitleLower, word) {
hasSectionTitleMatch = true
break
}
}
if hasSectionTitleMatch {
score += scoreSectionBoost
}
}
results = append(results, &SearchResult{
Topic: topic,
Section: section,
Score: score,
Snippet: snippet,
})
}
// Sort by score (highest first)
sort.Slice(results, func(a, b int) bool {
if results[a].Score != results[b].Score {
return results[a].Score > results[b].Score
}
return results[a].Topic.Title < results[b].Topic.Title
})
return results
}
// extractPhrases pulls quoted substrings from the query and returns them
// alongside the remaining query text with quotes removed.
// For example: `hello "rate limit" world` returns
// phrases=["rate limit"], remaining="hello world".
func extractPhrases(query string) (phrases []string, remaining string) {
re := regexp.MustCompile(`"([^"]+)"`)
matches := re.FindAllStringSubmatch(query, -1)
for _, m := range matches {
phrase := strings.TrimSpace(m[1])
if phrase != "" {
phrases = append(phrases, phrase)
}
}
remaining = re.ReplaceAllString(query, "")
return phrases, remaining
}
// levenshtein computes the edit distance between two strings.
// Used for fuzzy matching to tolerate typos in search queries.
func levenshtein(a, b string) int {
aRunes := []rune(a)
bRunes := []rune(b)
aLen := len(aRunes)
bLen := len(bRunes)
if aLen == 0 {
return bLen
}
if bLen == 0 {
return aLen
}
// Use two rows instead of full matrix to save memory.
prev := make([]int, bLen+1)
curr := make([]int, bLen+1)
for j := 0; j <= bLen; j++ {
prev[j] = j
}
for i := 1; i <= aLen; i++ {
curr[0] = i
for j := 1; j <= bLen; j++ {
cost := 1
if aRunes[i-1] == bRunes[j-1] {
cost = 0
}
curr[j] = min3(
prev[j]+1, // deletion
curr[j-1]+1, // insertion
prev[j-1]+cost, // substitution
)
}
prev, curr = curr, prev
}
return prev[bLen]
}
// min3 returns the minimum of three integers.
func min3(a, b, c int) int {
if a < b {
if a < c {
return a
}
return c
}
if b < c {
return b
}
return c
}
// findBestMatch finds the section with the best match and extracts a snippet.
func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string, res []*regexp.Regexp) (*Section, string) {
var bestSection *Section
var bestSnippet string
bestScore := 0
// Check topic title
titleScore := countMatches(topic.Title, queryWords)
if titleScore > 0 {
bestSnippet = extractSnippet(topic.Content, res)
}
// Check sections
for idx := range topic.Sections {
section := &topic.Sections[idx]
sectionScore := countMatches(section.Title, queryWords)
contentScore := countMatches(section.Content, queryWords)
totalScore := sectionScore*2 + contentScore // Title matches worth more
if totalScore > bestScore {
bestScore = totalScore
bestSection = section
if contentScore > 0 {
bestSnippet = extractSnippet(section.Content, res)
} else {
bestSnippet = extractSnippet(section.Content, nil)
}
}
}
// If no section matched, use topic content
if bestSnippet == "" && topic.Content != "" {
bestSnippet = extractSnippet(topic.Content, res)
}
return bestSection, bestSnippet
}
// tokenize splits text into lowercase words for indexing/searching.
// For each word, it also emits the stemmed variant (if different from the
// original) so the index contains both raw and stemmed forms.
func tokenize(text string) []string {
text = strings.ToLower(text)
var words []string
var word strings.Builder
emit := func(w string) {
if len(w) < 2 {
return
}
words = append(words, w)
if s := stem(w); s != w {
words = append(words, s)
}
}
for _, r := range text {
if unicode.IsLetter(r) || unicode.IsDigit(r) {
word.WriteRune(r)
} else if word.Len() > 0 {
emit(word.String())
word.Reset()
}
}
// Don't forget the last word
if word.Len() > 0 {
emit(word.String())
}
return words
}
// countMatches counts how many query words appear in the text.
func countMatches(text string, queryWords []string) int {
textLower := strings.ToLower(text)
count := 0
for _, word := range queryWords {
if strings.Contains(textLower, word) {
count++
}
}
return count
}
// extractSnippet extracts a short snippet around the first match and highlights matches.
func extractSnippet(content string, res []*regexp.Regexp) string {
if content == "" {
return ""
}
const snippetLen = 150
// If no regexes, return start of content without highlighting
if len(res) == 0 {
lines := strings.Split(content, "\n")
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" && !strings.HasPrefix(line, "#") {
runes := []rune(line)
if len(runes) > snippetLen {
return string(runes[:snippetLen]) + "..."
}
return line
}
}
return ""
}
// Find first match position (byte-based)
matchPos := -1
for _, re := range res {
loc := re.FindStringIndex(content)
if loc != nil && (matchPos == -1 || loc[0] < matchPos) {
matchPos = loc[0]
}
}
// Convert to runes for safe slicing
runes := []rune(content)
runeLen := len(runes)
var start, end int
if matchPos == -1 {
// No match found, use start of content
start = 0
end = snippetLen
if end > runeLen {
end = runeLen
}
} else {
// Convert byte position to rune position
matchRunePos := len([]rune(content[:matchPos]))
// Extract snippet around match (rune-based)
start = matchRunePos - 50
if start < 0 {
start = 0
}
end = start + snippetLen
if end > runeLen {
end = runeLen
}
}
snippet := string(runes[start:end])
// Trim to word boundaries
prefix := ""
suffix := ""
if start > 0 {
if idx := strings.Index(snippet, " "); idx != -1 {
snippet = snippet[idx+1:]
prefix = "..."
}
}
if end < runeLen {
if idx := strings.LastIndex(snippet, " "); idx != -1 {
snippet = snippet[:idx]
suffix = "..."
}
}
snippet = strings.TrimSpace(snippet)
if snippet == "" {
return ""
}
// Apply highlighting
highlighted := highlight(snippet, res)
return prefix + highlighted + suffix
}
// highlight wraps matches in **bold**.
func highlight(text string, res []*regexp.Regexp) string {
if len(res) == 0 {
return text
}
type match struct {
start, end int
}
var matches []match
for _, re := range res {
indices := re.FindAllStringIndex(text, -1)
for _, idx := range indices {
matches = append(matches, match{idx[0], idx[1]})
}
}
if len(matches) == 0 {
return text
}
// Sort matches by start position
sort.Slice(matches, func(i, j int) bool {
if matches[i].start != matches[j].start {
return matches[i].start < matches[j].start
}
return matches[i].end > matches[j].end
})
// Merge overlapping or adjacent matches
var merged []match
if len(matches) > 0 {
curr := matches[0]
for i := 1; i < len(matches); i++ {
if matches[i].start <= curr.end {
if matches[i].end > curr.end {
curr.end = matches[i].end
}
} else {
merged = append(merged, curr)
curr = matches[i]
}
}
merged = append(merged, curr)
}
// Build highlighted string from back to front to avoid position shifts
result := text
for i := len(merged) - 1; i >= 0; i-- {
m := merged[i]
result = result[:m.end] + "**" + result[m.end:]
result = result[:m.start] + "**" + result[m.start:]
}
return result
}