Phase 0: Push test coverage from 92.1% to 100% by adding catalog_test.go and targeted tests for all uncovered branches in search.go. Add BenchmarkSearch with 150 topics (~745us/op baseline). Phase 1: Implement three search improvements: - Levenshtein-based fuzzy matching (max distance 2, words >= 3 chars) - Quoted phrase search via extractPhrases() with +8.0 boost - Tag boost (+3.0) and multi-word bonus (+2.0) scoring - Named scoring constants replacing magic numbers All changes are backward-compatible; Search() signature unchanged. Co-Authored-By: Charon <developers@lethean.io>
549 lines
13 KiB
Go
549 lines
13 KiB
Go
package help
|
|
|
|
import (
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
// Scoring weights for search result ranking.
|
|
const (
|
|
scoreExactWord = 1.0 // Exact word match in the index
|
|
scorePrefixWord = 0.5 // Prefix/partial word match
|
|
scoreFuzzyWord = 0.3 // Fuzzy (Levenshtein) match
|
|
scoreTitleBoost = 10.0 // Query word appears in topic title
|
|
scoreSectionBoost = 5.0 // Query word appears in section title
|
|
scoreTagBoost = 3.0 // Query word appears in topic tags
|
|
scorePhraseBoost = 8.0 // Exact phrase match in content
|
|
scoreAllWords = 2.0 // All query words present (multi-word bonus)
|
|
fuzzyMaxDistance = 2 // Maximum edit distance for fuzzy matching
|
|
)
|
|
|
|
// SearchResult represents a search match.
|
|
type SearchResult struct {
|
|
Topic *Topic
|
|
Section *Section // nil if topic-level match
|
|
Score float64
|
|
Snippet string // Context around match
|
|
}
|
|
|
|
// searchIndex provides full-text search.
|
|
type searchIndex struct {
|
|
topics map[string]*Topic // topicID -> Topic
|
|
index map[string][]string // word -> []topicID
|
|
}
|
|
|
|
// newSearchIndex creates a new empty search index.
|
|
func newSearchIndex() *searchIndex {
|
|
return &searchIndex{
|
|
topics: make(map[string]*Topic),
|
|
index: make(map[string][]string),
|
|
}
|
|
}
|
|
|
|
// Add indexes a topic for searching.
|
|
func (i *searchIndex) Add(topic *Topic) {
|
|
i.topics[topic.ID] = topic
|
|
|
|
// Index title words with boost
|
|
for _, word := range tokenize(topic.Title) {
|
|
i.addToIndex(word, topic.ID)
|
|
}
|
|
|
|
// Index content words
|
|
for _, word := range tokenize(topic.Content) {
|
|
i.addToIndex(word, topic.ID)
|
|
}
|
|
|
|
// Index section titles and content
|
|
for _, section := range topic.Sections {
|
|
for _, word := range tokenize(section.Title) {
|
|
i.addToIndex(word, topic.ID)
|
|
}
|
|
for _, word := range tokenize(section.Content) {
|
|
i.addToIndex(word, topic.ID)
|
|
}
|
|
}
|
|
|
|
// Index tags
|
|
for _, tag := range topic.Tags {
|
|
for _, word := range tokenize(tag) {
|
|
i.addToIndex(word, topic.ID)
|
|
}
|
|
}
|
|
}
|
|
|
|
// addToIndex adds a word-to-topic mapping.
|
|
func (i *searchIndex) addToIndex(word, topicID string) {
|
|
// Avoid duplicates
|
|
for _, id := range i.index[word] {
|
|
if id == topicID {
|
|
return
|
|
}
|
|
}
|
|
i.index[word] = append(i.index[word], topicID)
|
|
}
|
|
|
|
// Search finds topics matching the query. Supports:
|
|
// - Single and multi-word keyword queries
|
|
// - Quoted phrase search (e.g. `"rate limit"`)
|
|
// - Fuzzy matching via Levenshtein distance for typo tolerance
|
|
// - Prefix matching for partial words
|
|
func (i *searchIndex) Search(query string) []*SearchResult {
|
|
// Extract quoted phrases before tokenising.
|
|
phrases, stripped := extractPhrases(query)
|
|
|
|
queryWords := tokenize(stripped)
|
|
if len(queryWords) == 0 && len(phrases) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Track scores per topic
|
|
scores := make(map[string]float64)
|
|
|
|
for _, word := range queryWords {
|
|
// Exact matches
|
|
if topicIDs, ok := i.index[word]; ok {
|
|
for _, topicID := range topicIDs {
|
|
scores[topicID] += scoreExactWord
|
|
}
|
|
}
|
|
|
|
// Prefix matches (partial word matching)
|
|
for indexWord, topicIDs := range i.index {
|
|
if strings.HasPrefix(indexWord, word) && indexWord != word {
|
|
for _, topicID := range topicIDs {
|
|
scores[topicID] += scorePrefixWord
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fuzzy matches (Levenshtein distance)
|
|
if len(word) >= 3 {
|
|
for indexWord, topicIDs := range i.index {
|
|
if indexWord == word {
|
|
continue // Already scored as exact match
|
|
}
|
|
if strings.HasPrefix(indexWord, word) {
|
|
continue // Already scored as prefix match
|
|
}
|
|
dist := levenshtein(word, indexWord)
|
|
if dist > 0 && dist <= fuzzyMaxDistance {
|
|
for _, topicID := range topicIDs {
|
|
scores[topicID] += scoreFuzzyWord
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Pre-compile regexes for snippets
|
|
var res []*regexp.Regexp
|
|
for _, word := range queryWords {
|
|
if len(word) >= 2 {
|
|
if re, err := regexp.Compile("(?i)" + regexp.QuoteMeta(word)); err == nil {
|
|
res = append(res, re)
|
|
}
|
|
}
|
|
}
|
|
// Also add phrase regexes for highlighting
|
|
for _, phrase := range phrases {
|
|
if re, err := regexp.Compile("(?i)" + regexp.QuoteMeta(phrase)); err == nil {
|
|
res = append(res, re)
|
|
}
|
|
}
|
|
|
|
// Phrase matching: boost topics that contain the exact phrase.
|
|
for _, phrase := range phrases {
|
|
phraseLower := strings.ToLower(phrase)
|
|
for topicID, topic := range i.topics {
|
|
text := strings.ToLower(topic.Title + " " + topic.Content)
|
|
for _, section := range topic.Sections {
|
|
text += " " + strings.ToLower(section.Title+" "+section.Content)
|
|
}
|
|
if strings.Contains(text, phraseLower) {
|
|
scores[topicID] += scorePhraseBoost
|
|
}
|
|
}
|
|
}
|
|
|
|
// Build results with title/section/tag boosts and snippet extraction
|
|
var results []*SearchResult
|
|
for topicID, score := range scores {
|
|
topic := i.topics[topicID]
|
|
if topic == nil {
|
|
continue
|
|
}
|
|
|
|
// Title boost: if query words appear in title
|
|
titleLower := strings.ToLower(topic.Title)
|
|
titleMatchCount := 0
|
|
for _, word := range queryWords {
|
|
if strings.Contains(titleLower, word) {
|
|
titleMatchCount++
|
|
}
|
|
}
|
|
if titleMatchCount > 0 {
|
|
score += scoreTitleBoost
|
|
}
|
|
|
|
// Tag boost: if query words match tags
|
|
for _, tag := range topic.Tags {
|
|
tagLower := strings.ToLower(tag)
|
|
for _, word := range queryWords {
|
|
if tagLower == word || strings.Contains(tagLower, word) {
|
|
score += scoreTagBoost
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Multi-word bonus: if all query words are present in the topic
|
|
if len(queryWords) > 1 {
|
|
allPresent := true
|
|
fullText := strings.ToLower(topic.Title + " " + topic.Content)
|
|
for _, word := range queryWords {
|
|
if !strings.Contains(fullText, word) {
|
|
allPresent = false
|
|
break
|
|
}
|
|
}
|
|
if allPresent {
|
|
score += scoreAllWords
|
|
}
|
|
}
|
|
|
|
// Find matching section and extract snippet
|
|
section, snippet := i.findBestMatch(topic, queryWords, res)
|
|
|
|
// Section title boost
|
|
if section != nil {
|
|
sectionTitleLower := strings.ToLower(section.Title)
|
|
hasSectionTitleMatch := false
|
|
for _, word := range queryWords {
|
|
if strings.Contains(sectionTitleLower, word) {
|
|
hasSectionTitleMatch = true
|
|
break
|
|
}
|
|
}
|
|
if hasSectionTitleMatch {
|
|
score += scoreSectionBoost
|
|
}
|
|
}
|
|
|
|
results = append(results, &SearchResult{
|
|
Topic: topic,
|
|
Section: section,
|
|
Score: score,
|
|
Snippet: snippet,
|
|
})
|
|
}
|
|
|
|
// Sort by score (highest first)
|
|
sort.Slice(results, func(a, b int) bool {
|
|
if results[a].Score != results[b].Score {
|
|
return results[a].Score > results[b].Score
|
|
}
|
|
return results[a].Topic.Title < results[b].Topic.Title
|
|
})
|
|
|
|
return results
|
|
}
|
|
|
|
// extractPhrases pulls quoted substrings from the query and returns them
|
|
// alongside the remaining query text with quotes removed.
|
|
// For example: `hello "rate limit" world` returns
|
|
// phrases=["rate limit"], remaining="hello world".
|
|
func extractPhrases(query string) (phrases []string, remaining string) {
|
|
re := regexp.MustCompile(`"([^"]+)"`)
|
|
matches := re.FindAllStringSubmatch(query, -1)
|
|
for _, m := range matches {
|
|
phrase := strings.TrimSpace(m[1])
|
|
if phrase != "" {
|
|
phrases = append(phrases, phrase)
|
|
}
|
|
}
|
|
remaining = re.ReplaceAllString(query, "")
|
|
return phrases, remaining
|
|
}
|
|
|
|
// levenshtein computes the edit distance between two strings.
|
|
// Used for fuzzy matching to tolerate typos in search queries.
|
|
func levenshtein(a, b string) int {
|
|
aRunes := []rune(a)
|
|
bRunes := []rune(b)
|
|
aLen := len(aRunes)
|
|
bLen := len(bRunes)
|
|
|
|
if aLen == 0 {
|
|
return bLen
|
|
}
|
|
if bLen == 0 {
|
|
return aLen
|
|
}
|
|
|
|
// Use two rows instead of full matrix to save memory.
|
|
prev := make([]int, bLen+1)
|
|
curr := make([]int, bLen+1)
|
|
|
|
for j := 0; j <= bLen; j++ {
|
|
prev[j] = j
|
|
}
|
|
|
|
for i := 1; i <= aLen; i++ {
|
|
curr[0] = i
|
|
for j := 1; j <= bLen; j++ {
|
|
cost := 1
|
|
if aRunes[i-1] == bRunes[j-1] {
|
|
cost = 0
|
|
}
|
|
curr[j] = min3(
|
|
prev[j]+1, // deletion
|
|
curr[j-1]+1, // insertion
|
|
prev[j-1]+cost, // substitution
|
|
)
|
|
}
|
|
prev, curr = curr, prev
|
|
}
|
|
|
|
return prev[bLen]
|
|
}
|
|
|
|
// min3 returns the minimum of three integers.
|
|
func min3(a, b, c int) int {
|
|
if a < b {
|
|
if a < c {
|
|
return a
|
|
}
|
|
return c
|
|
}
|
|
if b < c {
|
|
return b
|
|
}
|
|
return c
|
|
}
|
|
|
|
// findBestMatch finds the section with the best match and extracts a snippet.
|
|
func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string, res []*regexp.Regexp) (*Section, string) {
|
|
var bestSection *Section
|
|
var bestSnippet string
|
|
bestScore := 0
|
|
|
|
// Check topic title
|
|
titleScore := countMatches(topic.Title, queryWords)
|
|
if titleScore > 0 {
|
|
bestSnippet = extractSnippet(topic.Content, res)
|
|
}
|
|
|
|
// Check sections
|
|
for idx := range topic.Sections {
|
|
section := &topic.Sections[idx]
|
|
sectionScore := countMatches(section.Title, queryWords)
|
|
contentScore := countMatches(section.Content, queryWords)
|
|
totalScore := sectionScore*2 + contentScore // Title matches worth more
|
|
|
|
if totalScore > bestScore {
|
|
bestScore = totalScore
|
|
bestSection = section
|
|
if contentScore > 0 {
|
|
bestSnippet = extractSnippet(section.Content, res)
|
|
} else {
|
|
bestSnippet = extractSnippet(section.Content, nil)
|
|
}
|
|
}
|
|
}
|
|
|
|
// If no section matched, use topic content
|
|
if bestSnippet == "" && topic.Content != "" {
|
|
bestSnippet = extractSnippet(topic.Content, res)
|
|
}
|
|
|
|
return bestSection, bestSnippet
|
|
}
|
|
|
|
// tokenize splits text into lowercase words for indexing/searching.
|
|
func tokenize(text string) []string {
|
|
text = strings.ToLower(text)
|
|
var words []string
|
|
var word strings.Builder
|
|
|
|
for _, r := range text {
|
|
if unicode.IsLetter(r) || unicode.IsDigit(r) {
|
|
word.WriteRune(r)
|
|
} else if word.Len() > 0 {
|
|
w := word.String()
|
|
if len(w) >= 2 { // Skip single-character words
|
|
words = append(words, w)
|
|
}
|
|
word.Reset()
|
|
}
|
|
}
|
|
|
|
// Don't forget the last word
|
|
if word.Len() >= 2 {
|
|
words = append(words, word.String())
|
|
}
|
|
|
|
return words
|
|
}
|
|
|
|
// countMatches counts how many query words appear in the text.
|
|
func countMatches(text string, queryWords []string) int {
|
|
textLower := strings.ToLower(text)
|
|
count := 0
|
|
for _, word := range queryWords {
|
|
if strings.Contains(textLower, word) {
|
|
count++
|
|
}
|
|
}
|
|
return count
|
|
}
|
|
|
|
// extractSnippet extracts a short snippet around the first match and highlights matches.
|
|
func extractSnippet(content string, res []*regexp.Regexp) string {
|
|
if content == "" {
|
|
return ""
|
|
}
|
|
|
|
const snippetLen = 150
|
|
|
|
// If no regexes, return start of content without highlighting
|
|
if len(res) == 0 {
|
|
lines := strings.Split(content, "\n")
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
if line != "" && !strings.HasPrefix(line, "#") {
|
|
runes := []rune(line)
|
|
if len(runes) > snippetLen {
|
|
return string(runes[:snippetLen]) + "..."
|
|
}
|
|
return line
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// Find first match position (byte-based)
|
|
matchPos := -1
|
|
for _, re := range res {
|
|
loc := re.FindStringIndex(content)
|
|
if loc != nil && (matchPos == -1 || loc[0] < matchPos) {
|
|
matchPos = loc[0]
|
|
}
|
|
}
|
|
|
|
// Convert to runes for safe slicing
|
|
runes := []rune(content)
|
|
runeLen := len(runes)
|
|
|
|
var start, end int
|
|
if matchPos == -1 {
|
|
// No match found, use start of content
|
|
start = 0
|
|
end = snippetLen
|
|
if end > runeLen {
|
|
end = runeLen
|
|
}
|
|
} else {
|
|
// Convert byte position to rune position
|
|
matchRunePos := len([]rune(content[:matchPos]))
|
|
|
|
// Extract snippet around match (rune-based)
|
|
start = matchRunePos - 50
|
|
if start < 0 {
|
|
start = 0
|
|
}
|
|
|
|
end = start + snippetLen
|
|
if end > runeLen {
|
|
end = runeLen
|
|
}
|
|
}
|
|
|
|
snippet := string(runes[start:end])
|
|
|
|
// Trim to word boundaries
|
|
prefix := ""
|
|
suffix := ""
|
|
if start > 0 {
|
|
if idx := strings.Index(snippet, " "); idx != -1 {
|
|
snippet = snippet[idx+1:]
|
|
prefix = "..."
|
|
}
|
|
}
|
|
if end < runeLen {
|
|
if idx := strings.LastIndex(snippet, " "); idx != -1 {
|
|
snippet = snippet[:idx]
|
|
suffix = "..."
|
|
}
|
|
}
|
|
|
|
snippet = strings.TrimSpace(snippet)
|
|
if snippet == "" {
|
|
return ""
|
|
}
|
|
|
|
// Apply highlighting
|
|
highlighted := highlight(snippet, res)
|
|
|
|
return prefix + highlighted + suffix
|
|
}
|
|
|
|
// highlight wraps matches in **bold**.
|
|
func highlight(text string, res []*regexp.Regexp) string {
|
|
if len(res) == 0 {
|
|
return text
|
|
}
|
|
|
|
type match struct {
|
|
start, end int
|
|
}
|
|
var matches []match
|
|
|
|
for _, re := range res {
|
|
indices := re.FindAllStringIndex(text, -1)
|
|
for _, idx := range indices {
|
|
matches = append(matches, match{idx[0], idx[1]})
|
|
}
|
|
}
|
|
|
|
if len(matches) == 0 {
|
|
return text
|
|
}
|
|
|
|
// Sort matches by start position
|
|
sort.Slice(matches, func(i, j int) bool {
|
|
if matches[i].start != matches[j].start {
|
|
return matches[i].start < matches[j].start
|
|
}
|
|
return matches[i].end > matches[j].end
|
|
})
|
|
|
|
// Merge overlapping or adjacent matches
|
|
var merged []match
|
|
if len(matches) > 0 {
|
|
curr := matches[0]
|
|
for i := 1; i < len(matches); i++ {
|
|
if matches[i].start <= curr.end {
|
|
if matches[i].end > curr.end {
|
|
curr.end = matches[i].end
|
|
}
|
|
} else {
|
|
merged = append(merged, curr)
|
|
curr = matches[i]
|
|
}
|
|
}
|
|
merged = append(merged, curr)
|
|
}
|
|
|
|
// Build highlighted string from back to front to avoid position shifts
|
|
result := text
|
|
for i := len(merged) - 1; i >= 0; i-- {
|
|
m := merged[i]
|
|
result = result[:m.end] + "**" + result[m.end:]
|
|
result = result[:m.start] + "**" + result[m.start:]
|
|
}
|
|
|
|
return result
|
|
}
|