feat(help): add full-text search functionality
Implements #139: full-text search for help topics. - Add searchIndex with inverted index for fast lookups - Add tokenize() for case-insensitive word extraction - Add Search() with relevance ranking: - Exact word matches score 1.0 - Prefix matches score 0.5 - Title matches get 2.0 boost - Add snippet extraction for search result context - Add section-level matching for precise results - Add comprehensive tests following _Good/_Bad naming Search features: - Case-insensitive matching - Partial word matching (prefix) - Title boost (matches in title rank higher) - Section-level results - Snippet extraction with context Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
df7ff9f128
commit
2b68a26a1b
2 changed files with 542 additions and 0 deletions
277
pkg/help/search.go
Normal file
277
pkg/help/search.go
Normal file
|
|
@ -0,0 +1,277 @@
|
|||
package help
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// SearchResult represents a search match.
|
||||
type SearchResult struct {
|
||||
Topic *Topic
|
||||
Section *Section // nil if topic-level match
|
||||
Score float64
|
||||
Snippet string // Context around match
|
||||
}
|
||||
|
||||
// searchIndex provides full-text search.
|
||||
type searchIndex struct {
|
||||
topics map[string]*Topic // topicID -> Topic
|
||||
index map[string]map[string]bool // word -> set of topicIDs
|
||||
}
|
||||
|
||||
// newSearchIndex creates a new empty search index.
|
||||
func newSearchIndex() *searchIndex {
|
||||
return &searchIndex{
|
||||
topics: make(map[string]*Topic),
|
||||
index: make(map[string]map[string]bool),
|
||||
}
|
||||
}
|
||||
|
||||
// Add indexes a topic for searching.
|
||||
func (i *searchIndex) Add(topic *Topic) {
|
||||
i.topics[topic.ID] = topic
|
||||
|
||||
// Index title words with boost
|
||||
for _, word := range tokenize(topic.Title) {
|
||||
i.addToIndex(word, topic.ID)
|
||||
}
|
||||
|
||||
// Index content words
|
||||
for _, word := range tokenize(topic.Content) {
|
||||
i.addToIndex(word, topic.ID)
|
||||
}
|
||||
|
||||
// Index section titles and content
|
||||
for _, section := range topic.Sections {
|
||||
for _, word := range tokenize(section.Title) {
|
||||
i.addToIndex(word, topic.ID)
|
||||
}
|
||||
for _, word := range tokenize(section.Content) {
|
||||
i.addToIndex(word, topic.ID)
|
||||
}
|
||||
}
|
||||
|
||||
// Index tags
|
||||
for _, tag := range topic.Tags {
|
||||
for _, word := range tokenize(tag) {
|
||||
i.addToIndex(word, topic.ID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// addToIndex adds a word-to-topic mapping.
|
||||
func (i *searchIndex) addToIndex(word, topicID string) {
|
||||
if i.index[word] == nil {
|
||||
i.index[word] = make(map[string]bool)
|
||||
}
|
||||
i.index[word][topicID] = true
|
||||
}
|
||||
|
||||
// Search finds topics matching the query.
|
||||
func (i *searchIndex) Search(query string) []*SearchResult {
|
||||
queryWords := tokenize(query)
|
||||
if len(queryWords) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Track scores per topic
|
||||
scores := make(map[string]float64)
|
||||
|
||||
for _, word := range queryWords {
|
||||
// Exact matches
|
||||
if topicIDs, ok := i.index[word]; ok {
|
||||
for topicID := range topicIDs {
|
||||
scores[topicID] += 1.0
|
||||
}
|
||||
}
|
||||
|
||||
// Prefix matches (partial word matching)
|
||||
for indexWord, topicIDs := range i.index {
|
||||
if strings.HasPrefix(indexWord, word) && indexWord != word {
|
||||
for topicID := range topicIDs {
|
||||
scores[topicID] += 0.5 // Lower score for partial matches
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build results with title boost and snippet extraction
|
||||
var results []*SearchResult
|
||||
for topicID, score := range scores {
|
||||
topic := i.topics[topicID]
|
||||
if topic == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Title boost: if query words appear in title
|
||||
titleLower := strings.ToLower(topic.Title)
|
||||
for _, word := range queryWords {
|
||||
if strings.Contains(titleLower, word) {
|
||||
score += 2.0 // Title matches are worth more
|
||||
}
|
||||
}
|
||||
|
||||
// Find matching section and extract snippet
|
||||
section, snippet := i.findBestMatch(topic, queryWords)
|
||||
|
||||
results = append(results, &SearchResult{
|
||||
Topic: topic,
|
||||
Section: section,
|
||||
Score: score,
|
||||
Snippet: snippet,
|
||||
})
|
||||
}
|
||||
|
||||
// Sort by score (highest first)
|
||||
sort.Slice(results, func(a, b int) bool {
|
||||
return results[a].Score > results[b].Score
|
||||
})
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
// findBestMatch finds the section with the best match and extracts a snippet.
|
||||
func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string) (*Section, string) {
|
||||
var bestSection *Section
|
||||
var bestSnippet string
|
||||
bestScore := 0
|
||||
|
||||
// Check topic title
|
||||
titleScore := countMatches(topic.Title, queryWords)
|
||||
if titleScore > 0 {
|
||||
bestSnippet = extractSnippet(topic.Content, queryWords)
|
||||
}
|
||||
|
||||
// Check sections
|
||||
for idx := range topic.Sections {
|
||||
section := &topic.Sections[idx]
|
||||
sectionScore := countMatches(section.Title, queryWords)
|
||||
contentScore := countMatches(section.Content, queryWords)
|
||||
totalScore := sectionScore*2 + contentScore // Title matches worth more
|
||||
|
||||
if totalScore > bestScore {
|
||||
bestScore = totalScore
|
||||
bestSection = section
|
||||
if contentScore > 0 {
|
||||
bestSnippet = extractSnippet(section.Content, queryWords)
|
||||
} else {
|
||||
bestSnippet = extractSnippet(section.Content, nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If no section matched, use topic content
|
||||
if bestSnippet == "" && topic.Content != "" {
|
||||
bestSnippet = extractSnippet(topic.Content, queryWords)
|
||||
}
|
||||
|
||||
return bestSection, bestSnippet
|
||||
}
|
||||
|
||||
// tokenize splits text into lowercase words for indexing/searching.
|
||||
func tokenize(text string) []string {
|
||||
text = strings.ToLower(text)
|
||||
var words []string
|
||||
var word strings.Builder
|
||||
|
||||
for _, r := range text {
|
||||
if unicode.IsLetter(r) || unicode.IsDigit(r) {
|
||||
word.WriteRune(r)
|
||||
} else if word.Len() > 0 {
|
||||
w := word.String()
|
||||
if len(w) >= 2 { // Skip single-character words
|
||||
words = append(words, w)
|
||||
}
|
||||
word.Reset()
|
||||
}
|
||||
}
|
||||
|
||||
// Don't forget the last word
|
||||
if word.Len() >= 2 {
|
||||
words = append(words, word.String())
|
||||
}
|
||||
|
||||
return words
|
||||
}
|
||||
|
||||
// countMatches counts how many query words appear in the text.
|
||||
func countMatches(text string, queryWords []string) int {
|
||||
textLower := strings.ToLower(text)
|
||||
count := 0
|
||||
for _, word := range queryWords {
|
||||
if strings.Contains(textLower, word) {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// extractSnippet extracts a short snippet around the first match.
|
||||
func extractSnippet(content string, queryWords []string) string {
|
||||
if content == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
const snippetLen = 150
|
||||
|
||||
// If no query words, return start of content
|
||||
if len(queryWords) == 0 {
|
||||
lines := strings.Split(content, "\n")
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if line != "" && !strings.HasPrefix(line, "#") {
|
||||
if len(line) > snippetLen {
|
||||
return line[:snippetLen] + "..."
|
||||
}
|
||||
return line
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// Find first match position
|
||||
contentLower := strings.ToLower(content)
|
||||
matchPos := -1
|
||||
for _, word := range queryWords {
|
||||
pos := strings.Index(contentLower, word)
|
||||
if pos != -1 && (matchPos == -1 || pos < matchPos) {
|
||||
matchPos = pos
|
||||
}
|
||||
}
|
||||
|
||||
if matchPos == -1 {
|
||||
// No match found, return start of content
|
||||
if len(content) > snippetLen {
|
||||
return content[:snippetLen] + "..."
|
||||
}
|
||||
return content
|
||||
}
|
||||
|
||||
// Extract snippet around match
|
||||
start := matchPos - 50
|
||||
if start < 0 {
|
||||
start = 0
|
||||
}
|
||||
|
||||
end := start + snippetLen
|
||||
if end > len(content) {
|
||||
end = len(content)
|
||||
}
|
||||
|
||||
snippet := content[start:end]
|
||||
|
||||
// Trim to word boundaries
|
||||
if start > 0 {
|
||||
if idx := strings.Index(snippet, " "); idx != -1 {
|
||||
snippet = "..." + snippet[idx+1:]
|
||||
}
|
||||
}
|
||||
if end < len(content) {
|
||||
if idx := strings.LastIndex(snippet, " "); idx != -1 {
|
||||
snippet = snippet[:idx] + "..."
|
||||
}
|
||||
}
|
||||
|
||||
return strings.TrimSpace(snippet)
|
||||
}
|
||||
265
pkg/help/search_test.go
Normal file
265
pkg/help/search_test.go
Normal file
|
|
@ -0,0 +1,265 @@
|
|||
package help
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestTokenize_Good(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected []string
|
||||
}{
|
||||
{
|
||||
name: "simple words",
|
||||
input: "hello world",
|
||||
expected: []string{"hello", "world"},
|
||||
},
|
||||
{
|
||||
name: "mixed case",
|
||||
input: "Hello World",
|
||||
expected: []string{"hello", "world"},
|
||||
},
|
||||
{
|
||||
name: "with punctuation",
|
||||
input: "Hello, world! How are you?",
|
||||
expected: []string{"hello", "world", "how", "are", "you"},
|
||||
},
|
||||
{
|
||||
name: "single characters filtered",
|
||||
input: "a b c hello d",
|
||||
expected: []string{"hello"},
|
||||
},
|
||||
{
|
||||
name: "numbers included",
|
||||
input: "version 2 release",
|
||||
expected: []string{"version", "release"},
|
||||
},
|
||||
{
|
||||
name: "alphanumeric",
|
||||
input: "v2.0 and config123",
|
||||
expected: []string{"v2", "and", "config123"},
|
||||
},
|
||||
{
|
||||
name: "empty string",
|
||||
input: "",
|
||||
expected: nil,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := tokenize(tt.input)
|
||||
assert.Equal(t, tt.expected, result)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSearchIndex_Add_Good(t *testing.T) {
|
||||
idx := newSearchIndex()
|
||||
|
||||
topic := &Topic{
|
||||
ID: "getting-started",
|
||||
Title: "Getting Started",
|
||||
Content: "Welcome to the guide.",
|
||||
Tags: []string{"intro", "setup"},
|
||||
Sections: []Section{
|
||||
{ID: "installation", Title: "Installation", Content: "Install the CLI."},
|
||||
},
|
||||
}
|
||||
|
||||
idx.Add(topic)
|
||||
|
||||
// Verify topic is stored
|
||||
assert.NotNil(t, idx.topics["getting-started"])
|
||||
|
||||
// Verify words are indexed
|
||||
assert.Contains(t, idx.index["getting"], "getting-started")
|
||||
assert.Contains(t, idx.index["started"], "getting-started")
|
||||
assert.Contains(t, idx.index["welcome"], "getting-started")
|
||||
assert.Contains(t, idx.index["guide"], "getting-started")
|
||||
assert.Contains(t, idx.index["intro"], "getting-started")
|
||||
assert.Contains(t, idx.index["setup"], "getting-started")
|
||||
assert.Contains(t, idx.index["installation"], "getting-started")
|
||||
assert.Contains(t, idx.index["cli"], "getting-started")
|
||||
}
|
||||
|
||||
func TestSearchIndex_Search_Good(t *testing.T) {
|
||||
idx := newSearchIndex()
|
||||
|
||||
// Add test topics
|
||||
idx.Add(&Topic{
|
||||
ID: "getting-started",
|
||||
Title: "Getting Started",
|
||||
Content: "Welcome to the CLI guide. This covers installation and setup.",
|
||||
Tags: []string{"intro"},
|
||||
})
|
||||
|
||||
idx.Add(&Topic{
|
||||
ID: "configuration",
|
||||
Title: "Configuration",
|
||||
Content: "Configure the CLI using environment variables.",
|
||||
})
|
||||
|
||||
idx.Add(&Topic{
|
||||
ID: "commands",
|
||||
Title: "Commands Reference",
|
||||
Content: "List of all available commands.",
|
||||
})
|
||||
|
||||
t.Run("single word query", func(t *testing.T) {
|
||||
results := idx.Search("configuration")
|
||||
assert.NotEmpty(t, results)
|
||||
assert.Equal(t, "configuration", results[0].Topic.ID)
|
||||
})
|
||||
|
||||
t.Run("multi-word query", func(t *testing.T) {
|
||||
results := idx.Search("cli guide")
|
||||
assert.NotEmpty(t, results)
|
||||
// Should match getting-started (has both "cli" and "guide")
|
||||
assert.Equal(t, "getting-started", results[0].Topic.ID)
|
||||
})
|
||||
|
||||
t.Run("title boost", func(t *testing.T) {
|
||||
results := idx.Search("commands")
|
||||
assert.NotEmpty(t, results)
|
||||
// "commands" appears in title of commands topic
|
||||
assert.Equal(t, "commands", results[0].Topic.ID)
|
||||
})
|
||||
|
||||
t.Run("partial word matching", func(t *testing.T) {
|
||||
results := idx.Search("config")
|
||||
assert.NotEmpty(t, results)
|
||||
// Should match "configuration" and "configure"
|
||||
foundConfig := false
|
||||
for _, r := range results {
|
||||
if r.Topic.ID == "configuration" {
|
||||
foundConfig = true
|
||||
break
|
||||
}
|
||||
}
|
||||
assert.True(t, foundConfig, "Should find configuration topic with prefix match")
|
||||
})
|
||||
|
||||
t.Run("no results", func(t *testing.T) {
|
||||
results := idx.Search("nonexistent")
|
||||
assert.Empty(t, results)
|
||||
})
|
||||
|
||||
t.Run("empty query", func(t *testing.T) {
|
||||
results := idx.Search("")
|
||||
assert.Nil(t, results)
|
||||
})
|
||||
}
|
||||
|
||||
func TestSearchIndex_Search_Good_WithSections(t *testing.T) {
|
||||
idx := newSearchIndex()
|
||||
|
||||
idx.Add(&Topic{
|
||||
ID: "installation",
|
||||
Title: "Installation Guide",
|
||||
Content: "Overview of installation process.",
|
||||
Sections: []Section{
|
||||
{
|
||||
ID: "linux",
|
||||
Title: "Linux Installation",
|
||||
Content: "Run apt-get install core on Debian.",
|
||||
},
|
||||
{
|
||||
ID: "macos",
|
||||
Title: "macOS Installation",
|
||||
Content: "Use brew install core on macOS.",
|
||||
},
|
||||
{
|
||||
ID: "windows",
|
||||
Title: "Windows Installation",
|
||||
Content: "Download the installer from the website.",
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
t.Run("matches section content", func(t *testing.T) {
|
||||
results := idx.Search("debian")
|
||||
assert.NotEmpty(t, results)
|
||||
assert.Equal(t, "installation", results[0].Topic.ID)
|
||||
// Should identify the Linux section as best match
|
||||
if results[0].Section != nil {
|
||||
assert.Equal(t, "linux", results[0].Section.ID)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("matches section title", func(t *testing.T) {
|
||||
results := idx.Search("windows")
|
||||
assert.NotEmpty(t, results)
|
||||
assert.Equal(t, "installation", results[0].Topic.ID)
|
||||
})
|
||||
}
|
||||
|
||||
func TestExtractSnippet_Good(t *testing.T) {
|
||||
content := `This is the first paragraph with some introduction text.
|
||||
|
||||
Here is more content that talks about installation and setup.
|
||||
The installation process is straightforward.
|
||||
|
||||
Finally, some closing remarks about the configuration.`
|
||||
|
||||
t.Run("finds match and extracts context", func(t *testing.T) {
|
||||
snippet := extractSnippet(content, []string{"installation"})
|
||||
assert.Contains(t, snippet, "installation")
|
||||
assert.True(t, len(snippet) <= 200, "Snippet should be reasonably short")
|
||||
})
|
||||
|
||||
t.Run("no query words returns start", func(t *testing.T) {
|
||||
snippet := extractSnippet(content, nil)
|
||||
assert.Contains(t, snippet, "first paragraph")
|
||||
})
|
||||
|
||||
t.Run("empty content", func(t *testing.T) {
|
||||
snippet := extractSnippet("", []string{"test"})
|
||||
assert.Empty(t, snippet)
|
||||
})
|
||||
}
|
||||
|
||||
func TestCountMatches_Good(t *testing.T) {
|
||||
tests := []struct {
|
||||
text string
|
||||
words []string
|
||||
expected int
|
||||
}{
|
||||
{"Hello world", []string{"hello"}, 1},
|
||||
{"Hello world", []string{"hello", "world"}, 2},
|
||||
{"Hello world", []string{"foo", "bar"}, 0},
|
||||
{"The quick brown fox", []string{"quick", "fox", "dog"}, 2},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
result := countMatches(tt.text, tt.words)
|
||||
assert.Equal(t, tt.expected, result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSearchResult_Score_Good(t *testing.T) {
|
||||
idx := newSearchIndex()
|
||||
|
||||
// Topic with query word in title should score higher
|
||||
idx.Add(&Topic{
|
||||
ID: "topic-in-title",
|
||||
Title: "Installation Guide",
|
||||
Content: "Some content here.",
|
||||
})
|
||||
|
||||
idx.Add(&Topic{
|
||||
ID: "topic-in-content",
|
||||
Title: "Some Other Topic",
|
||||
Content: "This covers installation steps.",
|
||||
})
|
||||
|
||||
results := idx.Search("installation")
|
||||
assert.Len(t, results, 2)
|
||||
|
||||
// Title match should score higher
|
||||
assert.Equal(t, "topic-in-title", results[0].Topic.ID)
|
||||
assert.Greater(t, results[0].Score, results[1].Score)
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue