From 3724228e190455f12248aa88b7a16fbe87ebf91b Mon Sep 17 00:00:00 2001 From: Snider Date: Sun, 1 Feb 2026 23:30:30 +0000 Subject: [PATCH] feat(help): add full-text search functionality Implements #139: full-text search for help topics. - Add searchIndex with inverted index for fast lookups - Add tokenize() for case-insensitive word extraction - Add Search() with relevance ranking: - Exact word matches score 1.0 - Prefix matches score 0.5 - Title matches get 2.0 boost - Add snippet extraction for search result context - Add section-level matching for precise results - Add comprehensive tests following _Good/_Bad naming Search features: - Case-insensitive matching - Partial word matching (prefix) - Title boost (matches in title rank higher) - Section-level results - Snippet extraction with context Co-Authored-By: Claude Opus 4.5 --- pkg/help/search.go | 277 ++++++++++++++++++++++++++++++++++++++++ pkg/help/search_test.go | 265 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 542 insertions(+) create mode 100644 pkg/help/search.go create mode 100644 pkg/help/search_test.go diff --git a/pkg/help/search.go b/pkg/help/search.go new file mode 100644 index 00000000..73f226a9 --- /dev/null +++ b/pkg/help/search.go @@ -0,0 +1,277 @@ +package help + +import ( + "sort" + "strings" + "unicode" +) + +// SearchResult represents a search match. +type SearchResult struct { + Topic *Topic + Section *Section // nil if topic-level match + Score float64 + Snippet string // Context around match +} + +// searchIndex provides full-text search. +type searchIndex struct { + topics map[string]*Topic // topicID -> Topic + index map[string]map[string]bool // word -> set of topicIDs +} + +// newSearchIndex creates a new empty search index. +func newSearchIndex() *searchIndex { + return &searchIndex{ + topics: make(map[string]*Topic), + index: make(map[string]map[string]bool), + } +} + +// Add indexes a topic for searching. +func (i *searchIndex) Add(topic *Topic) { + i.topics[topic.ID] = topic + + // Index title words with boost + for _, word := range tokenize(topic.Title) { + i.addToIndex(word, topic.ID) + } + + // Index content words + for _, word := range tokenize(topic.Content) { + i.addToIndex(word, topic.ID) + } + + // Index section titles and content + for _, section := range topic.Sections { + for _, word := range tokenize(section.Title) { + i.addToIndex(word, topic.ID) + } + for _, word := range tokenize(section.Content) { + i.addToIndex(word, topic.ID) + } + } + + // Index tags + for _, tag := range topic.Tags { + for _, word := range tokenize(tag) { + i.addToIndex(word, topic.ID) + } + } +} + +// addToIndex adds a word-to-topic mapping. +func (i *searchIndex) addToIndex(word, topicID string) { + if i.index[word] == nil { + i.index[word] = make(map[string]bool) + } + i.index[word][topicID] = true +} + +// Search finds topics matching the query. +func (i *searchIndex) Search(query string) []*SearchResult { + queryWords := tokenize(query) + if len(queryWords) == 0 { + return nil + } + + // Track scores per topic + scores := make(map[string]float64) + + for _, word := range queryWords { + // Exact matches + if topicIDs, ok := i.index[word]; ok { + for topicID := range topicIDs { + scores[topicID] += 1.0 + } + } + + // Prefix matches (partial word matching) + for indexWord, topicIDs := range i.index { + if strings.HasPrefix(indexWord, word) && indexWord != word { + for topicID := range topicIDs { + scores[topicID] += 0.5 // Lower score for partial matches + } + } + } + } + + // Build results with title boost and snippet extraction + var results []*SearchResult + for topicID, score := range scores { + topic := i.topics[topicID] + if topic == nil { + continue + } + + // Title boost: if query words appear in title + titleLower := strings.ToLower(topic.Title) + for _, word := range queryWords { + if strings.Contains(titleLower, word) { + score += 2.0 // Title matches are worth more + } + } + + // Find matching section and extract snippet + section, snippet := i.findBestMatch(topic, queryWords) + + results = append(results, &SearchResult{ + Topic: topic, + Section: section, + Score: score, + Snippet: snippet, + }) + } + + // Sort by score (highest first) + sort.Slice(results, func(a, b int) bool { + return results[a].Score > results[b].Score + }) + + return results +} + +// findBestMatch finds the section with the best match and extracts a snippet. +func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string) (*Section, string) { + var bestSection *Section + var bestSnippet string + bestScore := 0 + + // Check topic title + titleScore := countMatches(topic.Title, queryWords) + if titleScore > 0 { + bestSnippet = extractSnippet(topic.Content, queryWords) + } + + // Check sections + for idx := range topic.Sections { + section := &topic.Sections[idx] + sectionScore := countMatches(section.Title, queryWords) + contentScore := countMatches(section.Content, queryWords) + totalScore := sectionScore*2 + contentScore // Title matches worth more + + if totalScore > bestScore { + bestScore = totalScore + bestSection = section + if contentScore > 0 { + bestSnippet = extractSnippet(section.Content, queryWords) + } else { + bestSnippet = extractSnippet(section.Content, nil) + } + } + } + + // If no section matched, use topic content + if bestSnippet == "" && topic.Content != "" { + bestSnippet = extractSnippet(topic.Content, queryWords) + } + + return bestSection, bestSnippet +} + +// tokenize splits text into lowercase words for indexing/searching. +func tokenize(text string) []string { + text = strings.ToLower(text) + var words []string + var word strings.Builder + + for _, r := range text { + if unicode.IsLetter(r) || unicode.IsDigit(r) { + word.WriteRune(r) + } else if word.Len() > 0 { + w := word.String() + if len(w) >= 2 { // Skip single-character words + words = append(words, w) + } + word.Reset() + } + } + + // Don't forget the last word + if word.Len() >= 2 { + words = append(words, word.String()) + } + + return words +} + +// countMatches counts how many query words appear in the text. +func countMatches(text string, queryWords []string) int { + textLower := strings.ToLower(text) + count := 0 + for _, word := range queryWords { + if strings.Contains(textLower, word) { + count++ + } + } + return count +} + +// extractSnippet extracts a short snippet around the first match. +func extractSnippet(content string, queryWords []string) string { + if content == "" { + return "" + } + + const snippetLen = 150 + + // If no query words, return start of content + if len(queryWords) == 0 { + lines := strings.Split(content, "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + if line != "" && !strings.HasPrefix(line, "#") { + if len(line) > snippetLen { + return line[:snippetLen] + "..." + } + return line + } + } + return "" + } + + // Find first match position + contentLower := strings.ToLower(content) + matchPos := -1 + for _, word := range queryWords { + pos := strings.Index(contentLower, word) + if pos != -1 && (matchPos == -1 || pos < matchPos) { + matchPos = pos + } + } + + if matchPos == -1 { + // No match found, return start of content + if len(content) > snippetLen { + return content[:snippetLen] + "..." + } + return content + } + + // Extract snippet around match + start := matchPos - 50 + if start < 0 { + start = 0 + } + + end := start + snippetLen + if end > len(content) { + end = len(content) + } + + snippet := content[start:end] + + // Trim to word boundaries + if start > 0 { + if idx := strings.Index(snippet, " "); idx != -1 { + snippet = "..." + snippet[idx+1:] + } + } + if end < len(content) { + if idx := strings.LastIndex(snippet, " "); idx != -1 { + snippet = snippet[:idx] + "..." + } + } + + return strings.TrimSpace(snippet) +} diff --git a/pkg/help/search_test.go b/pkg/help/search_test.go new file mode 100644 index 00000000..bbe35cd6 --- /dev/null +++ b/pkg/help/search_test.go @@ -0,0 +1,265 @@ +package help + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestTokenize_Good(t *testing.T) { + tests := []struct { + name string + input string + expected []string + }{ + { + name: "simple words", + input: "hello world", + expected: []string{"hello", "world"}, + }, + { + name: "mixed case", + input: "Hello World", + expected: []string{"hello", "world"}, + }, + { + name: "with punctuation", + input: "Hello, world! How are you?", + expected: []string{"hello", "world", "how", "are", "you"}, + }, + { + name: "single characters filtered", + input: "a b c hello d", + expected: []string{"hello"}, + }, + { + name: "numbers included", + input: "version 2 release", + expected: []string{"version", "release"}, + }, + { + name: "alphanumeric", + input: "v2.0 and config123", + expected: []string{"v2", "and", "config123"}, + }, + { + name: "empty string", + input: "", + expected: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tokenize(tt.input) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestSearchIndex_Add_Good(t *testing.T) { + idx := newSearchIndex() + + topic := &Topic{ + ID: "getting-started", + Title: "Getting Started", + Content: "Welcome to the guide.", + Tags: []string{"intro", "setup"}, + Sections: []Section{ + {ID: "installation", Title: "Installation", Content: "Install the CLI."}, + }, + } + + idx.Add(topic) + + // Verify topic is stored + assert.NotNil(t, idx.topics["getting-started"]) + + // Verify words are indexed + assert.Contains(t, idx.index["getting"], "getting-started") + assert.Contains(t, idx.index["started"], "getting-started") + assert.Contains(t, idx.index["welcome"], "getting-started") + assert.Contains(t, idx.index["guide"], "getting-started") + assert.Contains(t, idx.index["intro"], "getting-started") + assert.Contains(t, idx.index["setup"], "getting-started") + assert.Contains(t, idx.index["installation"], "getting-started") + assert.Contains(t, idx.index["cli"], "getting-started") +} + +func TestSearchIndex_Search_Good(t *testing.T) { + idx := newSearchIndex() + + // Add test topics + idx.Add(&Topic{ + ID: "getting-started", + Title: "Getting Started", + Content: "Welcome to the CLI guide. This covers installation and setup.", + Tags: []string{"intro"}, + }) + + idx.Add(&Topic{ + ID: "configuration", + Title: "Configuration", + Content: "Configure the CLI using environment variables.", + }) + + idx.Add(&Topic{ + ID: "commands", + Title: "Commands Reference", + Content: "List of all available commands.", + }) + + t.Run("single word query", func(t *testing.T) { + results := idx.Search("configuration") + assert.NotEmpty(t, results) + assert.Equal(t, "configuration", results[0].Topic.ID) + }) + + t.Run("multi-word query", func(t *testing.T) { + results := idx.Search("cli guide") + assert.NotEmpty(t, results) + // Should match getting-started (has both "cli" and "guide") + assert.Equal(t, "getting-started", results[0].Topic.ID) + }) + + t.Run("title boost", func(t *testing.T) { + results := idx.Search("commands") + assert.NotEmpty(t, results) + // "commands" appears in title of commands topic + assert.Equal(t, "commands", results[0].Topic.ID) + }) + + t.Run("partial word matching", func(t *testing.T) { + results := idx.Search("config") + assert.NotEmpty(t, results) + // Should match "configuration" and "configure" + foundConfig := false + for _, r := range results { + if r.Topic.ID == "configuration" { + foundConfig = true + break + } + } + assert.True(t, foundConfig, "Should find configuration topic with prefix match") + }) + + t.Run("no results", func(t *testing.T) { + results := idx.Search("nonexistent") + assert.Empty(t, results) + }) + + t.Run("empty query", func(t *testing.T) { + results := idx.Search("") + assert.Nil(t, results) + }) +} + +func TestSearchIndex_Search_Good_WithSections(t *testing.T) { + idx := newSearchIndex() + + idx.Add(&Topic{ + ID: "installation", + Title: "Installation Guide", + Content: "Overview of installation process.", + Sections: []Section{ + { + ID: "linux", + Title: "Linux Installation", + Content: "Run apt-get install core on Debian.", + }, + { + ID: "macos", + Title: "macOS Installation", + Content: "Use brew install core on macOS.", + }, + { + ID: "windows", + Title: "Windows Installation", + Content: "Download the installer from the website.", + }, + }, + }) + + t.Run("matches section content", func(t *testing.T) { + results := idx.Search("debian") + assert.NotEmpty(t, results) + assert.Equal(t, "installation", results[0].Topic.ID) + // Should identify the Linux section as best match + if results[0].Section != nil { + assert.Equal(t, "linux", results[0].Section.ID) + } + }) + + t.Run("matches section title", func(t *testing.T) { + results := idx.Search("windows") + assert.NotEmpty(t, results) + assert.Equal(t, "installation", results[0].Topic.ID) + }) +} + +func TestExtractSnippet_Good(t *testing.T) { + content := `This is the first paragraph with some introduction text. + +Here is more content that talks about installation and setup. +The installation process is straightforward. + +Finally, some closing remarks about the configuration.` + + t.Run("finds match and extracts context", func(t *testing.T) { + snippet := extractSnippet(content, []string{"installation"}) + assert.Contains(t, snippet, "installation") + assert.True(t, len(snippet) <= 200, "Snippet should be reasonably short") + }) + + t.Run("no query words returns start", func(t *testing.T) { + snippet := extractSnippet(content, nil) + assert.Contains(t, snippet, "first paragraph") + }) + + t.Run("empty content", func(t *testing.T) { + snippet := extractSnippet("", []string{"test"}) + assert.Empty(t, snippet) + }) +} + +func TestCountMatches_Good(t *testing.T) { + tests := []struct { + text string + words []string + expected int + }{ + {"Hello world", []string{"hello"}, 1}, + {"Hello world", []string{"hello", "world"}, 2}, + {"Hello world", []string{"foo", "bar"}, 0}, + {"The quick brown fox", []string{"quick", "fox", "dog"}, 2}, + } + + for _, tt := range tests { + result := countMatches(tt.text, tt.words) + assert.Equal(t, tt.expected, result) + } +} + +func TestSearchResult_Score_Good(t *testing.T) { + idx := newSearchIndex() + + // Topic with query word in title should score higher + idx.Add(&Topic{ + ID: "topic-in-title", + Title: "Installation Guide", + Content: "Some content here.", + }) + + idx.Add(&Topic{ + ID: "topic-in-content", + Title: "Some Other Topic", + Content: "This covers installation steps.", + }) + + results := idx.Search("installation") + assert.Len(t, results, 2) + + // Title match should score higher + assert.Equal(t, "topic-in-title", results[0].Topic.ID) + assert.Greater(t, results[0].Score, results[1].Score) +}