feat(help): add full-text search functionality

Implements #139: full-text search for help topics. - Add searchIndex with inverted index for fast lookups - Add tokenize() for case-insensitive word extraction - Add Search() with relevance ranking: - Exact word matches score 1.0 - Prefix matches score 0.5 - Title matches get 2.0 boost - Add snippet extraction for search result context - Add section-level matching for precise results - Add comprehensive tests following _Good/_Bad naming Search features: - Case-insensitive matching - Partial word matching (prefix) - Title boost (matches in title rank higher) - Section-level results - Snippet extraction with context Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 23:30:30 +00:00 · 2026-02-01 23:30:30 +00:00 · 2b68a26a1b
commit 2b68a26a1b
parent df7ff9f128
2 changed files with 542 additions and 0 deletions
--- a/pkg/help/search.go
+++ b/pkg/help/search.go
@ -0,0 +1,277 @@
+package help
+
+import (
+	"sort"
+	"strings"
+	"unicode"
+)
+
+// SearchResult represents a search match.
+type SearchResult struct {
+	Topic   *Topic
+	Section *Section // nil if topic-level match
+	Score   float64
+	Snippet string // Context around match
+}
+
+// searchIndex provides full-text search.
+type searchIndex struct {
+	topics map[string]*Topic          // topicID -> Topic
+	index  map[string]map[string]bool // word -> set of topicIDs
+}
+
+// newSearchIndex creates a new empty search index.
+func newSearchIndex() *searchIndex {
+	return &searchIndex{
+		topics: make(map[string]*Topic),
+		index:  make(map[string]map[string]bool),
+	}
+}
+
+// Add indexes a topic for searching.
+func (i *searchIndex) Add(topic *Topic) {
+	i.topics[topic.ID] = topic
+
+	// Index title words with boost
+	for _, word := range tokenize(topic.Title) {
+		i.addToIndex(word, topic.ID)
+	}
+
+	// Index content words
+	for _, word := range tokenize(topic.Content) {
+		i.addToIndex(word, topic.ID)
+	}
+
+	// Index section titles and content
+	for _, section := range topic.Sections {
+		for _, word := range tokenize(section.Title) {
+			i.addToIndex(word, topic.ID)
+		}
+		for _, word := range tokenize(section.Content) {
+			i.addToIndex(word, topic.ID)
+		}
+	}
+
+	// Index tags
+	for _, tag := range topic.Tags {
+		for _, word := range tokenize(tag) {
+			i.addToIndex(word, topic.ID)
+		}
+	}
+}
+
+// addToIndex adds a word-to-topic mapping.
+func (i *searchIndex) addToIndex(word, topicID string) {
+	if i.index[word] == nil {
+		i.index[word] = make(map[string]bool)
+	}
+	i.index[word][topicID] = true
+}
+
+// Search finds topics matching the query.
+func (i *searchIndex) Search(query string) []*SearchResult {
+	queryWords := tokenize(query)
+	if len(queryWords) == 0 {
+		return nil
+	}
+
+	// Track scores per topic
+	scores := make(map[string]float64)
+
+	for _, word := range queryWords {
+		// Exact matches
+		if topicIDs, ok := i.index[word]; ok {
+			for topicID := range topicIDs {
+				scores[topicID] += 1.0
+			}
+		}
+
+		// Prefix matches (partial word matching)
+		for indexWord, topicIDs := range i.index {
+			if strings.HasPrefix(indexWord, word) && indexWord != word {
+				for topicID := range topicIDs {
+					scores[topicID] += 0.5 // Lower score for partial matches
+				}
+			}
+		}
+	}
+
+	// Build results with title boost and snippet extraction
+	var results []*SearchResult
+	for topicID, score := range scores {
+		topic := i.topics[topicID]
+		if topic == nil {
+			continue
+		}
+
+		// Title boost: if query words appear in title
+		titleLower := strings.ToLower(topic.Title)
+		for _, word := range queryWords {
+			if strings.Contains(titleLower, word) {
+				score += 2.0 // Title matches are worth more
+			}
+		}
+
+		// Find matching section and extract snippet
+		section, snippet := i.findBestMatch(topic, queryWords)
+
+		results = append(results, &SearchResult{
+			Topic:   topic,
+			Section: section,
+			Score:   score,
+			Snippet: snippet,
+		})
+	}
+
+	// Sort by score (highest first)
+	sort.Slice(results, func(a, b int) bool {
+		return results[a].Score > results[b].Score
+	})
+
+	return results
+}
+
+// findBestMatch finds the section with the best match and extracts a snippet.
+func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string) (*Section, string) {
+	var bestSection *Section
+	var bestSnippet string
+	bestScore := 0
+
+	// Check topic title
+	titleScore := countMatches(topic.Title, queryWords)
+	if titleScore > 0 {
+		bestSnippet = extractSnippet(topic.Content, queryWords)
+	}
+
+	// Check sections
+	for idx := range topic.Sections {
+		section := &topic.Sections[idx]
+		sectionScore := countMatches(section.Title, queryWords)
+		contentScore := countMatches(section.Content, queryWords)
+		totalScore := sectionScore*2 + contentScore // Title matches worth more
+
+		if totalScore > bestScore {
+			bestScore = totalScore
+			bestSection = section
+			if contentScore > 0 {
+				bestSnippet = extractSnippet(section.Content, queryWords)
+			} else {
+				bestSnippet = extractSnippet(section.Content, nil)
+			}
+		}
+	}
+
+	// If no section matched, use topic content
+	if bestSnippet == "" && topic.Content != "" {
+		bestSnippet = extractSnippet(topic.Content, queryWords)
+	}
+
+	return bestSection, bestSnippet
+}
+
+// tokenize splits text into lowercase words for indexing/searching.
+func tokenize(text string) []string {
+	text = strings.ToLower(text)
+	var words []string
+	var word strings.Builder
+
+	for _, r := range text {
+		if unicode.IsLetter(r) || unicode.IsDigit(r) {
+			word.WriteRune(r)
+		} else if word.Len() > 0 {
+			w := word.String()
+			if len(w) >= 2 { // Skip single-character words
+				words = append(words, w)
+			}
+			word.Reset()
+		}
+	}
+
+	// Don't forget the last word
+	if word.Len() >= 2 {
+		words = append(words, word.String())
+	}
+
+	return words
+}
+
+// countMatches counts how many query words appear in the text.
+func countMatches(text string, queryWords []string) int {
+	textLower := strings.ToLower(text)
+	count := 0
+	for _, word := range queryWords {
+		if strings.Contains(textLower, word) {
+			count++
+		}
+	}
+	return count
+}
+
+// extractSnippet extracts a short snippet around the first match.
+func extractSnippet(content string, queryWords []string) string {
+	if content == "" {
+		return ""
+	}
+
+	const snippetLen = 150
+
+	// If no query words, return start of content
+	if len(queryWords) == 0 {
+		lines := strings.Split(content, "\n")
+		for _, line := range lines {
+			line = strings.TrimSpace(line)
+			if line != "" && !strings.HasPrefix(line, "#") {
+				if len(line) > snippetLen {
+					return line[:snippetLen] + "..."
+				}
+				return line
+			}
+		}
+		return ""
+	}
+
+	// Find first match position
+	contentLower := strings.ToLower(content)
+	matchPos := -1
+	for _, word := range queryWords {
+		pos := strings.Index(contentLower, word)
+		if pos != -1 && (matchPos == -1 || pos < matchPos) {
+			matchPos = pos
+		}
+	}
+
+	if matchPos == -1 {
+		// No match found, return start of content
+		if len(content) > snippetLen {
+			return content[:snippetLen] + "..."
+		}
+		return content
+	}
+
+	// Extract snippet around match
+	start := matchPos - 50
+	if start < 0 {
+		start = 0
+	}
+
+	end := start + snippetLen
+	if end > len(content) {
+		end = len(content)
+	}
+
+	snippet := content[start:end]
+
+	// Trim to word boundaries
+	if start > 0 {
+		if idx := strings.Index(snippet, " "); idx != -1 {
+			snippet = "..." + snippet[idx+1:]
+		}
+	}
+	if end < len(content) {
+		if idx := strings.LastIndex(snippet, " "); idx != -1 {
+			snippet = snippet[:idx] + "..."
+		}
+	}
+
+	return strings.TrimSpace(snippet)
+}
--- a/pkg/help/search_test.go
+++ b/pkg/help/search_test.go
@ -0,0 +1,265 @@
+package help
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestTokenize_Good(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected []string
+	}{
+		{
+			name:     "simple words",
+			input:    "hello world",
+			expected: []string{"hello", "world"},
+		},
+		{
+			name:     "mixed case",
+			input:    "Hello World",
+			expected: []string{"hello", "world"},
+		},
+		{
+			name:     "with punctuation",
+			input:    "Hello, world! How are you?",
+			expected: []string{"hello", "world", "how", "are", "you"},
+		},
+		{
+			name:     "single characters filtered",
+			input:    "a b c hello d",
+			expected: []string{"hello"},
+		},
+		{
+			name:     "numbers included",
+			input:    "version 2 release",
+			expected: []string{"version", "release"},
+		},
+		{
+			name:     "alphanumeric",
+			input:    "v2.0 and config123",
+			expected: []string{"v2", "and", "config123"},
+		},
+		{
+			name:     "empty string",
+			input:    "",
+			expected: nil,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := tokenize(tt.input)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
+
+func TestSearchIndex_Add_Good(t *testing.T) {
+	idx := newSearchIndex()
+
+	topic := &Topic{
+		ID:      "getting-started",
+		Title:   "Getting Started",
+		Content: "Welcome to the guide.",
+		Tags:    []string{"intro", "setup"},
+		Sections: []Section{
+			{ID: "installation", Title: "Installation", Content: "Install the CLI."},
+		},
+	}
+
+	idx.Add(topic)
+
+	// Verify topic is stored
+	assert.NotNil(t, idx.topics["getting-started"])
+
+	// Verify words are indexed
+	assert.Contains(t, idx.index["getting"], "getting-started")
+	assert.Contains(t, idx.index["started"], "getting-started")
+	assert.Contains(t, idx.index["welcome"], "getting-started")
+	assert.Contains(t, idx.index["guide"], "getting-started")
+	assert.Contains(t, idx.index["intro"], "getting-started")
+	assert.Contains(t, idx.index["setup"], "getting-started")
+	assert.Contains(t, idx.index["installation"], "getting-started")
+	assert.Contains(t, idx.index["cli"], "getting-started")
+}
+
+func TestSearchIndex_Search_Good(t *testing.T) {
+	idx := newSearchIndex()
+
+	// Add test topics
+	idx.Add(&Topic{
+		ID:      "getting-started",
+		Title:   "Getting Started",
+		Content: "Welcome to the CLI guide. This covers installation and setup.",
+		Tags:    []string{"intro"},
+	})
+
+	idx.Add(&Topic{
+		ID:      "configuration",
+		Title:   "Configuration",
+		Content: "Configure the CLI using environment variables.",
+	})
+
+	idx.Add(&Topic{
+		ID:      "commands",
+		Title:   "Commands Reference",
+		Content: "List of all available commands.",
+	})
+
+	t.Run("single word query", func(t *testing.T) {
+		results := idx.Search("configuration")
+		assert.NotEmpty(t, results)
+		assert.Equal(t, "configuration", results[0].Topic.ID)
+	})
+
+	t.Run("multi-word query", func(t *testing.T) {
+		results := idx.Search("cli guide")
+		assert.NotEmpty(t, results)
+		// Should match getting-started (has both "cli" and "guide")
+		assert.Equal(t, "getting-started", results[0].Topic.ID)
+	})
+
+	t.Run("title boost", func(t *testing.T) {
+		results := idx.Search("commands")
+		assert.NotEmpty(t, results)
+		// "commands" appears in title of commands topic
+		assert.Equal(t, "commands", results[0].Topic.ID)
+	})
+
+	t.Run("partial word matching", func(t *testing.T) {
+		results := idx.Search("config")
+		assert.NotEmpty(t, results)
+		// Should match "configuration" and "configure"
+		foundConfig := false
+		for _, r := range results {
+			if r.Topic.ID == "configuration" {
+				foundConfig = true
+				break
+			}
+		}
+		assert.True(t, foundConfig, "Should find configuration topic with prefix match")
+	})
+
+	t.Run("no results", func(t *testing.T) {
+		results := idx.Search("nonexistent")
+		assert.Empty(t, results)
+	})
+
+	t.Run("empty query", func(t *testing.T) {
+		results := idx.Search("")
+		assert.Nil(t, results)
+	})
+}
+
+func TestSearchIndex_Search_Good_WithSections(t *testing.T) {
+	idx := newSearchIndex()
+
+	idx.Add(&Topic{
+		ID:      "installation",
+		Title:   "Installation Guide",
+		Content: "Overview of installation process.",
+		Sections: []Section{
+			{
+				ID:      "linux",
+				Title:   "Linux Installation",
+				Content: "Run apt-get install core on Debian.",
+			},
+			{
+				ID:      "macos",
+				Title:   "macOS Installation",
+				Content: "Use brew install core on macOS.",
+			},
+			{
+				ID:      "windows",
+				Title:   "Windows Installation",
+				Content: "Download the installer from the website.",
+			},
+		},
+	})
+
+	t.Run("matches section content", func(t *testing.T) {
+		results := idx.Search("debian")
+		assert.NotEmpty(t, results)
+		assert.Equal(t, "installation", results[0].Topic.ID)
+		// Should identify the Linux section as best match
+		if results[0].Section != nil {
+			assert.Equal(t, "linux", results[0].Section.ID)
+		}
+	})
+
+	t.Run("matches section title", func(t *testing.T) {
+		results := idx.Search("windows")
+		assert.NotEmpty(t, results)
+		assert.Equal(t, "installation", results[0].Topic.ID)
+	})
+}
+
+func TestExtractSnippet_Good(t *testing.T) {
+	content := `This is the first paragraph with some introduction text.
+
+Here is more content that talks about installation and setup.
+The installation process is straightforward.
+
+Finally, some closing remarks about the configuration.`
+
+	t.Run("finds match and extracts context", func(t *testing.T) {
+		snippet := extractSnippet(content, []string{"installation"})
+		assert.Contains(t, snippet, "installation")
+		assert.True(t, len(snippet) <= 200, "Snippet should be reasonably short")
+	})
+
+	t.Run("no query words returns start", func(t *testing.T) {
+		snippet := extractSnippet(content, nil)
+		assert.Contains(t, snippet, "first paragraph")
+	})
+
+	t.Run("empty content", func(t *testing.T) {
+		snippet := extractSnippet("", []string{"test"})
+		assert.Empty(t, snippet)
+	})
+}
+
+func TestCountMatches_Good(t *testing.T) {
+	tests := []struct {
+		text     string
+		words    []string
+		expected int
+	}{
+		{"Hello world", []string{"hello"}, 1},
+		{"Hello world", []string{"hello", "world"}, 2},
+		{"Hello world", []string{"foo", "bar"}, 0},
+		{"The quick brown fox", []string{"quick", "fox", "dog"}, 2},
+	}
+
+	for _, tt := range tests {
+		result := countMatches(tt.text, tt.words)
+		assert.Equal(t, tt.expected, result)
+	}
+}
+
+func TestSearchResult_Score_Good(t *testing.T) {
+	idx := newSearchIndex()
+
+	// Topic with query word in title should score higher
+	idx.Add(&Topic{
+		ID:      "topic-in-title",
+		Title:   "Installation Guide",
+		Content: "Some content here.",
+	})
+
+	idx.Add(&Topic{
+		ID:      "topic-in-content",
+		Title:   "Some Other Topic",
+		Content: "This covers installation steps.",
+	})
+
+	results := idx.Search("installation")
+	assert.Len(t, results, 2)
+
+	// Title match should score higher
+	assert.Equal(t, "topic-in-title", results[0].Topic.ID)
+	assert.Greater(t, results[0].Score, results[1].Score)
+}