cli/pkg/help/search.go

package help

import (
	"sort"
	"strings"
	"unicode"
)

// SearchResult represents a search match.
type SearchResult struct {
	Topic   *Topic
	Section *Section // nil if topic-level match
	Score   float64
	Snippet string // Context around match
}

// searchIndex provides full-text search.
type searchIndex struct {
	topics map[string]*Topic          // topicID -> Topic
	index  map[string]map[string]bool // word -> set of topicIDs
}

// newSearchIndex creates a new empty search index.
func newSearchIndex() *searchIndex {
	return &searchIndex{
		topics: make(map[string]*Topic),
		index:  make(map[string]map[string]bool),
	}
}

// Add indexes a topic for searching.
func (i *searchIndex) Add(topic *Topic) {
	i.topics[topic.ID] = topic

	// Index title words with boost
	for _, word := range tokenize(topic.Title) {
		i.addToIndex(word, topic.ID)
	}

	// Index content words
	for _, word := range tokenize(topic.Content) {
		i.addToIndex(word, topic.ID)
	}

	// Index section titles and content
	for _, section := range topic.Sections {
		for _, word := range tokenize(section.Title) {
			i.addToIndex(word, topic.ID)
		}
		for _, word := range tokenize(section.Content) {
			i.addToIndex(word, topic.ID)
		}
	}

	// Index tags
	for _, tag := range topic.Tags {
		for _, word := range tokenize(tag) {
			i.addToIndex(word, topic.ID)
		}
	}
}

// addToIndex adds a word-to-topic mapping.
func (i *searchIndex) addToIndex(word, topicID string) {
	if i.index[word] == nil {
		i.index[word] = make(map[string]bool)
	}
	i.index[word][topicID] = true
}

// Search finds topics matching the query.
func (i *searchIndex) Search(query string) []*SearchResult {
	queryWords := tokenize(query)
	if len(queryWords) == 0 {
		return nil
	}

	// Track scores per topic
	scores := make(map[string]float64)

	for _, word := range queryWords {
		// Exact matches
		if topicIDs, ok := i.index[word]; ok {
			for topicID := range topicIDs {
				scores[topicID] += 1.0
			}
		}

		// Prefix matches (partial word matching)
		for indexWord, topicIDs := range i.index {
			if strings.HasPrefix(indexWord, word) && indexWord != word {
				for topicID := range topicIDs {
					scores[topicID] += 0.5 // Lower score for partial matches
				}
			}
		}
	}

	// Build results with title boost and snippet extraction
	var results []*SearchResult
	for topicID, score := range scores {
		topic := i.topics[topicID]
		if topic == nil {
			continue
		}

		// Title boost: if query words appear in title
		titleLower := strings.ToLower(topic.Title)
		for _, word := range queryWords {
			if strings.Contains(titleLower, word) {
				score += 2.0 // Title matches are worth more
			}
		}

		// Find matching section and extract snippet
		section, snippet := i.findBestMatch(topic, queryWords)

		results = append(results, &SearchResult{
			Topic:   topic,
			Section: section,
			Score:   score,
			Snippet: snippet,
		})
	}

	// Sort by score (highest first)
	sort.Slice(results, func(a, b int) bool {
		return results[a].Score > results[b].Score
	})

	return results
}

// findBestMatch finds the section with the best match and extracts a snippet.
func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string) (*Section, string) {
	var bestSection *Section
	var bestSnippet string
	bestScore := 0

	// Check topic title
	titleScore := countMatches(topic.Title, queryWords)
	if titleScore > 0 {
		bestSnippet = extractSnippet(topic.Content, queryWords)
	}

	// Check sections
	for idx := range topic.Sections {
		section := &topic.Sections[idx]
		sectionScore := countMatches(section.Title, queryWords)
		contentScore := countMatches(section.Content, queryWords)
		totalScore := sectionScore*2 + contentScore // Title matches worth more

		if totalScore > bestScore {
			bestScore = totalScore
			bestSection = section
			if contentScore > 0 {
				bestSnippet = extractSnippet(section.Content, queryWords)
			} else {
				bestSnippet = extractSnippet(section.Content, nil)
			}
		}
	}

	// If no section matched, use topic content
	if bestSnippet == "" && topic.Content != "" {
		bestSnippet = extractSnippet(topic.Content, queryWords)
	}

	return bestSection, bestSnippet
}

// tokenize splits text into lowercase words for indexing/searching.
func tokenize(text string) []string {
	text = strings.ToLower(text)
	var words []string
	var word strings.Builder

	for _, r := range text {
		if unicode.IsLetter(r) || unicode.IsDigit(r) {
			word.WriteRune(r)
		} else if word.Len() > 0 {
			w := word.String()
			if len(w) >= 2 { // Skip single-character words
				words = append(words, w)
			}
			word.Reset()
		}
	}

	// Don't forget the last word
	if word.Len() >= 2 {
		words = append(words, word.String())
	}

	return words
}

// countMatches counts how many query words appear in the text.
func countMatches(text string, queryWords []string) int {
	textLower := strings.ToLower(text)
	count := 0
	for _, word := range queryWords {
		if strings.Contains(textLower, word) {
			count++
		}
	}
	return count
}

// extractSnippet extracts a short snippet around the first match.
// Uses rune-based indexing to properly handle multi-byte UTF-8 characters.
func extractSnippet(content string, queryWords []string) string {
	if content == "" {
		return ""
	}

	const snippetLen = 150

	// If no query words, return start of content
	if len(queryWords) == 0 {
		lines := strings.Split(content, "\n")
		for _, line := range lines {
			line = strings.TrimSpace(line)
			if line != "" && !strings.HasPrefix(line, "#") {
				runes := []rune(line)
				if len(runes) > snippetLen {
					return string(runes[:snippetLen]) + "..."
				}
				return line
			}
		}
		return ""
	}

	// Find first match position (byte-based for strings.Index)
	contentLower := strings.ToLower(content)
	matchPos := -1
	for _, word := range queryWords {
		pos := strings.Index(contentLower, word)
		if pos != -1 && (matchPos == -1 || pos < matchPos) {
			matchPos = pos
		}
	}

	// Convert to runes for safe slicing
	runes := []rune(content)
	runeLen := len(runes)

	if matchPos == -1 {
		// No match found, return start of content
		if runeLen > snippetLen {
			return string(runes[:snippetLen]) + "..."
		}
		return content
	}

	// Convert byte position to rune position (use same string as Index)
	matchRunePos := len([]rune(contentLower[:matchPos]))

	// Extract snippet around match (rune-based)
	start := matchRunePos - 50
	if start < 0 {
		start = 0
	}

	end := start + snippetLen
	if end > runeLen {
		end = runeLen
	}

	snippet := string(runes[start:end])

	// Trim to word boundaries
	if start > 0 {
		if idx := strings.Index(snippet, " "); idx != -1 {
			snippet = "..." + snippet[idx+1:]
		}
	}
	if end < runeLen {
		if idx := strings.LastIndex(snippet, " "); idx != -1 {
			snippet = snippet[:idx] + "..."
		}
	}

	return strings.TrimSpace(snippet)
}
feat(help): add markdown parsing and section extraction (#174) * feat(help): add markdown parsing and section extraction Implements #137: markdown parsing and section extraction for help system. - Add Topic and Section types for help content structure - Add Frontmatter type for YAML metadata parsing - Add ParseTopic() to parse markdown files into Topic structs - Add ExtractFrontmatter() to extract YAML frontmatter - Add ExtractSections() to extract headings and content - Add GenerateID() to create URL-safe anchor IDs - Add comprehensive tests following _Good/_Bad naming convention This is the foundation for the display-agnostic help system (#133). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(test): use manual cleanup for TestDevOps_Boot_Good_FreshWithNoExisting Fixes flaky test that fails with "TempDir RemoveAll cleanup: directory not empty" by using os.MkdirTemp with t.Cleanup instead of t.TempDir(). This is the same fix applied to TestDevOps_Boot_Good_Success in 8effbda. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(help): address CodeRabbit review feedback - Add CRLF line ending support to frontmatter regex - Add empty frontmatter block support - Use filepath.Base/Ext for cross-platform path handling - Add tests for CRLF and empty frontmatter cases Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat(help): add full-text search functionality (#175) * fix(test): use manual cleanup for TestDevOps_Boot_Good_FreshWithNoExisting Fixes flaky test that fails with "TempDir RemoveAll cleanup: directory not empty" by using os.MkdirTemp with t.Cleanup instead of t.TempDir(). This is the same fix applied to TestDevOps_Boot_Good_Success in 8effbda. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat(help): add full-text search functionality Implements #139: full-text search for help topics. - Add searchIndex with inverted index for fast lookups - Add tokenize() for case-insensitive word extraction - Add Search() with relevance ranking: - Exact word matches score 1.0 - Prefix matches score 0.5 - Title matches get 2.0 boost - Add snippet extraction for search result context - Add section-level matching for precise results - Add comprehensive tests following _Good/_Bad naming Search features: - Case-insensitive matching - Partial word matching (prefix) - Title boost (matches in title rank higher) - Section-level results - Snippet extraction with context Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(help): address CodeRabbit review feedback - Add CRLF line ending support to frontmatter regex - Add empty frontmatter block support - Use filepath.Base/Ext for cross-platform path handling - Add tests for CRLF and empty frontmatter cases Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> * fix(help): use rune-based slicing for UTF-8 safe snippets Address CodeRabbit feedback: byte-based slicing can corrupt multi-byte UTF-8 characters. Now uses rune-based indexing for snippet extraction. - Convert content to []rune before slicing - Convert byte position to rune position for match location - Add UTF-8 validation tests with Japanese text Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(help): use correct string for byte-to-rune conversion in extractSnippet strings.ToLower can change byte lengths for certain Unicode characters (e.g., K U+212A 3 bytes → k 1 byte). Since matchPos is a byte index from strings.Index(contentLower, word), the rune conversion must also use contentLower to maintain correct index alignment. Fixes CodeRabbit review feedback. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> 2026-02-02 00:07:32 +00:00			`package help`

			`import (`
			`"sort"`
			`"strings"`
			`"unicode"`
			`)`

			`// SearchResult represents a search match.`
			`type SearchResult struct {`
			`Topic *Topic`
			`Section *Section // nil if topic-level match`
			`Score float64`
			`Snippet string // Context around match`
			`}`

			`// searchIndex provides full-text search.`
			`type searchIndex struct {`
			`topics map[string]*Topic // topicID -> Topic`
			`index map[string]map[string]bool // word -> set of topicIDs`
			`}`

			`// newSearchIndex creates a new empty search index.`
			`func newSearchIndex() *searchIndex {`
			`return &searchIndex{`
			`topics: make(map[string]*Topic),`
			`index: make(map[string]map[string]bool),`
			`}`
			`}`

			`// Add indexes a topic for searching.`
			`func (i searchIndex) Add(topic Topic) {`
			`i.topics[topic.ID] = topic`

			`// Index title words with boost`
			`for _, word := range tokenize(topic.Title) {`
			`i.addToIndex(word, topic.ID)`
			`}`

			`// Index content words`
			`for _, word := range tokenize(topic.Content) {`
			`i.addToIndex(word, topic.ID)`
			`}`

			`// Index section titles and content`
			`for _, section := range topic.Sections {`
			`for _, word := range tokenize(section.Title) {`
			`i.addToIndex(word, topic.ID)`
			`}`
			`for _, word := range tokenize(section.Content) {`
			`i.addToIndex(word, topic.ID)`
			`}`
			`}`

			`// Index tags`
			`for _, tag := range topic.Tags {`
			`for _, word := range tokenize(tag) {`
			`i.addToIndex(word, topic.ID)`
			`}`
			`}`
			`}`

			`// addToIndex adds a word-to-topic mapping.`
			`func (i *searchIndex) addToIndex(word, topicID string) {`
			`if i.index[word] == nil {`
			`i.index[word] = make(map[string]bool)`
			`}`
			`i.index[word][topicID] = true`
			`}`

			`// Search finds topics matching the query.`
			`func (i searchIndex) Search(query string) []SearchResult {`
			`queryWords := tokenize(query)`
			`if len(queryWords) == 0 {`
			`return nil`
			`}`

			`// Track scores per topic`
			`scores := make(map[string]float64)`

			`for _, word := range queryWords {`
			`// Exact matches`
			`if topicIDs, ok := i.index[word]; ok {`
			`for topicID := range topicIDs {`
			`scores[topicID] += 1.0`
			`}`
			`}`

			`// Prefix matches (partial word matching)`
			`for indexWord, topicIDs := range i.index {`
			`if strings.HasPrefix(indexWord, word) && indexWord != word {`
			`for topicID := range topicIDs {`
			`scores[topicID] += 0.5 // Lower score for partial matches`
			`}`
			`}`
			`}`
			`}`

			`// Build results with title boost and snippet extraction`
			`var results []*SearchResult`
			`for topicID, score := range scores {`
			`topic := i.topics[topicID]`
			`if topic == nil {`
			`continue`
			`}`

			`// Title boost: if query words appear in title`
			`titleLower := strings.ToLower(topic.Title)`
			`for _, word := range queryWords {`
			`if strings.Contains(titleLower, word) {`
			`score += 2.0 // Title matches are worth more`
			`}`
			`}`

			`// Find matching section and extract snippet`
			`section, snippet := i.findBestMatch(topic, queryWords)`

			`results = append(results, &SearchResult{`
			`Topic: topic,`
			`Section: section,`
			`Score: score,`
			`Snippet: snippet,`
			`})`
			`}`

			`// Sort by score (highest first)`
			`sort.Slice(results, func(a, b int) bool {`
			`return results[a].Score > results[b].Score`
			`})`

			`return results`
			`}`

			`// findBestMatch finds the section with the best match and extracts a snippet.`
			`func (i searchIndex) findBestMatch(topic Topic, queryWords []string) (*Section, string) {`
			`var bestSection *Section`
			`var bestSnippet string`
			`bestScore := 0`

			`// Check topic title`
			`titleScore := countMatches(topic.Title, queryWords)`
			`if titleScore > 0 {`
			`bestSnippet = extractSnippet(topic.Content, queryWords)`
			`}`

			`// Check sections`
			`for idx := range topic.Sections {`
			`section := &topic.Sections[idx]`
			`sectionScore := countMatches(section.Title, queryWords)`
			`contentScore := countMatches(section.Content, queryWords)`
			`totalScore := sectionScore*2 + contentScore // Title matches worth more`

			`if totalScore > bestScore {`
			`bestScore = totalScore`
			`bestSection = section`
			`if contentScore > 0 {`
			`bestSnippet = extractSnippet(section.Content, queryWords)`
			`} else {`
			`bestSnippet = extractSnippet(section.Content, nil)`
			`}`
			`}`
			`}`

			`// If no section matched, use topic content`
			`if bestSnippet == "" && topic.Content != "" {`
			`bestSnippet = extractSnippet(topic.Content, queryWords)`
			`}`

			`return bestSection, bestSnippet`
			`}`

			`// tokenize splits text into lowercase words for indexing/searching.`
			`func tokenize(text string) []string {`
			`text = strings.ToLower(text)`
			`var words []string`
			`var word strings.Builder`

			`for _, r := range text {`
			`if unicode.IsLetter(r) \|\| unicode.IsDigit(r) {`
			`word.WriteRune(r)`
			`} else if word.Len() > 0 {`
			`w := word.String()`
			`if len(w) >= 2 { // Skip single-character words`
			`words = append(words, w)`
			`}`
			`word.Reset()`
			`}`
			`}`

			`// Don't forget the last word`
			`if word.Len() >= 2 {`
			`words = append(words, word.String())`
			`}`

			`return words`
			`}`

			`// countMatches counts how many query words appear in the text.`
			`func countMatches(text string, queryWords []string) int {`
			`textLower := strings.ToLower(text)`
			`count := 0`
			`for _, word := range queryWords {`
			`if strings.Contains(textLower, word) {`
			`count++`
			`}`
			`}`
			`return count`
			`}`

			`// extractSnippet extracts a short snippet around the first match.`
			`// Uses rune-based indexing to properly handle multi-byte UTF-8 characters.`
			`func extractSnippet(content string, queryWords []string) string {`
			`if content == "" {`
			`return ""`
			`}`

			`const snippetLen = 150`

			`// If no query words, return start of content`
			`if len(queryWords) == 0 {`
			`lines := strings.Split(content, "\n")`
			`for _, line := range lines {`
			`line = strings.TrimSpace(line)`
			`if line != "" && !strings.HasPrefix(line, "#") {`
			`runes := []rune(line)`
			`if len(runes) > snippetLen {`
			`return string(runes[:snippetLen]) + "..."`
			`}`
			`return line`
			`}`
			`}`
			`return ""`
			`}`

			`// Find first match position (byte-based for strings.Index)`
			`contentLower := strings.ToLower(content)`
			`matchPos := -1`
			`for _, word := range queryWords {`
			`pos := strings.Index(contentLower, word)`
			`if pos != -1 && (matchPos == -1 \|\| pos < matchPos) {`
			`matchPos = pos`
			`}`
			`}`

			`// Convert to runes for safe slicing`
			`runes := []rune(content)`
			`runeLen := len(runes)`

			`if matchPos == -1 {`
			`// No match found, return start of content`
			`if runeLen > snippetLen {`
			`return string(runes[:snippetLen]) + "..."`
			`}`
			`return content`
			`}`

			`// Convert byte position to rune position (use same string as Index)`
			`matchRunePos := len([]rune(contentLower[:matchPos]))`

			`// Extract snippet around match (rune-based)`
			`start := matchRunePos - 50`
			`if start < 0 {`
			`start = 0`
			`}`

			`end := start + snippetLen`
			`if end > runeLen {`
			`end = runeLen`
			`}`

			`snippet := string(runes[start:end])`

			`// Trim to word boundaries`
			`if start > 0 {`
			`if idx := strings.Index(snippet, " "); idx != -1 {`
			`snippet = "..." + snippet[idx+1:]`
			`}`
			`}`
			`if end < runeLen {`
			`if idx := strings.LastIndex(snippet, " "); idx != -1 {`
			`snippet = snippet[:idx] + "..."`
			`}`
			`}`

			`return strings.TrimSpace(snippet)`
			`}`