cli/pkg/help/search.go
Snider 7fede9d563
feat(help): add markdown parsing and section extraction (#174)
* feat(help): add markdown parsing and section extraction

Implements #137: markdown parsing and section extraction for help system.

- Add Topic and Section types for help content structure
- Add Frontmatter type for YAML metadata parsing
- Add ParseTopic() to parse markdown files into Topic structs
- Add ExtractFrontmatter() to extract YAML frontmatter
- Add ExtractSections() to extract headings and content
- Add GenerateID() to create URL-safe anchor IDs
- Add comprehensive tests following _Good/_Bad naming convention

This is the foundation for the display-agnostic help system (#133).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(test): use manual cleanup for TestDevOps_Boot_Good_FreshWithNoExisting

Fixes flaky test that fails with "TempDir RemoveAll cleanup: directory
not empty" by using os.MkdirTemp with t.Cleanup instead of t.TempDir().

This is the same fix applied to TestDevOps_Boot_Good_Success in 8effbda.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(help): address CodeRabbit review feedback

- Add CRLF line ending support to frontmatter regex
- Add empty frontmatter block support
- Use filepath.Base/Ext for cross-platform path handling
- Add tests for CRLF and empty frontmatter cases

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* feat(help): add full-text search functionality (#175)

* fix(test): use manual cleanup for TestDevOps_Boot_Good_FreshWithNoExisting

Fixes flaky test that fails with "TempDir RemoveAll cleanup: directory
not empty" by using os.MkdirTemp with t.Cleanup instead of t.TempDir().

This is the same fix applied to TestDevOps_Boot_Good_Success in 8effbda.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* feat(help): add full-text search functionality

Implements #139: full-text search for help topics.

- Add searchIndex with inverted index for fast lookups
- Add tokenize() for case-insensitive word extraction
- Add Search() with relevance ranking:
  - Exact word matches score 1.0
  - Prefix matches score 0.5
  - Title matches get 2.0 boost
- Add snippet extraction for search result context
- Add section-level matching for precise results
- Add comprehensive tests following _Good/_Bad naming

Search features:
- Case-insensitive matching
- Partial word matching (prefix)
- Title boost (matches in title rank higher)
- Section-level results
- Snippet extraction with context

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(help): address CodeRabbit review feedback

- Add CRLF line ending support to frontmatter regex
- Add empty frontmatter block support
- Use filepath.Base/Ext for cross-platform path handling
- Add tests for CRLF and empty frontmatter cases

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>

* fix(help): use rune-based slicing for UTF-8 safe snippets

Address CodeRabbit feedback: byte-based slicing can corrupt multi-byte
UTF-8 characters. Now uses rune-based indexing for snippet extraction.

- Convert content to []rune before slicing
- Convert byte position to rune position for match location
- Add UTF-8 validation tests with Japanese text

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(help): use correct string for byte-to-rune conversion in extractSnippet

strings.ToLower can change byte lengths for certain Unicode characters
(e.g., K U+212A 3 bytes → k 1 byte). Since matchPos is a byte index from
strings.Index(contentLower, word), the rune conversion must also use
contentLower to maintain correct index alignment.

Fixes CodeRabbit review feedback.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 00:07:32 +00:00

286 lines
6.6 KiB
Go

package help
import (
"sort"
"strings"
"unicode"
)
// SearchResult represents a search match.
type SearchResult struct {
Topic *Topic
Section *Section // nil if topic-level match
Score float64
Snippet string // Context around match
}
// searchIndex provides full-text search.
type searchIndex struct {
topics map[string]*Topic // topicID -> Topic
index map[string]map[string]bool // word -> set of topicIDs
}
// newSearchIndex creates a new empty search index.
func newSearchIndex() *searchIndex {
return &searchIndex{
topics: make(map[string]*Topic),
index: make(map[string]map[string]bool),
}
}
// Add indexes a topic for searching.
func (i *searchIndex) Add(topic *Topic) {
i.topics[topic.ID] = topic
// Index title words with boost
for _, word := range tokenize(topic.Title) {
i.addToIndex(word, topic.ID)
}
// Index content words
for _, word := range tokenize(topic.Content) {
i.addToIndex(word, topic.ID)
}
// Index section titles and content
for _, section := range topic.Sections {
for _, word := range tokenize(section.Title) {
i.addToIndex(word, topic.ID)
}
for _, word := range tokenize(section.Content) {
i.addToIndex(word, topic.ID)
}
}
// Index tags
for _, tag := range topic.Tags {
for _, word := range tokenize(tag) {
i.addToIndex(word, topic.ID)
}
}
}
// addToIndex adds a word-to-topic mapping.
func (i *searchIndex) addToIndex(word, topicID string) {
if i.index[word] == nil {
i.index[word] = make(map[string]bool)
}
i.index[word][topicID] = true
}
// Search finds topics matching the query.
func (i *searchIndex) Search(query string) []*SearchResult {
queryWords := tokenize(query)
if len(queryWords) == 0 {
return nil
}
// Track scores per topic
scores := make(map[string]float64)
for _, word := range queryWords {
// Exact matches
if topicIDs, ok := i.index[word]; ok {
for topicID := range topicIDs {
scores[topicID] += 1.0
}
}
// Prefix matches (partial word matching)
for indexWord, topicIDs := range i.index {
if strings.HasPrefix(indexWord, word) && indexWord != word {
for topicID := range topicIDs {
scores[topicID] += 0.5 // Lower score for partial matches
}
}
}
}
// Build results with title boost and snippet extraction
var results []*SearchResult
for topicID, score := range scores {
topic := i.topics[topicID]
if topic == nil {
continue
}
// Title boost: if query words appear in title
titleLower := strings.ToLower(topic.Title)
for _, word := range queryWords {
if strings.Contains(titleLower, word) {
score += 2.0 // Title matches are worth more
}
}
// Find matching section and extract snippet
section, snippet := i.findBestMatch(topic, queryWords)
results = append(results, &SearchResult{
Topic: topic,
Section: section,
Score: score,
Snippet: snippet,
})
}
// Sort by score (highest first)
sort.Slice(results, func(a, b int) bool {
return results[a].Score > results[b].Score
})
return results
}
// findBestMatch finds the section with the best match and extracts a snippet.
func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string) (*Section, string) {
var bestSection *Section
var bestSnippet string
bestScore := 0
// Check topic title
titleScore := countMatches(topic.Title, queryWords)
if titleScore > 0 {
bestSnippet = extractSnippet(topic.Content, queryWords)
}
// Check sections
for idx := range topic.Sections {
section := &topic.Sections[idx]
sectionScore := countMatches(section.Title, queryWords)
contentScore := countMatches(section.Content, queryWords)
totalScore := sectionScore*2 + contentScore // Title matches worth more
if totalScore > bestScore {
bestScore = totalScore
bestSection = section
if contentScore > 0 {
bestSnippet = extractSnippet(section.Content, queryWords)
} else {
bestSnippet = extractSnippet(section.Content, nil)
}
}
}
// If no section matched, use topic content
if bestSnippet == "" && topic.Content != "" {
bestSnippet = extractSnippet(topic.Content, queryWords)
}
return bestSection, bestSnippet
}
// tokenize splits text into lowercase words for indexing/searching.
func tokenize(text string) []string {
text = strings.ToLower(text)
var words []string
var word strings.Builder
for _, r := range text {
if unicode.IsLetter(r) || unicode.IsDigit(r) {
word.WriteRune(r)
} else if word.Len() > 0 {
w := word.String()
if len(w) >= 2 { // Skip single-character words
words = append(words, w)
}
word.Reset()
}
}
// Don't forget the last word
if word.Len() >= 2 {
words = append(words, word.String())
}
return words
}
// countMatches counts how many query words appear in the text.
func countMatches(text string, queryWords []string) int {
textLower := strings.ToLower(text)
count := 0
for _, word := range queryWords {
if strings.Contains(textLower, word) {
count++
}
}
return count
}
// extractSnippet extracts a short snippet around the first match.
// Uses rune-based indexing to properly handle multi-byte UTF-8 characters.
func extractSnippet(content string, queryWords []string) string {
if content == "" {
return ""
}
const snippetLen = 150
// If no query words, return start of content
if len(queryWords) == 0 {
lines := strings.Split(content, "\n")
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" && !strings.HasPrefix(line, "#") {
runes := []rune(line)
if len(runes) > snippetLen {
return string(runes[:snippetLen]) + "..."
}
return line
}
}
return ""
}
// Find first match position (byte-based for strings.Index)
contentLower := strings.ToLower(content)
matchPos := -1
for _, word := range queryWords {
pos := strings.Index(contentLower, word)
if pos != -1 && (matchPos == -1 || pos < matchPos) {
matchPos = pos
}
}
// Convert to runes for safe slicing
runes := []rune(content)
runeLen := len(runes)
if matchPos == -1 {
// No match found, return start of content
if runeLen > snippetLen {
return string(runes[:snippetLen]) + "..."
}
return content
}
// Convert byte position to rune position (use same string as Index)
matchRunePos := len([]rune(contentLower[:matchPos]))
// Extract snippet around match (rune-based)
start := matchRunePos - 50
if start < 0 {
start = 0
}
end := start + snippetLen
if end > runeLen {
end = runeLen
}
snippet := string(runes[start:end])
// Trim to word boundaries
if start > 0 {
if idx := strings.Index(snippet, " "); idx != -1 {
snippet = "..." + snippet[idx+1:]
}
}
if end < runeLen {
if idx := strings.LastIndex(snippet, " "); idx != -1 {
snippet = snippet[:idx] + "..."
}
}
return strings.TrimSpace(snippet)
}