* feat(help): add markdown parsing and section extraction Implements #137: markdown parsing and section extraction for help system. - Add Topic and Section types for help content structure - Add Frontmatter type for YAML metadata parsing - Add ParseTopic() to parse markdown files into Topic structs - Add ExtractFrontmatter() to extract YAML frontmatter - Add ExtractSections() to extract headings and content - Add GenerateID() to create URL-safe anchor IDs - Add comprehensive tests following _Good/_Bad naming convention This is the foundation for the display-agnostic help system (#133). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(test): use manual cleanup for TestDevOps_Boot_Good_FreshWithNoExisting Fixes flaky test that fails with "TempDir RemoveAll cleanup: directory not empty" by using os.MkdirTemp with t.Cleanup instead of t.TempDir(). This is the same fix applied to TestDevOps_Boot_Good_Success in8effbda. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(help): address CodeRabbit review feedback - Add CRLF line ending support to frontmatter regex - Add empty frontmatter block support - Use filepath.Base/Ext for cross-platform path handling - Add tests for CRLF and empty frontmatter cases Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat(help): add full-text search functionality (#175) * fix(test): use manual cleanup for TestDevOps_Boot_Good_FreshWithNoExisting Fixes flaky test that fails with "TempDir RemoveAll cleanup: directory not empty" by using os.MkdirTemp with t.Cleanup instead of t.TempDir(). This is the same fix applied to TestDevOps_Boot_Good_Success in8effbda. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat(help): add full-text search functionality Implements #139: full-text search for help topics. - Add searchIndex with inverted index for fast lookups - Add tokenize() for case-insensitive word extraction - Add Search() with relevance ranking: - Exact word matches score 1.0 - Prefix matches score 0.5 - Title matches get 2.0 boost - Add snippet extraction for search result context - Add section-level matching for precise results - Add comprehensive tests following _Good/_Bad naming Search features: - Case-insensitive matching - Partial word matching (prefix) - Title boost (matches in title rank higher) - Section-level results - Snippet extraction with context Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(help): address CodeRabbit review feedback - Add CRLF line ending support to frontmatter regex - Add empty frontmatter block support - Use filepath.Base/Ext for cross-platform path handling - Add tests for CRLF and empty frontmatter cases Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> * fix(help): use rune-based slicing for UTF-8 safe snippets Address CodeRabbit feedback: byte-based slicing can corrupt multi-byte UTF-8 characters. Now uses rune-based indexing for snippet extraction. - Convert content to []rune before slicing - Convert byte position to rune position for match location - Add UTF-8 validation tests with Japanese text Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(help): use correct string for byte-to-rune conversion in extractSnippet strings.ToLower can change byte lengths for certain Unicode characters (e.g., K U+212A 3 bytes → k 1 byte). Since matchPos is a byte index from strings.Index(contentLower, word), the rune conversion must also use contentLower to maintain correct index alignment. Fixes CodeRabbit review feedback. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
286 lines
6.6 KiB
Go
286 lines
6.6 KiB
Go
package help
|
|
|
|
import (
|
|
"sort"
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
// SearchResult represents a search match.
|
|
type SearchResult struct {
|
|
Topic *Topic
|
|
Section *Section // nil if topic-level match
|
|
Score float64
|
|
Snippet string // Context around match
|
|
}
|
|
|
|
// searchIndex provides full-text search.
|
|
type searchIndex struct {
|
|
topics map[string]*Topic // topicID -> Topic
|
|
index map[string]map[string]bool // word -> set of topicIDs
|
|
}
|
|
|
|
// newSearchIndex creates a new empty search index.
|
|
func newSearchIndex() *searchIndex {
|
|
return &searchIndex{
|
|
topics: make(map[string]*Topic),
|
|
index: make(map[string]map[string]bool),
|
|
}
|
|
}
|
|
|
|
// Add indexes a topic for searching.
|
|
func (i *searchIndex) Add(topic *Topic) {
|
|
i.topics[topic.ID] = topic
|
|
|
|
// Index title words with boost
|
|
for _, word := range tokenize(topic.Title) {
|
|
i.addToIndex(word, topic.ID)
|
|
}
|
|
|
|
// Index content words
|
|
for _, word := range tokenize(topic.Content) {
|
|
i.addToIndex(word, topic.ID)
|
|
}
|
|
|
|
// Index section titles and content
|
|
for _, section := range topic.Sections {
|
|
for _, word := range tokenize(section.Title) {
|
|
i.addToIndex(word, topic.ID)
|
|
}
|
|
for _, word := range tokenize(section.Content) {
|
|
i.addToIndex(word, topic.ID)
|
|
}
|
|
}
|
|
|
|
// Index tags
|
|
for _, tag := range topic.Tags {
|
|
for _, word := range tokenize(tag) {
|
|
i.addToIndex(word, topic.ID)
|
|
}
|
|
}
|
|
}
|
|
|
|
// addToIndex adds a word-to-topic mapping.
|
|
func (i *searchIndex) addToIndex(word, topicID string) {
|
|
if i.index[word] == nil {
|
|
i.index[word] = make(map[string]bool)
|
|
}
|
|
i.index[word][topicID] = true
|
|
}
|
|
|
|
// Search finds topics matching the query.
|
|
func (i *searchIndex) Search(query string) []*SearchResult {
|
|
queryWords := tokenize(query)
|
|
if len(queryWords) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Track scores per topic
|
|
scores := make(map[string]float64)
|
|
|
|
for _, word := range queryWords {
|
|
// Exact matches
|
|
if topicIDs, ok := i.index[word]; ok {
|
|
for topicID := range topicIDs {
|
|
scores[topicID] += 1.0
|
|
}
|
|
}
|
|
|
|
// Prefix matches (partial word matching)
|
|
for indexWord, topicIDs := range i.index {
|
|
if strings.HasPrefix(indexWord, word) && indexWord != word {
|
|
for topicID := range topicIDs {
|
|
scores[topicID] += 0.5 // Lower score for partial matches
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Build results with title boost and snippet extraction
|
|
var results []*SearchResult
|
|
for topicID, score := range scores {
|
|
topic := i.topics[topicID]
|
|
if topic == nil {
|
|
continue
|
|
}
|
|
|
|
// Title boost: if query words appear in title
|
|
titleLower := strings.ToLower(topic.Title)
|
|
for _, word := range queryWords {
|
|
if strings.Contains(titleLower, word) {
|
|
score += 2.0 // Title matches are worth more
|
|
}
|
|
}
|
|
|
|
// Find matching section and extract snippet
|
|
section, snippet := i.findBestMatch(topic, queryWords)
|
|
|
|
results = append(results, &SearchResult{
|
|
Topic: topic,
|
|
Section: section,
|
|
Score: score,
|
|
Snippet: snippet,
|
|
})
|
|
}
|
|
|
|
// Sort by score (highest first)
|
|
sort.Slice(results, func(a, b int) bool {
|
|
return results[a].Score > results[b].Score
|
|
})
|
|
|
|
return results
|
|
}
|
|
|
|
// findBestMatch finds the section with the best match and extracts a snippet.
|
|
func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string) (*Section, string) {
|
|
var bestSection *Section
|
|
var bestSnippet string
|
|
bestScore := 0
|
|
|
|
// Check topic title
|
|
titleScore := countMatches(topic.Title, queryWords)
|
|
if titleScore > 0 {
|
|
bestSnippet = extractSnippet(topic.Content, queryWords)
|
|
}
|
|
|
|
// Check sections
|
|
for idx := range topic.Sections {
|
|
section := &topic.Sections[idx]
|
|
sectionScore := countMatches(section.Title, queryWords)
|
|
contentScore := countMatches(section.Content, queryWords)
|
|
totalScore := sectionScore*2 + contentScore // Title matches worth more
|
|
|
|
if totalScore > bestScore {
|
|
bestScore = totalScore
|
|
bestSection = section
|
|
if contentScore > 0 {
|
|
bestSnippet = extractSnippet(section.Content, queryWords)
|
|
} else {
|
|
bestSnippet = extractSnippet(section.Content, nil)
|
|
}
|
|
}
|
|
}
|
|
|
|
// If no section matched, use topic content
|
|
if bestSnippet == "" && topic.Content != "" {
|
|
bestSnippet = extractSnippet(topic.Content, queryWords)
|
|
}
|
|
|
|
return bestSection, bestSnippet
|
|
}
|
|
|
|
// tokenize splits text into lowercase words for indexing/searching.
|
|
func tokenize(text string) []string {
|
|
text = strings.ToLower(text)
|
|
var words []string
|
|
var word strings.Builder
|
|
|
|
for _, r := range text {
|
|
if unicode.IsLetter(r) || unicode.IsDigit(r) {
|
|
word.WriteRune(r)
|
|
} else if word.Len() > 0 {
|
|
w := word.String()
|
|
if len(w) >= 2 { // Skip single-character words
|
|
words = append(words, w)
|
|
}
|
|
word.Reset()
|
|
}
|
|
}
|
|
|
|
// Don't forget the last word
|
|
if word.Len() >= 2 {
|
|
words = append(words, word.String())
|
|
}
|
|
|
|
return words
|
|
}
|
|
|
|
// countMatches counts how many query words appear in the text.
|
|
func countMatches(text string, queryWords []string) int {
|
|
textLower := strings.ToLower(text)
|
|
count := 0
|
|
for _, word := range queryWords {
|
|
if strings.Contains(textLower, word) {
|
|
count++
|
|
}
|
|
}
|
|
return count
|
|
}
|
|
|
|
// extractSnippet extracts a short snippet around the first match.
|
|
// Uses rune-based indexing to properly handle multi-byte UTF-8 characters.
|
|
func extractSnippet(content string, queryWords []string) string {
|
|
if content == "" {
|
|
return ""
|
|
}
|
|
|
|
const snippetLen = 150
|
|
|
|
// If no query words, return start of content
|
|
if len(queryWords) == 0 {
|
|
lines := strings.Split(content, "\n")
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
if line != "" && !strings.HasPrefix(line, "#") {
|
|
runes := []rune(line)
|
|
if len(runes) > snippetLen {
|
|
return string(runes[:snippetLen]) + "..."
|
|
}
|
|
return line
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// Find first match position (byte-based for strings.Index)
|
|
contentLower := strings.ToLower(content)
|
|
matchPos := -1
|
|
for _, word := range queryWords {
|
|
pos := strings.Index(contentLower, word)
|
|
if pos != -1 && (matchPos == -1 || pos < matchPos) {
|
|
matchPos = pos
|
|
}
|
|
}
|
|
|
|
// Convert to runes for safe slicing
|
|
runes := []rune(content)
|
|
runeLen := len(runes)
|
|
|
|
if matchPos == -1 {
|
|
// No match found, return start of content
|
|
if runeLen > snippetLen {
|
|
return string(runes[:snippetLen]) + "..."
|
|
}
|
|
return content
|
|
}
|
|
|
|
// Convert byte position to rune position (use same string as Index)
|
|
matchRunePos := len([]rune(contentLower[:matchPos]))
|
|
|
|
// Extract snippet around match (rune-based)
|
|
start := matchRunePos - 50
|
|
if start < 0 {
|
|
start = 0
|
|
}
|
|
|
|
end := start + snippetLen
|
|
if end > runeLen {
|
|
end = runeLen
|
|
}
|
|
|
|
snippet := string(runes[start:end])
|
|
|
|
// Trim to word boundaries
|
|
if start > 0 {
|
|
if idx := strings.Index(snippet, " "); idx != -1 {
|
|
snippet = "..." + snippet[idx+1:]
|
|
}
|
|
}
|
|
if end < runeLen {
|
|
if idx := strings.LastIndex(snippet, " "); idx != -1 {
|
|
snippet = snippet[:idx] + "..."
|
|
}
|
|
}
|
|
|
|
return strings.TrimSpace(snippet)
|
|
}
|