feat(help): add markdown parsing and section extraction (#174)

* feat(help): add markdown parsing and section extraction Implements #137: markdown parsing and section extraction for help system. - Add Topic and Section types for help content structure - Add Frontmatter type for YAML metadata parsing - Add ParseTopic() to parse markdown files into Topic structs - Add ExtractFrontmatter() to extract YAML frontmatter - Add ExtractSections() to extract headings and content - Add GenerateID() to create URL-safe anchor IDs - Add comprehensive tests following _Good/_Bad naming convention This is the foundation for the display-agnostic help system (#133). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(test): use manual cleanup for TestDevOps_Boot_Good_FreshWithNoExisting Fixes flaky test that fails with "TempDir RemoveAll cleanup: directory not empty" by using os.MkdirTemp with t.Cleanup instead of t.TempDir(). This is the same fix applied to TestDevOps_Boot_Good_Success in 3423e48. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(help): address CodeRabbit review feedback - Add CRLF line ending support to frontmatter regex - Add empty frontmatter block support - Use filepath.Base/Ext for cross-platform path handling - Add tests for CRLF and empty frontmatter cases Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat(help): add full-text search functionality (#175) * fix(test): use manual cleanup for TestDevOps_Boot_Good_FreshWithNoExisting Fixes flaky test that fails with "TempDir RemoveAll cleanup: directory not empty" by using os.MkdirTemp with t.Cleanup instead of t.TempDir(). This is the same fix applied to TestDevOps_Boot_Good_Success in 3423e48. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat(help): add full-text search functionality Implements #139: full-text search for help topics. - Add searchIndex with inverted index for fast lookups - Add tokenize() for case-insensitive word extraction - Add Search() with relevance ranking: - Exact word matches score 1.0 - Prefix matches score 0.5 - Title matches get 2.0 boost - Add snippet extraction for search result context - Add section-level matching for precise results - Add comprehensive tests following _Good/_Bad naming Search features: - Case-insensitive matching - Partial word matching (prefix) - Title boost (matches in title rank higher) - Section-level results - Snippet extraction with context Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(help): address CodeRabbit review feedback - Add CRLF line ending support to frontmatter regex - Add empty frontmatter block support - Use filepath.Base/Ext for cross-platform path handling - Add tests for CRLF and empty frontmatter cases Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> * fix(help): use rune-based slicing for UTF-8 safe snippets Address CodeRabbit feedback: byte-based slicing can corrupt multi-byte UTF-8 characters. Now uses rune-based indexing for snippet extraction. - Convert content to []rune before slicing - Convert byte position to rune position for match location - Add UTF-8 validation tests with Japanese text Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(help): use correct string for byte-to-rune conversion in extractSnippet strings.ToLower can change byte lengths for certain Unicode characters (e.g., K U+212A 3 bytes → k 1 byte). Since matchPos is a byte index from strings.Index(contentLower, word), the rune conversion must also use contentLower to maintain correct index alignment. Fixes CodeRabbit review feedback. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 00:07:32 +00:00 · 2026-02-02 00:07:32 +00:00 · 12779ef67c
commit 12779ef67c
parent 547c65f264
6 changed files with 1133 additions and 2 deletions
--- a/pkg/devops/devops_test.go
+++ b/pkg/devops/devops_test.go
@ -699,12 +699,14 @@ func TestDevOps_Stop_Bad_ContainerNotRunning(t *testing.T) {
 }

 func TestDevOps_Boot_Good_FreshWithNoExisting(t *testing.T) {
-	tempDir := t.TempDir()
+	tempDir, err := os.MkdirTemp("", "devops-boot-fresh-*")
+	require.NoError(t, err)
+	t.Cleanup(func() { os.RemoveAll(tempDir) })
 	t.Setenv("CORE_IMAGES_DIR", tempDir)

 	// Create fake image
 	imagePath := filepath.Join(tempDir, ImageName())
-	err := os.WriteFile(imagePath, []byte("fake"), 0644)
+	err = os.WriteFile(imagePath, []byte("fake"), 0644)
 	require.NoError(t, err)

 	cfg := DefaultConfig()
--- a/pkg/help/parser.go
+++ b/pkg/help/parser.go
@ -0,0 +1,174 @@
+package help
+
+import (
+	"path/filepath"
+	"regexp"
+	"strings"
+	"unicode"
+
+	"gopkg.in/yaml.v3"
+)
+
+var (
+	// frontmatterRegex matches YAML frontmatter delimited by ---
+	// Supports both LF and CRLF line endings, and empty frontmatter blocks
+	frontmatterRegex = regexp.MustCompile(`(?s)^---\r?\n(.*?)(?:\r?\n)?---\r?\n?`)
+
+	// headingRegex matches markdown headings (# to ######)
+	headingRegex = regexp.MustCompile(`^(#{1,6})\s+(.+)$`)
+)
+
+// ParseTopic parses a markdown file into a Topic.
+func ParseTopic(path string, content []byte) (*Topic, error) {
+	contentStr := string(content)
+
+	topic := &Topic{
+		Path:     path,
+		ID:       GenerateID(pathToTitle(path)),
+		Sections: []Section{},
+		Tags:     []string{},
+		Related:  []string{},
+	}
+
+	// Extract YAML frontmatter if present
+	fm, body := ExtractFrontmatter(contentStr)
+	if fm != nil {
+		topic.Title = fm.Title
+		topic.Tags = fm.Tags
+		topic.Related = fm.Related
+		topic.Order = fm.Order
+		if topic.Title != "" {
+			topic.ID = GenerateID(topic.Title)
+		}
+	}
+
+	topic.Content = body
+
+	// Extract sections from headings
+	topic.Sections = ExtractSections(body)
+
+	// If no title from frontmatter, try first H1
+	if topic.Title == "" && len(topic.Sections) > 0 {
+		for _, s := range topic.Sections {
+			if s.Level == 1 {
+				topic.Title = s.Title
+				topic.ID = GenerateID(s.Title)
+				break
+			}
+		}
+	}
+
+	return topic, nil
+}
+
+// ExtractFrontmatter extracts YAML frontmatter from markdown content.
+// Returns the parsed frontmatter and the remaining content.
+func ExtractFrontmatter(content string) (*Frontmatter, string) {
+	match := frontmatterRegex.FindStringSubmatch(content)
+	if match == nil {
+		return nil, content
+	}
+
+	var fm Frontmatter
+	if err := yaml.Unmarshal([]byte(match[1]), &fm); err != nil {
+		// Invalid YAML, return content as-is
+		return nil, content
+	}
+
+	// Return content without frontmatter
+	body := content[len(match[0]):]
+	return &fm, body
+}
+
+// ExtractSections parses markdown and returns sections.
+func ExtractSections(content string) []Section {
+	lines := strings.Split(content, "\n")
+	sections := []Section{}
+
+	var currentSection *Section
+	var contentLines []string
+
+	for i, line := range lines {
+		lineNum := i + 1 // 1-indexed
+
+		match := headingRegex.FindStringSubmatch(line)
+		if match != nil {
+			// Save previous section's content
+			if currentSection != nil {
+				currentSection.Content = strings.TrimSpace(strings.Join(contentLines, "\n"))
+			}
+
+			// Start new section
+			level := len(match[1])
+			title := strings.TrimSpace(match[2])
+
+			section := Section{
+				ID:    GenerateID(title),
+				Title: title,
+				Level: level,
+				Line:  lineNum,
+			}
+			sections = append(sections, section)
+			currentSection = &sections[len(sections)-1]
+			contentLines = []string{}
+		} else if currentSection != nil {
+			contentLines = append(contentLines, line)
+		}
+	}
+
+	// Save last section's content
+	if currentSection != nil {
+		currentSection.Content = strings.TrimSpace(strings.Join(contentLines, "\n"))
+	}
+
+	return sections
+}
+
+// GenerateID creates a URL-safe ID from a title.
+// "Getting Started" -> "getting-started"
+func GenerateID(title string) string {
+	var result strings.Builder
+
+	for _, r := range strings.ToLower(title) {
+		if unicode.IsLetter(r) || unicode.IsDigit(r) {
+			result.WriteRune(r)
+		} else if unicode.IsSpace(r) || r == '-' || r == '_' {
+			// Only add hyphen if last char isn't already a hyphen
+			str := result.String()
+			if len(str) > 0 && str[len(str)-1] != '-' {
+				result.WriteRune('-')
+			}
+		}
+		// Skip other characters
+	}
+
+	// Trim trailing hyphens
+	str := result.String()
+	return strings.Trim(str, "-")
+}
+
+// pathToTitle converts a file path to a title.
+// "getting-started.md" -> "Getting Started"
+func pathToTitle(path string) string {
+	// Get filename without directory (cross-platform)
+	filename := filepath.Base(path)
+
+	// Remove extension
+	if ext := filepath.Ext(filename); ext != "" {
+		filename = strings.TrimSuffix(filename, ext)
+	}
+
+	// Replace hyphens/underscores with spaces
+	filename = strings.ReplaceAll(filename, "-", " ")
+	filename = strings.ReplaceAll(filename, "_", " ")
+
+	// Title case
+	words := strings.Fields(filename)
+	for i, word := range words {
+		if len(word) > 0 {
+			words[i] = strings.ToUpper(string(word[0])) + strings.ToLower(word[1:])
+		}
+	}
+
+	return strings.Join(words, " ")
+}
--- a/pkg/help/parser_test.go
+++ b/pkg/help/parser_test.go
@ -0,0 +1,339 @@
+package help
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestGenerateID_Good(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "simple title",
+			input:    "Getting Started",
+			expected: "getting-started",
+		},
+		{
+			name:     "already lowercase",
+			input:    "installation",
+			expected: "installation",
+		},
+		{
+			name:     "multiple spaces",
+			input:    "Quick   Start   Guide",
+			expected: "quick-start-guide",
+		},
+		{
+			name:     "with numbers",
+			input:    "Chapter 1 Introduction",
+			expected: "chapter-1-introduction",
+		},
+		{
+			name:     "special characters",
+			input:    "What's New? (v2.0)",
+			expected: "whats-new-v20",
+		},
+		{
+			name:     "underscores",
+			input:    "config_file_reference",
+			expected: "config-file-reference",
+		},
+		{
+			name:     "hyphens preserved",
+			input:    "pre-commit hooks",
+			expected: "pre-commit-hooks",
+		},
+		{
+			name:     "leading trailing spaces",
+			input:    "  Trimmed Title  ",
+			expected: "trimmed-title",
+		},
+		{
+			name:     "unicode letters",
+			input:    "Configuración Básica",
+			expected: "configuración-básica",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := GenerateID(tt.input)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
+
+func TestExtractFrontmatter_Good(t *testing.T) {
+	content := `---
+title: Getting Started
+tags: [intro, setup]
+order: 1
+related:
+  - installation
+  - configuration
+---
+
+# Welcome
+
+This is the content.
+`
+
+	fm, body := ExtractFrontmatter(content)
+
+	assert.NotNil(t, fm)
+	assert.Equal(t, "Getting Started", fm.Title)
+	assert.Equal(t, []string{"intro", "setup"}, fm.Tags)
+	assert.Equal(t, 1, fm.Order)
+	assert.Equal(t, []string{"installation", "configuration"}, fm.Related)
+	assert.Contains(t, body, "# Welcome")
+	assert.Contains(t, body, "This is the content.")
+}
+
+func TestExtractFrontmatter_Good_NoFrontmatter(t *testing.T) {
+	content := `# Just a Heading
+
+Some content here.
+`
+
+	fm, body := ExtractFrontmatter(content)
+
+	assert.Nil(t, fm)
+	assert.Equal(t, content, body)
+}
+
+func TestExtractFrontmatter_Good_CRLF(t *testing.T) {
+	// Content with CRLF line endings (Windows-style)
+	content := "---\r\ntitle: CRLF Test\r\n---\r\n\r\n# Content"
+
+	fm, body := ExtractFrontmatter(content)
+
+	assert.NotNil(t, fm)
+	assert.Equal(t, "CRLF Test", fm.Title)
+	assert.Contains(t, body, "# Content")
+}
+
+func TestExtractFrontmatter_Good_Empty(t *testing.T) {
+	// Empty frontmatter block
+	content := "---\n---\n# Content"
+
+	fm, body := ExtractFrontmatter(content)
+
+	// Empty frontmatter should parse successfully
+	assert.NotNil(t, fm)
+	assert.Equal(t, "", fm.Title)
+	assert.Contains(t, body, "# Content")
+}
+
+func TestExtractFrontmatter_Bad_InvalidYAML(t *testing.T) {
+	content := `---
+title: [invalid yaml
+---
+
+# Content
+`
+
+	fm, body := ExtractFrontmatter(content)
+
+	// Invalid YAML should return nil frontmatter and original content
+	assert.Nil(t, fm)
+	assert.Equal(t, content, body)
+}
+
+func TestExtractSections_Good(t *testing.T) {
+	content := `# Main Title
+
+Introduction paragraph.
+
+## Installation
+
+Install instructions here.
+More details.
+
+### Prerequisites
+
+You need these things.
+
+## Configuration
+
+Config info here.
+`
+
+	sections := ExtractSections(content)
+
+	assert.Len(t, sections, 4)
+
+	// Main Title (H1)
+	assert.Equal(t, "main-title", sections[0].ID)
+	assert.Equal(t, "Main Title", sections[0].Title)
+	assert.Equal(t, 1, sections[0].Level)
+	assert.Equal(t, 1, sections[0].Line)
+	assert.Contains(t, sections[0].Content, "Introduction paragraph.")
+
+	// Installation (H2)
+	assert.Equal(t, "installation", sections[1].ID)
+	assert.Equal(t, "Installation", sections[1].Title)
+	assert.Equal(t, 2, sections[1].Level)
+	assert.Contains(t, sections[1].Content, "Install instructions here.")
+	assert.Contains(t, sections[1].Content, "More details.")
+
+	// Prerequisites (H3)
+	assert.Equal(t, "prerequisites", sections[2].ID)
+	assert.Equal(t, "Prerequisites", sections[2].Title)
+	assert.Equal(t, 3, sections[2].Level)
+	assert.Contains(t, sections[2].Content, "You need these things.")
+
+	// Configuration (H2)
+	assert.Equal(t, "configuration", sections[3].ID)
+	assert.Equal(t, "Configuration", sections[3].Title)
+	assert.Equal(t, 2, sections[3].Level)
+}
+
+func TestExtractSections_Good_AllHeadingLevels(t *testing.T) {
+	content := `# H1
+## H2
+### H3
+#### H4
+##### H5
+###### H6
+`
+
+	sections := ExtractSections(content)
+
+	assert.Len(t, sections, 6)
+	for i, level := range []int{1, 2, 3, 4, 5, 6} {
+		assert.Equal(t, level, sections[i].Level)
+	}
+}
+
+func TestExtractSections_Good_Empty(t *testing.T) {
+	content := `Just plain text.
+No headings here.
+`
+
+	sections := ExtractSections(content)
+
+	assert.Empty(t, sections)
+}
+
+func TestParseTopic_Good(t *testing.T) {
+	content := []byte(`---
+title: Quick Start Guide
+tags: [intro, quickstart]
+order: 5
+related:
+  - installation
+---
+
+# Quick Start Guide
+
+Welcome to the guide.
+
+## First Steps
+
+Do this first.
+
+## Next Steps
+
+Then do this.
+`)
+
+	topic, err := ParseTopic("docs/quick-start.md", content)
+
+	assert.NoError(t, err)
+	assert.NotNil(t, topic)
+
+	// Check metadata from frontmatter
+	assert.Equal(t, "quick-start-guide", topic.ID)
+	assert.Equal(t, "Quick Start Guide", topic.Title)
+	assert.Equal(t, "docs/quick-start.md", topic.Path)
+	assert.Equal(t, []string{"intro", "quickstart"}, topic.Tags)
+	assert.Equal(t, []string{"installation"}, topic.Related)
+	assert.Equal(t, 5, topic.Order)
+
+	// Check sections
+	assert.Len(t, topic.Sections, 3)
+	assert.Equal(t, "quick-start-guide", topic.Sections[0].ID)
+	assert.Equal(t, "first-steps", topic.Sections[1].ID)
+	assert.Equal(t, "next-steps", topic.Sections[2].ID)
+
+	// Content should not include frontmatter
+	assert.NotContains(t, topic.Content, "---")
+	assert.Contains(t, topic.Content, "# Quick Start Guide")
+}
+
+func TestParseTopic_Good_NoFrontmatter(t *testing.T) {
+	content := []byte(`# Getting Started
+
+This is a simple doc.
+
+## Installation
+
+Install it here.
+`)
+
+	topic, err := ParseTopic("getting-started.md", content)
+
+	assert.NoError(t, err)
+	assert.NotNil(t, topic)
+
+	// Title should come from first H1
+	assert.Equal(t, "Getting Started", topic.Title)
+	assert.Equal(t, "getting-started", topic.ID)
+
+	// Sections extracted
+	assert.Len(t, topic.Sections, 2)
+}
+
+func TestParseTopic_Good_NoHeadings(t *testing.T) {
+	content := []byte(`---
+title: Plain Content
+---
+
+Just some text without any headings.
+`)
+
+	topic, err := ParseTopic("plain.md", content)
+
+	assert.NoError(t, err)
+	assert.NotNil(t, topic)
+	assert.Equal(t, "Plain Content", topic.Title)
+	assert.Equal(t, "plain-content", topic.ID)
+	assert.Empty(t, topic.Sections)
+}
+
+func TestParseTopic_Good_IDFromPath(t *testing.T) {
+	content := []byte(`Just content, no frontmatter or headings.`)
+
+	topic, err := ParseTopic("commands/dev-workflow.md", content)
+
+	assert.NoError(t, err)
+	assert.NotNil(t, topic)
+
+	// ID and title should be derived from path
+	assert.Equal(t, "dev-workflow", topic.ID)
+	assert.Equal(t, "", topic.Title) // No title available
+}
+
+func TestPathToTitle_Good(t *testing.T) {
+	tests := []struct {
+		path     string
+		expected string
+	}{
+		{"getting-started.md", "Getting Started"},
+		{"commands/dev.md", "Dev"},
+		{"path/to/file_name.md", "File Name"},
+		{"UPPERCASE.md", "Uppercase"},
+		{"no-extension", "No Extension"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.path, func(t *testing.T) {
+			result := pathToTitle(tt.path)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
--- a/pkg/help/search.go
+++ b/pkg/help/search.go
@ -0,0 +1,286 @@
+package help
+
+import (
+	"sort"
+	"strings"
+	"unicode"
+)
+
+// SearchResult represents a search match.
+type SearchResult struct {
+	Topic   *Topic
+	Section *Section // nil if topic-level match
+	Score   float64
+	Snippet string // Context around match
+}
+
+// searchIndex provides full-text search.
+type searchIndex struct {
+	topics map[string]*Topic          // topicID -> Topic
+	index  map[string]map[string]bool // word -> set of topicIDs
+}
+
+// newSearchIndex creates a new empty search index.
+func newSearchIndex() *searchIndex {
+	return &searchIndex{
+		topics: make(map[string]*Topic),
+		index:  make(map[string]map[string]bool),
+	}
+}
+
+// Add indexes a topic for searching.
+func (i *searchIndex) Add(topic *Topic) {
+	i.topics[topic.ID] = topic
+
+	// Index title words with boost
+	for _, word := range tokenize(topic.Title) {
+		i.addToIndex(word, topic.ID)
+	}
+
+	// Index content words
+	for _, word := range tokenize(topic.Content) {
+		i.addToIndex(word, topic.ID)
+	}
+
+	// Index section titles and content
+	for _, section := range topic.Sections {
+		for _, word := range tokenize(section.Title) {
+			i.addToIndex(word, topic.ID)
+		}
+		for _, word := range tokenize(section.Content) {
+			i.addToIndex(word, topic.ID)
+		}
+	}
+
+	// Index tags
+	for _, tag := range topic.Tags {
+		for _, word := range tokenize(tag) {
+			i.addToIndex(word, topic.ID)
+		}
+	}
+}
+
+// addToIndex adds a word-to-topic mapping.
+func (i *searchIndex) addToIndex(word, topicID string) {
+	if i.index[word] == nil {
+		i.index[word] = make(map[string]bool)
+	}
+	i.index[word][topicID] = true
+}
+
+// Search finds topics matching the query.
+func (i *searchIndex) Search(query string) []*SearchResult {
+	queryWords := tokenize(query)
+	if len(queryWords) == 0 {
+		return nil
+	}
+
+	// Track scores per topic
+	scores := make(map[string]float64)
+
+	for _, word := range queryWords {
+		// Exact matches
+		if topicIDs, ok := i.index[word]; ok {
+			for topicID := range topicIDs {
+				scores[topicID] += 1.0
+			}
+		}
+
+		// Prefix matches (partial word matching)
+		for indexWord, topicIDs := range i.index {
+			if strings.HasPrefix(indexWord, word) && indexWord != word {
+				for topicID := range topicIDs {
+					scores[topicID] += 0.5 // Lower score for partial matches
+				}
+			}
+		}
+	}
+
+	// Build results with title boost and snippet extraction
+	var results []*SearchResult
+	for topicID, score := range scores {
+		topic := i.topics[topicID]
+		if topic == nil {
+			continue
+		}
+
+		// Title boost: if query words appear in title
+		titleLower := strings.ToLower(topic.Title)
+		for _, word := range queryWords {
+			if strings.Contains(titleLower, word) {
+				score += 2.0 // Title matches are worth more
+			}
+		}
+
+		// Find matching section and extract snippet
+		section, snippet := i.findBestMatch(topic, queryWords)
+
+		results = append(results, &SearchResult{
+			Topic:   topic,
+			Section: section,
+			Score:   score,
+			Snippet: snippet,
+		})
+	}
+
+	// Sort by score (highest first)
+	sort.Slice(results, func(a, b int) bool {
+		return results[a].Score > results[b].Score
+	})
+
+	return results
+}
+
+// findBestMatch finds the section with the best match and extracts a snippet.
+func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string) (*Section, string) {
+	var bestSection *Section
+	var bestSnippet string
+	bestScore := 0
+
+	// Check topic title
+	titleScore := countMatches(topic.Title, queryWords)
+	if titleScore > 0 {
+		bestSnippet = extractSnippet(topic.Content, queryWords)
+	}
+
+	// Check sections
+	for idx := range topic.Sections {
+		section := &topic.Sections[idx]
+		sectionScore := countMatches(section.Title, queryWords)
+		contentScore := countMatches(section.Content, queryWords)
+		totalScore := sectionScore*2 + contentScore // Title matches worth more
+
+		if totalScore > bestScore {
+			bestScore = totalScore
+			bestSection = section
+			if contentScore > 0 {
+				bestSnippet = extractSnippet(section.Content, queryWords)
+			} else {
+				bestSnippet = extractSnippet(section.Content, nil)
+			}
+		}
+	}
+
+	// If no section matched, use topic content
+	if bestSnippet == "" && topic.Content != "" {
+		bestSnippet = extractSnippet(topic.Content, queryWords)
+	}
+
+	return bestSection, bestSnippet
+}
+
+// tokenize splits text into lowercase words for indexing/searching.
+func tokenize(text string) []string {
+	text = strings.ToLower(text)
+	var words []string
+	var word strings.Builder
+
+	for _, r := range text {
+		if unicode.IsLetter(r) || unicode.IsDigit(r) {
+			word.WriteRune(r)
+		} else if word.Len() > 0 {
+			w := word.String()
+			if len(w) >= 2 { // Skip single-character words
+				words = append(words, w)
+			}
+			word.Reset()
+		}
+	}
+
+	// Don't forget the last word
+	if word.Len() >= 2 {
+		words = append(words, word.String())
+	}
+
+	return words
+}
+
+// countMatches counts how many query words appear in the text.
+func countMatches(text string, queryWords []string) int {
+	textLower := strings.ToLower(text)
+	count := 0
+	for _, word := range queryWords {
+		if strings.Contains(textLower, word) {
+			count++
+		}
+	}
+	return count
+}
+
+// extractSnippet extracts a short snippet around the first match.
+// Uses rune-based indexing to properly handle multi-byte UTF-8 characters.
+func extractSnippet(content string, queryWords []string) string {
+	if content == "" {
+		return ""
+	}
+
+	const snippetLen = 150
+
+	// If no query words, return start of content
+	if len(queryWords) == 0 {
+		lines := strings.Split(content, "\n")
+		for _, line := range lines {
+			line = strings.TrimSpace(line)
+			if line != "" && !strings.HasPrefix(line, "#") {
+				runes := []rune(line)
+				if len(runes) > snippetLen {
+					return string(runes[:snippetLen]) + "..."
+				}
+				return line
+			}
+		}
+		return ""
+	}
+
+	// Find first match position (byte-based for strings.Index)
+	contentLower := strings.ToLower(content)
+	matchPos := -1
+	for _, word := range queryWords {
+		pos := strings.Index(contentLower, word)
+		if pos != -1 && (matchPos == -1 || pos < matchPos) {
+			matchPos = pos
+		}
+	}
+
+	// Convert to runes for safe slicing
+	runes := []rune(content)
+	runeLen := len(runes)
+
+	if matchPos == -1 {
+		// No match found, return start of content
+		if runeLen > snippetLen {
+			return string(runes[:snippetLen]) + "..."
+		}
+		return content
+	}
+
+	// Convert byte position to rune position (use same string as Index)
+	matchRunePos := len([]rune(contentLower[:matchPos]))
+
+	// Extract snippet around match (rune-based)
+	start := matchRunePos - 50
+	if start < 0 {
+		start = 0
+	}
+
+	end := start + snippetLen
+	if end > runeLen {
+		end = runeLen
+	}
+
+	snippet := string(runes[start:end])
+
+	// Trim to word boundaries
+	if start > 0 {
+		if idx := strings.Index(snippet, " "); idx != -1 {
+			snippet = "..." + snippet[idx+1:]
+		}
+	}
+	if end < runeLen {
+		if idx := strings.LastIndex(snippet, " "); idx != -1 {
+			snippet = snippet[:idx] + "..."
+		}
+	}
+
+	return strings.TrimSpace(snippet)
+}
--- a/pkg/help/search_test.go
+++ b/pkg/help/search_test.go
@ -0,0 +1,299 @@
+package help
+
+import (
+	"strings"
+	"testing"
+	"unicode/utf8"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestTokenize_Good(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected []string
+	}{
+		{
+			name:     "simple words",
+			input:    "hello world",
+			expected: []string{"hello", "world"},
+		},
+		{
+			name:     "mixed case",
+			input:    "Hello World",
+			expected: []string{"hello", "world"},
+		},
+		{
+			name:     "with punctuation",
+			input:    "Hello, world! How are you?",
+			expected: []string{"hello", "world", "how", "are", "you"},
+		},
+		{
+			name:     "single characters filtered",
+			input:    "a b c hello d",
+			expected: []string{"hello"},
+		},
+		{
+			name:     "numbers included",
+			input:    "version 2 release",
+			expected: []string{"version", "release"},
+		},
+		{
+			name:     "alphanumeric",
+			input:    "v2.0 and config123",
+			expected: []string{"v2", "and", "config123"},
+		},
+		{
+			name:     "empty string",
+			input:    "",
+			expected: nil,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := tokenize(tt.input)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}
+
+func TestSearchIndex_Add_Good(t *testing.T) {
+	idx := newSearchIndex()
+
+	topic := &Topic{
+		ID:      "getting-started",
+		Title:   "Getting Started",
+		Content: "Welcome to the guide.",
+		Tags:    []string{"intro", "setup"},
+		Sections: []Section{
+			{ID: "installation", Title: "Installation", Content: "Install the CLI."},
+		},
+	}
+
+	idx.Add(topic)
+
+	// Verify topic is stored
+	assert.NotNil(t, idx.topics["getting-started"])
+
+	// Verify words are indexed
+	assert.Contains(t, idx.index["getting"], "getting-started")
+	assert.Contains(t, idx.index["started"], "getting-started")
+	assert.Contains(t, idx.index["welcome"], "getting-started")
+	assert.Contains(t, idx.index["guide"], "getting-started")
+	assert.Contains(t, idx.index["intro"], "getting-started")
+	assert.Contains(t, idx.index["setup"], "getting-started")
+	assert.Contains(t, idx.index["installation"], "getting-started")
+	assert.Contains(t, idx.index["cli"], "getting-started")
+}
+
+func TestSearchIndex_Search_Good(t *testing.T) {
+	idx := newSearchIndex()
+
+	// Add test topics
+	idx.Add(&Topic{
+		ID:      "getting-started",
+		Title:   "Getting Started",
+		Content: "Welcome to the CLI guide. This covers installation and setup.",
+		Tags:    []string{"intro"},
+	})
+
+	idx.Add(&Topic{
+		ID:      "configuration",
+		Title:   "Configuration",
+		Content: "Configure the CLI using environment variables.",
+	})
+
+	idx.Add(&Topic{
+		ID:      "commands",
+		Title:   "Commands Reference",
+		Content: "List of all available commands.",
+	})
+
+	t.Run("single word query", func(t *testing.T) {
+		results := idx.Search("configuration")
+		assert.NotEmpty(t, results)
+		assert.Equal(t, "configuration", results[0].Topic.ID)
+	})
+
+	t.Run("multi-word query", func(t *testing.T) {
+		results := idx.Search("cli guide")
+		assert.NotEmpty(t, results)
+		// Should match getting-started (has both "cli" and "guide")
+		assert.Equal(t, "getting-started", results[0].Topic.ID)
+	})
+
+	t.Run("title boost", func(t *testing.T) {
+		results := idx.Search("commands")
+		assert.NotEmpty(t, results)
+		// "commands" appears in title of commands topic
+		assert.Equal(t, "commands", results[0].Topic.ID)
+	})
+
+	t.Run("partial word matching", func(t *testing.T) {
+		results := idx.Search("config")
+		assert.NotEmpty(t, results)
+		// Should match "configuration" and "configure"
+		foundConfig := false
+		for _, r := range results {
+			if r.Topic.ID == "configuration" {
+				foundConfig = true
+				break
+			}
+		}
+		assert.True(t, foundConfig, "Should find configuration topic with prefix match")
+	})
+
+	t.Run("no results", func(t *testing.T) {
+		results := idx.Search("nonexistent")
+		assert.Empty(t, results)
+	})
+
+	t.Run("empty query", func(t *testing.T) {
+		results := idx.Search("")
+		assert.Nil(t, results)
+	})
+}
+
+func TestSearchIndex_Search_Good_WithSections(t *testing.T) {
+	idx := newSearchIndex()
+
+	idx.Add(&Topic{
+		ID:      "installation",
+		Title:   "Installation Guide",
+		Content: "Overview of installation process.",
+		Sections: []Section{
+			{
+				ID:      "linux",
+				Title:   "Linux Installation",
+				Content: "Run apt-get install core on Debian.",
+			},
+			{
+				ID:      "macos",
+				Title:   "macOS Installation",
+				Content: "Use brew install core on macOS.",
+			},
+			{
+				ID:      "windows",
+				Title:   "Windows Installation",
+				Content: "Download the installer from the website.",
+			},
+		},
+	})
+
+	t.Run("matches section content", func(t *testing.T) {
+		results := idx.Search("debian")
+		assert.NotEmpty(t, results)
+		assert.Equal(t, "installation", results[0].Topic.ID)
+		// Should identify the Linux section as best match
+		if results[0].Section != nil {
+			assert.Equal(t, "linux", results[0].Section.ID)
+		}
+	})
+
+	t.Run("matches section title", func(t *testing.T) {
+		results := idx.Search("windows")
+		assert.NotEmpty(t, results)
+		assert.Equal(t, "installation", results[0].Topic.ID)
+	})
+}
+
+func TestExtractSnippet_Good(t *testing.T) {
+	content := `This is the first paragraph with some introduction text.
+
+Here is more content that talks about installation and setup.
+The installation process is straightforward.
+
+Finally, some closing remarks about the configuration.`
+
+	t.Run("finds match and extracts context", func(t *testing.T) {
+		snippet := extractSnippet(content, []string{"installation"})
+		assert.Contains(t, snippet, "installation")
+		assert.True(t, len(snippet) <= 200, "Snippet should be reasonably short")
+	})
+
+	t.Run("no query words returns start", func(t *testing.T) {
+		snippet := extractSnippet(content, nil)
+		assert.Contains(t, snippet, "first paragraph")
+	})
+
+	t.Run("empty content", func(t *testing.T) {
+		snippet := extractSnippet("", []string{"test"})
+		assert.Empty(t, snippet)
+	})
+}
+
+func TestExtractSnippet_Good_UTF8(t *testing.T) {
+	// Content with multi-byte UTF-8 characters
+	content := "日本語のテキストです。This contains Japanese text. 検索機能をテストします。"
+
+	t.Run("handles multi-byte characters without corruption", func(t *testing.T) {
+		snippet := extractSnippet(content, []string{"japanese"})
+		// Should not panic or produce invalid UTF-8
+		assert.True(t, len(snippet) > 0)
+		// Verify the result is valid UTF-8
+		assert.True(t, isValidUTF8(snippet), "Snippet should be valid UTF-8")
+	})
+
+	t.Run("truncates multi-byte content safely", func(t *testing.T) {
+		// Long content that will be truncated
+		longContent := strings.Repeat("日本語", 100) // 300 characters
+		snippet := extractSnippet(longContent, nil)
+		assert.True(t, isValidUTF8(snippet), "Truncated snippet should be valid UTF-8")
+	})
+}
+
+// isValidUTF8 checks if a string is valid UTF-8
+func isValidUTF8(s string) bool {
+	for i := 0; i < len(s); {
+		r, size := utf8.DecodeRuneInString(s[i:])
+		if r == utf8.RuneError && size == 1 {
+			return false
+		}
+		i += size
+	}
+	return true
+}
+
+func TestCountMatches_Good(t *testing.T) {
+	tests := []struct {
+		text     string
+		words    []string
+		expected int
+	}{
+		{"Hello world", []string{"hello"}, 1},
+		{"Hello world", []string{"hello", "world"}, 2},
+		{"Hello world", []string{"foo", "bar"}, 0},
+		{"The quick brown fox", []string{"quick", "fox", "dog"}, 2},
+	}
+
+	for _, tt := range tests {
+		result := countMatches(tt.text, tt.words)
+		assert.Equal(t, tt.expected, result)
+	}
+}
+
+func TestSearchResult_Score_Good(t *testing.T) {
+	idx := newSearchIndex()
+
+	// Topic with query word in title should score higher
+	idx.Add(&Topic{
+		ID:      "topic-in-title",
+		Title:   "Installation Guide",
+		Content: "Some content here.",
+	})
+
+	idx.Add(&Topic{
+		ID:      "topic-in-content",
+		Title:   "Some Other Topic",
+		Content: "This covers installation steps.",
+	})
+
+	results := idx.Search("installation")
+	assert.Len(t, results, 2)
+
+	// Title match should score higher
+	assert.Equal(t, "topic-in-title", results[0].Topic.ID)
+	assert.Greater(t, results[0].Score, results[1].Score)
+}
--- a/pkg/help/topic.go
+++ b/pkg/help/topic.go
@ -0,0 +1,31 @@
+// Package help provides display-agnostic help content management.
+package help
+
+// Topic represents a help topic/page.
+type Topic struct {
+	ID       string    `json:"id"`
+	Title    string    `json:"title"`
+	Path     string    `json:"path"`
+	Content  string    `json:"content"`
+	Sections []Section `json:"sections"`
+	Tags     []string  `json:"tags"`
+	Related  []string  `json:"related"`
+	Order    int       `json:"order"` // For sorting
+}
+
+// Section represents a heading within a topic.
+type Section struct {
+	ID      string `json:"id"`
+	Title   string `json:"title"`
+	Level   int    `json:"level"`
+	Line    int    `json:"line"`    // Start line in content (1-indexed)
+	Content string `json:"content"` // Content under heading
+}
+
+// Frontmatter represents YAML frontmatter metadata.
+type Frontmatter struct {
+	Title   string   `yaml:"title"`
+	Tags    []string `yaml:"tags"`
+	Related []string `yaml:"related"`
+	Order   int      `yaml:"order"`
+}