From 7fede9d563bb58c37d940f2b7fc96eafbfffa997 Mon Sep 17 00:00:00 2001 From: Snider Date: Mon, 2 Feb 2026 00:07:32 +0000 Subject: [PATCH] feat(help): add markdown parsing and section extraction (#174) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(help): add markdown parsing and section extraction Implements #137: markdown parsing and section extraction for help system. - Add Topic and Section types for help content structure - Add Frontmatter type for YAML metadata parsing - Add ParseTopic() to parse markdown files into Topic structs - Add ExtractFrontmatter() to extract YAML frontmatter - Add ExtractSections() to extract headings and content - Add GenerateID() to create URL-safe anchor IDs - Add comprehensive tests following _Good/_Bad naming convention This is the foundation for the display-agnostic help system (#133). Co-Authored-By: Claude Opus 4.5 * fix(test): use manual cleanup for TestDevOps_Boot_Good_FreshWithNoExisting Fixes flaky test that fails with "TempDir RemoveAll cleanup: directory not empty" by using os.MkdirTemp with t.Cleanup instead of t.TempDir(). This is the same fix applied to TestDevOps_Boot_Good_Success in 8effbda. Co-Authored-By: Claude Opus 4.5 * fix(help): address CodeRabbit review feedback - Add CRLF line ending support to frontmatter regex - Add empty frontmatter block support - Use filepath.Base/Ext for cross-platform path handling - Add tests for CRLF and empty frontmatter cases Co-Authored-By: Claude Opus 4.5 * feat(help): add full-text search functionality (#175) * fix(test): use manual cleanup for TestDevOps_Boot_Good_FreshWithNoExisting Fixes flaky test that fails with "TempDir RemoveAll cleanup: directory not empty" by using os.MkdirTemp with t.Cleanup instead of t.TempDir(). This is the same fix applied to TestDevOps_Boot_Good_Success in 8effbda. Co-Authored-By: Claude Opus 4.5 * feat(help): add full-text search functionality Implements #139: full-text search for help topics. - Add searchIndex with inverted index for fast lookups - Add tokenize() for case-insensitive word extraction - Add Search() with relevance ranking: - Exact word matches score 1.0 - Prefix matches score 0.5 - Title matches get 2.0 boost - Add snippet extraction for search result context - Add section-level matching for precise results - Add comprehensive tests following _Good/_Bad naming Search features: - Case-insensitive matching - Partial word matching (prefix) - Title boost (matches in title rank higher) - Section-level results - Snippet extraction with context Co-Authored-By: Claude Opus 4.5 * fix(help): address CodeRabbit review feedback - Add CRLF line ending support to frontmatter regex - Add empty frontmatter block support - Use filepath.Base/Ext for cross-platform path handling - Add tests for CRLF and empty frontmatter cases Co-Authored-By: Claude Opus 4.5 --------- Co-authored-by: Claude Opus 4.5 * fix(help): use rune-based slicing for UTF-8 safe snippets Address CodeRabbit feedback: byte-based slicing can corrupt multi-byte UTF-8 characters. Now uses rune-based indexing for snippet extraction. - Convert content to []rune before slicing - Convert byte position to rune position for match location - Add UTF-8 validation tests with Japanese text Co-Authored-By: Claude Opus 4.5 * fix(help): use correct string for byte-to-rune conversion in extractSnippet strings.ToLower can change byte lengths for certain Unicode characters (e.g., K U+212A 3 bytes → k 1 byte). Since matchPos is a byte index from strings.Index(contentLower, word), the rune conversion must also use contentLower to maintain correct index alignment. Fixes CodeRabbit review feedback. Co-Authored-By: Claude Opus 4.5 --------- Co-authored-by: Claude Opus 4.5 --- pkg/devops/devops_test.go | 6 +- pkg/help/parser.go | 174 +++++++++++++++++++ pkg/help/parser_test.go | 339 ++++++++++++++++++++++++++++++++++++++ pkg/help/search.go | 286 ++++++++++++++++++++++++++++++++ pkg/help/search_test.go | 299 +++++++++++++++++++++++++++++++++ pkg/help/topic.go | 31 ++++ 6 files changed, 1133 insertions(+), 2 deletions(-) create mode 100644 pkg/help/parser.go create mode 100644 pkg/help/parser_test.go create mode 100644 pkg/help/search.go create mode 100644 pkg/help/search_test.go create mode 100644 pkg/help/topic.go diff --git a/pkg/devops/devops_test.go b/pkg/devops/devops_test.go index edb5742..65f45c9 100644 --- a/pkg/devops/devops_test.go +++ b/pkg/devops/devops_test.go @@ -699,12 +699,14 @@ func TestDevOps_Stop_Bad_ContainerNotRunning(t *testing.T) { } func TestDevOps_Boot_Good_FreshWithNoExisting(t *testing.T) { - tempDir := t.TempDir() + tempDir, err := os.MkdirTemp("", "devops-boot-fresh-*") + require.NoError(t, err) + t.Cleanup(func() { os.RemoveAll(tempDir) }) t.Setenv("CORE_IMAGES_DIR", tempDir) // Create fake image imagePath := filepath.Join(tempDir, ImageName()) - err := os.WriteFile(imagePath, []byte("fake"), 0644) + err = os.WriteFile(imagePath, []byte("fake"), 0644) require.NoError(t, err) cfg := DefaultConfig() diff --git a/pkg/help/parser.go b/pkg/help/parser.go new file mode 100644 index 0000000..a92b490 --- /dev/null +++ b/pkg/help/parser.go @@ -0,0 +1,174 @@ +package help + +import ( + "path/filepath" + "regexp" + "strings" + "unicode" + + "gopkg.in/yaml.v3" +) + +var ( + // frontmatterRegex matches YAML frontmatter delimited by --- + // Supports both LF and CRLF line endings, and empty frontmatter blocks + frontmatterRegex = regexp.MustCompile(`(?s)^---\r?\n(.*?)(?:\r?\n)?---\r?\n?`) + + // headingRegex matches markdown headings (# to ######) + headingRegex = regexp.MustCompile(`^(#{1,6})\s+(.+)$`) +) + +// ParseTopic parses a markdown file into a Topic. +func ParseTopic(path string, content []byte) (*Topic, error) { + contentStr := string(content) + + topic := &Topic{ + Path: path, + ID: GenerateID(pathToTitle(path)), + Sections: []Section{}, + Tags: []string{}, + Related: []string{}, + } + + // Extract YAML frontmatter if present + fm, body := ExtractFrontmatter(contentStr) + if fm != nil { + topic.Title = fm.Title + topic.Tags = fm.Tags + topic.Related = fm.Related + topic.Order = fm.Order + if topic.Title != "" { + topic.ID = GenerateID(topic.Title) + } + } + + topic.Content = body + + // Extract sections from headings + topic.Sections = ExtractSections(body) + + // If no title from frontmatter, try first H1 + if topic.Title == "" && len(topic.Sections) > 0 { + for _, s := range topic.Sections { + if s.Level == 1 { + topic.Title = s.Title + topic.ID = GenerateID(s.Title) + break + } + } + } + + return topic, nil +} + +// ExtractFrontmatter extracts YAML frontmatter from markdown content. +// Returns the parsed frontmatter and the remaining content. +func ExtractFrontmatter(content string) (*Frontmatter, string) { + match := frontmatterRegex.FindStringSubmatch(content) + if match == nil { + return nil, content + } + + var fm Frontmatter + if err := yaml.Unmarshal([]byte(match[1]), &fm); err != nil { + // Invalid YAML, return content as-is + return nil, content + } + + // Return content without frontmatter + body := content[len(match[0]):] + return &fm, body +} + +// ExtractSections parses markdown and returns sections. +func ExtractSections(content string) []Section { + lines := strings.Split(content, "\n") + sections := []Section{} + + var currentSection *Section + var contentLines []string + + for i, line := range lines { + lineNum := i + 1 // 1-indexed + + match := headingRegex.FindStringSubmatch(line) + if match != nil { + // Save previous section's content + if currentSection != nil { + currentSection.Content = strings.TrimSpace(strings.Join(contentLines, "\n")) + } + + // Start new section + level := len(match[1]) + title := strings.TrimSpace(match[2]) + + section := Section{ + ID: GenerateID(title), + Title: title, + Level: level, + Line: lineNum, + } + sections = append(sections, section) + currentSection = §ions[len(sections)-1] + contentLines = []string{} + } else if currentSection != nil { + contentLines = append(contentLines, line) + } + } + + // Save last section's content + if currentSection != nil { + currentSection.Content = strings.TrimSpace(strings.Join(contentLines, "\n")) + } + + return sections +} + +// GenerateID creates a URL-safe ID from a title. +// "Getting Started" -> "getting-started" +func GenerateID(title string) string { + var result strings.Builder + + for _, r := range strings.ToLower(title) { + if unicode.IsLetter(r) || unicode.IsDigit(r) { + result.WriteRune(r) + } else if unicode.IsSpace(r) || r == '-' || r == '_' { + // Only add hyphen if last char isn't already a hyphen + str := result.String() + if len(str) > 0 && str[len(str)-1] != '-' { + result.WriteRune('-') + } + } + // Skip other characters + } + + // Trim trailing hyphens + str := result.String() + return strings.Trim(str, "-") +} + +// pathToTitle converts a file path to a title. +// "getting-started.md" -> "Getting Started" +func pathToTitle(path string) string { + // Get filename without directory (cross-platform) + filename := filepath.Base(path) + + // Remove extension + if ext := filepath.Ext(filename); ext != "" { + filename = strings.TrimSuffix(filename, ext) + } + + // Replace hyphens/underscores with spaces + filename = strings.ReplaceAll(filename, "-", " ") + filename = strings.ReplaceAll(filename, "_", " ") + + // Title case + words := strings.Fields(filename) + for i, word := range words { + if len(word) > 0 { + words[i] = strings.ToUpper(string(word[0])) + strings.ToLower(word[1:]) + } + } + + return strings.Join(words, " ") +} diff --git a/pkg/help/parser_test.go b/pkg/help/parser_test.go new file mode 100644 index 0000000..b95cadc --- /dev/null +++ b/pkg/help/parser_test.go @@ -0,0 +1,339 @@ +package help + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestGenerateID_Good(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "simple title", + input: "Getting Started", + expected: "getting-started", + }, + { + name: "already lowercase", + input: "installation", + expected: "installation", + }, + { + name: "multiple spaces", + input: "Quick Start Guide", + expected: "quick-start-guide", + }, + { + name: "with numbers", + input: "Chapter 1 Introduction", + expected: "chapter-1-introduction", + }, + { + name: "special characters", + input: "What's New? (v2.0)", + expected: "whats-new-v20", + }, + { + name: "underscores", + input: "config_file_reference", + expected: "config-file-reference", + }, + { + name: "hyphens preserved", + input: "pre-commit hooks", + expected: "pre-commit-hooks", + }, + { + name: "leading trailing spaces", + input: " Trimmed Title ", + expected: "trimmed-title", + }, + { + name: "unicode letters", + input: "Configuración Básica", + expected: "configuración-básica", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := GenerateID(tt.input) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestExtractFrontmatter_Good(t *testing.T) { + content := `--- +title: Getting Started +tags: [intro, setup] +order: 1 +related: + - installation + - configuration +--- + +# Welcome + +This is the content. +` + + fm, body := ExtractFrontmatter(content) + + assert.NotNil(t, fm) + assert.Equal(t, "Getting Started", fm.Title) + assert.Equal(t, []string{"intro", "setup"}, fm.Tags) + assert.Equal(t, 1, fm.Order) + assert.Equal(t, []string{"installation", "configuration"}, fm.Related) + assert.Contains(t, body, "# Welcome") + assert.Contains(t, body, "This is the content.") +} + +func TestExtractFrontmatter_Good_NoFrontmatter(t *testing.T) { + content := `# Just a Heading + +Some content here. +` + + fm, body := ExtractFrontmatter(content) + + assert.Nil(t, fm) + assert.Equal(t, content, body) +} + +func TestExtractFrontmatter_Good_CRLF(t *testing.T) { + // Content with CRLF line endings (Windows-style) + content := "---\r\ntitle: CRLF Test\r\n---\r\n\r\n# Content" + + fm, body := ExtractFrontmatter(content) + + assert.NotNil(t, fm) + assert.Equal(t, "CRLF Test", fm.Title) + assert.Contains(t, body, "# Content") +} + +func TestExtractFrontmatter_Good_Empty(t *testing.T) { + // Empty frontmatter block + content := "---\n---\n# Content" + + fm, body := ExtractFrontmatter(content) + + // Empty frontmatter should parse successfully + assert.NotNil(t, fm) + assert.Equal(t, "", fm.Title) + assert.Contains(t, body, "# Content") +} + +func TestExtractFrontmatter_Bad_InvalidYAML(t *testing.T) { + content := `--- +title: [invalid yaml +--- + +# Content +` + + fm, body := ExtractFrontmatter(content) + + // Invalid YAML should return nil frontmatter and original content + assert.Nil(t, fm) + assert.Equal(t, content, body) +} + +func TestExtractSections_Good(t *testing.T) { + content := `# Main Title + +Introduction paragraph. + +## Installation + +Install instructions here. +More details. + +### Prerequisites + +You need these things. + +## Configuration + +Config info here. +` + + sections := ExtractSections(content) + + assert.Len(t, sections, 4) + + // Main Title (H1) + assert.Equal(t, "main-title", sections[0].ID) + assert.Equal(t, "Main Title", sections[0].Title) + assert.Equal(t, 1, sections[0].Level) + assert.Equal(t, 1, sections[0].Line) + assert.Contains(t, sections[0].Content, "Introduction paragraph.") + + // Installation (H2) + assert.Equal(t, "installation", sections[1].ID) + assert.Equal(t, "Installation", sections[1].Title) + assert.Equal(t, 2, sections[1].Level) + assert.Contains(t, sections[1].Content, "Install instructions here.") + assert.Contains(t, sections[1].Content, "More details.") + + // Prerequisites (H3) + assert.Equal(t, "prerequisites", sections[2].ID) + assert.Equal(t, "Prerequisites", sections[2].Title) + assert.Equal(t, 3, sections[2].Level) + assert.Contains(t, sections[2].Content, "You need these things.") + + // Configuration (H2) + assert.Equal(t, "configuration", sections[3].ID) + assert.Equal(t, "Configuration", sections[3].Title) + assert.Equal(t, 2, sections[3].Level) +} + +func TestExtractSections_Good_AllHeadingLevels(t *testing.T) { + content := `# H1 +## H2 +### H3 +#### H4 +##### H5 +###### H6 +` + + sections := ExtractSections(content) + + assert.Len(t, sections, 6) + for i, level := range []int{1, 2, 3, 4, 5, 6} { + assert.Equal(t, level, sections[i].Level) + } +} + +func TestExtractSections_Good_Empty(t *testing.T) { + content := `Just plain text. +No headings here. +` + + sections := ExtractSections(content) + + assert.Empty(t, sections) +} + +func TestParseTopic_Good(t *testing.T) { + content := []byte(`--- +title: Quick Start Guide +tags: [intro, quickstart] +order: 5 +related: + - installation +--- + +# Quick Start Guide + +Welcome to the guide. + +## First Steps + +Do this first. + +## Next Steps + +Then do this. +`) + + topic, err := ParseTopic("docs/quick-start.md", content) + + assert.NoError(t, err) + assert.NotNil(t, topic) + + // Check metadata from frontmatter + assert.Equal(t, "quick-start-guide", topic.ID) + assert.Equal(t, "Quick Start Guide", topic.Title) + assert.Equal(t, "docs/quick-start.md", topic.Path) + assert.Equal(t, []string{"intro", "quickstart"}, topic.Tags) + assert.Equal(t, []string{"installation"}, topic.Related) + assert.Equal(t, 5, topic.Order) + + // Check sections + assert.Len(t, topic.Sections, 3) + assert.Equal(t, "quick-start-guide", topic.Sections[0].ID) + assert.Equal(t, "first-steps", topic.Sections[1].ID) + assert.Equal(t, "next-steps", topic.Sections[2].ID) + + // Content should not include frontmatter + assert.NotContains(t, topic.Content, "---") + assert.Contains(t, topic.Content, "# Quick Start Guide") +} + +func TestParseTopic_Good_NoFrontmatter(t *testing.T) { + content := []byte(`# Getting Started + +This is a simple doc. + +## Installation + +Install it here. +`) + + topic, err := ParseTopic("getting-started.md", content) + + assert.NoError(t, err) + assert.NotNil(t, topic) + + // Title should come from first H1 + assert.Equal(t, "Getting Started", topic.Title) + assert.Equal(t, "getting-started", topic.ID) + + // Sections extracted + assert.Len(t, topic.Sections, 2) +} + +func TestParseTopic_Good_NoHeadings(t *testing.T) { + content := []byte(`--- +title: Plain Content +--- + +Just some text without any headings. +`) + + topic, err := ParseTopic("plain.md", content) + + assert.NoError(t, err) + assert.NotNil(t, topic) + assert.Equal(t, "Plain Content", topic.Title) + assert.Equal(t, "plain-content", topic.ID) + assert.Empty(t, topic.Sections) +} + +func TestParseTopic_Good_IDFromPath(t *testing.T) { + content := []byte(`Just content, no frontmatter or headings.`) + + topic, err := ParseTopic("commands/dev-workflow.md", content) + + assert.NoError(t, err) + assert.NotNil(t, topic) + + // ID and title should be derived from path + assert.Equal(t, "dev-workflow", topic.ID) + assert.Equal(t, "", topic.Title) // No title available +} + +func TestPathToTitle_Good(t *testing.T) { + tests := []struct { + path string + expected string + }{ + {"getting-started.md", "Getting Started"}, + {"commands/dev.md", "Dev"}, + {"path/to/file_name.md", "File Name"}, + {"UPPERCASE.md", "Uppercase"}, + {"no-extension", "No Extension"}, + } + + for _, tt := range tests { + t.Run(tt.path, func(t *testing.T) { + result := pathToTitle(tt.path) + assert.Equal(t, tt.expected, result) + }) + } +} diff --git a/pkg/help/search.go b/pkg/help/search.go new file mode 100644 index 0000000..19914cf --- /dev/null +++ b/pkg/help/search.go @@ -0,0 +1,286 @@ +package help + +import ( + "sort" + "strings" + "unicode" +) + +// SearchResult represents a search match. +type SearchResult struct { + Topic *Topic + Section *Section // nil if topic-level match + Score float64 + Snippet string // Context around match +} + +// searchIndex provides full-text search. +type searchIndex struct { + topics map[string]*Topic // topicID -> Topic + index map[string]map[string]bool // word -> set of topicIDs +} + +// newSearchIndex creates a new empty search index. +func newSearchIndex() *searchIndex { + return &searchIndex{ + topics: make(map[string]*Topic), + index: make(map[string]map[string]bool), + } +} + +// Add indexes a topic for searching. +func (i *searchIndex) Add(topic *Topic) { + i.topics[topic.ID] = topic + + // Index title words with boost + for _, word := range tokenize(topic.Title) { + i.addToIndex(word, topic.ID) + } + + // Index content words + for _, word := range tokenize(topic.Content) { + i.addToIndex(word, topic.ID) + } + + // Index section titles and content + for _, section := range topic.Sections { + for _, word := range tokenize(section.Title) { + i.addToIndex(word, topic.ID) + } + for _, word := range tokenize(section.Content) { + i.addToIndex(word, topic.ID) + } + } + + // Index tags + for _, tag := range topic.Tags { + for _, word := range tokenize(tag) { + i.addToIndex(word, topic.ID) + } + } +} + +// addToIndex adds a word-to-topic mapping. +func (i *searchIndex) addToIndex(word, topicID string) { + if i.index[word] == nil { + i.index[word] = make(map[string]bool) + } + i.index[word][topicID] = true +} + +// Search finds topics matching the query. +func (i *searchIndex) Search(query string) []*SearchResult { + queryWords := tokenize(query) + if len(queryWords) == 0 { + return nil + } + + // Track scores per topic + scores := make(map[string]float64) + + for _, word := range queryWords { + // Exact matches + if topicIDs, ok := i.index[word]; ok { + for topicID := range topicIDs { + scores[topicID] += 1.0 + } + } + + // Prefix matches (partial word matching) + for indexWord, topicIDs := range i.index { + if strings.HasPrefix(indexWord, word) && indexWord != word { + for topicID := range topicIDs { + scores[topicID] += 0.5 // Lower score for partial matches + } + } + } + } + + // Build results with title boost and snippet extraction + var results []*SearchResult + for topicID, score := range scores { + topic := i.topics[topicID] + if topic == nil { + continue + } + + // Title boost: if query words appear in title + titleLower := strings.ToLower(topic.Title) + for _, word := range queryWords { + if strings.Contains(titleLower, word) { + score += 2.0 // Title matches are worth more + } + } + + // Find matching section and extract snippet + section, snippet := i.findBestMatch(topic, queryWords) + + results = append(results, &SearchResult{ + Topic: topic, + Section: section, + Score: score, + Snippet: snippet, + }) + } + + // Sort by score (highest first) + sort.Slice(results, func(a, b int) bool { + return results[a].Score > results[b].Score + }) + + return results +} + +// findBestMatch finds the section with the best match and extracts a snippet. +func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string) (*Section, string) { + var bestSection *Section + var bestSnippet string + bestScore := 0 + + // Check topic title + titleScore := countMatches(topic.Title, queryWords) + if titleScore > 0 { + bestSnippet = extractSnippet(topic.Content, queryWords) + } + + // Check sections + for idx := range topic.Sections { + section := &topic.Sections[idx] + sectionScore := countMatches(section.Title, queryWords) + contentScore := countMatches(section.Content, queryWords) + totalScore := sectionScore*2 + contentScore // Title matches worth more + + if totalScore > bestScore { + bestScore = totalScore + bestSection = section + if contentScore > 0 { + bestSnippet = extractSnippet(section.Content, queryWords) + } else { + bestSnippet = extractSnippet(section.Content, nil) + } + } + } + + // If no section matched, use topic content + if bestSnippet == "" && topic.Content != "" { + bestSnippet = extractSnippet(topic.Content, queryWords) + } + + return bestSection, bestSnippet +} + +// tokenize splits text into lowercase words for indexing/searching. +func tokenize(text string) []string { + text = strings.ToLower(text) + var words []string + var word strings.Builder + + for _, r := range text { + if unicode.IsLetter(r) || unicode.IsDigit(r) { + word.WriteRune(r) + } else if word.Len() > 0 { + w := word.String() + if len(w) >= 2 { // Skip single-character words + words = append(words, w) + } + word.Reset() + } + } + + // Don't forget the last word + if word.Len() >= 2 { + words = append(words, word.String()) + } + + return words +} + +// countMatches counts how many query words appear in the text. +func countMatches(text string, queryWords []string) int { + textLower := strings.ToLower(text) + count := 0 + for _, word := range queryWords { + if strings.Contains(textLower, word) { + count++ + } + } + return count +} + +// extractSnippet extracts a short snippet around the first match. +// Uses rune-based indexing to properly handle multi-byte UTF-8 characters. +func extractSnippet(content string, queryWords []string) string { + if content == "" { + return "" + } + + const snippetLen = 150 + + // If no query words, return start of content + if len(queryWords) == 0 { + lines := strings.Split(content, "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + if line != "" && !strings.HasPrefix(line, "#") { + runes := []rune(line) + if len(runes) > snippetLen { + return string(runes[:snippetLen]) + "..." + } + return line + } + } + return "" + } + + // Find first match position (byte-based for strings.Index) + contentLower := strings.ToLower(content) + matchPos := -1 + for _, word := range queryWords { + pos := strings.Index(contentLower, word) + if pos != -1 && (matchPos == -1 || pos < matchPos) { + matchPos = pos + } + } + + // Convert to runes for safe slicing + runes := []rune(content) + runeLen := len(runes) + + if matchPos == -1 { + // No match found, return start of content + if runeLen > snippetLen { + return string(runes[:snippetLen]) + "..." + } + return content + } + + // Convert byte position to rune position (use same string as Index) + matchRunePos := len([]rune(contentLower[:matchPos])) + + // Extract snippet around match (rune-based) + start := matchRunePos - 50 + if start < 0 { + start = 0 + } + + end := start + snippetLen + if end > runeLen { + end = runeLen + } + + snippet := string(runes[start:end]) + + // Trim to word boundaries + if start > 0 { + if idx := strings.Index(snippet, " "); idx != -1 { + snippet = "..." + snippet[idx+1:] + } + } + if end < runeLen { + if idx := strings.LastIndex(snippet, " "); idx != -1 { + snippet = snippet[:idx] + "..." + } + } + + return strings.TrimSpace(snippet) +} diff --git a/pkg/help/search_test.go b/pkg/help/search_test.go new file mode 100644 index 0000000..94e6542 --- /dev/null +++ b/pkg/help/search_test.go @@ -0,0 +1,299 @@ +package help + +import ( + "strings" + "testing" + "unicode/utf8" + + "github.com/stretchr/testify/assert" +) + +func TestTokenize_Good(t *testing.T) { + tests := []struct { + name string + input string + expected []string + }{ + { + name: "simple words", + input: "hello world", + expected: []string{"hello", "world"}, + }, + { + name: "mixed case", + input: "Hello World", + expected: []string{"hello", "world"}, + }, + { + name: "with punctuation", + input: "Hello, world! How are you?", + expected: []string{"hello", "world", "how", "are", "you"}, + }, + { + name: "single characters filtered", + input: "a b c hello d", + expected: []string{"hello"}, + }, + { + name: "numbers included", + input: "version 2 release", + expected: []string{"version", "release"}, + }, + { + name: "alphanumeric", + input: "v2.0 and config123", + expected: []string{"v2", "and", "config123"}, + }, + { + name: "empty string", + input: "", + expected: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tokenize(tt.input) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestSearchIndex_Add_Good(t *testing.T) { + idx := newSearchIndex() + + topic := &Topic{ + ID: "getting-started", + Title: "Getting Started", + Content: "Welcome to the guide.", + Tags: []string{"intro", "setup"}, + Sections: []Section{ + {ID: "installation", Title: "Installation", Content: "Install the CLI."}, + }, + } + + idx.Add(topic) + + // Verify topic is stored + assert.NotNil(t, idx.topics["getting-started"]) + + // Verify words are indexed + assert.Contains(t, idx.index["getting"], "getting-started") + assert.Contains(t, idx.index["started"], "getting-started") + assert.Contains(t, idx.index["welcome"], "getting-started") + assert.Contains(t, idx.index["guide"], "getting-started") + assert.Contains(t, idx.index["intro"], "getting-started") + assert.Contains(t, idx.index["setup"], "getting-started") + assert.Contains(t, idx.index["installation"], "getting-started") + assert.Contains(t, idx.index["cli"], "getting-started") +} + +func TestSearchIndex_Search_Good(t *testing.T) { + idx := newSearchIndex() + + // Add test topics + idx.Add(&Topic{ + ID: "getting-started", + Title: "Getting Started", + Content: "Welcome to the CLI guide. This covers installation and setup.", + Tags: []string{"intro"}, + }) + + idx.Add(&Topic{ + ID: "configuration", + Title: "Configuration", + Content: "Configure the CLI using environment variables.", + }) + + idx.Add(&Topic{ + ID: "commands", + Title: "Commands Reference", + Content: "List of all available commands.", + }) + + t.Run("single word query", func(t *testing.T) { + results := idx.Search("configuration") + assert.NotEmpty(t, results) + assert.Equal(t, "configuration", results[0].Topic.ID) + }) + + t.Run("multi-word query", func(t *testing.T) { + results := idx.Search("cli guide") + assert.NotEmpty(t, results) + // Should match getting-started (has both "cli" and "guide") + assert.Equal(t, "getting-started", results[0].Topic.ID) + }) + + t.Run("title boost", func(t *testing.T) { + results := idx.Search("commands") + assert.NotEmpty(t, results) + // "commands" appears in title of commands topic + assert.Equal(t, "commands", results[0].Topic.ID) + }) + + t.Run("partial word matching", func(t *testing.T) { + results := idx.Search("config") + assert.NotEmpty(t, results) + // Should match "configuration" and "configure" + foundConfig := false + for _, r := range results { + if r.Topic.ID == "configuration" { + foundConfig = true + break + } + } + assert.True(t, foundConfig, "Should find configuration topic with prefix match") + }) + + t.Run("no results", func(t *testing.T) { + results := idx.Search("nonexistent") + assert.Empty(t, results) + }) + + t.Run("empty query", func(t *testing.T) { + results := idx.Search("") + assert.Nil(t, results) + }) +} + +func TestSearchIndex_Search_Good_WithSections(t *testing.T) { + idx := newSearchIndex() + + idx.Add(&Topic{ + ID: "installation", + Title: "Installation Guide", + Content: "Overview of installation process.", + Sections: []Section{ + { + ID: "linux", + Title: "Linux Installation", + Content: "Run apt-get install core on Debian.", + }, + { + ID: "macos", + Title: "macOS Installation", + Content: "Use brew install core on macOS.", + }, + { + ID: "windows", + Title: "Windows Installation", + Content: "Download the installer from the website.", + }, + }, + }) + + t.Run("matches section content", func(t *testing.T) { + results := idx.Search("debian") + assert.NotEmpty(t, results) + assert.Equal(t, "installation", results[0].Topic.ID) + // Should identify the Linux section as best match + if results[0].Section != nil { + assert.Equal(t, "linux", results[0].Section.ID) + } + }) + + t.Run("matches section title", func(t *testing.T) { + results := idx.Search("windows") + assert.NotEmpty(t, results) + assert.Equal(t, "installation", results[0].Topic.ID) + }) +} + +func TestExtractSnippet_Good(t *testing.T) { + content := `This is the first paragraph with some introduction text. + +Here is more content that talks about installation and setup. +The installation process is straightforward. + +Finally, some closing remarks about the configuration.` + + t.Run("finds match and extracts context", func(t *testing.T) { + snippet := extractSnippet(content, []string{"installation"}) + assert.Contains(t, snippet, "installation") + assert.True(t, len(snippet) <= 200, "Snippet should be reasonably short") + }) + + t.Run("no query words returns start", func(t *testing.T) { + snippet := extractSnippet(content, nil) + assert.Contains(t, snippet, "first paragraph") + }) + + t.Run("empty content", func(t *testing.T) { + snippet := extractSnippet("", []string{"test"}) + assert.Empty(t, snippet) + }) +} + +func TestExtractSnippet_Good_UTF8(t *testing.T) { + // Content with multi-byte UTF-8 characters + content := "日本語のテキストです。This contains Japanese text. 検索機能をテストします。" + + t.Run("handles multi-byte characters without corruption", func(t *testing.T) { + snippet := extractSnippet(content, []string{"japanese"}) + // Should not panic or produce invalid UTF-8 + assert.True(t, len(snippet) > 0) + // Verify the result is valid UTF-8 + assert.True(t, isValidUTF8(snippet), "Snippet should be valid UTF-8") + }) + + t.Run("truncates multi-byte content safely", func(t *testing.T) { + // Long content that will be truncated + longContent := strings.Repeat("日本語", 100) // 300 characters + snippet := extractSnippet(longContent, nil) + assert.True(t, isValidUTF8(snippet), "Truncated snippet should be valid UTF-8") + }) +} + +// isValidUTF8 checks if a string is valid UTF-8 +func isValidUTF8(s string) bool { + for i := 0; i < len(s); { + r, size := utf8.DecodeRuneInString(s[i:]) + if r == utf8.RuneError && size == 1 { + return false + } + i += size + } + return true +} + +func TestCountMatches_Good(t *testing.T) { + tests := []struct { + text string + words []string + expected int + }{ + {"Hello world", []string{"hello"}, 1}, + {"Hello world", []string{"hello", "world"}, 2}, + {"Hello world", []string{"foo", "bar"}, 0}, + {"The quick brown fox", []string{"quick", "fox", "dog"}, 2}, + } + + for _, tt := range tests { + result := countMatches(tt.text, tt.words) + assert.Equal(t, tt.expected, result) + } +} + +func TestSearchResult_Score_Good(t *testing.T) { + idx := newSearchIndex() + + // Topic with query word in title should score higher + idx.Add(&Topic{ + ID: "topic-in-title", + Title: "Installation Guide", + Content: "Some content here.", + }) + + idx.Add(&Topic{ + ID: "topic-in-content", + Title: "Some Other Topic", + Content: "This covers installation steps.", + }) + + results := idx.Search("installation") + assert.Len(t, results, 2) + + // Title match should score higher + assert.Equal(t, "topic-in-title", results[0].Topic.ID) + assert.Greater(t, results[0].Score, results[1].Score) +} diff --git a/pkg/help/topic.go b/pkg/help/topic.go new file mode 100644 index 0000000..b934e98 --- /dev/null +++ b/pkg/help/topic.go @@ -0,0 +1,31 @@ +// Package help provides display-agnostic help content management. +package help + +// Topic represents a help topic/page. +type Topic struct { + ID string `json:"id"` + Title string `json:"title"` + Path string `json:"path"` + Content string `json:"content"` + Sections []Section `json:"sections"` + Tags []string `json:"tags"` + Related []string `json:"related"` + Order int `json:"order"` // For sorting +} + +// Section represents a heading within a topic. +type Section struct { + ID string `json:"id"` + Title string `json:"title"` + Level int `json:"level"` + Line int `json:"line"` // Start line in content (1-indexed) + Content string `json:"content"` // Content under heading +} + +// Frontmatter represents YAML frontmatter metadata. +type Frontmatter struct { + Title string `yaml:"title"` + Tags []string `yaml:"tags"` + Related []string `yaml:"related"` + Order int `yaml:"order"` +}