cli/pkg/help/parser.go
Snider 12779ef67c feat(help): add markdown parsing and section extraction (#174)
* feat(help): add markdown parsing and section extraction

Implements #137: markdown parsing and section extraction for help system.

- Add Topic and Section types for help content structure
- Add Frontmatter type for YAML metadata parsing
- Add ParseTopic() to parse markdown files into Topic structs
- Add ExtractFrontmatter() to extract YAML frontmatter
- Add ExtractSections() to extract headings and content
- Add GenerateID() to create URL-safe anchor IDs
- Add comprehensive tests following _Good/_Bad naming convention

This is the foundation for the display-agnostic help system (#133).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(test): use manual cleanup for TestDevOps_Boot_Good_FreshWithNoExisting

Fixes flaky test that fails with "TempDir RemoveAll cleanup: directory
not empty" by using os.MkdirTemp with t.Cleanup instead of t.TempDir().

This is the same fix applied to TestDevOps_Boot_Good_Success in 3423e48.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(help): address CodeRabbit review feedback

- Add CRLF line ending support to frontmatter regex
- Add empty frontmatter block support
- Use filepath.Base/Ext for cross-platform path handling
- Add tests for CRLF and empty frontmatter cases

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* feat(help): add full-text search functionality (#175)

* fix(test): use manual cleanup for TestDevOps_Boot_Good_FreshWithNoExisting

Fixes flaky test that fails with "TempDir RemoveAll cleanup: directory
not empty" by using os.MkdirTemp with t.Cleanup instead of t.TempDir().

This is the same fix applied to TestDevOps_Boot_Good_Success in 3423e48.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* feat(help): add full-text search functionality

Implements #139: full-text search for help topics.

- Add searchIndex with inverted index for fast lookups
- Add tokenize() for case-insensitive word extraction
- Add Search() with relevance ranking:
  - Exact word matches score 1.0
  - Prefix matches score 0.5
  - Title matches get 2.0 boost
- Add snippet extraction for search result context
- Add section-level matching for precise results
- Add comprehensive tests following _Good/_Bad naming

Search features:
- Case-insensitive matching
- Partial word matching (prefix)
- Title boost (matches in title rank higher)
- Section-level results
- Snippet extraction with context

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(help): address CodeRabbit review feedback

- Add CRLF line ending support to frontmatter regex
- Add empty frontmatter block support
- Use filepath.Base/Ext for cross-platform path handling
- Add tests for CRLF and empty frontmatter cases

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>

* fix(help): use rune-based slicing for UTF-8 safe snippets

Address CodeRabbit feedback: byte-based slicing can corrupt multi-byte
UTF-8 characters. Now uses rune-based indexing for snippet extraction.

- Convert content to []rune before slicing
- Convert byte position to rune position for match location
- Add UTF-8 validation tests with Japanese text

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(help): use correct string for byte-to-rune conversion in extractSnippet

strings.ToLower can change byte lengths for certain Unicode characters
(e.g., K U+212A 3 bytes → k 1 byte). Since matchPos is a byte index from
strings.Index(contentLower, word), the rune conversion must also use
contentLower to maintain correct index alignment.

Fixes CodeRabbit review feedback.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 00:07:32 +00:00

174 lines
4.2 KiB
Go

package help
import (
"path/filepath"
"regexp"
"strings"
"unicode"
"gopkg.in/yaml.v3"
)
var (
// frontmatterRegex matches YAML frontmatter delimited by ---
// Supports both LF and CRLF line endings, and empty frontmatter blocks
frontmatterRegex = regexp.MustCompile(`(?s)^---\r?\n(.*?)(?:\r?\n)?---\r?\n?`)
// headingRegex matches markdown headings (# to ######)
headingRegex = regexp.MustCompile(`^(#{1,6})\s+(.+)$`)
)
// ParseTopic parses a markdown file into a Topic.
func ParseTopic(path string, content []byte) (*Topic, error) {
contentStr := string(content)
topic := &Topic{
Path: path,
ID: GenerateID(pathToTitle(path)),
Sections: []Section{},
Tags: []string{},
Related: []string{},
}
// Extract YAML frontmatter if present
fm, body := ExtractFrontmatter(contentStr)
if fm != nil {
topic.Title = fm.Title
topic.Tags = fm.Tags
topic.Related = fm.Related
topic.Order = fm.Order
if topic.Title != "" {
topic.ID = GenerateID(topic.Title)
}
}
topic.Content = body
// Extract sections from headings
topic.Sections = ExtractSections(body)
// If no title from frontmatter, try first H1
if topic.Title == "" && len(topic.Sections) > 0 {
for _, s := range topic.Sections {
if s.Level == 1 {
topic.Title = s.Title
topic.ID = GenerateID(s.Title)
break
}
}
}
return topic, nil
}
// ExtractFrontmatter extracts YAML frontmatter from markdown content.
// Returns the parsed frontmatter and the remaining content.
func ExtractFrontmatter(content string) (*Frontmatter, string) {
match := frontmatterRegex.FindStringSubmatch(content)
if match == nil {
return nil, content
}
var fm Frontmatter
if err := yaml.Unmarshal([]byte(match[1]), &fm); err != nil {
// Invalid YAML, return content as-is
return nil, content
}
// Return content without frontmatter
body := content[len(match[0]):]
return &fm, body
}
// ExtractSections parses markdown and returns sections.
func ExtractSections(content string) []Section {
lines := strings.Split(content, "\n")
sections := []Section{}
var currentSection *Section
var contentLines []string
for i, line := range lines {
lineNum := i + 1 // 1-indexed
match := headingRegex.FindStringSubmatch(line)
if match != nil {
// Save previous section's content
if currentSection != nil {
currentSection.Content = strings.TrimSpace(strings.Join(contentLines, "\n"))
}
// Start new section
level := len(match[1])
title := strings.TrimSpace(match[2])
section := Section{
ID: GenerateID(title),
Title: title,
Level: level,
Line: lineNum,
}
sections = append(sections, section)
currentSection = &sections[len(sections)-1]
contentLines = []string{}
} else if currentSection != nil {
contentLines = append(contentLines, line)
}
}
// Save last section's content
if currentSection != nil {
currentSection.Content = strings.TrimSpace(strings.Join(contentLines, "\n"))
}
return sections
}
// GenerateID creates a URL-safe ID from a title.
// "Getting Started" -> "getting-started"
func GenerateID(title string) string {
var result strings.Builder
for _, r := range strings.ToLower(title) {
if unicode.IsLetter(r) || unicode.IsDigit(r) {
result.WriteRune(r)
} else if unicode.IsSpace(r) || r == '-' || r == '_' {
// Only add hyphen if last char isn't already a hyphen
str := result.String()
if len(str) > 0 && str[len(str)-1] != '-' {
result.WriteRune('-')
}
}
// Skip other characters
}
// Trim trailing hyphens
str := result.String()
return strings.Trim(str, "-")
}
// pathToTitle converts a file path to a title.
// "getting-started.md" -> "Getting Started"
func pathToTitle(path string) string {
// Get filename without directory (cross-platform)
filename := filepath.Base(path)
// Remove extension
if ext := filepath.Ext(filename); ext != "" {
filename = strings.TrimSuffix(filename, ext)
}
// Replace hyphens/underscores with spaces
filename = strings.ReplaceAll(filename, "-", " ")
filename = strings.ReplaceAll(filename, "_", " ")
// Title case
words := strings.Fields(filename)
for i, word := range words {
if len(word) > 0 {
words[i] = strings.ToUpper(string(word[0])) + strings.ToLower(word[1:])
}
}
return strings.Join(words, " ")
}