Add Catalog.All(), Catalog.SearchResults(), AllSections(), Tokens() iterators. Use slices.SortFunc, slices.Clone, maps.Values, built-in min replacing min3, strings.SplitSeq for streaming, range-over-int in levenshtein and benchmarks. Co-Authored-By: Gemini <noreply@google.com> Co-Authored-By: Virgil <virgil@lethean.io>
696 lines
16 KiB
Go
696 lines
16 KiB
Go
// SPDX-Licence-Identifier: EUPL-1.2
|
|
package help
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
func TestGenerateID_Good(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
input string
|
|
expected string
|
|
}{
|
|
{
|
|
name: "simple title",
|
|
input: "Getting Started",
|
|
expected: "getting-started",
|
|
},
|
|
{
|
|
name: "already lowercase",
|
|
input: "installation",
|
|
expected: "installation",
|
|
},
|
|
{
|
|
name: "multiple spaces",
|
|
input: "Quick Start Guide",
|
|
expected: "quick-start-guide",
|
|
},
|
|
{
|
|
name: "with numbers",
|
|
input: "Chapter 1 Introduction",
|
|
expected: "chapter-1-introduction",
|
|
},
|
|
{
|
|
name: "special characters",
|
|
input: "What's New? (v2.0)",
|
|
expected: "whats-new-v20",
|
|
},
|
|
{
|
|
name: "underscores",
|
|
input: "config_file_reference",
|
|
expected: "config-file-reference",
|
|
},
|
|
{
|
|
name: "hyphens preserved",
|
|
input: "pre-commit hooks",
|
|
expected: "pre-commit-hooks",
|
|
},
|
|
{
|
|
name: "leading trailing spaces",
|
|
input: " Trimmed Title ",
|
|
expected: "trimmed-title",
|
|
},
|
|
{
|
|
name: "unicode letters",
|
|
input: "Configuración Básica",
|
|
expected: "configuración-básica",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
result := GenerateID(tt.input)
|
|
assert.Equal(t, tt.expected, result)
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestExtractFrontmatter_Good(t *testing.T) {
|
|
content := `---
|
|
title: Getting Started
|
|
tags: [intro, setup]
|
|
order: 1
|
|
related:
|
|
- installation
|
|
- configuration
|
|
---
|
|
|
|
# Welcome
|
|
|
|
This is the content.
|
|
`
|
|
|
|
fm, body := ExtractFrontmatter(content)
|
|
|
|
assert.NotNil(t, fm)
|
|
assert.Equal(t, "Getting Started", fm.Title)
|
|
assert.Equal(t, []string{"intro", "setup"}, fm.Tags)
|
|
assert.Equal(t, 1, fm.Order)
|
|
assert.Equal(t, []string{"installation", "configuration"}, fm.Related)
|
|
assert.Contains(t, body, "# Welcome")
|
|
assert.Contains(t, body, "This is the content.")
|
|
}
|
|
|
|
func TestExtractFrontmatter_Good_NoFrontmatter(t *testing.T) {
|
|
content := `# Just a Heading
|
|
|
|
Some content here.
|
|
`
|
|
|
|
fm, body := ExtractFrontmatter(content)
|
|
|
|
assert.Nil(t, fm)
|
|
assert.Equal(t, content, body)
|
|
}
|
|
|
|
func TestExtractFrontmatter_Good_CRLF(t *testing.T) {
|
|
// Content with CRLF line endings (Windows-style)
|
|
content := "---\r\ntitle: CRLF Test\r\n---\r\n\r\n# Content"
|
|
|
|
fm, body := ExtractFrontmatter(content)
|
|
|
|
assert.NotNil(t, fm)
|
|
assert.Equal(t, "CRLF Test", fm.Title)
|
|
assert.Contains(t, body, "# Content")
|
|
}
|
|
|
|
func TestExtractFrontmatter_Good_Empty(t *testing.T) {
|
|
// Empty frontmatter block
|
|
content := "---\n---\n# Content"
|
|
|
|
fm, body := ExtractFrontmatter(content)
|
|
|
|
// Empty frontmatter should parse successfully
|
|
assert.NotNil(t, fm)
|
|
assert.Equal(t, "", fm.Title)
|
|
assert.Contains(t, body, "# Content")
|
|
}
|
|
|
|
func TestExtractFrontmatter_Bad_InvalidYAML(t *testing.T) {
|
|
content := `---
|
|
title: [invalid yaml
|
|
---
|
|
|
|
# Content
|
|
`
|
|
|
|
fm, body := ExtractFrontmatter(content)
|
|
|
|
// Invalid YAML should return nil frontmatter and original content
|
|
assert.Nil(t, fm)
|
|
assert.Equal(t, content, body)
|
|
}
|
|
|
|
func TestExtractSections_Good(t *testing.T) {
|
|
content := `# Main Title
|
|
|
|
Introduction paragraph.
|
|
|
|
## Installation
|
|
|
|
Install instructions here.
|
|
More details.
|
|
|
|
### Prerequisites
|
|
|
|
You need these things.
|
|
|
|
## Configuration
|
|
|
|
Config info here.
|
|
`
|
|
|
|
sections := ExtractSections(content)
|
|
|
|
assert.Len(t, sections, 4)
|
|
|
|
// Main Title (H1)
|
|
assert.Equal(t, "main-title", sections[0].ID)
|
|
assert.Equal(t, "Main Title", sections[0].Title)
|
|
assert.Equal(t, 1, sections[0].Level)
|
|
assert.Equal(t, 1, sections[0].Line)
|
|
assert.Contains(t, sections[0].Content, "Introduction paragraph.")
|
|
|
|
// Installation (H2)
|
|
assert.Equal(t, "installation", sections[1].ID)
|
|
assert.Equal(t, "Installation", sections[1].Title)
|
|
assert.Equal(t, 2, sections[1].Level)
|
|
assert.Contains(t, sections[1].Content, "Install instructions here.")
|
|
assert.Contains(t, sections[1].Content, "More details.")
|
|
|
|
// Prerequisites (H3)
|
|
assert.Equal(t, "prerequisites", sections[2].ID)
|
|
assert.Equal(t, "Prerequisites", sections[2].Title)
|
|
assert.Equal(t, 3, sections[2].Level)
|
|
assert.Contains(t, sections[2].Content, "You need these things.")
|
|
|
|
// Configuration (H2)
|
|
assert.Equal(t, "configuration", sections[3].ID)
|
|
assert.Equal(t, "Configuration", sections[3].Title)
|
|
assert.Equal(t, 2, sections[3].Level)
|
|
}
|
|
|
|
func TestExtractSections_Good_AllHeadingLevels(t *testing.T) {
|
|
content := `# H1
|
|
## H2
|
|
### H3
|
|
#### H4
|
|
##### H5
|
|
###### H6
|
|
`
|
|
|
|
sections := ExtractSections(content)
|
|
|
|
assert.Len(t, sections, 6)
|
|
for i, level := range []int{1, 2, 3, 4, 5, 6} {
|
|
assert.Equal(t, level, sections[i].Level)
|
|
}
|
|
}
|
|
|
|
func TestExtractSections_Good_Empty(t *testing.T) {
|
|
content := `Just plain text.
|
|
No headings here.
|
|
`
|
|
|
|
sections := ExtractSections(content)
|
|
|
|
assert.Empty(t, sections)
|
|
}
|
|
|
|
func TestParseTopic_Good(t *testing.T) {
|
|
content := []byte(`---
|
|
title: Quick Start Guide
|
|
tags: [intro, quickstart]
|
|
order: 5
|
|
related:
|
|
- installation
|
|
---
|
|
|
|
# Quick Start Guide
|
|
|
|
Welcome to the guide.
|
|
|
|
## First Steps
|
|
|
|
Do this first.
|
|
|
|
## Next Steps
|
|
|
|
Then do this.
|
|
`)
|
|
|
|
topic, err := ParseTopic("docs/quick-start.md", content)
|
|
|
|
assert.NoError(t, err)
|
|
assert.NotNil(t, topic)
|
|
|
|
// Check metadata from frontmatter
|
|
assert.Equal(t, "quick-start-guide", topic.ID)
|
|
assert.Equal(t, "Quick Start Guide", topic.Title)
|
|
assert.Equal(t, "docs/quick-start.md", topic.Path)
|
|
assert.Equal(t, []string{"intro", "quickstart"}, topic.Tags)
|
|
assert.Equal(t, []string{"installation"}, topic.Related)
|
|
assert.Equal(t, 5, topic.Order)
|
|
|
|
// Check sections
|
|
assert.Len(t, topic.Sections, 3)
|
|
assert.Equal(t, "quick-start-guide", topic.Sections[0].ID)
|
|
assert.Equal(t, "first-steps", topic.Sections[1].ID)
|
|
assert.Equal(t, "next-steps", topic.Sections[2].ID)
|
|
|
|
// Content should not include frontmatter
|
|
assert.NotContains(t, topic.Content, "---")
|
|
assert.Contains(t, topic.Content, "# Quick Start Guide")
|
|
}
|
|
|
|
func TestParseTopic_Good_NoFrontmatter(t *testing.T) {
|
|
content := []byte(`# Getting Started
|
|
|
|
This is a simple doc.
|
|
|
|
## Installation
|
|
|
|
Install it here.
|
|
`)
|
|
|
|
topic, err := ParseTopic("getting-started.md", content)
|
|
|
|
assert.NoError(t, err)
|
|
assert.NotNil(t, topic)
|
|
|
|
// Title should come from first H1
|
|
assert.Equal(t, "Getting Started", topic.Title)
|
|
assert.Equal(t, "getting-started", topic.ID)
|
|
|
|
// Sections extracted
|
|
assert.Len(t, topic.Sections, 2)
|
|
}
|
|
|
|
func TestParseTopic_Good_NoHeadings(t *testing.T) {
|
|
content := []byte(`---
|
|
title: Plain Content
|
|
---
|
|
|
|
Just some text without any headings.
|
|
`)
|
|
|
|
topic, err := ParseTopic("plain.md", content)
|
|
|
|
assert.NoError(t, err)
|
|
assert.NotNil(t, topic)
|
|
assert.Equal(t, "Plain Content", topic.Title)
|
|
assert.Equal(t, "plain-content", topic.ID)
|
|
assert.Empty(t, topic.Sections)
|
|
}
|
|
|
|
func TestParseTopic_Good_IDFromPath(t *testing.T) {
|
|
content := []byte(`Just content, no frontmatter or headings.`)
|
|
|
|
topic, err := ParseTopic("commands/dev-workflow.md", content)
|
|
|
|
assert.NoError(t, err)
|
|
assert.NotNil(t, topic)
|
|
|
|
// ID and title should be derived from path
|
|
assert.Equal(t, "dev-workflow", topic.ID)
|
|
assert.Equal(t, "", topic.Title) // No title available
|
|
}
|
|
|
|
func TestPathToTitle_Good(t *testing.T) {
|
|
tests := []struct {
|
|
path string
|
|
expected string
|
|
}{
|
|
{"getting-started.md", "Getting Started"},
|
|
{"commands/dev.md", "Dev"},
|
|
{"path/to/file_name.md", "File Name"},
|
|
{"UPPERCASE.md", "Uppercase"},
|
|
{"no-extension", "No Extension"},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.path, func(t *testing.T) {
|
|
result := pathToTitle(tt.path)
|
|
assert.Equal(t, tt.expected, result)
|
|
})
|
|
}
|
|
}
|
|
|
|
// --- Phase 0: Expanded parser tests ---
|
|
|
|
func TestParseTopic_Good_EmptyInput(t *testing.T) {
|
|
// Empty byte slice should produce a valid topic with no content
|
|
topic, err := ParseTopic("empty.md", []byte(""))
|
|
|
|
require.NoError(t, err)
|
|
assert.NotNil(t, topic)
|
|
assert.Equal(t, "empty", topic.ID)
|
|
assert.Equal(t, "", topic.Title)
|
|
assert.Equal(t, "", topic.Content)
|
|
assert.Empty(t, topic.Sections)
|
|
assert.Empty(t, topic.Tags)
|
|
assert.Empty(t, topic.Related)
|
|
}
|
|
|
|
func TestParseTopic_Good_FrontmatterOnly(t *testing.T) {
|
|
// Frontmatter with no body or sections
|
|
content := []byte(`---
|
|
title: Metadata Only
|
|
tags: [meta]
|
|
order: 99
|
|
---
|
|
`)
|
|
|
|
topic, err := ParseTopic("meta.md", content)
|
|
|
|
require.NoError(t, err)
|
|
assert.Equal(t, "metadata-only", topic.ID)
|
|
assert.Equal(t, "Metadata Only", topic.Title)
|
|
assert.Equal(t, []string{"meta"}, topic.Tags)
|
|
assert.Equal(t, 99, topic.Order)
|
|
assert.Empty(t, topic.Sections)
|
|
// Body after frontmatter is just a newline
|
|
assert.Equal(t, "", strings.TrimSpace(topic.Content))
|
|
}
|
|
|
|
func TestExtractFrontmatter_Bad_MalformedYAML(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
content string
|
|
}{
|
|
{
|
|
name: "unclosed bracket",
|
|
content: `---
|
|
title: [broken
|
|
tags: [also broken
|
|
---
|
|
|
|
# Content`,
|
|
},
|
|
{
|
|
name: "tab indentation error",
|
|
content: "---\ntitle: Good\n\t- bad indent\n---\n\n# Content",
|
|
},
|
|
{
|
|
name: "duplicate keys with conflicting types",
|
|
// YAML spec allows duplicate keys but implementations may vary;
|
|
// this tests that the parser does not panic regardless.
|
|
content: `---
|
|
title: First
|
|
title:
|
|
nested: value
|
|
---
|
|
|
|
# Content`,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
fm, body := ExtractFrontmatter(tt.content)
|
|
// Malformed YAML should return nil frontmatter without panic
|
|
if fm == nil {
|
|
// Body should be original content when YAML fails
|
|
assert.Equal(t, tt.content, body)
|
|
}
|
|
// No panic is the key assertion — test reaching here is success
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestExtractFrontmatter_Bad_NotAtStart(t *testing.T) {
|
|
// Frontmatter delimiters that do not start at the beginning of the file
|
|
content := `Some preamble text.
|
|
|
|
---
|
|
title: Should Not Parse
|
|
---
|
|
|
|
# Content`
|
|
|
|
fm, body := ExtractFrontmatter(content)
|
|
|
|
assert.Nil(t, fm)
|
|
assert.Equal(t, content, body)
|
|
}
|
|
|
|
func TestExtractSections_Good_DeeplyNested(t *testing.T) {
|
|
content := `# Level 1
|
|
|
|
Top-level content.
|
|
|
|
## Level 2
|
|
|
|
Second level.
|
|
|
|
### Level 3
|
|
|
|
Third level.
|
|
|
|
#### Level 4
|
|
|
|
Fourth level details.
|
|
|
|
##### Level 5
|
|
|
|
Fifth level fine print.
|
|
|
|
###### Level 6
|
|
|
|
Deepest heading level.
|
|
`
|
|
|
|
sections := ExtractSections(content)
|
|
|
|
require.Len(t, sections, 6)
|
|
|
|
for i, expected := range []struct {
|
|
level int
|
|
title string
|
|
}{
|
|
{1, "Level 1"},
|
|
{2, "Level 2"},
|
|
{3, "Level 3"},
|
|
{4, "Level 4"},
|
|
{5, "Level 5"},
|
|
{6, "Level 6"},
|
|
} {
|
|
assert.Equal(t, expected.level, sections[i].Level, "section %d level", i)
|
|
assert.Equal(t, expected.title, sections[i].Title, "section %d title", i)
|
|
}
|
|
|
|
// Verify content is associated with correct sections
|
|
assert.Contains(t, sections[0].Content, "Top-level content.")
|
|
assert.Contains(t, sections[3].Content, "Fourth level details.")
|
|
assert.Contains(t, sections[5].Content, "Deepest heading level.")
|
|
}
|
|
|
|
func TestExtractSections_Good_DeeplyNestedWithContent(t *testing.T) {
|
|
// H4, H5, H6 with meaningful content under each
|
|
content := `#### Configuration Options
|
|
|
|
Set these in your config file.
|
|
|
|
##### Advanced Options
|
|
|
|
Only for power users.
|
|
|
|
###### Experimental Flags
|
|
|
|
These may change without notice.
|
|
`
|
|
|
|
sections := ExtractSections(content)
|
|
|
|
require.Len(t, sections, 3)
|
|
assert.Equal(t, 4, sections[0].Level)
|
|
assert.Equal(t, "Configuration Options", sections[0].Title)
|
|
assert.Contains(t, sections[0].Content, "Set these in your config file.")
|
|
|
|
assert.Equal(t, 5, sections[1].Level)
|
|
assert.Equal(t, "Advanced Options", sections[1].Title)
|
|
assert.Contains(t, sections[1].Content, "Only for power users.")
|
|
|
|
assert.Equal(t, 6, sections[2].Level)
|
|
assert.Equal(t, "Experimental Flags", sections[2].Title)
|
|
assert.Contains(t, sections[2].Content, "These may change without notice.")
|
|
}
|
|
|
|
func TestParseTopic_Good_Unicode(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
content string
|
|
title string
|
|
}{
|
|
{
|
|
name: "CJK characters",
|
|
content: `---
|
|
title: 日本語ドキュメント
|
|
tags: [日本語, ドキュメント]
|
|
---
|
|
|
|
# 日本語ドキュメント
|
|
|
|
はじめにの内容です。
|
|
|
|
## インストール
|
|
|
|
インストール手順はこちら。
|
|
`,
|
|
title: "日本語ドキュメント",
|
|
},
|
|
{
|
|
name: "emoji in title and content",
|
|
content: `---
|
|
title: Rocket Launch 🚀
|
|
tags: [emoji, fun]
|
|
---
|
|
|
|
# Rocket Launch 🚀
|
|
|
|
This topic has emoji 🎉 in the content.
|
|
|
|
## Features ✨
|
|
|
|
- Fast ⚡
|
|
- Reliable 🔒
|
|
`,
|
|
title: "Rocket Launch 🚀",
|
|
},
|
|
{
|
|
name: "diacritics and accented characters",
|
|
content: `---
|
|
title: Présentation Générale
|
|
tags: [français]
|
|
---
|
|
|
|
# Présentation Générale
|
|
|
|
Bienvenue à la documentation. Les données sont protégées.
|
|
|
|
## Résumé
|
|
|
|
Aperçu des fonctionnalités clés.
|
|
`,
|
|
title: "Présentation Générale",
|
|
},
|
|
{
|
|
name: "mixed scripts",
|
|
content: `---
|
|
title: Mixed Скрипты 混合
|
|
---
|
|
|
|
# Mixed Скрипты 混合
|
|
|
|
Content with Кириллица, 中文, العربية, and हिन्दी.
|
|
`,
|
|
title: "Mixed Скрипты 混合",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
topic, err := ParseTopic("unicode.md", []byte(tt.content))
|
|
|
|
require.NoError(t, err)
|
|
assert.Equal(t, tt.title, topic.Title)
|
|
assert.NotEmpty(t, topic.ID)
|
|
assert.True(t, len(topic.Sections) > 0, "should extract sections from unicode content")
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestParseTopic_Good_VeryLongDocument(t *testing.T) {
|
|
// Build a document with 10,000+ lines
|
|
var b strings.Builder
|
|
|
|
b.WriteString("---\ntitle: Massive Document\ntags: [large, stress]\n---\n\n")
|
|
|
|
// Generate 100 sections, each with ~100 lines of content
|
|
for i := range 100 {
|
|
b.WriteString(fmt.Sprintf("## Section %d\n\n", i+1))
|
|
for j := range 100 {
|
|
b.WriteString(fmt.Sprintf("Line %d of section %d: Lorem ipsum dolor sit amet.\n", j+1, i+1))
|
|
}
|
|
b.WriteString("\n")
|
|
}
|
|
|
|
content := b.String()
|
|
lineCount := strings.Count(content, "\n")
|
|
assert.Greater(t, lineCount, 10000, "document should exceed 10K lines")
|
|
|
|
topic, err := ParseTopic("massive.md", []byte(content))
|
|
|
|
require.NoError(t, err)
|
|
assert.Equal(t, "Massive Document", topic.Title)
|
|
assert.Equal(t, "massive-document", topic.ID)
|
|
assert.Len(t, topic.Sections, 100)
|
|
|
|
// Verify first and last sections have correct titles
|
|
assert.Equal(t, "Section 1", topic.Sections[0].Title)
|
|
assert.Equal(t, "Section 100", topic.Sections[99].Title)
|
|
|
|
// Verify content is captured in sections
|
|
assert.Contains(t, topic.Sections[0].Content, "Line 1 of section 1")
|
|
assert.Contains(t, topic.Sections[99].Content, "Line 100 of section 100")
|
|
}
|
|
|
|
func TestExtractSections_Bad_EmptyString(t *testing.T) {
|
|
sections := ExtractSections("")
|
|
assert.Empty(t, sections)
|
|
}
|
|
|
|
func TestExtractSections_Bad_HeadingWithoutSpace(t *testing.T) {
|
|
// "#NoSpace" is not a valid markdown heading (needs space after #)
|
|
content := `#NoSpace
|
|
##AlsoNoSpace
|
|
Some text.
|
|
`
|
|
|
|
sections := ExtractSections(content)
|
|
assert.Empty(t, sections, "headings without space after # should not be parsed")
|
|
}
|
|
|
|
func TestExtractSections_Good_ConsecutiveHeadings(t *testing.T) {
|
|
// Headings with no content between them
|
|
content := `# Title
|
|
## Subtitle
|
|
### Sub-subtitle
|
|
`
|
|
|
|
sections := ExtractSections(content)
|
|
|
|
require.Len(t, sections, 3)
|
|
// First two sections should have empty content
|
|
assert.Equal(t, "", sections[0].Content)
|
|
assert.Equal(t, "", sections[1].Content)
|
|
assert.Equal(t, "", sections[2].Content)
|
|
}
|
|
|
|
func TestGenerateID_Ugly_EmptyString(t *testing.T) {
|
|
result := GenerateID("")
|
|
assert.Equal(t, "", result)
|
|
}
|
|
|
|
func TestGenerateID_Good_OnlySpecialChars(t *testing.T) {
|
|
result := GenerateID("!@#$%^&*()")
|
|
assert.Equal(t, "", result)
|
|
}
|
|
|
|
func TestGenerateID_Good_CJK(t *testing.T) {
|
|
result := GenerateID("日本語テスト")
|
|
assert.NotEmpty(t, result)
|
|
assert.NotContains(t, result, " ")
|
|
}
|
|
|
|
func TestGenerateID_Good_Emoji(t *testing.T) {
|
|
result := GenerateID("Hello 🌍 World")
|
|
// Emoji are not letters or digits, so they are dropped
|
|
assert.Equal(t, "hello-world", result)
|
|
}
|