diff --git a/TODO.md b/TODO.md index 11bf617..e8f88c9 100644 --- a/TODO.md +++ b/TODO.md @@ -1,4 +1,4 @@ -# TODO.md — go-help +# TODO.md -- go-help Dispatched from core/go orchestration. Pick up tasks in order. @@ -6,22 +6,22 @@ Dispatched from core/go orchestration. Pick up tasks in order. ## Phase 0: Hardening & Test Coverage -- [x] **Expand parser tests** — Parser already at 100%. Existing tests cover: empty input, frontmatter-only, malformed YAML, all heading levels (H1-H6), Unicode content, path-derived IDs. -- [x] **Expand search tests** — Added tests for: empty query, no results, case sensitivity, multi-word queries, special characters, overlapping matches, scoring boundary cases, nil-topic guard, snippet edge cases (headings-only, whitespace-only). -- [x] **Add catalog tests** — Created `catalog_test.go` covering: DefaultCatalog, Add, List, Search, Get (found/not-found), score tie-breaking. -- [x] **Benchmark search** — `BenchmarkSearch` with 150 topics. Baseline: ~745us/op, ~392KB/op, 4114 allocs/op (Ryzen 9 9950X). -- [x] **`go vet ./...` clean** — No warnings. -- [x] **Coverage: 100%** — Up from 92.1%. +- [x] **Expand parser tests** -- Parser at 100%. Tests cover: empty input, frontmatter-only, malformed YAML (3 variants), frontmatter not at start, deeply nested headings (H4-H6 with content), Unicode (CJK, emoji, diacritics, mixed scripts), 10K+ line document, empty sections, headings without space, consecutive headings, GenerateID edge cases, path-derived IDs. +- [x] **Expand search tests** -- Added tests for: empty query (4 variants), no results (3 variants), case sensitivity (4 variants), multi-word queries (4 variants), special characters (@, dots, underscores), overlapping matches, scoring boundary cases, nil-topic guard, snippet edge cases (headings-only, whitespace-only), duplicate topic IDs, catalog integration. +- [x] **Add catalog tests** -- Created `catalog_test.go` covering: DefaultCatalog, Add, List, Search, Get (found/not-found), score tie-breaking. +- [x] **Benchmark search** -- `search_bench_test.go` with 8 benchmarks: single word, multi-word, no results, partial match, 500-topic catalog, 1000-topic catalog, Add indexing, tokenize. Uses `b.Loop()` (Go 1.25+) and `b.ReportAllocs()`. +- [x] **`go vet ./...` clean** -- No warnings. +- [x] **Coverage: 100%** -- Up from 92.1%. ## Phase 1: Search Improvements -- [x] **Fuzzy matching** — Levenshtein distance with max edit distance of 2. Words under 3 chars skip fuzzy. Score: +0.3 per fuzzy match (below prefix +0.5 and exact +1.0). +- [x] **Fuzzy matching** -- Levenshtein distance with max edit distance of 2. Words under 3 chars skip fuzzy. Score: +0.3 per fuzzy match (below prefix +0.5 and exact +1.0). - [ ] Add stemming support for English search terms -- [x] **Phrase search** — Quoted multi-word queries via `extractPhrases()`. Phrase boost: +8.0. Searches title, content, and section content. -- [x] **Improved scoring weights** — Named constants: title +10, section +5, tag +3, phrase +8, all-words bonus +2, exact +1.0, prefix +0.5, fuzzy +0.3. -- [x] **Tag boost** — Query words matching tags add +3.0 per matching tag. -- [x] **Multi-word bonus** — All query words present in topic adds +2.0. -- [x] **Tests for all new features** — Levenshtein, min3, extractPhrases, fuzzy search, phrase search, tag boost, multi-word bonus, scoring constants, phrase highlighting, section phrase matching. +- [x] **Phrase search** -- Quoted multi-word queries via `extractPhrases()`. Phrase boost: +8.0. Searches title, content, and section content. +- [x] **Improved scoring weights** -- Named constants: title +10, section +5, tag +3, phrase +8, all-words bonus +2, exact +1.0, prefix +0.5, fuzzy +0.3. +- [x] **Tag boost** -- Query words matching tags add +3.0 per matching tag. +- [x] **Multi-word bonus** -- All query words present in topic adds +2.0. +- [x] **Tests for all new features** -- Levenshtein, min3, extractPhrases, fuzzy search, phrase search, tag boost, multi-word bonus, scoring constants, phrase highlighting, section phrase matching. ## Phase 2: core.help Integration diff --git a/parser_test.go b/parser_test.go index b95cadc..1ca0b46 100644 --- a/parser_test.go +++ b/parser_test.go @@ -1,9 +1,13 @@ +// SPDX-Licence-Identifier: EUPL-1.2 package help import ( + "fmt" + "strings" "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestGenerateID_Good(t *testing.T) { @@ -337,3 +341,356 @@ func TestPathToTitle_Good(t *testing.T) { }) } } + +// --- Phase 0: Expanded parser tests --- + +func TestParseTopic_Good_EmptyInput(t *testing.T) { + // Empty byte slice should produce a valid topic with no content + topic, err := ParseTopic("empty.md", []byte("")) + + require.NoError(t, err) + assert.NotNil(t, topic) + assert.Equal(t, "empty", topic.ID) + assert.Equal(t, "", topic.Title) + assert.Equal(t, "", topic.Content) + assert.Empty(t, topic.Sections) + assert.Empty(t, topic.Tags) + assert.Empty(t, topic.Related) +} + +func TestParseTopic_Good_FrontmatterOnly(t *testing.T) { + // Frontmatter with no body or sections + content := []byte(`--- +title: Metadata Only +tags: [meta] +order: 99 +--- +`) + + topic, err := ParseTopic("meta.md", content) + + require.NoError(t, err) + assert.Equal(t, "metadata-only", topic.ID) + assert.Equal(t, "Metadata Only", topic.Title) + assert.Equal(t, []string{"meta"}, topic.Tags) + assert.Equal(t, 99, topic.Order) + assert.Empty(t, topic.Sections) + // Body after frontmatter is just a newline + assert.Equal(t, "", strings.TrimSpace(topic.Content)) +} + +func TestExtractFrontmatter_Bad_MalformedYAML(t *testing.T) { + tests := []struct { + name string + content string + }{ + { + name: "unclosed bracket", + content: `--- +title: [broken +tags: [also broken +--- + +# Content`, + }, + { + name: "tab indentation error", + content: "---\ntitle: Good\n\t- bad indent\n---\n\n# Content", + }, + { + name: "duplicate keys with conflicting types", + // YAML spec allows duplicate keys but implementations may vary; + // this tests that the parser does not panic regardless. + content: `--- +title: First +title: + nested: value +--- + +# Content`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fm, body := ExtractFrontmatter(tt.content) + // Malformed YAML should return nil frontmatter without panic + if fm == nil { + // Body should be original content when YAML fails + assert.Equal(t, tt.content, body) + } + // No panic is the key assertion — test reaching here is success + }) + } +} + +func TestExtractFrontmatter_Bad_NotAtStart(t *testing.T) { + // Frontmatter delimiters that do not start at the beginning of the file + content := `Some preamble text. + +--- +title: Should Not Parse +--- + +# Content` + + fm, body := ExtractFrontmatter(content) + + assert.Nil(t, fm) + assert.Equal(t, content, body) +} + +func TestExtractSections_Good_DeeplyNested(t *testing.T) { + content := `# Level 1 + +Top-level content. + +## Level 2 + +Second level. + +### Level 3 + +Third level. + +#### Level 4 + +Fourth level details. + +##### Level 5 + +Fifth level fine print. + +###### Level 6 + +Deepest heading level. +` + + sections := ExtractSections(content) + + require.Len(t, sections, 6) + + for i, expected := range []struct { + level int + title string + }{ + {1, "Level 1"}, + {2, "Level 2"}, + {3, "Level 3"}, + {4, "Level 4"}, + {5, "Level 5"}, + {6, "Level 6"}, + } { + assert.Equal(t, expected.level, sections[i].Level, "section %d level", i) + assert.Equal(t, expected.title, sections[i].Title, "section %d title", i) + } + + // Verify content is associated with correct sections + assert.Contains(t, sections[0].Content, "Top-level content.") + assert.Contains(t, sections[3].Content, "Fourth level details.") + assert.Contains(t, sections[5].Content, "Deepest heading level.") +} + +func TestExtractSections_Good_DeeplyNestedWithContent(t *testing.T) { + // H4, H5, H6 with meaningful content under each + content := `#### Configuration Options + +Set these in your config file. + +##### Advanced Options + +Only for power users. + +###### Experimental Flags + +These may change without notice. +` + + sections := ExtractSections(content) + + require.Len(t, sections, 3) + assert.Equal(t, 4, sections[0].Level) + assert.Equal(t, "Configuration Options", sections[0].Title) + assert.Contains(t, sections[0].Content, "Set these in your config file.") + + assert.Equal(t, 5, sections[1].Level) + assert.Equal(t, "Advanced Options", sections[1].Title) + assert.Contains(t, sections[1].Content, "Only for power users.") + + assert.Equal(t, 6, sections[2].Level) + assert.Equal(t, "Experimental Flags", sections[2].Title) + assert.Contains(t, sections[2].Content, "These may change without notice.") +} + +func TestParseTopic_Good_Unicode(t *testing.T) { + tests := []struct { + name string + content string + title string + }{ + { + name: "CJK characters", + content: `--- +title: 日本語ドキュメント +tags: [日本語, ドキュメント] +--- + +# 日本語ドキュメント + +はじめにの内容です。 + +## インストール + +インストール手順はこちら。 +`, + title: "日本語ドキュメント", + }, + { + name: "emoji in title and content", + content: `--- +title: Rocket Launch 🚀 +tags: [emoji, fun] +--- + +# Rocket Launch 🚀 + +This topic has emoji 🎉 in the content. + +## Features ✨ + +- Fast ⚡ +- Reliable 🔒 +`, + title: "Rocket Launch 🚀", + }, + { + name: "diacritics and accented characters", + content: `--- +title: Présentation Générale +tags: [français] +--- + +# Présentation Générale + +Bienvenue à la documentation. Les données sont protégées. + +## Résumé + +Aperçu des fonctionnalités clés. +`, + title: "Présentation Générale", + }, + { + name: "mixed scripts", + content: `--- +title: Mixed Скрипты 混合 +--- + +# Mixed Скрипты 混合 + +Content with Кириллица, 中文, العربية, and हिन्दी. +`, + title: "Mixed Скрипты 混合", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + topic, err := ParseTopic("unicode.md", []byte(tt.content)) + + require.NoError(t, err) + assert.Equal(t, tt.title, topic.Title) + assert.NotEmpty(t, topic.ID) + assert.True(t, len(topic.Sections) > 0, "should extract sections from unicode content") + }) + } +} + +func TestParseTopic_Good_VeryLongDocument(t *testing.T) { + // Build a document with 10,000+ lines + var b strings.Builder + + b.WriteString("---\ntitle: Massive Document\ntags: [large, stress]\n---\n\n") + + // Generate 100 sections, each with ~100 lines of content + for i := range 100 { + b.WriteString(fmt.Sprintf("## Section %d\n\n", i+1)) + for j := range 100 { + b.WriteString(fmt.Sprintf("Line %d of section %d: Lorem ipsum dolor sit amet.\n", j+1, i+1)) + } + b.WriteString("\n") + } + + content := b.String() + lineCount := strings.Count(content, "\n") + assert.Greater(t, lineCount, 10000, "document should exceed 10K lines") + + topic, err := ParseTopic("massive.md", []byte(content)) + + require.NoError(t, err) + assert.Equal(t, "Massive Document", topic.Title) + assert.Equal(t, "massive-document", topic.ID) + assert.Len(t, topic.Sections, 100) + + // Verify first and last sections have correct titles + assert.Equal(t, "Section 1", topic.Sections[0].Title) + assert.Equal(t, "Section 100", topic.Sections[99].Title) + + // Verify content is captured in sections + assert.Contains(t, topic.Sections[0].Content, "Line 1 of section 1") + assert.Contains(t, topic.Sections[99].Content, "Line 100 of section 100") +} + +func TestExtractSections_Bad_EmptyString(t *testing.T) { + sections := ExtractSections("") + assert.Empty(t, sections) +} + +func TestExtractSections_Bad_HeadingWithoutSpace(t *testing.T) { + // "#NoSpace" is not a valid markdown heading (needs space after #) + content := `#NoSpace +##AlsoNoSpace +Some text. +` + + sections := ExtractSections(content) + assert.Empty(t, sections, "headings without space after # should not be parsed") +} + +func TestExtractSections_Good_ConsecutiveHeadings(t *testing.T) { + // Headings with no content between them + content := `# Title +## Subtitle +### Sub-subtitle +` + + sections := ExtractSections(content) + + require.Len(t, sections, 3) + // First two sections should have empty content + assert.Equal(t, "", sections[0].Content) + assert.Equal(t, "", sections[1].Content) + assert.Equal(t, "", sections[2].Content) +} + +func TestGenerateID_Ugly_EmptyString(t *testing.T) { + result := GenerateID("") + assert.Equal(t, "", result) +} + +func TestGenerateID_Good_OnlySpecialChars(t *testing.T) { + result := GenerateID("!@#$%^&*()") + assert.Equal(t, "", result) +} + +func TestGenerateID_Good_CJK(t *testing.T) { + result := GenerateID("日本語テスト") + assert.NotEmpty(t, result) + assert.NotContains(t, result, " ") +} + +func TestGenerateID_Good_Emoji(t *testing.T) { + result := GenerateID("Hello 🌍 World") + // Emoji are not letters or digits, so they are dropped + assert.Equal(t, "hello-world", result) +} diff --git a/search_bench_test.go b/search_bench_test.go new file mode 100644 index 0000000..8d6fe3c --- /dev/null +++ b/search_bench_test.go @@ -0,0 +1,176 @@ +// SPDX-Licence-Identifier: EUPL-1.2 +package help + +import ( + "fmt" + "strings" + "testing" +) + +// titleCase capitalises the first letter of a string. +// Used in benchmarks to avoid deprecated strings.Title. +func titleCase(s string) string { + if len(s) == 0 { + return s + } + return strings.ToUpper(s[:1]) + s[1:] +} + +// buildLargeCatalog creates a search index with n topics for benchmarking. +// Each topic has a title, content with multiple paragraphs, sections, and tags. +func buildLargeCatalog(n int) *searchIndex { + idx := newSearchIndex() + + // Word pools for generating varied content + subjects := []string{ + "configuration", "deployment", "monitoring", "testing", "debugging", + "authentication", "authorisation", "networking", "storage", "logging", + "caching", "scheduling", "routing", "migration", "backup", + "encryption", "compression", "validation", "serialisation", "templating", + } + verbs := []string{ + "install", "configure", "deploy", "monitor", "debug", + "authenticate", "authorise", "connect", "store", "analyse", + "cache", "schedule", "route", "migrate", "restore", + } + adjectives := []string{ + "advanced", "basic", "custom", "distributed", "encrypted", + "federated", "graceful", "hybrid", "incremental", "just-in-time", + } + + for i := range n { + subj := subjects[i%len(subjects)] + verb := verbs[i%len(verbs)] + adj := adjectives[i%len(adjectives)] + + title := fmt.Sprintf("%s %s Guide %d", titleCase(adj), titleCase(subj), i) + content := fmt.Sprintf( + "This guide covers how to %s %s %s systems. "+ + "It includes step-by-step instructions for setting up %s "+ + "in both development and production environments. "+ + "The %s process requires careful planning and %s tools. "+ + "Make sure to review the prerequisites before starting.", + verb, adj, subj, subj, subj, adj, + ) + + sections := []Section{ + { + ID: fmt.Sprintf("overview-%d", i), + Title: "Overview", + Content: fmt.Sprintf("An overview of %s %s patterns and best practices.", adj, subj), + }, + { + ID: fmt.Sprintf("setup-%d", i), + Title: fmt.Sprintf("%s Setup", titleCase(subj)), + Content: fmt.Sprintf("Detailed setup instructions for %s. Run the %s command to begin.", subj, verb), + }, + { + ID: fmt.Sprintf("troubleshooting-%d", i), + Title: "Troubleshooting", + Content: fmt.Sprintf("Common issues when working with %s and how to resolve them.", subj), + }, + } + + idx.Add(&Topic{ + ID: fmt.Sprintf("%s-%s-%d", adj, subj, i), + Title: title, + Content: content, + Sections: sections, + Tags: []string{subj, adj, verb, "guide"}, + }) + } + + return idx +} + +func BenchmarkSearch_SingleWord(b *testing.B) { + idx := buildLargeCatalog(200) + b.ReportAllocs() + b.ResetTimer() + + for b.Loop() { + idx.Search("configuration") + } +} + +func BenchmarkSearch_MultiWord(b *testing.B) { + idx := buildLargeCatalog(200) + b.ReportAllocs() + b.ResetTimer() + + for b.Loop() { + idx.Search("advanced deployment guide") + } +} + +func BenchmarkSearch_NoResults(b *testing.B) { + idx := buildLargeCatalog(200) + b.ReportAllocs() + b.ResetTimer() + + for b.Loop() { + idx.Search("xylophone") + } +} + +func BenchmarkSearch_PartialMatch(b *testing.B) { + idx := buildLargeCatalog(200) + b.ReportAllocs() + b.ResetTimer() + + for b.Loop() { + idx.Search("config") + } +} + +func BenchmarkSearch_LargeCatalog500(b *testing.B) { + idx := buildLargeCatalog(500) + b.ReportAllocs() + b.ResetTimer() + + for b.Loop() { + idx.Search("deployment monitoring") + } +} + +func BenchmarkSearch_LargeCatalog1000(b *testing.B) { + idx := buildLargeCatalog(1000) + b.ReportAllocs() + b.ResetTimer() + + for b.Loop() { + idx.Search("testing guide") + } +} + +func BenchmarkSearchIndex_Add(b *testing.B) { + // Benchmark the indexing/add path + topic := &Topic{ + ID: "bench-topic", + Title: "Benchmark Topic Title", + Content: "This is benchmark content with several words for indexing purposes.", + Tags: []string{"bench", "performance"}, + Sections: []Section{ + {ID: "s1", Title: "First Section", Content: "Section content for benchmarking."}, + {ID: "s2", Title: "Second Section", Content: "More section content here."}, + }, + } + + b.ReportAllocs() + b.ResetTimer() + + for b.Loop() { + idx := newSearchIndex() + idx.Add(topic) + } +} + +func BenchmarkTokenize(b *testing.B) { + text := "The quick brown fox jumps over the lazy dog. Configuration and deployment are covered in detail." + b.ReportAllocs() + b.ResetTimer() + + for b.Loop() { + tokenize(text) + } +} diff --git a/search_test.go b/search_test.go index f6fd074..f122e4b 100644 --- a/search_test.go +++ b/search_test.go @@ -1,3 +1,4 @@ +// SPDX-Licence-Identifier: EUPL-1.2 package help import ( @@ -7,6 +8,7 @@ import ( "unicode/utf8" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestTokenize_Good(t *testing.T) { @@ -339,6 +341,8 @@ func TestSearchResult_Score_Good(t *testing.T) { assert.Greater(t, results[0].Score, results[1].Score) } +// --- Upstream Phase 0 tests (100% coverage) --- + func TestExtractSnippet_Good_HeadingsOnly(t *testing.T) { // Content with only headings and no body text should return empty snippet // when no regexes are provided. Covers the empty-return branch. @@ -545,7 +549,7 @@ func TestSearchIndex_Search_Good_FuzzyMatching(t *testing.T) { }) t.Run("two-edit typo still matches", func(t *testing.T) { - // "deplymnt" is within 2 edits of "deployment" — but first check + // "deplymnt" is within 2 edits of "deployment" -- but first check // that "deploymnt" (1 edit) works. results := idx.Search("deploymnt") assert.NotEmpty(t, results, "fuzzy match should find results for 1-edit typo") @@ -792,3 +796,374 @@ func TestSearchIndex_Search_Good_PhraseHighlighting(t *testing.T) { "phrase should be highlighted in snippet") } } + +// --- Phase 0 additional tests: expanded edge cases --- + +func TestSearchIndex_Search_Bad_EmptyQuery(t *testing.T) { + idx := newSearchIndex() + idx.Add(&Topic{ID: "test", Title: "Test Topic", Content: "Some content."}) + + t.Run("empty string", func(t *testing.T) { + results := idx.Search("") + assert.Nil(t, results) + }) + + t.Run("whitespace only", func(t *testing.T) { + results := idx.Search(" ") + assert.Nil(t, results) + }) + + t.Run("single character", func(t *testing.T) { + // Single chars are filtered by tokenize (min 2 chars) + results := idx.Search("a") + assert.Nil(t, results) + }) + + t.Run("punctuation only", func(t *testing.T) { + results := idx.Search("!@#$%") + assert.Nil(t, results) + }) +} + +func TestSearchIndex_Search_Bad_NoResults(t *testing.T) { + idx := newSearchIndex() + + idx.Add(&Topic{ + ID: "golang", + Title: "Golang Programming", + Content: "Building applications with Go and goroutines.", + }) + + t.Run("completely unrelated query", func(t *testing.T) { + results := idx.Search("quantum physics") + assert.Empty(t, results) + }) + + t.Run("empty index", func(t *testing.T) { + emptyIdx := newSearchIndex() + results := emptyIdx.Search("anything") + assert.Empty(t, results) + }) +} + +func TestSearchIndex_Search_Good_CaseSensitivity(t *testing.T) { + idx := newSearchIndex() + + idx.Add(&Topic{ + ID: "case-test", + Title: "PostgreSQL Configuration", + Content: "Configure POSTGRESQL settings. The postgresql.conf file controls everything.", + }) + + t.Run("lowercase query matches uppercase content", func(t *testing.T) { + results := idx.Search("postgresql") + require.NotEmpty(t, results) + assert.Equal(t, "case-test", results[0].Topic.ID) + }) + + t.Run("uppercase query matches lowercase content", func(t *testing.T) { + results := idx.Search("POSTGRESQL") + require.NotEmpty(t, results) + assert.Equal(t, "case-test", results[0].Topic.ID) + }) + + t.Run("mixed case query matches", func(t *testing.T) { + results := idx.Search("PostgreSQL") + require.NotEmpty(t, results) + assert.Equal(t, "case-test", results[0].Topic.ID) + }) + + t.Run("title case sensitivity", func(t *testing.T) { + results := idx.Search("configuration") + require.NotEmpty(t, results) + assert.Equal(t, "case-test", results[0].Topic.ID) + }) +} + +func TestSearchIndex_Search_Good_MultiWord(t *testing.T) { + idx := newSearchIndex() + + idx.Add(&Topic{ + ID: "docker-compose", + Title: "Docker Compose Setup", + Content: "Learn how to use Docker Compose for container orchestration.", + }) + idx.Add(&Topic{ + ID: "docker-basics", + Title: "Docker Basics", + Content: "Introduction to Docker containers and images.", + }) + idx.Add(&Topic{ + ID: "kubernetes", + Title: "Kubernetes Setup", + Content: "Setting up a Kubernetes cluster for production.", + }) + + t.Run("both words match same topic", func(t *testing.T) { + results := idx.Search("docker compose") + require.NotEmpty(t, results) + // docker-compose should rank highest (both words in title + content) + assert.Equal(t, "docker-compose", results[0].Topic.ID) + }) + + t.Run("one word matches multiple topics", func(t *testing.T) { + results := idx.Search("docker") + require.Len(t, results, 2) + // Both docker topics should appear + ids := []string{results[0].Topic.ID, results[1].Topic.ID} + assert.Contains(t, ids, "docker-compose") + assert.Contains(t, ids, "docker-basics") + }) + + t.Run("words from different topics", func(t *testing.T) { + results := idx.Search("docker kubernetes") + require.NotEmpty(t, results) + // All three topics should match (docker matches 2, kubernetes matches 1) + assert.GreaterOrEqual(t, len(results), 3) + }) + + t.Run("three word query narrows results", func(t *testing.T) { + results := idx.Search("docker compose setup") + require.NotEmpty(t, results) + // docker-compose has all three words, should rank first + assert.Equal(t, "docker-compose", results[0].Topic.ID) + }) +} + +func TestSearchIndex_Search_Good_SpecialCharsExpanded(t *testing.T) { + idx := newSearchIndex() + + idx.Add(&Topic{ + ID: "email-config", + Title: "Email Configuration", + Content: "Set SMTP_HOST to smtp.example.com and PORT to 587.", + }) + idx.Add(&Topic{ + ID: "dotfiles", + Title: "Dotfile Management", + Content: "Manage your .bashrc and .zshrc files across machines.", + }) + idx.Add(&Topic{ + ID: "at-mentions", + Title: "User Mentions", + Content: "Use @username to mention users in comments.", + }) + + t.Run("query with at symbol", func(t *testing.T) { + // "@username" tokenises to "username" (@ is stripped) + results := idx.Search("@username") + require.NotEmpty(t, results) + assert.Equal(t, "at-mentions", results[0].Topic.ID) + }) + + t.Run("query with dots", func(t *testing.T) { + // "smtp.example.com" tokenises to "smtp", "example", "com" + results := idx.Search("smtp.example.com") + require.NotEmpty(t, results) + assert.Equal(t, "email-config", results[0].Topic.ID) + }) + + t.Run("query with underscores", func(t *testing.T) { + // "SMTP_HOST" tokenises to "smtp", "host" + results := idx.Search("SMTP_HOST") + require.NotEmpty(t, results) + assert.Equal(t, "email-config", results[0].Topic.ID) + }) +} + +func TestSearchIndex_Search_Good_OverlappingMatches(t *testing.T) { + idx := newSearchIndex() + + idx.Add(&Topic{ + ID: "search-guide", + Title: "Searching and Search Results", + Content: "The search function searches through searchable content to find search results.", + }) + + // "search" should match: "searching", "search", "searches", "searchable" + results := idx.Search("search") + require.NotEmpty(t, results) + assert.Equal(t, "search-guide", results[0].Topic.ID) + // Score should be boosted since "search" appears in the title + assert.Greater(t, results[0].Score, 10.0) +} + +func TestSearchIndex_Search_Good_ScoringBoundary(t *testing.T) { + idx := newSearchIndex() + + // Topic A: exact title match + idx.Add(&Topic{ + ID: "exact-title", + Title: "Installation", + Content: "Basic content without the query word repeated.", + }) + + // Topic B: no title match but heavy body usage + idx.Add(&Topic{ + ID: "heavy-body", + Title: "Getting Started Guide", + Content: "Installation steps: First install the package. Then install dependencies. The installation is straightforward. Install everything.", + Sections: []Section{ + { + ID: "install-section", + Title: "Install Steps", + Content: "Detailed installation instructions for every platform.", + }, + }, + }) + + results := idx.Search("installation") + require.Len(t, results, 2) + + // Title match gets +10 boost, so "exact-title" should rank first + assert.Equal(t, "exact-title", results[0].Topic.ID, "exact title match should rank above body-heavy match") + assert.Greater(t, results[0].Score, results[1].Score) +} + +func TestSearchIndex_Search_Good_TagMatching(t *testing.T) { + idx := newSearchIndex() + + idx.Add(&Topic{ + ID: "tagged-topic", + Title: "Workflow Automation", + Content: "Automate your CI/CD pipeline.", + Tags: []string{"devops", "cicd", "automation"}, + }) + + // Search for a tag that does not appear in title or content + results := idx.Search("devops") + require.NotEmpty(t, results) + assert.Equal(t, "tagged-topic", results[0].Topic.ID) +} + +func TestSearchIndex_Search_Good_SectionTitleBoost(t *testing.T) { + idx := newSearchIndex() + + idx.Add(&Topic{ + ID: "section-match", + Title: "Complete Reference", + Content: "Overview of all features.", + Sections: []Section{ + {ID: "deployment", Title: "Deployment", Content: "How to deploy your application."}, + {ID: "monitoring", Title: "Monitoring", Content: "Set up health checks."}, + }, + }) + + idx.Add(&Topic{ + ID: "body-match", + Title: "Quick Tips", + Content: "Deployment can be tricky, here are some tips.", + }) + + results := idx.Search("deployment") + require.Len(t, results, 2) + + // Section title match gives +5 boost (in addition to other scoring) + sectionResult := results[0] + assert.Equal(t, "section-match", sectionResult.Topic.ID) + if sectionResult.Section != nil { + assert.Equal(t, "deployment", sectionResult.Section.ID) + } +} + +func TestTokenize_Good_SpecialCases(t *testing.T) { + t.Run("only special characters", func(t *testing.T) { + result := tokenize("!@#$%^&*()") + assert.Nil(t, result) + }) + + t.Run("unicode tokens", func(t *testing.T) { + result := tokenize("日本語 テスト") + assert.NotEmpty(t, result, "CJK characters should tokenise as words") + }) + + t.Run("mixed unicode and ascii", func(t *testing.T) { + result := tokenize("hello 世界 world") + assert.Contains(t, result, "hello") + assert.Contains(t, result, "world") + }) + + t.Run("numbers only", func(t *testing.T) { + result := tokenize("12345 67890") + assert.Equal(t, []string{"12345", "67890"}, result) + }) + + t.Run("hyphenated words split", func(t *testing.T) { + result := tokenize("pre-commit") + assert.Equal(t, []string{"pre", "commit"}, result) + }) +} + +func TestHighlight_Good_NoMatches(t *testing.T) { + result := highlight("no matches here", compileRegexes([]string{"xyz"})) + assert.Equal(t, "no matches here", result) +} + +func TestHighlight_Good_AdjacentMatches(t *testing.T) { + // Two words right next to each other + result := highlight("foobar", compileRegexes([]string{"foo", "bar"})) + // "foo" and "bar" are adjacent, should be merged into one highlight + assert.Equal(t, "**foobar**", result) +} + +func TestExtractSnippet_Good_HeadingsSkipped(t *testing.T) { + // When no regex is given, snippet should skip heading lines + content := "# Heading\n\nActual content here." + snippet := extractSnippet(content, nil) + assert.Contains(t, snippet, "Actual content here.") + assert.NotContains(t, snippet, "# Heading") +} + +func TestSearchIndex_Search_Good_DuplicateTopicIDs(t *testing.T) { + idx := newSearchIndex() + + // Adding the same topic twice should not cause duplicate results + topic := &Topic{ + ID: "deduplicated", + Title: "Unique Topic", + Content: "Unique content about testing.", + } + idx.Add(topic) + idx.Add(topic) + + results := idx.Search("unique") + assert.Len(t, results, 1) +} + +func TestCatalog_Search_Good_Integration(t *testing.T) { + // Test the full Catalog.Search path (integration through catalog -> index) + cat := &Catalog{ + topics: make(map[string]*Topic), + index: newSearchIndex(), + } + + cat.Add(&Topic{ + ID: "alpha", + Title: "Alpha Feature", + Content: "This is the alpha version of the feature.", + Tags: []string{"experimental"}, + }) + cat.Add(&Topic{ + ID: "beta", + Title: "Beta Release Notes", + Content: "Improvements and bug fixes in the beta.", + Tags: []string{"release"}, + }) + + t.Run("search via catalog", func(t *testing.T) { + results := cat.Search("alpha") + require.NotEmpty(t, results) + assert.Equal(t, "alpha", results[0].Topic.ID) + }) + + t.Run("search by tag via catalog", func(t *testing.T) { + results := cat.Search("experimental") + require.NotEmpty(t, results) + assert.Equal(t, "alpha", results[0].Topic.ID) + }) + + t.Run("empty query via catalog", func(t *testing.T) { + results := cat.Search("") + assert.Nil(t, results) + }) +}