go-help/search_test.go
Snider 23cef8592a test: complete Phase 0 hardening — 124 tests, 94% coverage, 8 benchmarks
Expand parser tests: empty input, frontmatter-only, malformed YAML,
deeply nested headings (H4-H6), Unicode (CJK, emoji, diacritics,
mixed scripts), very long documents (10K+ lines), edge cases.

Expand search tests: empty/invalid queries, no results, case sensitivity,
multi-word queries, special characters (@, dots, underscores), overlapping
matches, scoring boundaries (title vs body), tag matching, section title
boost, tokenize/highlight edge cases, catalog integration.

Add search benchmarks: single word, multi-word, no results, partial match,
500-topic catalog, 1000-topic catalog, Add indexing, tokenize. Uses
b.Loop() (Go 1.25+) and b.ReportAllocs().

Coverage: 92.1% → 94.0% | Tests: 39 → 124 | go vet: clean | race: clean

Co-Authored-By: Virgil <virgil@lethean.io>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 04:43:48 +00:00

1169 lines
34 KiB
Go

// SPDX-Licence-Identifier: EUPL-1.2
package help
import (
"regexp"
"strings"
"testing"
"unicode/utf8"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestTokenize_Good(t *testing.T) {
tests := []struct {
name string
input string
expected []string
}{
{
name: "simple words",
input: "hello world",
expected: []string{"hello", "world"},
},
{
name: "mixed case",
input: "Hello World",
expected: []string{"hello", "world"},
},
{
name: "with punctuation",
input: "Hello, world! How are you?",
expected: []string{"hello", "world", "how", "are", "you"},
},
{
name: "single characters filtered",
input: "a b c hello d",
expected: []string{"hello"},
},
{
name: "numbers included",
input: "version 2 release",
expected: []string{"version", "release"},
},
{
name: "alphanumeric",
input: "v2.0 and config123",
expected: []string{"v2", "and", "config123"},
},
{
name: "empty string",
input: "",
expected: nil,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := tokenize(tt.input)
assert.Equal(t, tt.expected, result)
})
}
}
func TestSearchIndex_Add_Good(t *testing.T) {
idx := newSearchIndex()
topic := &Topic{
ID: "getting-started",
Title: "Getting Started",
Content: "Welcome to the guide.",
Tags: []string{"intro", "setup"},
Sections: []Section{
{ID: "installation", Title: "Installation", Content: "Install the CLI."},
},
}
idx.Add(topic)
// Verify topic is stored
assert.NotNil(t, idx.topics["getting-started"])
// Verify words are indexed
assert.Contains(t, idx.index["getting"], "getting-started")
assert.Contains(t, idx.index["started"], "getting-started")
assert.Contains(t, idx.index["welcome"], "getting-started")
assert.Contains(t, idx.index["guide"], "getting-started")
assert.Contains(t, idx.index["intro"], "getting-started")
assert.Contains(t, idx.index["setup"], "getting-started")
assert.Contains(t, idx.index["installation"], "getting-started")
assert.Contains(t, idx.index["cli"], "getting-started")
}
func TestSearchIndex_Search_Good(t *testing.T) {
idx := newSearchIndex()
// Add test topics
idx.Add(&Topic{
ID: "getting-started",
Title: "Getting Started",
Content: "Welcome to the CLI guide. This covers installation and setup.",
Tags: []string{"intro"},
})
idx.Add(&Topic{
ID: "configuration",
Title: "Configuration",
Content: "Configure the CLI using environment variables.",
})
idx.Add(&Topic{
ID: "commands",
Title: "Commands Reference",
Content: "List of all available commands.",
})
t.Run("single word query", func(t *testing.T) {
results := idx.Search("configuration")
assert.NotEmpty(t, results)
assert.Equal(t, "configuration", results[0].Topic.ID)
})
t.Run("multi-word query", func(t *testing.T) {
results := idx.Search("cli guide")
assert.NotEmpty(t, results)
// Should match getting-started (has both "cli" and "guide")
assert.Equal(t, "getting-started", results[0].Topic.ID)
})
t.Run("title boost", func(t *testing.T) {
results := idx.Search("commands")
assert.NotEmpty(t, results)
// "commands" appears in title of commands topic
assert.Equal(t, "commands", results[0].Topic.ID)
})
t.Run("partial word matching", func(t *testing.T) {
results := idx.Search("config")
assert.NotEmpty(t, results)
// Should match "configuration" and "configure"
foundConfig := false
for _, r := range results {
if r.Topic.ID == "configuration" {
foundConfig = true
break
}
}
assert.True(t, foundConfig, "Should find configuration topic with prefix match")
})
t.Run("no results", func(t *testing.T) {
results := idx.Search("nonexistent")
assert.Empty(t, results)
})
t.Run("empty query", func(t *testing.T) {
results := idx.Search("")
assert.Nil(t, results)
})
}
func TestSearchIndex_Search_Good_WithSections(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "installation",
Title: "Installation Guide",
Content: "Overview of installation process.",
Sections: []Section{
{
ID: "linux",
Title: "Linux Installation",
Content: "Run apt-get install core on Debian.",
},
{
ID: "macos",
Title: "macOS Installation",
Content: "Use brew install core on macOS.",
},
{
ID: "windows",
Title: "Windows Installation",
Content: "Download the installer from the website.",
},
},
})
t.Run("matches section content", func(t *testing.T) {
results := idx.Search("debian")
assert.NotEmpty(t, results)
assert.Equal(t, "installation", results[0].Topic.ID)
// Should identify the Linux section as best match
if results[0].Section != nil {
assert.Equal(t, "linux", results[0].Section.ID)
}
})
t.Run("matches section title", func(t *testing.T) {
results := idx.Search("windows")
assert.NotEmpty(t, results)
assert.Equal(t, "installation", results[0].Topic.ID)
})
}
func TestExtractSnippet_Good(t *testing.T) {
content := `This is the first paragraph with some introduction text.
Here is more content that talks about installation and setup.
The installation process is straightforward.
Finally, some closing remarks about the configuration.`
t.Run("finds match and extracts context", func(t *testing.T) {
snippet := extractSnippet(content, compileRegexes([]string{"installation"}))
assert.Contains(t, snippet, "**installation**")
assert.True(t, len(snippet) <= 250, "Snippet should be reasonably short")
})
t.Run("no query words returns start", func(t *testing.T) {
snippet := extractSnippet(content, nil)
assert.Contains(t, snippet, "first paragraph")
})
t.Run("empty content", func(t *testing.T) {
snippet := extractSnippet("", compileRegexes([]string{"test"}))
assert.Empty(t, snippet)
})
}
func TestExtractSnippet_Highlighting(t *testing.T) {
content := "The quick brown fox jumps over the lazy dog."
t.Run("simple highlighting", func(t *testing.T) {
snippet := extractSnippet(content, compileRegexes([]string{"quick", "fox"}))
assert.Contains(t, snippet, "**quick**")
assert.Contains(t, snippet, "**fox**")
})
t.Run("case insensitive highlighting", func(t *testing.T) {
snippet := extractSnippet(content, compileRegexes([]string{"QUICK", "Fox"}))
assert.Contains(t, snippet, "**quick**")
assert.Contains(t, snippet, "**fox**")
})
t.Run("partial word matching", func(t *testing.T) {
content := "The configuration is complete."
snippet := extractSnippet(content, compileRegexes([]string{"config"}))
assert.Contains(t, snippet, "**config**uration")
})
t.Run("overlapping matches", func(t *testing.T) {
content := "Searching for something."
// Both "search" and "searching" match
snippet := extractSnippet(content, compileRegexes([]string{"search", "searching"}))
assert.Equal(t, "**Searching** for something.", snippet)
})
}
func TestExtractSnippet_Good_UTF8(t *testing.T) {
// Content with multi-byte UTF-8 characters
content := "日本語のテキストです。This contains Japanese text. 検索機能をテストします。"
t.Run("handles multi-byte characters without corruption", func(t *testing.T) {
snippet := extractSnippet(content, compileRegexes([]string{"japanese"}))
// Should not panic or produce invalid UTF-8
assert.True(t, len(snippet) > 0)
// Verify the result is valid UTF-8
assert.True(t, isValidUTF8(snippet), "Snippet should be valid UTF-8")
})
t.Run("truncates multi-byte content safely", func(t *testing.T) {
// Long content that will be truncated
longContent := strings.Repeat("日本語", 100) // 300 characters
snippet := extractSnippet(longContent, nil)
assert.True(t, isValidUTF8(snippet), "Truncated snippet should be valid UTF-8")
})
}
// compileRegexes is a helper for tests.
func compileRegexes(words []string) []*regexp.Regexp {
var res []*regexp.Regexp
for _, w := range words {
if re, err := regexp.Compile("(?i)" + regexp.QuoteMeta(w)); err == nil {
res = append(res, re)
}
}
return res
}
// isValidUTF8 checks if a string is valid UTF-8
func isValidUTF8(s string) bool {
for i := 0; i < len(s); {
r, size := utf8.DecodeRuneInString(s[i:])
if r == utf8.RuneError && size == 1 {
return false
}
i += size
}
return true
}
func TestCountMatches_Good(t *testing.T) {
tests := []struct {
text string
words []string
expected int
}{
{"Hello world", []string{"hello"}, 1},
{"Hello world", []string{"hello", "world"}, 2},
{"Hello world", []string{"foo", "bar"}, 0},
{"The quick brown fox", []string{"quick", "fox", "dog"}, 2},
}
for _, tt := range tests {
result := countMatches(tt.text, tt.words)
assert.Equal(t, tt.expected, result)
}
}
func TestSearchResult_Score_Good(t *testing.T) {
idx := newSearchIndex()
// Topic with query word in title should score higher
idx.Add(&Topic{
ID: "topic-in-title",
Title: "Installation Guide",
Content: "Some content here.",
})
idx.Add(&Topic{
ID: "topic-in-content",
Title: "Some Other Topic",
Content: "This covers installation steps.",
})
results := idx.Search("installation")
assert.Len(t, results, 2)
// Title match should score higher
assert.Equal(t, "topic-in-title", results[0].Topic.ID)
assert.Greater(t, results[0].Score, results[1].Score)
}
// --- Upstream Phase 0 tests (100% coverage) ---
func TestExtractSnippet_Good_HeadingsOnly(t *testing.T) {
// Content with only headings and no body text should return empty snippet
// when no regexes are provided. Covers the empty-return branch.
content := "# Heading One\n## Heading Two\n### Heading Three"
snippet := extractSnippet(content, nil)
assert.Empty(t, snippet, "headings-only content without regexes should return empty snippet")
}
func TestExtractSnippet_Good_SnippetTrimmedToEmpty(t *testing.T) {
// After word-boundary trimming the snippet could become empty.
// This exercises the snippet=="" guard after TrimSpace.
content := strings.Repeat(" ", 200)
snippet := extractSnippet(content, compileRegexes([]string{"zz"}))
assert.Empty(t, snippet, "whitespace-only content should yield empty snippet")
}
func TestHighlight_Good_EmptyRegexes(t *testing.T) {
// Calling highlight with an empty regex slice should return the
// text unchanged. Covers the early-return branch.
result := highlight("some text here", nil)
assert.Equal(t, "some text here", result)
result = highlight("some text here", []*regexp.Regexp{})
assert.Equal(t, "some text here", result)
}
func TestHighlight_Good_OverlappingExtension(t *testing.T) {
// Test overlapping matches where the second match extends past the
// first, exercising the curr.end extension branch in merging.
text := "abcdefghij"
// First regex matches "abcdef", second matches "cdefghij"
// They overlap and the second extends the merged range.
re1, _ := regexp.Compile("abcdef")
re2, _ := regexp.Compile("cdefghij")
result := highlight(text, []*regexp.Regexp{re1, re2})
assert.Equal(t, "**abcdefghij**", result)
}
func TestSearchIndex_Search_Good_NilTopicGuard(t *testing.T) {
// Manually inject a stale reference in the scores map so the
// nil-topic guard is exercised. We do this by manipulating the
// index directly.
idx := newSearchIndex()
idx.Add(&Topic{
ID: "real-topic",
Title: "Real Topic",
Content: "Contains testword for matching.",
})
// Inject a mapping from "testword" to a non-existent topic ID.
idx.index["testword"] = append(idx.index["testword"], "ghost-topic")
results := idx.Search("testword")
// Should still find the real topic, ghost-topic should be skipped.
assert.Len(t, results, 1)
assert.Equal(t, "real-topic", results[0].Topic.ID)
}
func TestSearchIndex_Search_Good_SpecialCharacters(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "special-chars",
Title: "Rate Limiting (v2.0)",
Content: "Configure rate-limiting rules with special characters: @#$%.",
})
t.Run("query with special characters", func(t *testing.T) {
results := idx.Search("rate limiting")
assert.NotEmpty(t, results)
assert.Equal(t, "special-chars", results[0].Topic.ID)
})
t.Run("query with punctuation stripped", func(t *testing.T) {
results := idx.Search("v2.0")
assert.NotEmpty(t, results)
})
}
func TestSearchIndex_Search_Good_CaseInsensitive(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "case-test",
Title: "UPPERCASE Title",
Content: "MiXeD CaSe content here.",
})
t.Run("lowercase query finds uppercase content", func(t *testing.T) {
results := idx.Search("uppercase")
assert.NotEmpty(t, results)
assert.Equal(t, "case-test", results[0].Topic.ID)
})
t.Run("uppercase query finds content", func(t *testing.T) {
results := idx.Search("MIXED")
assert.NotEmpty(t, results)
})
}
func TestSearchIndex_Search_Good_SingleCharQuery(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "single-char",
Title: "Test Topic",
Content: "Some content.",
})
// Single-character queries are filtered out by tokenize (min 2 chars),
// so searching for "a" should return nil.
results := idx.Search("a")
assert.Nil(t, results)
}
// --- Phase 1: Fuzzy matching tests ---
func TestLevenshtein_Good(t *testing.T) {
tests := []struct {
name string
a string
b string
expected int
}{
{name: "identical strings", a: "hello", b: "hello", expected: 0},
{name: "single substitution", a: "hello", b: "hallo", expected: 1},
{name: "single insertion", a: "hello", b: "helloo", expected: 1},
{name: "single deletion", a: "hello", b: "helo", expected: 1},
{name: "two edits", a: "kitten", b: "sitting", expected: 3},
{name: "completely different", a: "abc", b: "xyz", expected: 3},
{name: "empty first string", a: "", b: "hello", expected: 5},
{name: "empty second string", a: "hello", b: "", expected: 5},
{name: "both empty", a: "", b: "", expected: 0},
{name: "single char strings", a: "a", b: "b", expected: 1},
{name: "same single char", a: "a", b: "a", expected: 0},
{name: "transposition", a: "ab", b: "ba", expected: 2},
{name: "unicode strings", a: "cafe", b: "cafe", expected: 0},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := levenshtein(tt.a, tt.b)
assert.Equal(t, tt.expected, result)
})
}
}
func TestMin3_Good(t *testing.T) {
tests := []struct {
name string
a, b, c int
expected int
}{
{name: "first smallest", a: 1, b: 2, c: 3, expected: 1},
{name: "second smallest", a: 2, b: 1, c: 3, expected: 1},
{name: "third smallest", a: 3, b: 2, c: 1, expected: 1},
{name: "all equal", a: 5, b: 5, c: 5, expected: 5},
{name: "first and second equal smallest", a: 1, b: 1, c: 3, expected: 1},
{name: "second and third equal smallest", a: 3, b: 1, c: 1, expected: 1},
{name: "first and third equal smallest", a: 1, b: 3, c: 1, expected: 1},
{name: "negative values", a: -3, b: -1, c: -2, expected: -3},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := min3(tt.a, tt.b, tt.c)
assert.Equal(t, tt.expected, result)
})
}
}
func TestSearchIndex_Search_Good_FuzzyMatching(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "configuration",
Title: "Configuration Guide",
Content: "Learn how to configure the application settings.",
})
idx.Add(&Topic{
ID: "deployment",
Title: "Deployment Process",
Content: "Deploy your application to production servers.",
})
t.Run("typo in query finds correct topic", func(t *testing.T) {
// "configuraton" is 1 edit away from "configuration"
results := idx.Search("configuraton")
assert.NotEmpty(t, results, "fuzzy match should find results for typo")
found := false
for _, r := range results {
if r.Topic.ID == "configuration" {
found = true
break
}
}
assert.True(t, found, "should find configuration topic with typo")
})
t.Run("two-edit typo still matches", func(t *testing.T) {
// "deplymnt" is within 2 edits of "deployment" -- but first check
// that "deploymnt" (1 edit) works.
results := idx.Search("deploymnt")
assert.NotEmpty(t, results, "fuzzy match should find results for 1-edit typo")
})
t.Run("too many edits returns no fuzzy match", func(t *testing.T) {
// "zzzzzzz" is very far from any indexed word.
results := idx.Search("zzzzzzz")
assert.Empty(t, results, "large edit distance should not produce results")
})
t.Run("short words skip fuzzy matching", func(t *testing.T) {
// Words shorter than 3 characters skip fuzzy matching.
// "to" is in the index but "tx" (1 edit) should not fuzzy-match
// because "tx" is only 2 chars.
results := idx.Search("tx")
// May or may not find results via prefix, but should not crash.
_ = results
})
t.Run("fuzzy scores lower than exact", func(t *testing.T) {
// Exact match on "configure" should score higher than fuzzy.
exactResults := idx.Search("configure")
fuzzyResults := idx.Search("configurr")
if len(exactResults) > 0 && len(fuzzyResults) > 0 {
assert.GreaterOrEqual(t, exactResults[0].Score, fuzzyResults[0].Score,
"exact match should score at least as high as fuzzy match")
}
})
}
// --- Phase 1: Phrase search tests ---
func TestExtractPhrases_Good(t *testing.T) {
tests := []struct {
name string
query string
expectedPhrases []string
expectedRemaining string
}{
{
name: "no phrases",
query: "hello world",
expectedPhrases: nil,
expectedRemaining: "hello world",
},
{
name: "single phrase",
query: `"rate limit"`,
expectedPhrases: []string{"rate limit"},
expectedRemaining: "",
},
{
name: "phrase with surrounding words",
query: `configure "rate limit" rules`,
expectedPhrases: []string{"rate limit"},
expectedRemaining: "configure rules",
},
{
name: "multiple phrases",
query: `"rate limit" and "error handling"`,
expectedPhrases: []string{"rate limit", "error handling"},
expectedRemaining: " and ",
},
{
name: "empty quotes left in remaining",
query: `"" hello`,
expectedPhrases: nil,
expectedRemaining: `"" hello`,
},
{
name: "whitespace-only quotes ignored",
query: `" " hello`,
expectedPhrases: nil,
expectedRemaining: " hello",
},
{
name: "unclosed quote treated as plain text",
query: `"unclosed phrase`,
expectedPhrases: nil,
expectedRemaining: `"unclosed phrase`,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
phrases, remaining := extractPhrases(tt.query)
assert.Equal(t, tt.expectedPhrases, phrases)
assert.Equal(t, tt.expectedRemaining, remaining)
})
}
}
func TestSearchIndex_Search_Good_PhraseSearch(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "rate-limiting",
Title: "Rate Limiting",
Content: "Configure rate limit rules for API endpoints. The rate limit protects against abuse.",
Tags: []string{"api", "security"},
})
idx.Add(&Topic{
ID: "error-handling",
Title: "Error Handling",
Content: "Handle errors and rate responses correctly. Limit retries to avoid loops.",
Tags: []string{"errors"},
})
t.Run("quoted phrase matches exact sequence", func(t *testing.T) {
results := idx.Search(`"rate limit"`)
assert.NotEmpty(t, results)
// rate-limiting topic has "rate limit" as exact phrase
assert.Equal(t, "rate-limiting", results[0].Topic.ID)
})
t.Run("unquoted words match both topics", func(t *testing.T) {
results := idx.Search("rate limit")
assert.GreaterOrEqual(t, len(results), 2,
"unquoted query should match both topics that contain the words separately")
})
t.Run("phrase not found yields no phrase boost", func(t *testing.T) {
results := idx.Search(`"nonexistent phrase here"`)
assert.Empty(t, results, "phrase with no tokenisable words and no match should return empty")
})
t.Run("phrase with surrounding keywords", func(t *testing.T) {
results := idx.Search(`"rate limit" api`)
assert.NotEmpty(t, results)
assert.Equal(t, "rate-limiting", results[0].Topic.ID)
})
t.Run("phrase-only query with no loose words", func(t *testing.T) {
// Query is only a quoted phrase; tokenize of remaining is empty,
// but phrase matching should still score topics.
results := idx.Search(`"rate limit"`)
assert.NotEmpty(t, results)
})
t.Run("phrase found in section content", func(t *testing.T) {
sectionIdx := newSearchIndex()
sectionIdx.Add(&Topic{
ID: "section-phrase",
Title: "Advanced Guide",
Content: "Overview of the system.",
Sections: []Section{
{
ID: "limits",
Title: "Limits",
Content: "The rate limit is set per client.",
},
},
})
results := sectionIdx.Search(`"rate limit"`)
assert.NotEmpty(t, results)
assert.Equal(t, "section-phrase", results[0].Topic.ID)
})
}
// --- Phase 1: Improved scoring tests ---
func TestSearchIndex_Search_Good_TagBoost(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "tagged-topic",
Title: "Some Guide",
Content: "General content about the system.",
Tags: []string{"deployment", "production"},
})
idx.Add(&Topic{
ID: "content-topic",
Title: "Other Guide",
Content: "Information about deployment processes.",
})
results := idx.Search("deployment")
assert.NotEmpty(t, results)
// The tagged topic should rank higher because of the tag boost.
assert.Equal(t, "tagged-topic", results[0].Topic.ID,
"topic with matching tag should rank higher")
}
func TestSearchIndex_Search_Good_MultiWordBonus(t *testing.T) {
idx := newSearchIndex()
// Both topics have neutral titles (no query words in title) to
// isolate the multi-word bonus effect.
idx.Add(&Topic{
ID: "both-words",
Title: "Complete Guide",
Content: "Learn about deploying and monitoring in one place.",
})
idx.Add(&Topic{
ID: "one-word",
Title: "Other Guide",
Content: "Just deploying steps without monitoring.",
})
results := idx.Search("deploying monitoring")
assert.NotEmpty(t, results)
// Topic with both words should score higher due to multi-word bonus.
assert.Equal(t, "both-words", results[0].Topic.ID,
"topic containing all query words should rank higher")
}
func TestSearchIndex_Search_Good_ScoringConstants(t *testing.T) {
// Verify the scoring constants are sensible relative to each other.
assert.Greater(t, scoreTitleBoost, scoreSectionBoost,
"title boost should exceed section boost")
assert.Greater(t, scoreSectionBoost, scoreTagBoost,
"section boost should exceed tag boost")
assert.Greater(t, scoreExactWord, scorePrefixWord,
"exact match should score higher than prefix match")
assert.Greater(t, scorePrefixWord, scoreFuzzyWord,
"prefix match should score higher than fuzzy match")
assert.Greater(t, scorePhraseBoost, scoreAllWords,
"phrase boost should exceed multi-word bonus")
}
func TestSearchIndex_Search_Good_PhraseHighlighting(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "phrase-highlight",
Title: "API Guide",
Content: "Configure rate limit rules for the API gateway.",
})
results := idx.Search(`"rate limit" api`)
assert.NotEmpty(t, results)
// The snippet should highlight both the phrase and keyword.
if results[0].Snippet != "" {
assert.Contains(t, results[0].Snippet, "**rate limit**",
"phrase should be highlighted in snippet")
}
}
// --- Phase 0 additional tests: expanded edge cases ---
func TestSearchIndex_Search_Bad_EmptyQuery(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{ID: "test", Title: "Test Topic", Content: "Some content."})
t.Run("empty string", func(t *testing.T) {
results := idx.Search("")
assert.Nil(t, results)
})
t.Run("whitespace only", func(t *testing.T) {
results := idx.Search(" ")
assert.Nil(t, results)
})
t.Run("single character", func(t *testing.T) {
// Single chars are filtered by tokenize (min 2 chars)
results := idx.Search("a")
assert.Nil(t, results)
})
t.Run("punctuation only", func(t *testing.T) {
results := idx.Search("!@#$%")
assert.Nil(t, results)
})
}
func TestSearchIndex_Search_Bad_NoResults(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "golang",
Title: "Golang Programming",
Content: "Building applications with Go and goroutines.",
})
t.Run("completely unrelated query", func(t *testing.T) {
results := idx.Search("quantum physics")
assert.Empty(t, results)
})
t.Run("empty index", func(t *testing.T) {
emptyIdx := newSearchIndex()
results := emptyIdx.Search("anything")
assert.Empty(t, results)
})
}
func TestSearchIndex_Search_Good_CaseSensitivity(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "case-test",
Title: "PostgreSQL Configuration",
Content: "Configure POSTGRESQL settings. The postgresql.conf file controls everything.",
})
t.Run("lowercase query matches uppercase content", func(t *testing.T) {
results := idx.Search("postgresql")
require.NotEmpty(t, results)
assert.Equal(t, "case-test", results[0].Topic.ID)
})
t.Run("uppercase query matches lowercase content", func(t *testing.T) {
results := idx.Search("POSTGRESQL")
require.NotEmpty(t, results)
assert.Equal(t, "case-test", results[0].Topic.ID)
})
t.Run("mixed case query matches", func(t *testing.T) {
results := idx.Search("PostgreSQL")
require.NotEmpty(t, results)
assert.Equal(t, "case-test", results[0].Topic.ID)
})
t.Run("title case sensitivity", func(t *testing.T) {
results := idx.Search("configuration")
require.NotEmpty(t, results)
assert.Equal(t, "case-test", results[0].Topic.ID)
})
}
func TestSearchIndex_Search_Good_MultiWord(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "docker-compose",
Title: "Docker Compose Setup",
Content: "Learn how to use Docker Compose for container orchestration.",
})
idx.Add(&Topic{
ID: "docker-basics",
Title: "Docker Basics",
Content: "Introduction to Docker containers and images.",
})
idx.Add(&Topic{
ID: "kubernetes",
Title: "Kubernetes Setup",
Content: "Setting up a Kubernetes cluster for production.",
})
t.Run("both words match same topic", func(t *testing.T) {
results := idx.Search("docker compose")
require.NotEmpty(t, results)
// docker-compose should rank highest (both words in title + content)
assert.Equal(t, "docker-compose", results[0].Topic.ID)
})
t.Run("one word matches multiple topics", func(t *testing.T) {
results := idx.Search("docker")
require.Len(t, results, 2)
// Both docker topics should appear
ids := []string{results[0].Topic.ID, results[1].Topic.ID}
assert.Contains(t, ids, "docker-compose")
assert.Contains(t, ids, "docker-basics")
})
t.Run("words from different topics", func(t *testing.T) {
results := idx.Search("docker kubernetes")
require.NotEmpty(t, results)
// All three topics should match (docker matches 2, kubernetes matches 1)
assert.GreaterOrEqual(t, len(results), 3)
})
t.Run("three word query narrows results", func(t *testing.T) {
results := idx.Search("docker compose setup")
require.NotEmpty(t, results)
// docker-compose has all three words, should rank first
assert.Equal(t, "docker-compose", results[0].Topic.ID)
})
}
func TestSearchIndex_Search_Good_SpecialCharsExpanded(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "email-config",
Title: "Email Configuration",
Content: "Set SMTP_HOST to smtp.example.com and PORT to 587.",
})
idx.Add(&Topic{
ID: "dotfiles",
Title: "Dotfile Management",
Content: "Manage your .bashrc and .zshrc files across machines.",
})
idx.Add(&Topic{
ID: "at-mentions",
Title: "User Mentions",
Content: "Use @username to mention users in comments.",
})
t.Run("query with at symbol", func(t *testing.T) {
// "@username" tokenises to "username" (@ is stripped)
results := idx.Search("@username")
require.NotEmpty(t, results)
assert.Equal(t, "at-mentions", results[0].Topic.ID)
})
t.Run("query with dots", func(t *testing.T) {
// "smtp.example.com" tokenises to "smtp", "example", "com"
results := idx.Search("smtp.example.com")
require.NotEmpty(t, results)
assert.Equal(t, "email-config", results[0].Topic.ID)
})
t.Run("query with underscores", func(t *testing.T) {
// "SMTP_HOST" tokenises to "smtp", "host"
results := idx.Search("SMTP_HOST")
require.NotEmpty(t, results)
assert.Equal(t, "email-config", results[0].Topic.ID)
})
}
func TestSearchIndex_Search_Good_OverlappingMatches(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "search-guide",
Title: "Searching and Search Results",
Content: "The search function searches through searchable content to find search results.",
})
// "search" should match: "searching", "search", "searches", "searchable"
results := idx.Search("search")
require.NotEmpty(t, results)
assert.Equal(t, "search-guide", results[0].Topic.ID)
// Score should be boosted since "search" appears in the title
assert.Greater(t, results[0].Score, 10.0)
}
func TestSearchIndex_Search_Good_ScoringBoundary(t *testing.T) {
idx := newSearchIndex()
// Topic A: exact title match
idx.Add(&Topic{
ID: "exact-title",
Title: "Installation",
Content: "Basic content without the query word repeated.",
})
// Topic B: no title match but heavy body usage
idx.Add(&Topic{
ID: "heavy-body",
Title: "Getting Started Guide",
Content: "Installation steps: First install the package. Then install dependencies. The installation is straightforward. Install everything.",
Sections: []Section{
{
ID: "install-section",
Title: "Install Steps",
Content: "Detailed installation instructions for every platform.",
},
},
})
results := idx.Search("installation")
require.Len(t, results, 2)
// Title match gets +10 boost, so "exact-title" should rank first
assert.Equal(t, "exact-title", results[0].Topic.ID, "exact title match should rank above body-heavy match")
assert.Greater(t, results[0].Score, results[1].Score)
}
func TestSearchIndex_Search_Good_TagMatching(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "tagged-topic",
Title: "Workflow Automation",
Content: "Automate your CI/CD pipeline.",
Tags: []string{"devops", "cicd", "automation"},
})
// Search for a tag that does not appear in title or content
results := idx.Search("devops")
require.NotEmpty(t, results)
assert.Equal(t, "tagged-topic", results[0].Topic.ID)
}
func TestSearchIndex_Search_Good_SectionTitleBoost(t *testing.T) {
idx := newSearchIndex()
idx.Add(&Topic{
ID: "section-match",
Title: "Complete Reference",
Content: "Overview of all features.",
Sections: []Section{
{ID: "deployment", Title: "Deployment", Content: "How to deploy your application."},
{ID: "monitoring", Title: "Monitoring", Content: "Set up health checks."},
},
})
idx.Add(&Topic{
ID: "body-match",
Title: "Quick Tips",
Content: "Deployment can be tricky, here are some tips.",
})
results := idx.Search("deployment")
require.Len(t, results, 2)
// Section title match gives +5 boost (in addition to other scoring)
sectionResult := results[0]
assert.Equal(t, "section-match", sectionResult.Topic.ID)
if sectionResult.Section != nil {
assert.Equal(t, "deployment", sectionResult.Section.ID)
}
}
func TestTokenize_Good_SpecialCases(t *testing.T) {
t.Run("only special characters", func(t *testing.T) {
result := tokenize("!@#$%^&*()")
assert.Nil(t, result)
})
t.Run("unicode tokens", func(t *testing.T) {
result := tokenize("日本語 テスト")
assert.NotEmpty(t, result, "CJK characters should tokenise as words")
})
t.Run("mixed unicode and ascii", func(t *testing.T) {
result := tokenize("hello 世界 world")
assert.Contains(t, result, "hello")
assert.Contains(t, result, "world")
})
t.Run("numbers only", func(t *testing.T) {
result := tokenize("12345 67890")
assert.Equal(t, []string{"12345", "67890"}, result)
})
t.Run("hyphenated words split", func(t *testing.T) {
result := tokenize("pre-commit")
assert.Equal(t, []string{"pre", "commit"}, result)
})
}
func TestHighlight_Good_NoMatches(t *testing.T) {
result := highlight("no matches here", compileRegexes([]string{"xyz"}))
assert.Equal(t, "no matches here", result)
}
func TestHighlight_Good_AdjacentMatches(t *testing.T) {
// Two words right next to each other
result := highlight("foobar", compileRegexes([]string{"foo", "bar"}))
// "foo" and "bar" are adjacent, should be merged into one highlight
assert.Equal(t, "**foobar**", result)
}
func TestExtractSnippet_Good_HeadingsSkipped(t *testing.T) {
// When no regex is given, snippet should skip heading lines
content := "# Heading\n\nActual content here."
snippet := extractSnippet(content, nil)
assert.Contains(t, snippet, "Actual content here.")
assert.NotContains(t, snippet, "# Heading")
}
func TestSearchIndex_Search_Good_DuplicateTopicIDs(t *testing.T) {
idx := newSearchIndex()
// Adding the same topic twice should not cause duplicate results
topic := &Topic{
ID: "deduplicated",
Title: "Unique Topic",
Content: "Unique content about testing.",
}
idx.Add(topic)
idx.Add(topic)
results := idx.Search("unique")
assert.Len(t, results, 1)
}
func TestCatalog_Search_Good_Integration(t *testing.T) {
// Test the full Catalog.Search path (integration through catalog -> index)
cat := &Catalog{
topics: make(map[string]*Topic),
index: newSearchIndex(),
}
cat.Add(&Topic{
ID: "alpha",
Title: "Alpha Feature",
Content: "This is the alpha version of the feature.",
Tags: []string{"experimental"},
})
cat.Add(&Topic{
ID: "beta",
Title: "Beta Release Notes",
Content: "Improvements and bug fixes in the beta.",
Tags: []string{"release"},
})
t.Run("search via catalog", func(t *testing.T) {
results := cat.Search("alpha")
require.NotEmpty(t, results)
assert.Equal(t, "alpha", results[0].Topic.ID)
})
t.Run("search by tag via catalog", func(t *testing.T) {
results := cat.Search("experimental")
require.NotEmpty(t, results)
assert.Equal(t, "alpha", results[0].Topic.ID)
})
t.Run("empty query via catalog", func(t *testing.T) {
results := cat.Search("")
assert.Nil(t, results)
})
}