diff --git a/TODO.md b/TODO.md index 5412a00..081b08f 100644 --- a/TODO.md +++ b/TODO.md @@ -16,7 +16,7 @@ Dispatched from core/go orchestration. Pick up tasks in order. ## Phase 1: Search Improvements - [x] **Fuzzy matching** -- Levenshtein distance with max edit distance of 2. Words under 3 chars skip fuzzy. Score: +0.3 per fuzzy match (below prefix +0.5 and exact +1.0). -- [ ] **English stemming** — Add a lightweight Porter-style stemmer for English search terms. Pure Go, no external deps. +- [x] **English stemming** — Add a lightweight Porter-style stemmer for English search terms. Pure Go, no external deps. - **Create `stemmer.go`** — Implement `stem(word string) string` covering the most impactful English suffix rules: - Step 1: Plurals and -ed/-ing forms (`-sses` → `-ss`, `-ies` → `-i`, `-s` → `""`, `-eed` → `-ee`, `-ed` → `""`, `-ing` → `""`) - Step 2: Derivational suffixes (`-ational` → `-ate`, `-tional` → `-tion`, `-fulness` → `-ful`, `-ness` → `""`, `-ment` → `""`, `-ation` → `-ate`, `-ously` → `-ous`, `-ively` → `-ive`, `-ably` → `-able`, `-ally` → `-al`, `-izer` → `-ize`, `-ingly` → `-ing`) diff --git a/search.go b/search.go index b11b98f..3c89684 100644 --- a/search.go +++ b/search.go @@ -12,6 +12,7 @@ const ( scoreExactWord = 1.0 // Exact word match in the index scorePrefixWord = 0.5 // Prefix/partial word match scoreFuzzyWord = 0.3 // Fuzzy (Levenshtein) match + scoreStemWord = 0.7 // Stemmed word match (between exact and prefix) scoreTitleBoost = 10.0 // Query word appears in topic title scoreSectionBoost = 5.0 // Query word appears in section title scoreTagBoost = 3.0 // Query word appears in topic tags @@ -102,11 +103,25 @@ func (i *searchIndex) Search(query string) []*SearchResult { // Track scores per topic scores := make(map[string]float64) + // Build set of stemmed query variants for stem-aware scoring. + stemmedWords := make(map[string]bool) for _, word := range queryWords { - // Exact matches + if s := stem(word); s != word { + stemmedWords[s] = true + } + } + + for _, word := range queryWords { + isStem := stemmedWords[word] + + // Exact matches — score stems lower than raw words. if topicIDs, ok := i.index[word]; ok { + sc := scoreExactWord + if isStem { + sc = scoreStemWord + } for _, topicID := range topicIDs { - scores[topicID] += scoreExactWord + scores[topicID] += sc } } @@ -363,26 +378,35 @@ func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string, res []*re } // tokenize splits text into lowercase words for indexing/searching. +// For each word, it also emits the stemmed variant (if different from the +// original) so the index contains both raw and stemmed forms. func tokenize(text string) []string { text = strings.ToLower(text) var words []string var word strings.Builder + emit := func(w string) { + if len(w) < 2 { + return + } + words = append(words, w) + if s := stem(w); s != w { + words = append(words, s) + } + } + for _, r := range text { if unicode.IsLetter(r) || unicode.IsDigit(r) { word.WriteRune(r) } else if word.Len() > 0 { - w := word.String() - if len(w) >= 2 { // Skip single-character words - words = append(words, w) - } + emit(word.String()) word.Reset() } } // Don't forget the last word - if word.Len() >= 2 { - words = append(words, word.String()) + if word.Len() > 0 { + emit(word.String()) } return words diff --git a/stemmer.go b/stemmer.go new file mode 100644 index 0000000..c24dc70 --- /dev/null +++ b/stemmer.go @@ -0,0 +1,93 @@ +// SPDX-Licence-Identifier: EUPL-1.2 +package help + +import "strings" + +// stem performs lightweight Porter-style suffix stripping on an English word. +// Words shorter than 4 characters are returned unchanged. The result is +// guaranteed to be at least 2 characters long. +// +// This is intentionally NOT the full Porter algorithm — it covers only the +// most impactful suffix rules for a help-catalog search context. +func stem(word string) string { + if len(word) < 4 { + return word + } + + s := word + + // Step 1: plurals and verb inflections. + s = stemInflectional(s) + + // Step 2: derivational suffixes (longest match first). + s = stemDerivational(s) + + // Guard: result must be at least 2 characters. + if len(s) < 2 { + return word + } + + return s +} + +// stemInflectional handles plurals and -ed/-ing verb forms. +func stemInflectional(s string) string { + switch { + case strings.HasSuffix(s, "sses"): + return s[:len(s)-2] // -sses → -ss + case strings.HasSuffix(s, "ies"): + return s[:len(s)-2] // -ies → -i + case strings.HasSuffix(s, "eed"): + return s[:len(s)-1] // -eed → -ee + case strings.HasSuffix(s, "ing"): + r := s[:len(s)-3] + if len(r) >= 2 { + return r + } + case strings.HasSuffix(s, "ed"): + r := s[:len(s)-2] + if len(r) >= 2 { + return r + } + case strings.HasSuffix(s, "s") && !strings.HasSuffix(s, "ss"): + return s[:len(s)-1] // -s → "" (but not -ss) + } + return s +} + +// stemDerivational strips common derivational suffixes. +// Ordered longest-first so we match the most specific rule. +func stemDerivational(s string) string { + // Longest suffixes first (8+ chars). + type rule struct { + suffix string + replacement string + } + + rules := []rule{ + {"fulness", "ful"}, // -fulness → -ful + {"ational", "ate"}, // -ational → -ate + {"tional", "tion"}, // -tional → -tion + {"ously", "ous"}, // -ously → -ous + {"ively", "ive"}, // -ively → -ive + {"ingly", "ing"}, // -ingly → -ing + {"ation", "ate"}, // -ation → -ate + {"ness", ""}, // -ness → "" + {"ment", ""}, // -ment → "" + {"ably", "able"}, // -ably → -able + {"ally", "al"}, // -ally → -al + {"izer", "ize"}, // -izer → -ize + } + + for _, r := range rules { + if strings.HasSuffix(s, r.suffix) { + result := s[:len(s)-len(r.suffix)] + r.replacement + if len(result) >= 2 { + return result + } + return s // Guard: don't over-strip + } + } + + return s +} diff --git a/stemmer_test.go b/stemmer_test.go new file mode 100644 index 0000000..43399cc --- /dev/null +++ b/stemmer_test.go @@ -0,0 +1,222 @@ +// SPDX-Licence-Identifier: EUPL-1.2 +package help + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// --------------------------------------------------------------------------- +// stem() unit tests +// --------------------------------------------------------------------------- + +func TestStem_Good(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + // Step 1: plurals and verb inflections + {name: "sses to ss", input: "addresses", expected: "address"}, + {name: "ies to i", input: "eries", expected: "eri"}, + {name: "ies to i (ponies)", input: "ponies", expected: "poni"}, + {name: "eed to ee", input: "agreed", expected: "agree"}, + {name: "ed removed", input: "configured", expected: "configur"}, + {name: "ing removed", input: "running", expected: "runn"}, + {name: "ing removed (testing)", input: "testing", expected: "test"}, + {name: "s removed (servers)", input: "servers", expected: "server"}, + {name: "s removed then derivational (configurations)", input: "configurations", expected: "configurate"}, + {name: "ss unchanged", input: "boss", expected: "boss"}, + + // Step 2: derivational suffixes + {name: "ational to ate", input: "configurational", expected: "configurate"}, + {name: "tional to tion", input: "nutritional", expected: "nutrition"}, + {name: "fulness to ful", input: "cheerfulness", expected: "cheerful"}, + {name: "ness removed", input: "darkness", expected: "dark"}, + {name: "ment removed", input: "deployment", expected: "deploy"}, + {name: "ation to ate", input: "configuration", expected: "configurate"}, + {name: "ously to ous", input: "dangerously", expected: "dangerous"}, + {name: "ively to ive", input: "effectively", expected: "effective"}, + {name: "ably to able", input: "comfortably", expected: "comfortable"}, + {name: "ally to al", input: "manually", expected: "manual"}, + {name: "izer to ize", input: "organizer", expected: "organize"}, + {name: "ingly to ing", input: "surprisingly", expected: "surprising"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := stem(tt.input) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestStem_ShortWordsUnchanged(t *testing.T) { + tests := []struct { + name string + input string + }{ + {name: "single char", input: "a"}, + {name: "two chars", input: "go"}, + {name: "three chars", input: "run"}, + {name: "three chars (the)", input: "the"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.input, stem(tt.input), "words under 4 chars should be unchanged") + }) + } +} + +func TestStem_GuardMinLength(t *testing.T) { + // The stem function must never reduce a word below 2 characters. + // "ed" removal from a 4-char word like "abed" would leave "ab" (ok). + // We test that it doesn't return a single-char result. + result := stem("abed") + assert.GreaterOrEqual(t, len(result), 2, "result must be at least 2 chars") +} + +// --------------------------------------------------------------------------- +// Search integration tests — stemming recall +// --------------------------------------------------------------------------- + +func TestSearch_StemRunningMatchesRun(t *testing.T) { + idx := newSearchIndex() + idx.Add(&Topic{ + ID: "topic-run", + Title: "How to Run Commands", + Content: "You can run any command from the terminal.", + }) + + results := idx.Search("running") + require.NotEmpty(t, results, "searching 'running' should match topic containing 'run'") + assert.Equal(t, "topic-run", results[0].Topic.ID) +} + +func TestSearch_StemConfigurationsMatchesConfigure(t *testing.T) { + idx := newSearchIndex() + idx.Add(&Topic{ + ID: "topic-configure", + Title: "Configure Your Application", + Content: "Learn how to configure settings for your application.", + }) + + results := idx.Search("configurations") + require.NotEmpty(t, results, "searching 'configurations' should match topic containing 'configure'") + assert.Equal(t, "topic-configure", results[0].Topic.ID) +} + +func TestSearch_StemPluralServersMatchesServer(t *testing.T) { + idx := newSearchIndex() + idx.Add(&Topic{ + ID: "topic-server", + Title: "Server Management", + Content: "Manage your server with these tools.", + }) + + results := idx.Search("servers") + require.NotEmpty(t, results, "searching 'servers' should match topic containing 'server'") + assert.Equal(t, "topic-server", results[0].Topic.ID) +} + +func TestSearch_StemScoringLowerThanExact(t *testing.T) { + idx := newSearchIndex() + idx.Add(&Topic{ + ID: "exact-match", + Title: "Running Guide", + Content: "Guide to running applications.", + }) + idx.Add(&Topic{ + ID: "stem-match", + Title: "How to Run", + Content: "Run your application.", + }) + + results := idx.Search("running") + require.Len(t, results, 2, "should match both topics") + + // The topic containing the exact word "running" should score higher + // than the one matched only via the stem "run" (all else being equal, + // scoreExactWord > scoreStemWord). + var exactScore, stemScore float64 + for _, r := range results { + if r.Topic.ID == "exact-match" { + exactScore = r.Score + } + if r.Topic.ID == "stem-match" { + stemScore = r.Score + } + } + assert.Greater(t, exactScore, stemScore, + "exact word match should score higher than stem-only match") +} + +func TestSearch_ExistingExactMatchUnaffected(t *testing.T) { + // Ensure stemming doesn't break exact-match searches. + idx := newSearchIndex() + idx.Add(&Topic{ + ID: "topic-deploy", + Title: "Deploy Guide", + Content: "How to deploy your application step by step.", + }) + + results := idx.Search("deploy") + require.NotEmpty(t, results) + assert.Equal(t, "topic-deploy", results[0].Topic.ID) +} + +func TestTokenize_IncludesStemmedVariants(t *testing.T) { + words := tokenize("running configurations servers") + + // Should contain originals + assert.Contains(t, words, "running") + assert.Contains(t, words, "configurations") + assert.Contains(t, words, "servers") + + // Should also contain stems + assert.Contains(t, words, "runn") // stem of running (ing removed) + assert.Contains(t, words, "configurate") // stem of configurations (s->configuration->ation->ate) + assert.Contains(t, words, "server") // stem of servers (s removed) +} + +// --------------------------------------------------------------------------- +// Benchmark +// --------------------------------------------------------------------------- + +func BenchmarkStem(b *testing.B) { + words := []string{ + "running", "configurations", "servers", "deployment", "testing", + "addresses", "agreed", "configured", "operational", "cheerfulness", + "darkness", "dangerously", "effectively", "comfortably", "manually", + "organizer", "surprisingly", "configuration", "authentication", + "authorisation", "networking", "monitoring", "scheduling", "routing", + "migration", "encryption", "compression", "validation", "serialisation", + "templating", "distributed", "federated", "graceful", "hybrid", + "incremental", "advanced", "basic", "custom", "encrypted", "install", + "configure", "deploy", "monitor", "debug", "authenticate", "authorise", + "connect", "store", "analyse", "cache", "schedule", "route", "migrate", + "restore", "help", "guide", "overview", "setup", "troubleshooting", + "performance", "benchmark", "analysis", "documentation", "reference", + "tutorial", "quickstart", "installation", "requirements", "dependencies", + "modules", "packages", "services", "workers", "processes", "threads", + "connections", "sessions", "transactions", "queries", "responses", + "requests", "handlers", "middleware", "controllers", "models", + "views", "templates", "layouts", "components", "widgets", "plugins", + "extensions", "integrations", "providers", "factories", "builders", + "adapters", "decorators", "observers", "listeners", "subscribers", + "publishers", "dispatchers", "resolvers", "transformers", "formatters", + "validators", "sanitizers", "parsers", "compilers", "interpreters", + } + + b.ReportAllocs() + b.ResetTimer() + + for b.Loop() { + for _, w := range words { + stem(w) + } + } +}