feat(search): add English stemmer for improved search recall
Implement lightweight Porter-style suffix stripping in stemmer.go covering plurals (-sses, -ies, -s), verb forms (-ed, -ing, -eed), and derivational suffixes (-ational, -tional, -fulness, -ness, -ment, -ation, -ously, -ively, -ably, -ally, -izer, -ingly). Words under 4 chars are unchanged and results are guaranteed at least 2 chars. tokenize() now emits both raw and stemmed forms so the index contains both. Search() distinguishes stem-only matches (scoreStemWord=0.7) from exact matches (1.0), keeping stemmed results slightly below raw hits. 22 stem unit tests, 5 search integration tests, and BenchmarkStem with 100 words. All existing tests pass with no regressions. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2cca8d5656
commit
fc758a832b
4 changed files with 348 additions and 9 deletions
2
TODO.md
2
TODO.md
|
|
@ -16,7 +16,7 @@ Dispatched from core/go orchestration. Pick up tasks in order.
|
|||
## Phase 1: Search Improvements
|
||||
|
||||
- [x] **Fuzzy matching** -- Levenshtein distance with max edit distance of 2. Words under 3 chars skip fuzzy. Score: +0.3 per fuzzy match (below prefix +0.5 and exact +1.0).
|
||||
- [ ] **English stemming** — Add a lightweight Porter-style stemmer for English search terms. Pure Go, no external deps.
|
||||
- [x] **English stemming** — Add a lightweight Porter-style stemmer for English search terms. Pure Go, no external deps.
|
||||
- **Create `stemmer.go`** — Implement `stem(word string) string` covering the most impactful English suffix rules:
|
||||
- Step 1: Plurals and -ed/-ing forms (`-sses` → `-ss`, `-ies` → `-i`, `-s` → `""`, `-eed` → `-ee`, `-ed` → `""`, `-ing` → `""`)
|
||||
- Step 2: Derivational suffixes (`-ational` → `-ate`, `-tional` → `-tion`, `-fulness` → `-ful`, `-ness` → `""`, `-ment` → `""`, `-ation` → `-ate`, `-ously` → `-ous`, `-ively` → `-ive`, `-ably` → `-able`, `-ally` → `-al`, `-izer` → `-ize`, `-ingly` → `-ing`)
|
||||
|
|
|
|||
40
search.go
40
search.go
|
|
@ -12,6 +12,7 @@ const (
|
|||
scoreExactWord = 1.0 // Exact word match in the index
|
||||
scorePrefixWord = 0.5 // Prefix/partial word match
|
||||
scoreFuzzyWord = 0.3 // Fuzzy (Levenshtein) match
|
||||
scoreStemWord = 0.7 // Stemmed word match (between exact and prefix)
|
||||
scoreTitleBoost = 10.0 // Query word appears in topic title
|
||||
scoreSectionBoost = 5.0 // Query word appears in section title
|
||||
scoreTagBoost = 3.0 // Query word appears in topic tags
|
||||
|
|
@ -102,11 +103,25 @@ func (i *searchIndex) Search(query string) []*SearchResult {
|
|||
// Track scores per topic
|
||||
scores := make(map[string]float64)
|
||||
|
||||
// Build set of stemmed query variants for stem-aware scoring.
|
||||
stemmedWords := make(map[string]bool)
|
||||
for _, word := range queryWords {
|
||||
// Exact matches
|
||||
if s := stem(word); s != word {
|
||||
stemmedWords[s] = true
|
||||
}
|
||||
}
|
||||
|
||||
for _, word := range queryWords {
|
||||
isStem := stemmedWords[word]
|
||||
|
||||
// Exact matches — score stems lower than raw words.
|
||||
if topicIDs, ok := i.index[word]; ok {
|
||||
sc := scoreExactWord
|
||||
if isStem {
|
||||
sc = scoreStemWord
|
||||
}
|
||||
for _, topicID := range topicIDs {
|
||||
scores[topicID] += scoreExactWord
|
||||
scores[topicID] += sc
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -363,26 +378,35 @@ func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string, res []*re
|
|||
}
|
||||
|
||||
// tokenize splits text into lowercase words for indexing/searching.
|
||||
// For each word, it also emits the stemmed variant (if different from the
|
||||
// original) so the index contains both raw and stemmed forms.
|
||||
func tokenize(text string) []string {
|
||||
text = strings.ToLower(text)
|
||||
var words []string
|
||||
var word strings.Builder
|
||||
|
||||
emit := func(w string) {
|
||||
if len(w) < 2 {
|
||||
return
|
||||
}
|
||||
words = append(words, w)
|
||||
if s := stem(w); s != w {
|
||||
words = append(words, s)
|
||||
}
|
||||
}
|
||||
|
||||
for _, r := range text {
|
||||
if unicode.IsLetter(r) || unicode.IsDigit(r) {
|
||||
word.WriteRune(r)
|
||||
} else if word.Len() > 0 {
|
||||
w := word.String()
|
||||
if len(w) >= 2 { // Skip single-character words
|
||||
words = append(words, w)
|
||||
}
|
||||
emit(word.String())
|
||||
word.Reset()
|
||||
}
|
||||
}
|
||||
|
||||
// Don't forget the last word
|
||||
if word.Len() >= 2 {
|
||||
words = append(words, word.String())
|
||||
if word.Len() > 0 {
|
||||
emit(word.String())
|
||||
}
|
||||
|
||||
return words
|
||||
|
|
|
|||
93
stemmer.go
Normal file
93
stemmer.go
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
// SPDX-Licence-Identifier: EUPL-1.2
|
||||
package help
|
||||
|
||||
import "strings"
|
||||
|
||||
// stem performs lightweight Porter-style suffix stripping on an English word.
|
||||
// Words shorter than 4 characters are returned unchanged. The result is
|
||||
// guaranteed to be at least 2 characters long.
|
||||
//
|
||||
// This is intentionally NOT the full Porter algorithm — it covers only the
|
||||
// most impactful suffix rules for a help-catalog search context.
|
||||
func stem(word string) string {
|
||||
if len(word) < 4 {
|
||||
return word
|
||||
}
|
||||
|
||||
s := word
|
||||
|
||||
// Step 1: plurals and verb inflections.
|
||||
s = stemInflectional(s)
|
||||
|
||||
// Step 2: derivational suffixes (longest match first).
|
||||
s = stemDerivational(s)
|
||||
|
||||
// Guard: result must be at least 2 characters.
|
||||
if len(s) < 2 {
|
||||
return word
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// stemInflectional handles plurals and -ed/-ing verb forms.
|
||||
func stemInflectional(s string) string {
|
||||
switch {
|
||||
case strings.HasSuffix(s, "sses"):
|
||||
return s[:len(s)-2] // -sses → -ss
|
||||
case strings.HasSuffix(s, "ies"):
|
||||
return s[:len(s)-2] // -ies → -i
|
||||
case strings.HasSuffix(s, "eed"):
|
||||
return s[:len(s)-1] // -eed → -ee
|
||||
case strings.HasSuffix(s, "ing"):
|
||||
r := s[:len(s)-3]
|
||||
if len(r) >= 2 {
|
||||
return r
|
||||
}
|
||||
case strings.HasSuffix(s, "ed"):
|
||||
r := s[:len(s)-2]
|
||||
if len(r) >= 2 {
|
||||
return r
|
||||
}
|
||||
case strings.HasSuffix(s, "s") && !strings.HasSuffix(s, "ss"):
|
||||
return s[:len(s)-1] // -s → "" (but not -ss)
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// stemDerivational strips common derivational suffixes.
|
||||
// Ordered longest-first so we match the most specific rule.
|
||||
func stemDerivational(s string) string {
|
||||
// Longest suffixes first (8+ chars).
|
||||
type rule struct {
|
||||
suffix string
|
||||
replacement string
|
||||
}
|
||||
|
||||
rules := []rule{
|
||||
{"fulness", "ful"}, // -fulness → -ful
|
||||
{"ational", "ate"}, // -ational → -ate
|
||||
{"tional", "tion"}, // -tional → -tion
|
||||
{"ously", "ous"}, // -ously → -ous
|
||||
{"ively", "ive"}, // -ively → -ive
|
||||
{"ingly", "ing"}, // -ingly → -ing
|
||||
{"ation", "ate"}, // -ation → -ate
|
||||
{"ness", ""}, // -ness → ""
|
||||
{"ment", ""}, // -ment → ""
|
||||
{"ably", "able"}, // -ably → -able
|
||||
{"ally", "al"}, // -ally → -al
|
||||
{"izer", "ize"}, // -izer → -ize
|
||||
}
|
||||
|
||||
for _, r := range rules {
|
||||
if strings.HasSuffix(s, r.suffix) {
|
||||
result := s[:len(s)-len(r.suffix)] + r.replacement
|
||||
if len(result) >= 2 {
|
||||
return result
|
||||
}
|
||||
return s // Guard: don't over-strip
|
||||
}
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
222
stemmer_test.go
Normal file
222
stemmer_test.go
Normal file
|
|
@ -0,0 +1,222 @@
|
|||
// SPDX-Licence-Identifier: EUPL-1.2
|
||||
package help
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// stem() unit tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestStem_Good(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
// Step 1: plurals and verb inflections
|
||||
{name: "sses to ss", input: "addresses", expected: "address"},
|
||||
{name: "ies to i", input: "eries", expected: "eri"},
|
||||
{name: "ies to i (ponies)", input: "ponies", expected: "poni"},
|
||||
{name: "eed to ee", input: "agreed", expected: "agree"},
|
||||
{name: "ed removed", input: "configured", expected: "configur"},
|
||||
{name: "ing removed", input: "running", expected: "runn"},
|
||||
{name: "ing removed (testing)", input: "testing", expected: "test"},
|
||||
{name: "s removed (servers)", input: "servers", expected: "server"},
|
||||
{name: "s removed then derivational (configurations)", input: "configurations", expected: "configurate"},
|
||||
{name: "ss unchanged", input: "boss", expected: "boss"},
|
||||
|
||||
// Step 2: derivational suffixes
|
||||
{name: "ational to ate", input: "configurational", expected: "configurate"},
|
||||
{name: "tional to tion", input: "nutritional", expected: "nutrition"},
|
||||
{name: "fulness to ful", input: "cheerfulness", expected: "cheerful"},
|
||||
{name: "ness removed", input: "darkness", expected: "dark"},
|
||||
{name: "ment removed", input: "deployment", expected: "deploy"},
|
||||
{name: "ation to ate", input: "configuration", expected: "configurate"},
|
||||
{name: "ously to ous", input: "dangerously", expected: "dangerous"},
|
||||
{name: "ively to ive", input: "effectively", expected: "effective"},
|
||||
{name: "ably to able", input: "comfortably", expected: "comfortable"},
|
||||
{name: "ally to al", input: "manually", expected: "manual"},
|
||||
{name: "izer to ize", input: "organizer", expected: "organize"},
|
||||
{name: "ingly to ing", input: "surprisingly", expected: "surprising"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := stem(tt.input)
|
||||
assert.Equal(t, tt.expected, result)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestStem_ShortWordsUnchanged(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
}{
|
||||
{name: "single char", input: "a"},
|
||||
{name: "two chars", input: "go"},
|
||||
{name: "three chars", input: "run"},
|
||||
{name: "three chars (the)", input: "the"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
assert.Equal(t, tt.input, stem(tt.input), "words under 4 chars should be unchanged")
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestStem_GuardMinLength(t *testing.T) {
|
||||
// The stem function must never reduce a word below 2 characters.
|
||||
// "ed" removal from a 4-char word like "abed" would leave "ab" (ok).
|
||||
// We test that it doesn't return a single-char result.
|
||||
result := stem("abed")
|
||||
assert.GreaterOrEqual(t, len(result), 2, "result must be at least 2 chars")
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Search integration tests — stemming recall
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestSearch_StemRunningMatchesRun(t *testing.T) {
|
||||
idx := newSearchIndex()
|
||||
idx.Add(&Topic{
|
||||
ID: "topic-run",
|
||||
Title: "How to Run Commands",
|
||||
Content: "You can run any command from the terminal.",
|
||||
})
|
||||
|
||||
results := idx.Search("running")
|
||||
require.NotEmpty(t, results, "searching 'running' should match topic containing 'run'")
|
||||
assert.Equal(t, "topic-run", results[0].Topic.ID)
|
||||
}
|
||||
|
||||
func TestSearch_StemConfigurationsMatchesConfigure(t *testing.T) {
|
||||
idx := newSearchIndex()
|
||||
idx.Add(&Topic{
|
||||
ID: "topic-configure",
|
||||
Title: "Configure Your Application",
|
||||
Content: "Learn how to configure settings for your application.",
|
||||
})
|
||||
|
||||
results := idx.Search("configurations")
|
||||
require.NotEmpty(t, results, "searching 'configurations' should match topic containing 'configure'")
|
||||
assert.Equal(t, "topic-configure", results[0].Topic.ID)
|
||||
}
|
||||
|
||||
func TestSearch_StemPluralServersMatchesServer(t *testing.T) {
|
||||
idx := newSearchIndex()
|
||||
idx.Add(&Topic{
|
||||
ID: "topic-server",
|
||||
Title: "Server Management",
|
||||
Content: "Manage your server with these tools.",
|
||||
})
|
||||
|
||||
results := idx.Search("servers")
|
||||
require.NotEmpty(t, results, "searching 'servers' should match topic containing 'server'")
|
||||
assert.Equal(t, "topic-server", results[0].Topic.ID)
|
||||
}
|
||||
|
||||
func TestSearch_StemScoringLowerThanExact(t *testing.T) {
|
||||
idx := newSearchIndex()
|
||||
idx.Add(&Topic{
|
||||
ID: "exact-match",
|
||||
Title: "Running Guide",
|
||||
Content: "Guide to running applications.",
|
||||
})
|
||||
idx.Add(&Topic{
|
||||
ID: "stem-match",
|
||||
Title: "How to Run",
|
||||
Content: "Run your application.",
|
||||
})
|
||||
|
||||
results := idx.Search("running")
|
||||
require.Len(t, results, 2, "should match both topics")
|
||||
|
||||
// The topic containing the exact word "running" should score higher
|
||||
// than the one matched only via the stem "run" (all else being equal,
|
||||
// scoreExactWord > scoreStemWord).
|
||||
var exactScore, stemScore float64
|
||||
for _, r := range results {
|
||||
if r.Topic.ID == "exact-match" {
|
||||
exactScore = r.Score
|
||||
}
|
||||
if r.Topic.ID == "stem-match" {
|
||||
stemScore = r.Score
|
||||
}
|
||||
}
|
||||
assert.Greater(t, exactScore, stemScore,
|
||||
"exact word match should score higher than stem-only match")
|
||||
}
|
||||
|
||||
func TestSearch_ExistingExactMatchUnaffected(t *testing.T) {
|
||||
// Ensure stemming doesn't break exact-match searches.
|
||||
idx := newSearchIndex()
|
||||
idx.Add(&Topic{
|
||||
ID: "topic-deploy",
|
||||
Title: "Deploy Guide",
|
||||
Content: "How to deploy your application step by step.",
|
||||
})
|
||||
|
||||
results := idx.Search("deploy")
|
||||
require.NotEmpty(t, results)
|
||||
assert.Equal(t, "topic-deploy", results[0].Topic.ID)
|
||||
}
|
||||
|
||||
func TestTokenize_IncludesStemmedVariants(t *testing.T) {
|
||||
words := tokenize("running configurations servers")
|
||||
|
||||
// Should contain originals
|
||||
assert.Contains(t, words, "running")
|
||||
assert.Contains(t, words, "configurations")
|
||||
assert.Contains(t, words, "servers")
|
||||
|
||||
// Should also contain stems
|
||||
assert.Contains(t, words, "runn") // stem of running (ing removed)
|
||||
assert.Contains(t, words, "configurate") // stem of configurations (s->configuration->ation->ate)
|
||||
assert.Contains(t, words, "server") // stem of servers (s removed)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Benchmark
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func BenchmarkStem(b *testing.B) {
|
||||
words := []string{
|
||||
"running", "configurations", "servers", "deployment", "testing",
|
||||
"addresses", "agreed", "configured", "operational", "cheerfulness",
|
||||
"darkness", "dangerously", "effectively", "comfortably", "manually",
|
||||
"organizer", "surprisingly", "configuration", "authentication",
|
||||
"authorisation", "networking", "monitoring", "scheduling", "routing",
|
||||
"migration", "encryption", "compression", "validation", "serialisation",
|
||||
"templating", "distributed", "federated", "graceful", "hybrid",
|
||||
"incremental", "advanced", "basic", "custom", "encrypted", "install",
|
||||
"configure", "deploy", "monitor", "debug", "authenticate", "authorise",
|
||||
"connect", "store", "analyse", "cache", "schedule", "route", "migrate",
|
||||
"restore", "help", "guide", "overview", "setup", "troubleshooting",
|
||||
"performance", "benchmark", "analysis", "documentation", "reference",
|
||||
"tutorial", "quickstart", "installation", "requirements", "dependencies",
|
||||
"modules", "packages", "services", "workers", "processes", "threads",
|
||||
"connections", "sessions", "transactions", "queries", "responses",
|
||||
"requests", "handlers", "middleware", "controllers", "models",
|
||||
"views", "templates", "layouts", "components", "widgets", "plugins",
|
||||
"extensions", "integrations", "providers", "factories", "builders",
|
||||
"adapters", "decorators", "observers", "listeners", "subscribers",
|
||||
"publishers", "dispatchers", "resolvers", "transformers", "formatters",
|
||||
"validators", "sanitizers", "parsers", "compilers", "interpreters",
|
||||
}
|
||||
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
|
||||
for b.Loop() {
|
||||
for _, w := range words {
|
||||
stem(w)
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue