go-help/stemmer.go
Snider fc758a832b feat(search): add English stemmer for improved search recall
Implement lightweight Porter-style suffix stripping in stemmer.go covering
plurals (-sses, -ies, -s), verb forms (-ed, -ing, -eed), and derivational
suffixes (-ational, -tional, -fulness, -ness, -ment, -ation, -ously,
-ively, -ably, -ally, -izer, -ingly). Words under 4 chars are unchanged
and results are guaranteed at least 2 chars.

tokenize() now emits both raw and stemmed forms so the index contains both.
Search() distinguishes stem-only matches (scoreStemWord=0.7) from exact
matches (1.0), keeping stemmed results slightly below raw hits.

22 stem unit tests, 5 search integration tests, and BenchmarkStem with
100 words. All existing tests pass with no regressions.

Co-Authored-By: Virgil <virgil@lethean.io>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 08:07:50 +00:00

93 lines
2.4 KiB
Go

// SPDX-Licence-Identifier: EUPL-1.2
package help
import "strings"
// stem performs lightweight Porter-style suffix stripping on an English word.
// Words shorter than 4 characters are returned unchanged. The result is
// guaranteed to be at least 2 characters long.
//
// This is intentionally NOT the full Porter algorithm — it covers only the
// most impactful suffix rules for a help-catalog search context.
func stem(word string) string {
if len(word) < 4 {
return word
}
s := word
// Step 1: plurals and verb inflections.
s = stemInflectional(s)
// Step 2: derivational suffixes (longest match first).
s = stemDerivational(s)
// Guard: result must be at least 2 characters.
if len(s) < 2 {
return word
}
return s
}
// stemInflectional handles plurals and -ed/-ing verb forms.
func stemInflectional(s string) string {
switch {
case strings.HasSuffix(s, "sses"):
return s[:len(s)-2] // -sses → -ss
case strings.HasSuffix(s, "ies"):
return s[:len(s)-2] // -ies → -i
case strings.HasSuffix(s, "eed"):
return s[:len(s)-1] // -eed → -ee
case strings.HasSuffix(s, "ing"):
r := s[:len(s)-3]
if len(r) >= 2 {
return r
}
case strings.HasSuffix(s, "ed"):
r := s[:len(s)-2]
if len(r) >= 2 {
return r
}
case strings.HasSuffix(s, "s") && !strings.HasSuffix(s, "ss"):
return s[:len(s)-1] // -s → "" (but not -ss)
}
return s
}
// stemDerivational strips common derivational suffixes.
// Ordered longest-first so we match the most specific rule.
func stemDerivational(s string) string {
// Longest suffixes first (8+ chars).
type rule struct {
suffix string
replacement string
}
rules := []rule{
{"fulness", "ful"}, // -fulness → -ful
{"ational", "ate"}, // -ational → -ate
{"tional", "tion"}, // -tional → -tion
{"ously", "ous"}, // -ously → -ous
{"ively", "ive"}, // -ively → -ive
{"ingly", "ing"}, // -ingly → -ing
{"ation", "ate"}, // -ation → -ate
{"ness", ""}, // -ness → ""
{"ment", ""}, // -ment → ""
{"ably", "able"}, // -ably → -able
{"ally", "al"}, // -ally → -al
{"izer", "ize"}, // -izer → -ize
}
for _, r := range rules {
if strings.HasSuffix(s, r.suffix) {
result := s[:len(s)-len(r.suffix)] + r.replacement
if len(result) >= 2 {
return result
}
return s // Guard: don't over-strip
}
}
return s
}