Implement lightweight Porter-style suffix stripping in stemmer.go covering plurals (-sses, -ies, -s), verb forms (-ed, -ing, -eed), and derivational suffixes (-ational, -tional, -fulness, -ness, -ment, -ation, -ously, -ively, -ably, -ally, -izer, -ingly). Words under 4 chars are unchanged and results are guaranteed at least 2 chars. tokenize() now emits both raw and stemmed forms so the index contains both. Search() distinguishes stem-only matches (scoreStemWord=0.7) from exact matches (1.0), keeping stemmed results slightly below raw hits. 22 stem unit tests, 5 search integration tests, and BenchmarkStem with 100 words. All existing tests pass with no regressions. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
93 lines
2.4 KiB
Go
93 lines
2.4 KiB
Go
// SPDX-Licence-Identifier: EUPL-1.2
|
|
package help
|
|
|
|
import "strings"
|
|
|
|
// stem performs lightweight Porter-style suffix stripping on an English word.
|
|
// Words shorter than 4 characters are returned unchanged. The result is
|
|
// guaranteed to be at least 2 characters long.
|
|
//
|
|
// This is intentionally NOT the full Porter algorithm — it covers only the
|
|
// most impactful suffix rules for a help-catalog search context.
|
|
func stem(word string) string {
|
|
if len(word) < 4 {
|
|
return word
|
|
}
|
|
|
|
s := word
|
|
|
|
// Step 1: plurals and verb inflections.
|
|
s = stemInflectional(s)
|
|
|
|
// Step 2: derivational suffixes (longest match first).
|
|
s = stemDerivational(s)
|
|
|
|
// Guard: result must be at least 2 characters.
|
|
if len(s) < 2 {
|
|
return word
|
|
}
|
|
|
|
return s
|
|
}
|
|
|
|
// stemInflectional handles plurals and -ed/-ing verb forms.
|
|
func stemInflectional(s string) string {
|
|
switch {
|
|
case strings.HasSuffix(s, "sses"):
|
|
return s[:len(s)-2] // -sses → -ss
|
|
case strings.HasSuffix(s, "ies"):
|
|
return s[:len(s)-2] // -ies → -i
|
|
case strings.HasSuffix(s, "eed"):
|
|
return s[:len(s)-1] // -eed → -ee
|
|
case strings.HasSuffix(s, "ing"):
|
|
r := s[:len(s)-3]
|
|
if len(r) >= 2 {
|
|
return r
|
|
}
|
|
case strings.HasSuffix(s, "ed"):
|
|
r := s[:len(s)-2]
|
|
if len(r) >= 2 {
|
|
return r
|
|
}
|
|
case strings.HasSuffix(s, "s") && !strings.HasSuffix(s, "ss"):
|
|
return s[:len(s)-1] // -s → "" (but not -ss)
|
|
}
|
|
return s
|
|
}
|
|
|
|
// stemDerivational strips common derivational suffixes.
|
|
// Ordered longest-first so we match the most specific rule.
|
|
func stemDerivational(s string) string {
|
|
// Longest suffixes first (8+ chars).
|
|
type rule struct {
|
|
suffix string
|
|
replacement string
|
|
}
|
|
|
|
rules := []rule{
|
|
{"fulness", "ful"}, // -fulness → -ful
|
|
{"ational", "ate"}, // -ational → -ate
|
|
{"tional", "tion"}, // -tional → -tion
|
|
{"ously", "ous"}, // -ously → -ous
|
|
{"ively", "ive"}, // -ively → -ive
|
|
{"ingly", "ing"}, // -ingly → -ing
|
|
{"ation", "ate"}, // -ation → -ate
|
|
{"ness", ""}, // -ness → ""
|
|
{"ment", ""}, // -ment → ""
|
|
{"ably", "able"}, // -ably → -able
|
|
{"ally", "al"}, // -ally → -al
|
|
{"izer", "ize"}, // -izer → -ize
|
|
}
|
|
|
|
for _, r := range rules {
|
|
if strings.HasSuffix(s, r.suffix) {
|
|
result := s[:len(s)-len(r.suffix)] + r.replacement
|
|
if len(result) >= 2 {
|
|
return result
|
|
}
|
|
return s // Guard: don't over-strip
|
|
}
|
|
}
|
|
|
|
return s
|
|
}
|