feat(reversal): add noun matching to Tokeniser
Inverse noun lookup: JSON grammar data → irregular nouns → regular morphology rules. Round-trip verified via forward PluralForm(). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
f1aa4adbc4
commit
786909c193
2 changed files with 179 additions and 0 deletions
|
|
@ -65,6 +65,7 @@ func NewTokeniserForLang(lang string) *Tokeniser {
|
|||
lang: lang,
|
||||
}
|
||||
t.buildVerbIndex()
|
||||
t.buildNounIndex()
|
||||
return t
|
||||
}
|
||||
|
||||
|
|
@ -102,6 +103,107 @@ func (t *Tokeniser) buildVerbIndex() {
|
|||
}
|
||||
}
|
||||
|
||||
// buildNounIndex reads grammar tables and irregular noun maps to build
|
||||
// inverse lookup maps: plural form → base form.
|
||||
func (t *Tokeniser) buildNounIndex() {
|
||||
// Tier 1: Read from JSON grammar data (via GetGrammarData).
|
||||
data := i18n.GetGrammarData(t.lang)
|
||||
if data != nil && data.Nouns != nil {
|
||||
for base, forms := range data.Nouns {
|
||||
t.baseNouns[base] = true
|
||||
if forms.Other != "" && forms.Other != base {
|
||||
t.pluralToBase[forms.Other] = base
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tier 2: Read from the exported irregularNouns map.
|
||||
for base, plural := range i18n.IrregularNouns() {
|
||||
t.baseNouns[base] = true
|
||||
if plural != base {
|
||||
if _, exists := t.pluralToBase[plural]; !exists {
|
||||
t.pluralToBase[plural] = base
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// MatchNoun performs a 3-tier reverse lookup for a noun form.
|
||||
//
|
||||
// Tier 1: Check if the word is a known base noun.
|
||||
// Tier 2: Check the pluralToBase inverse map.
|
||||
// Tier 3: Try reverse morphology rules and round-trip verify via
|
||||
// the forward function PluralForm().
|
||||
func (t *Tokeniser) MatchNoun(word string) (NounMatch, bool) {
|
||||
word = strings.ToLower(strings.TrimSpace(word))
|
||||
if word == "" {
|
||||
return NounMatch{}, false
|
||||
}
|
||||
|
||||
// Tier 1: Is it a base noun?
|
||||
if t.baseNouns[word] {
|
||||
return NounMatch{Base: word, Plural: false, Form: word}, true
|
||||
}
|
||||
|
||||
// Tier 2: Check inverse map from grammar tables + irregular nouns.
|
||||
if base, ok := t.pluralToBase[word]; ok {
|
||||
return NounMatch{Base: base, Plural: true, Form: word}, true
|
||||
}
|
||||
|
||||
// Tier 3: Reverse morphology with round-trip verification.
|
||||
candidates := t.reverseRegularPlural(word)
|
||||
for _, c := range candidates {
|
||||
if i18n.PluralForm(c) == word {
|
||||
return NounMatch{Base: c, Plural: true, Form: word}, true
|
||||
}
|
||||
}
|
||||
|
||||
return NounMatch{}, false
|
||||
}
|
||||
|
||||
// reverseRegularPlural generates candidate base forms by reversing regular
|
||||
// plural suffixes. Returns multiple candidates ordered by likelihood.
|
||||
//
|
||||
// The forward engine applies rules in this order:
|
||||
// 1. ends in s/ss/sh/ch/x/z → +es
|
||||
// 2. ends in consonant+y → ies
|
||||
// 3. ends in f → ves, fe → ves
|
||||
// 4. default → +s
|
||||
//
|
||||
// We generate candidates for each possible reverse rule. Round-trip
|
||||
// verification ensures only correct candidates pass.
|
||||
func (t *Tokeniser) reverseRegularPlural(word string) []string {
|
||||
var candidates []string
|
||||
|
||||
// Rule: consonant + "ies" → consonant + "y" (e.g., "entries" → "entry")
|
||||
if strings.HasSuffix(word, "ies") && len(word) > 3 {
|
||||
base := word[:len(word)-3] + "y"
|
||||
candidates = append(candidates, base)
|
||||
}
|
||||
|
||||
// Rule: "ves" → "f" or "fe" (e.g., "wolves" → "wolf", "knives" → "knife")
|
||||
if strings.HasSuffix(word, "ves") && len(word) > 3 {
|
||||
candidates = append(candidates, word[:len(word)-3]+"f")
|
||||
candidates = append(candidates, word[:len(word)-3]+"fe")
|
||||
}
|
||||
|
||||
// Rule: sibilant + "es" (e.g., "processes" → "process", "branches" → "branch")
|
||||
if strings.HasSuffix(word, "ses") || strings.HasSuffix(word, "xes") ||
|
||||
strings.HasSuffix(word, "zes") || strings.HasSuffix(word, "ches") ||
|
||||
strings.HasSuffix(word, "shes") {
|
||||
base := word[:len(word)-2] // strip "es"
|
||||
candidates = append(candidates, base)
|
||||
}
|
||||
|
||||
// Rule: drop "s" (e.g., "servers" → "server")
|
||||
if strings.HasSuffix(word, "s") && len(word) > 1 {
|
||||
base := word[:len(word)-1]
|
||||
candidates = append(candidates, base)
|
||||
}
|
||||
|
||||
return candidates
|
||||
}
|
||||
|
||||
// MatchVerb performs a 3-tier reverse lookup for a verb form.
|
||||
//
|
||||
// Tier 1: Check if the word is a known base verb.
|
||||
|
|
|
|||
|
|
@ -69,6 +69,83 @@ func TestTokeniser_MatchVerb_Irregular(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_MatchNoun_Irregular(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tests := []struct {
|
||||
word string
|
||||
wantOK bool
|
||||
wantBase string
|
||||
wantPlural bool
|
||||
}{
|
||||
{"files", true, "file", true},
|
||||
{"file", true, "file", false},
|
||||
{"people", true, "person", true},
|
||||
{"person", true, "person", false},
|
||||
{"children", true, "child", true},
|
||||
{"child", true, "child", false},
|
||||
{"repositories", true, "repository", true},
|
||||
{"repository", true, "repository", false},
|
||||
{"branches", true, "branch", true},
|
||||
{"branch", true, "branch", false},
|
||||
{"xyzzy", false, "", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.word, func(t *testing.T) {
|
||||
match, ok := tok.MatchNoun(tt.word)
|
||||
if ok != tt.wantOK {
|
||||
t.Fatalf("MatchNoun(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
|
||||
}
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if match.Base != tt.wantBase {
|
||||
t.Errorf("MatchNoun(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
|
||||
}
|
||||
if match.Plural != tt.wantPlural {
|
||||
t.Errorf("MatchNoun(%q).Plural = %v, want %v", tt.word, match.Plural, tt.wantPlural)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_MatchNoun_Regular(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tests := []struct {
|
||||
word string
|
||||
wantOK bool
|
||||
wantBase string
|
||||
wantPlural bool
|
||||
}{
|
||||
// Regular nouns NOT in grammar tables — detected by reverse morphology + round-trip
|
||||
{"servers", true, "server", true},
|
||||
{"processes", true, "process", true},
|
||||
{"entries", true, "entry", true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.word, func(t *testing.T) {
|
||||
match, ok := tok.MatchNoun(tt.word)
|
||||
if ok != tt.wantOK {
|
||||
t.Fatalf("MatchNoun(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
|
||||
}
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if match.Base != tt.wantBase {
|
||||
t.Errorf("MatchNoun(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
|
||||
}
|
||||
if match.Plural != tt.wantPlural {
|
||||
t.Errorf("MatchNoun(%q).Plural = %v, want %v", tt.word, match.Plural, tt.wantPlural)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_MatchVerb_Regular(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue