feat(reversal): add noun matching to Tokeniser

Inverse noun lookup: JSON grammar data → irregular nouns → regular morphology rules. Round-trip verified via forward PluralForm(). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 23:18:08 +00:00 · 2026-02-16 23:18:08 +00:00 · 786909c193
commit 786909c193
parent f1aa4adbc4
2 changed files with 179 additions and 0 deletions
--- a/reversal/tokeniser.go
+++ b/reversal/tokeniser.go
@ -65,6 +65,7 @@ func NewTokeniserForLang(lang string) *Tokeniser {
 		lang:         lang,
 	}
 	t.buildVerbIndex()
+	t.buildNounIndex()
 	return t
 }

@ -102,6 +103,107 @@ func (t *Tokeniser) buildVerbIndex() {
 	}
 }

+// buildNounIndex reads grammar tables and irregular noun maps to build
+// inverse lookup maps: plural form → base form.
+func (t *Tokeniser) buildNounIndex() {
+	// Tier 1: Read from JSON grammar data (via GetGrammarData).
+	data := i18n.GetGrammarData(t.lang)
+	if data != nil && data.Nouns != nil {
+		for base, forms := range data.Nouns {
+			t.baseNouns[base] = true
+			if forms.Other != "" && forms.Other != base {
+				t.pluralToBase[forms.Other] = base
+			}
+		}
+	}
+
+	// Tier 2: Read from the exported irregularNouns map.
+	for base, plural := range i18n.IrregularNouns() {
+		t.baseNouns[base] = true
+		if plural != base {
+			if _, exists := t.pluralToBase[plural]; !exists {
+				t.pluralToBase[plural] = base
+			}
+		}
+	}
+}
+
+// MatchNoun performs a 3-tier reverse lookup for a noun form.
+//
+// Tier 1: Check if the word is a known base noun.
+// Tier 2: Check the pluralToBase inverse map.
+// Tier 3: Try reverse morphology rules and round-trip verify via
+// the forward function PluralForm().
+func (t *Tokeniser) MatchNoun(word string) (NounMatch, bool) {
+	word = strings.ToLower(strings.TrimSpace(word))
+	if word == "" {
+		return NounMatch{}, false
+	}
+
+	// Tier 1: Is it a base noun?
+	if t.baseNouns[word] {
+		return NounMatch{Base: word, Plural: false, Form: word}, true
+	}
+
+	// Tier 2: Check inverse map from grammar tables + irregular nouns.
+	if base, ok := t.pluralToBase[word]; ok {
+		return NounMatch{Base: base, Plural: true, Form: word}, true
+	}
+
+	// Tier 3: Reverse morphology with round-trip verification.
+	candidates := t.reverseRegularPlural(word)
+	for _, c := range candidates {
+		if i18n.PluralForm(c) == word {
+			return NounMatch{Base: c, Plural: true, Form: word}, true
+		}
+	}
+
+	return NounMatch{}, false
+}
+
+// reverseRegularPlural generates candidate base forms by reversing regular
+// plural suffixes. Returns multiple candidates ordered by likelihood.
+//
+// The forward engine applies rules in this order:
+//  1. ends in s/ss/sh/ch/x/z → +es
+//  2. ends in consonant+y → ies
+//  3. ends in f → ves, fe → ves
+//  4. default → +s
+//
+// We generate candidates for each possible reverse rule. Round-trip
+// verification ensures only correct candidates pass.
+func (t *Tokeniser) reverseRegularPlural(word string) []string {
+	var candidates []string
+
+	// Rule: consonant + "ies" → consonant + "y" (e.g., "entries" → "entry")
+	if strings.HasSuffix(word, "ies") && len(word) > 3 {
+		base := word[:len(word)-3] + "y"
+		candidates = append(candidates, base)
+	}
+
+	// Rule: "ves" → "f" or "fe" (e.g., "wolves" → "wolf", "knives" → "knife")
+	if strings.HasSuffix(word, "ves") && len(word) > 3 {
+		candidates = append(candidates, word[:len(word)-3]+"f")
+		candidates = append(candidates, word[:len(word)-3]+"fe")
+	}
+
+	// Rule: sibilant + "es" (e.g., "processes" → "process", "branches" → "branch")
+	if strings.HasSuffix(word, "ses") || strings.HasSuffix(word, "xes") ||
+		strings.HasSuffix(word, "zes") || strings.HasSuffix(word, "ches") ||
+		strings.HasSuffix(word, "shes") {
+		base := word[:len(word)-2] // strip "es"
+		candidates = append(candidates, base)
+	}
+
+	// Rule: drop "s" (e.g., "servers" → "server")
+	if strings.HasSuffix(word, "s") && len(word) > 1 {
+		base := word[:len(word)-1]
+		candidates = append(candidates, base)
+	}
+
+	return candidates
+}
+
 // MatchVerb performs a 3-tier reverse lookup for a verb form.
 //
 // Tier 1: Check if the word is a known base verb.
--- a/reversal/tokeniser_test.go
+++ b/reversal/tokeniser_test.go
@ -69,6 +69,83 @@ func TestTokeniser_MatchVerb_Irregular(t *testing.T) {
 	}
 }

+func TestTokeniser_MatchNoun_Irregular(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tests := []struct {
+		word       string
+		wantOK     bool
+		wantBase   string
+		wantPlural bool
+	}{
+		{"files", true, "file", true},
+		{"file", true, "file", false},
+		{"people", true, "person", true},
+		{"person", true, "person", false},
+		{"children", true, "child", true},
+		{"child", true, "child", false},
+		{"repositories", true, "repository", true},
+		{"repository", true, "repository", false},
+		{"branches", true, "branch", true},
+		{"branch", true, "branch", false},
+		{"xyzzy", false, "", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.word, func(t *testing.T) {
+			match, ok := tok.MatchNoun(tt.word)
+			if ok != tt.wantOK {
+				t.Fatalf("MatchNoun(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
+			}
+			if !ok {
+				return
+			}
+			if match.Base != tt.wantBase {
+				t.Errorf("MatchNoun(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
+			}
+			if match.Plural != tt.wantPlural {
+				t.Errorf("MatchNoun(%q).Plural = %v, want %v", tt.word, match.Plural, tt.wantPlural)
+			}
+		})
+	}
+}
+
+func TestTokeniser_MatchNoun_Regular(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tests := []struct {
+		word       string
+		wantOK     bool
+		wantBase   string
+		wantPlural bool
+	}{
+		// Regular nouns NOT in grammar tables — detected by reverse morphology + round-trip
+		{"servers", true, "server", true},
+		{"processes", true, "process", true},
+		{"entries", true, "entry", true},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.word, func(t *testing.T) {
+			match, ok := tok.MatchNoun(tt.word)
+			if ok != tt.wantOK {
+				t.Fatalf("MatchNoun(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
+			}
+			if !ok {
+				return
+			}
+			if match.Base != tt.wantBase {
+				t.Errorf("MatchNoun(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
+			}
+			if match.Plural != tt.wantPlural {
+				t.Errorf("MatchNoun(%q).Plural = %v, want %v", tt.word, match.Plural, tt.wantPlural)
+			}
+		})
+	}
+}
+
 func TestTokeniser_MatchVerb_Regular(t *testing.T) {
 	setup(t)
 	tok := NewTokeniser()