feat(reversal): add Tokeniser with verb matching

Reverse grammar tables into pattern matchers. 3-tier lookup: JSON grammar data → irregular verb maps → regular morphology rules. Verified by round-tripping through forward functions. Export IrregularVerbs() and IrregularNouns() so the reversal engine reads from the authoritative source instead of a duplicate list. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 23:10:20 +00:00 · 2026-02-16 23:10:20 +00:00 · f1aa4adbc4
commit f1aa4adbc4
parent 20ab172f5b
4 changed files with 435 additions and 1 deletions
--- a/grammar.go
+++ b/grammar.go
@ -20,6 +20,24 @@ func SetGrammarData(lang string, data *GrammarData) {
 	grammarCache[lang] = data
 }

+// IrregularVerbs returns a copy of the irregular verb forms map.
+func IrregularVerbs() map[string]VerbForms {
+	result := make(map[string]VerbForms, len(irregularVerbs))
+	for k, v := range irregularVerbs {
+		result[k] = v
+	}
+	return result
+}
+
+// IrregularNouns returns a copy of the irregular nouns map.
+func IrregularNouns() map[string]string {
+	result := make(map[string]string, len(irregularNouns))
+	for k, v := range irregularNouns {
+		result[k] = v
+	}
+	return result
+}
+
 func getVerbForm(lang, verb, form string) string {
 	data := GetGrammarData(lang)
 	if data == nil || data.Verbs == nil {
--- a/locales/en.json
+++ b/locales/en.json
@ -39,7 +39,10 @@
 			"hit": { "base": "hit", "past": "hit", "gerund": "hitting" },
 			"sit": { "base": "sit", "past": "sat", "gerund": "sitting" },
 			"split": { "base": "split", "past": "split", "gerund": "splitting" },
-			"shut": { "base": "shut", "past": "shut", "gerund": "shutting" }
+			"shut": { "base": "shut", "past": "shut", "gerund": "shutting" },
+			"delete": { "base": "delete", "past": "deleted", "gerund": "deleting" },
+			"update": { "base": "update", "past": "updated", "gerund": "updating" },
+			"push": { "base": "push", "past": "pushed", "gerund": "pushing" }
 		},
 		"noun": {
 			"file": { "one": "file", "other": "files" },
--- a/reversal/tokeniser.go
+++ b/reversal/tokeniser.go
@ -0,0 +1,303 @@
+// Package reversal provides reverse grammar lookups.
+//
+// The forward engine (go-i18n) maps base forms to inflected forms:
+//
+//	PastTense("delete") → "deleted"
+//	Gerund("run")       → "running"
+//
+// The reversal engine reads those same tables backwards, turning
+// inflected forms back into base forms with tense metadata:
+//
+//	MatchVerb("deleted")  → {Base: "delete", Tense: "past"}
+//	MatchVerb("running")  → {Base: "run",    Tense: "gerund"}
+//
+// 3-tier lookup: JSON grammar data → irregular verb maps → regular
+// morphology rules (verified by round-tripping through forward functions).
+package reversal
+
+import (
+	"strings"
+
+	i18n "forge.lthn.ai/core/go-i18n"
+)
+
+// VerbMatch holds the result of a reverse verb lookup.
+type VerbMatch struct {
+	Base  string // Base form of the verb ("delete", "run")
+	Tense string // "past", "gerund", or "base"
+	Form  string // The original inflected form
+}
+
+// NounMatch holds the result of a reverse noun lookup.
+type NounMatch struct {
+	Base   string // Base/singular form of the noun
+	Plural bool   // Whether the matched form was plural
+	Form   string // The original form
+}
+
+// Tokeniser provides reverse grammar lookups by maintaining inverse
+// indexes built from the forward grammar tables.
+type Tokeniser struct {
+	pastToBase   map[string]string // "deleted" → "delete"
+	gerundToBase map[string]string // "deleting" → "delete"
+	baseVerbs    map[string]bool   // "delete" → true
+	pluralToBase map[string]string // "files" → "file"
+	baseNouns    map[string]bool   // "file" → true
+	words        map[string]string // word translations
+	lang         string
+}
+
+// NewTokeniser creates a Tokeniser for English ("en").
+func NewTokeniser() *Tokeniser {
+	return NewTokeniserForLang("en")
+}
+
+// NewTokeniserForLang creates a Tokeniser for the specified language,
+// building inverse indexes from the grammar data.
+func NewTokeniserForLang(lang string) *Tokeniser {
+	t := &Tokeniser{
+		pastToBase:   make(map[string]string),
+		gerundToBase: make(map[string]string),
+		baseVerbs:    make(map[string]bool),
+		pluralToBase: make(map[string]string),
+		baseNouns:    make(map[string]bool),
+		words:        make(map[string]string),
+		lang:         lang,
+	}
+	t.buildVerbIndex()
+	return t
+}
+
+// buildVerbIndex reads grammar tables and irregular verb maps to build
+// inverse lookup maps: inflected form → base form.
+func (t *Tokeniser) buildVerbIndex() {
+	// Tier 1: Read from JSON grammar data (via GetGrammarData).
+	data := i18n.GetGrammarData(t.lang)
+	if data != nil && data.Verbs != nil {
+		for base, forms := range data.Verbs {
+			t.baseVerbs[base] = true
+			if forms.Past != "" {
+				t.pastToBase[forms.Past] = base
+			}
+			if forms.Gerund != "" {
+				t.gerundToBase[forms.Gerund] = base
+			}
+		}
+	}
+
+	// Tier 2: Read from the exported irregularVerbs map.
+	// Build inverse maps directly from the authoritative source.
+	for base, forms := range i18n.IrregularVerbs() {
+		t.baseVerbs[base] = true
+		if forms.Past != "" {
+			if _, exists := t.pastToBase[forms.Past]; !exists {
+				t.pastToBase[forms.Past] = base
+			}
+		}
+		if forms.Gerund != "" {
+			if _, exists := t.gerundToBase[forms.Gerund]; !exists {
+				t.gerundToBase[forms.Gerund] = base
+			}
+		}
+	}
+}
+
+// MatchVerb performs a 3-tier reverse lookup for a verb form.
+//
+// Tier 1: Check if the word is a known base verb.
+// Tier 2: Check the pastToBase and gerundToBase inverse maps.
+// Tier 3: Try reverse morphology rules and round-trip verify via
+// the forward functions PastTense() and Gerund().
+func (t *Tokeniser) MatchVerb(word string) (VerbMatch, bool) {
+	word = strings.ToLower(strings.TrimSpace(word))
+	if word == "" {
+		return VerbMatch{}, false
+	}
+
+	// Tier 1: Is it a base verb?
+	if t.baseVerbs[word] {
+		return VerbMatch{Base: word, Tense: "base", Form: word}, true
+	}
+
+	// Tier 2: Check inverse maps from grammar tables + irregular verbs.
+	if base, ok := t.pastToBase[word]; ok {
+		return VerbMatch{Base: base, Tense: "past", Form: word}, true
+	}
+	if base, ok := t.gerundToBase[word]; ok {
+		return VerbMatch{Base: base, Tense: "gerund", Form: word}, true
+	}
+
+	// Tier 3: Reverse morphology with round-trip verification.
+	// Try past tense candidates.
+	if base := t.bestRoundTrip(word, t.reverseRegularPast(word), i18n.PastTense); base != "" {
+		return VerbMatch{Base: base, Tense: "past", Form: word}, true
+	}
+
+	// Try gerund candidates.
+	if base := t.bestRoundTrip(word, t.reverseRegularGerund(word), i18n.Gerund); base != "" {
+		return VerbMatch{Base: base, Tense: "gerund", Form: word}, true
+	}
+
+	return VerbMatch{}, false
+}
+
+// bestRoundTrip selects the best candidate from a list by round-tripping
+// each through a forward function. When multiple candidates round-trip
+// successfully (ambiguity), it uses the following priority:
+//  1. Candidates that are known base verbs (in grammar tables / irregular maps)
+//  2. Candidates ending in a VCe pattern (vowel-consonant-e, the "magic e"
+//     pattern common in real English verbs like "delete", "create", "use").
+//     This avoids phantom verbs like "walke" or "processe" which have a
+//     CCe pattern (consonant-consonant-e) that doesn't occur naturally.
+//  3. Candidates NOT ending in "e" (the default morphology path)
+//  4. First match in candidate order as final tiebreaker
+func (t *Tokeniser) bestRoundTrip(target string, candidates []string, forward func(string) string) string {
+	var matches []string
+	for _, c := range candidates {
+		if forward(c) == target {
+			matches = append(matches, c)
+		}
+	}
+	if len(matches) == 0 {
+		return ""
+	}
+	if len(matches) == 1 {
+		return matches[0]
+	}
+
+	// Priority 1: known base verb
+	for _, m := range matches {
+		if t.baseVerbs[m] {
+			return m
+		}
+	}
+
+	// Priority 2: prefer VCe-ending candidate (real English verb pattern)
+	for _, m := range matches {
+		if hasVCeEnding(m) {
+			return m
+		}
+	}
+
+	// Priority 3: prefer candidate not ending in "e" (avoids phantom verbs
+	// with CCe endings like "walke", "processe")
+	for _, m := range matches {
+		if !strings.HasSuffix(m, "e") {
+			return m
+		}
+	}
+
+	return matches[0]
+}
+
+// hasVCeEnding returns true if the word ends in a vowel-consonant-e pattern
+// (the "magic e" pattern). This is characteristic of real English verbs like
+// "delete" (-ete), "create" (-ate), "use" (-use), "close" (-ose).
+// Phantom verbs produced by naive suffix stripping like "walke" (-lke) or
+// "processe" (-sse) end in consonant-consonant-e and return false.
+func hasVCeEnding(word string) bool {
+	if len(word) < 3 || word[len(word)-1] != 'e' {
+		return false
+	}
+	lastConsonant := word[len(word)-2]
+	vowelBefore := word[len(word)-3]
+	return !isVowelByte(lastConsonant) && isVowelByte(vowelBefore)
+}
+
+func isVowelByte(b byte) bool {
+	switch b {
+	case 'a', 'e', 'i', 'o', 'u':
+		return true
+	}
+	return false
+}
+
+// reverseRegularPast generates candidate base forms by reversing regular
+// past tense suffixes. Returns multiple candidates ordered by likelihood.
+//
+// The forward engine applies rules in this order:
+//  1. ends in "e" → +d  (create → created)
+//  2. ends in "y" + consonant → ied  (copy → copied)
+//  3. shouldDoubleConsonant → double+ed  (stop → stopped)
+//  4. default → +ed  (walk → walked)
+//
+// We generate candidates for each possible reverse rule. Round-trip
+// verification (in bestRoundTrip) ensures only correct candidates pass.
+func (t *Tokeniser) reverseRegularPast(word string) []string {
+	var candidates []string
+
+	if !strings.HasSuffix(word, "ed") {
+		return candidates
+	}
+
+	// Rule: consonant + "ied" → consonant + "y" (e.g., "copied" → "copy")
+	if strings.HasSuffix(word, "ied") && len(word) > 3 {
+		base := word[:len(word)-3] + "y"
+		candidates = append(candidates, base)
+	}
+
+	// Rule: doubled consonant + "ed" → single consonant (e.g., "stopped" → "stop")
+	if len(word) > 4 {
+		beforeEd := word[:len(word)-2]
+		lastChar := beforeEd[len(beforeEd)-1]
+		if len(beforeEd) >= 2 && beforeEd[len(beforeEd)-2] == lastChar {
+			base := beforeEd[:len(beforeEd)-1]
+			candidates = append(candidates, base)
+		}
+	}
+
+	// Rule: stem + "d" where stem ends in "e" (e.g., "created" → "create")
+	if len(word) > 2 {
+		stemPlusE := word[:len(word)-1] // strip "d", leaving stem + "e"
+		candidates = append(candidates, stemPlusE)
+	}
+
+	// Rule: stem + "ed" (e.g., "walked" → "walk")
+	if len(word) > 2 {
+		stem := word[:len(word)-2]
+		candidates = append(candidates, stem)
+	}
+
+	return candidates
+}
+
+// reverseRegularGerund generates candidate base forms by reversing regular
+// gerund suffixes. Returns multiple candidates ordered by likelihood.
+//
+// Rules reversed:
+//   - verb + "ing"          (e.g., "walking" → "walk")
+//   - verb[:-1] + "ing"     (e.g., "creating" → "create", drop e)
+//   - doubled consonant     (e.g., "stopping" → "stop")
+//   - verb[:-2] + "ying"    (e.g., "dying" → "die")
+func (t *Tokeniser) reverseRegularGerund(word string) []string {
+	var candidates []string
+
+	if !strings.HasSuffix(word, "ing") || len(word) < 4 {
+		return candidates
+	}
+
+	stem := word[:len(word)-3] // strip "ing"
+
+	// Rule: "ying" → "ie" (e.g., "dying" → "die")
+	if strings.HasSuffix(word, "ying") && len(word) > 4 {
+		base := word[:len(word)-4] + "ie"
+		candidates = append(candidates, base)
+	}
+
+	// Rule: doubled consonant + "ing" → single consonant (e.g., "stopping" → "stop")
+	if len(stem) >= 2 && stem[len(stem)-1] == stem[len(stem)-2] {
+		base := stem[:len(stem)-1]
+		candidates = append(candidates, base)
+	}
+
+	// Rule: direct strip "ing" (e.g., "walking" → "walk")
+	// This must come before the stem+"e" rule to avoid false positives
+	// like "walke" round-tripping through Gerund("walke") = "walking".
+	candidates = append(candidates, stem)
+
+	// Rule: stem + "e" was dropped before "ing" (e.g., "creating" → "create")
+	// Try adding "e" back.
+	candidates = append(candidates, stem+"e")
+
+	return candidates
+}
--- a/reversal/tokeniser_test.go
+++ b/reversal/tokeniser_test.go
@ -0,0 +1,110 @@
+package reversal
+
+import (
+	"testing"
+
+	i18n "forge.lthn.ai/core/go-i18n"
+)
+
+func setup(t *testing.T) {
+	t.Helper()
+	svc, err := i18n.New()
+	if err != nil {
+		t.Fatalf("i18n.New() failed: %v", err)
+	}
+	i18n.SetDefault(svc)
+}
+
+func TestTokeniser_MatchVerb_Irregular(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tests := []struct {
+		word    string
+		wantOK  bool
+		wantBase string
+		wantTense string
+	}{
+		// Irregular past tense
+		{"deleted", true, "delete", "past"},
+		{"deleting", true, "delete", "gerund"},
+		{"went", true, "go", "past"},
+		{"going", true, "go", "gerund"},
+		{"was", true, "be", "past"},
+		{"being", true, "be", "gerund"},
+		{"ran", true, "run", "past"},
+		{"running", true, "run", "gerund"},
+		{"wrote", true, "write", "past"},
+		{"writing", true, "write", "gerund"},
+		{"built", true, "build", "past"},
+		{"building", true, "build", "gerund"},
+		{"committed", true, "commit", "past"},
+		{"committing", true, "commit", "gerund"},
+
+		// Base forms
+		{"delete", true, "delete", "base"},
+		{"go", true, "go", "base"},
+
+		// Unknown words return false
+		{"xyzzy", false, "", ""},
+		{"flurble", false, "", ""},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.word, func(t *testing.T) {
+			match, ok := tok.MatchVerb(tt.word)
+			if ok != tt.wantOK {
+				t.Fatalf("MatchVerb(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
+			}
+			if !ok {
+				return
+			}
+			if match.Base != tt.wantBase {
+				t.Errorf("MatchVerb(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
+			}
+			if match.Tense != tt.wantTense {
+				t.Errorf("MatchVerb(%q).Tense = %q, want %q", tt.word, match.Tense, tt.wantTense)
+			}
+		})
+	}
+}
+
+func TestTokeniser_MatchVerb_Regular(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tests := []struct {
+		word      string
+		wantOK    bool
+		wantBase  string
+		wantTense string
+	}{
+		// Regular verbs NOT in grammar tables — detected by reverse morphology + round-trip
+		{"walked", true, "walk", "past"},
+		{"walking", true, "walk", "gerund"},
+		{"processed", true, "process", "past"},
+		{"processing", true, "process", "gerund"},
+		{"copied", true, "copy", "past"},
+		{"copying", true, "copy", "gerund"},
+		{"stopped", true, "stop", "past"},
+		{"stopping", true, "stop", "gerund"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.word, func(t *testing.T) {
+			match, ok := tok.MatchVerb(tt.word)
+			if ok != tt.wantOK {
+				t.Fatalf("MatchVerb(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
+			}
+			if !ok {
+				return
+			}
+			if match.Base != tt.wantBase {
+				t.Errorf("MatchVerb(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
+			}
+			if match.Tense != tt.wantTense {
+				t.Errorf("MatchVerb(%q).Tense = %q, want %q", tt.word, match.Tense, tt.wantTense)
+			}
+		})
+	}
+}