diff --git a/grammar.go b/grammar.go index 9968a63..aaa2fa0 100644 --- a/grammar.go +++ b/grammar.go @@ -20,6 +20,24 @@ func SetGrammarData(lang string, data *GrammarData) { grammarCache[lang] = data } +// IrregularVerbs returns a copy of the irregular verb forms map. +func IrregularVerbs() map[string]VerbForms { + result := make(map[string]VerbForms, len(irregularVerbs)) + for k, v := range irregularVerbs { + result[k] = v + } + return result +} + +// IrregularNouns returns a copy of the irregular nouns map. +func IrregularNouns() map[string]string { + result := make(map[string]string, len(irregularNouns)) + for k, v := range irregularNouns { + result[k] = v + } + return result +} + func getVerbForm(lang, verb, form string) string { data := GetGrammarData(lang) if data == nil || data.Verbs == nil { diff --git a/locales/en.json b/locales/en.json index 7739beb..92feb45 100644 --- a/locales/en.json +++ b/locales/en.json @@ -39,7 +39,10 @@ "hit": { "base": "hit", "past": "hit", "gerund": "hitting" }, "sit": { "base": "sit", "past": "sat", "gerund": "sitting" }, "split": { "base": "split", "past": "split", "gerund": "splitting" }, - "shut": { "base": "shut", "past": "shut", "gerund": "shutting" } + "shut": { "base": "shut", "past": "shut", "gerund": "shutting" }, + "delete": { "base": "delete", "past": "deleted", "gerund": "deleting" }, + "update": { "base": "update", "past": "updated", "gerund": "updating" }, + "push": { "base": "push", "past": "pushed", "gerund": "pushing" } }, "noun": { "file": { "one": "file", "other": "files" }, diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go new file mode 100644 index 0000000..f6c9dd0 --- /dev/null +++ b/reversal/tokeniser.go @@ -0,0 +1,303 @@ +// Package reversal provides reverse grammar lookups. +// +// The forward engine (go-i18n) maps base forms to inflected forms: +// +// PastTense("delete") → "deleted" +// Gerund("run") → "running" +// +// The reversal engine reads those same tables backwards, turning +// inflected forms back into base forms with tense metadata: +// +// MatchVerb("deleted") → {Base: "delete", Tense: "past"} +// MatchVerb("running") → {Base: "run", Tense: "gerund"} +// +// 3-tier lookup: JSON grammar data → irregular verb maps → regular +// morphology rules (verified by round-tripping through forward functions). +package reversal + +import ( + "strings" + + i18n "forge.lthn.ai/core/go-i18n" +) + +// VerbMatch holds the result of a reverse verb lookup. +type VerbMatch struct { + Base string // Base form of the verb ("delete", "run") + Tense string // "past", "gerund", or "base" + Form string // The original inflected form +} + +// NounMatch holds the result of a reverse noun lookup. +type NounMatch struct { + Base string // Base/singular form of the noun + Plural bool // Whether the matched form was plural + Form string // The original form +} + +// Tokeniser provides reverse grammar lookups by maintaining inverse +// indexes built from the forward grammar tables. +type Tokeniser struct { + pastToBase map[string]string // "deleted" → "delete" + gerundToBase map[string]string // "deleting" → "delete" + baseVerbs map[string]bool // "delete" → true + pluralToBase map[string]string // "files" → "file" + baseNouns map[string]bool // "file" → true + words map[string]string // word translations + lang string +} + +// NewTokeniser creates a Tokeniser for English ("en"). +func NewTokeniser() *Tokeniser { + return NewTokeniserForLang("en") +} + +// NewTokeniserForLang creates a Tokeniser for the specified language, +// building inverse indexes from the grammar data. +func NewTokeniserForLang(lang string) *Tokeniser { + t := &Tokeniser{ + pastToBase: make(map[string]string), + gerundToBase: make(map[string]string), + baseVerbs: make(map[string]bool), + pluralToBase: make(map[string]string), + baseNouns: make(map[string]bool), + words: make(map[string]string), + lang: lang, + } + t.buildVerbIndex() + return t +} + +// buildVerbIndex reads grammar tables and irregular verb maps to build +// inverse lookup maps: inflected form → base form. +func (t *Tokeniser) buildVerbIndex() { + // Tier 1: Read from JSON grammar data (via GetGrammarData). + data := i18n.GetGrammarData(t.lang) + if data != nil && data.Verbs != nil { + for base, forms := range data.Verbs { + t.baseVerbs[base] = true + if forms.Past != "" { + t.pastToBase[forms.Past] = base + } + if forms.Gerund != "" { + t.gerundToBase[forms.Gerund] = base + } + } + } + + // Tier 2: Read from the exported irregularVerbs map. + // Build inverse maps directly from the authoritative source. + for base, forms := range i18n.IrregularVerbs() { + t.baseVerbs[base] = true + if forms.Past != "" { + if _, exists := t.pastToBase[forms.Past]; !exists { + t.pastToBase[forms.Past] = base + } + } + if forms.Gerund != "" { + if _, exists := t.gerundToBase[forms.Gerund]; !exists { + t.gerundToBase[forms.Gerund] = base + } + } + } +} + +// MatchVerb performs a 3-tier reverse lookup for a verb form. +// +// Tier 1: Check if the word is a known base verb. +// Tier 2: Check the pastToBase and gerundToBase inverse maps. +// Tier 3: Try reverse morphology rules and round-trip verify via +// the forward functions PastTense() and Gerund(). +func (t *Tokeniser) MatchVerb(word string) (VerbMatch, bool) { + word = strings.ToLower(strings.TrimSpace(word)) + if word == "" { + return VerbMatch{}, false + } + + // Tier 1: Is it a base verb? + if t.baseVerbs[word] { + return VerbMatch{Base: word, Tense: "base", Form: word}, true + } + + // Tier 2: Check inverse maps from grammar tables + irregular verbs. + if base, ok := t.pastToBase[word]; ok { + return VerbMatch{Base: base, Tense: "past", Form: word}, true + } + if base, ok := t.gerundToBase[word]; ok { + return VerbMatch{Base: base, Tense: "gerund", Form: word}, true + } + + // Tier 3: Reverse morphology with round-trip verification. + // Try past tense candidates. + if base := t.bestRoundTrip(word, t.reverseRegularPast(word), i18n.PastTense); base != "" { + return VerbMatch{Base: base, Tense: "past", Form: word}, true + } + + // Try gerund candidates. + if base := t.bestRoundTrip(word, t.reverseRegularGerund(word), i18n.Gerund); base != "" { + return VerbMatch{Base: base, Tense: "gerund", Form: word}, true + } + + return VerbMatch{}, false +} + +// bestRoundTrip selects the best candidate from a list by round-tripping +// each through a forward function. When multiple candidates round-trip +// successfully (ambiguity), it uses the following priority: +// 1. Candidates that are known base verbs (in grammar tables / irregular maps) +// 2. Candidates ending in a VCe pattern (vowel-consonant-e, the "magic e" +// pattern common in real English verbs like "delete", "create", "use"). +// This avoids phantom verbs like "walke" or "processe" which have a +// CCe pattern (consonant-consonant-e) that doesn't occur naturally. +// 3. Candidates NOT ending in "e" (the default morphology path) +// 4. First match in candidate order as final tiebreaker +func (t *Tokeniser) bestRoundTrip(target string, candidates []string, forward func(string) string) string { + var matches []string + for _, c := range candidates { + if forward(c) == target { + matches = append(matches, c) + } + } + if len(matches) == 0 { + return "" + } + if len(matches) == 1 { + return matches[0] + } + + // Priority 1: known base verb + for _, m := range matches { + if t.baseVerbs[m] { + return m + } + } + + // Priority 2: prefer VCe-ending candidate (real English verb pattern) + for _, m := range matches { + if hasVCeEnding(m) { + return m + } + } + + // Priority 3: prefer candidate not ending in "e" (avoids phantom verbs + // with CCe endings like "walke", "processe") + for _, m := range matches { + if !strings.HasSuffix(m, "e") { + return m + } + } + + return matches[0] +} + +// hasVCeEnding returns true if the word ends in a vowel-consonant-e pattern +// (the "magic e" pattern). This is characteristic of real English verbs like +// "delete" (-ete), "create" (-ate), "use" (-use), "close" (-ose). +// Phantom verbs produced by naive suffix stripping like "walke" (-lke) or +// "processe" (-sse) end in consonant-consonant-e and return false. +func hasVCeEnding(word string) bool { + if len(word) < 3 || word[len(word)-1] != 'e' { + return false + } + lastConsonant := word[len(word)-2] + vowelBefore := word[len(word)-3] + return !isVowelByte(lastConsonant) && isVowelByte(vowelBefore) +} + +func isVowelByte(b byte) bool { + switch b { + case 'a', 'e', 'i', 'o', 'u': + return true + } + return false +} + +// reverseRegularPast generates candidate base forms by reversing regular +// past tense suffixes. Returns multiple candidates ordered by likelihood. +// +// The forward engine applies rules in this order: +// 1. ends in "e" → +d (create → created) +// 2. ends in "y" + consonant → ied (copy → copied) +// 3. shouldDoubleConsonant → double+ed (stop → stopped) +// 4. default → +ed (walk → walked) +// +// We generate candidates for each possible reverse rule. Round-trip +// verification (in bestRoundTrip) ensures only correct candidates pass. +func (t *Tokeniser) reverseRegularPast(word string) []string { + var candidates []string + + if !strings.HasSuffix(word, "ed") { + return candidates + } + + // Rule: consonant + "ied" → consonant + "y" (e.g., "copied" → "copy") + if strings.HasSuffix(word, "ied") && len(word) > 3 { + base := word[:len(word)-3] + "y" + candidates = append(candidates, base) + } + + // Rule: doubled consonant + "ed" → single consonant (e.g., "stopped" → "stop") + if len(word) > 4 { + beforeEd := word[:len(word)-2] + lastChar := beforeEd[len(beforeEd)-1] + if len(beforeEd) >= 2 && beforeEd[len(beforeEd)-2] == lastChar { + base := beforeEd[:len(beforeEd)-1] + candidates = append(candidates, base) + } + } + + // Rule: stem + "d" where stem ends in "e" (e.g., "created" → "create") + if len(word) > 2 { + stemPlusE := word[:len(word)-1] // strip "d", leaving stem + "e" + candidates = append(candidates, stemPlusE) + } + + // Rule: stem + "ed" (e.g., "walked" → "walk") + if len(word) > 2 { + stem := word[:len(word)-2] + candidates = append(candidates, stem) + } + + return candidates +} + +// reverseRegularGerund generates candidate base forms by reversing regular +// gerund suffixes. Returns multiple candidates ordered by likelihood. +// +// Rules reversed: +// - verb + "ing" (e.g., "walking" → "walk") +// - verb[:-1] + "ing" (e.g., "creating" → "create", drop e) +// - doubled consonant (e.g., "stopping" → "stop") +// - verb[:-2] + "ying" (e.g., "dying" → "die") +func (t *Tokeniser) reverseRegularGerund(word string) []string { + var candidates []string + + if !strings.HasSuffix(word, "ing") || len(word) < 4 { + return candidates + } + + stem := word[:len(word)-3] // strip "ing" + + // Rule: "ying" → "ie" (e.g., "dying" → "die") + if strings.HasSuffix(word, "ying") && len(word) > 4 { + base := word[:len(word)-4] + "ie" + candidates = append(candidates, base) + } + + // Rule: doubled consonant + "ing" → single consonant (e.g., "stopping" → "stop") + if len(stem) >= 2 && stem[len(stem)-1] == stem[len(stem)-2] { + base := stem[:len(stem)-1] + candidates = append(candidates, base) + } + + // Rule: direct strip "ing" (e.g., "walking" → "walk") + // This must come before the stem+"e" rule to avoid false positives + // like "walke" round-tripping through Gerund("walke") = "walking". + candidates = append(candidates, stem) + + // Rule: stem + "e" was dropped before "ing" (e.g., "creating" → "create") + // Try adding "e" back. + candidates = append(candidates, stem+"e") + + return candidates +} diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go new file mode 100644 index 0000000..39a46b3 --- /dev/null +++ b/reversal/tokeniser_test.go @@ -0,0 +1,110 @@ +package reversal + +import ( + "testing" + + i18n "forge.lthn.ai/core/go-i18n" +) + +func setup(t *testing.T) { + t.Helper() + svc, err := i18n.New() + if err != nil { + t.Fatalf("i18n.New() failed: %v", err) + } + i18n.SetDefault(svc) +} + +func TestTokeniser_MatchVerb_Irregular(t *testing.T) { + setup(t) + tok := NewTokeniser() + + tests := []struct { + word string + wantOK bool + wantBase string + wantTense string + }{ + // Irregular past tense + {"deleted", true, "delete", "past"}, + {"deleting", true, "delete", "gerund"}, + {"went", true, "go", "past"}, + {"going", true, "go", "gerund"}, + {"was", true, "be", "past"}, + {"being", true, "be", "gerund"}, + {"ran", true, "run", "past"}, + {"running", true, "run", "gerund"}, + {"wrote", true, "write", "past"}, + {"writing", true, "write", "gerund"}, + {"built", true, "build", "past"}, + {"building", true, "build", "gerund"}, + {"committed", true, "commit", "past"}, + {"committing", true, "commit", "gerund"}, + + // Base forms + {"delete", true, "delete", "base"}, + {"go", true, "go", "base"}, + + // Unknown words return false + {"xyzzy", false, "", ""}, + {"flurble", false, "", ""}, + } + + for _, tt := range tests { + t.Run(tt.word, func(t *testing.T) { + match, ok := tok.MatchVerb(tt.word) + if ok != tt.wantOK { + t.Fatalf("MatchVerb(%q) ok = %v, want %v", tt.word, ok, tt.wantOK) + } + if !ok { + return + } + if match.Base != tt.wantBase { + t.Errorf("MatchVerb(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase) + } + if match.Tense != tt.wantTense { + t.Errorf("MatchVerb(%q).Tense = %q, want %q", tt.word, match.Tense, tt.wantTense) + } + }) + } +} + +func TestTokeniser_MatchVerb_Regular(t *testing.T) { + setup(t) + tok := NewTokeniser() + + tests := []struct { + word string + wantOK bool + wantBase string + wantTense string + }{ + // Regular verbs NOT in grammar tables — detected by reverse morphology + round-trip + {"walked", true, "walk", "past"}, + {"walking", true, "walk", "gerund"}, + {"processed", true, "process", "past"}, + {"processing", true, "process", "gerund"}, + {"copied", true, "copy", "past"}, + {"copying", true, "copy", "gerund"}, + {"stopped", true, "stop", "past"}, + {"stopping", true, "stop", "gerund"}, + } + + for _, tt := range tests { + t.Run(tt.word, func(t *testing.T) { + match, ok := tok.MatchVerb(tt.word) + if ok != tt.wantOK { + t.Fatalf("MatchVerb(%q) ok = %v, want %v", tt.word, ok, tt.wantOK) + } + if !ok { + return + } + if match.Base != tt.wantBase { + t.Errorf("MatchVerb(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase) + } + if match.Tense != tt.wantTense { + t.Errorf("MatchVerb(%q).Tense = %q, want %q", tt.word, match.Tense, tt.wantTense) + } + }) + } +}