diff --git a/grammar.go b/grammar.go index 95c2f96..efc4948 100644 --- a/grammar.go +++ b/grammar.go @@ -163,6 +163,22 @@ func IrregularNouns() map[string]string { return result } +// DualClassVerbs returns a copy of the additional regular verbs that also act +// as common nouns in dev/ops text. +func DualClassVerbs() map[string]VerbForms { + result := make(map[string]VerbForms, len(dualClassVerbs)) + maps.Copy(result, dualClassVerbs) + return result +} + +// DualClassNouns returns a copy of the additional regular nouns that also act +// as common verbs in dev/ops text. +func DualClassNouns() map[string]string { + result := make(map[string]string, len(dualClassNouns)) + maps.Copy(result, dualClassNouns) + return result +} + // Lower returns the lowercase form of s. func Lower(s string) string { return core.Lower(s) diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 592384f..c58bf02 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -193,6 +193,19 @@ func (t *Tokeniser) buildVerbIndex() { } } } + + // Tier 2b: Seed additional regular dual-class bases that are common in + // dev/ops text. These are regular forms, but they need to behave like + // known bases so the dual-class resolver can disambiguate them. + for base, forms := range i18n.DualClassVerbs() { + t.baseVerbs[base] = true + if forms.Past != "" && t.pastToBase[forms.Past] == "" { + t.pastToBase[forms.Past] = base + } + if forms.Gerund != "" && t.gerundToBase[forms.Gerund] == "" { + t.gerundToBase[forms.Gerund] = base + } + } } // buildNounIndex reads grammar tables and irregular noun maps to build @@ -221,6 +234,18 @@ func (t *Tokeniser) buildNounIndex() { } } } + + // Tier 2b: Seed additional regular dual-class bases that are common in + // dev/ops text. The plural forms are regular, but the entries need to + // appear in the base noun set so the ambiguous-token pass can see them. + for base, plural := range i18n.DualClassNouns() { + t.baseNouns[base] = true + if plural != base { + if _, exists := t.pluralToBase[plural]; !exists { + t.pluralToBase[plural] = base + } + } + } } // MatchNoun performs a 3-tier reverse lookup for a noun form. diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 3023255..1611fcf 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -671,6 +671,12 @@ func TestTokeniser_DualClassDetection(t *testing.T) { } } + for _, word := range []string{"change", "export", "function", "handle", "host", "import", "link", "log", "merge", "patch", "process", "queue", "release", "stream", "tag", "trigger", "watch"} { + if !tok.IsDualClass(word) { + t.Errorf("%q should be dual-class after expansion", word) + } + } + notDual := []string{"delete", "go", "push", "branch", "repo"} for _, word := range notDual { if tok.IsDualClass(word) { @@ -720,6 +726,37 @@ func TestTokeniser_IgnoresDeprecatedGrammarEntries(t *testing.T) { } } +func TestTokeniser_DualClassExpansion_ClassifiesCommonDevOpsWords(t *testing.T) { + setup(t) + tok := NewTokeniser() + + tests := []struct { + text string + wantType TokenType + wantLower string + }{ + {"the merge", TokenNoun, "merge"}, + {"please merge the file", TokenVerb, "merge"}, + {"the process", TokenNoun, "process"}, + {"please process the log", TokenVerb, "process"}, + } + + for _, tt := range tests { + t.Run(tt.text, func(t *testing.T) { + tokens := tok.Tokenise(tt.text) + if len(tokens) < 2 { + t.Fatalf("Tokenise(%q) returned %d tokens, want at least 2", tt.text, len(tokens)) + } + if tokens[1].Lower != tt.wantLower { + t.Fatalf("Tokenise(%q)[1].Lower = %q, want %q", tt.text, tokens[1].Lower, tt.wantLower) + } + if tokens[1].Type != tt.wantType { + t.Fatalf("Tokenise(%q)[1].Type = %v, want %v", tt.text, tokens[1].Type, tt.wantType) + } + }) + } +} + func TestToken_ConfidenceField(t *testing.T) { setup(t) tok := NewTokeniser() diff --git a/types.go b/types.go index fd627e7..3218c3b 100644 --- a/types.go +++ b/types.go @@ -450,6 +450,51 @@ var irregularNouns = map[string]string{ "calf": "calves", "loaf": "loaves", "thief": "thieves", } +// dualClassVerbs seeds additional regular verbs that are also common nouns in +// dev/ops text. The forms are regular, but listing them here makes the +// reversal tokeniser treat them as known bases for dual-class disambiguation. +var dualClassVerbs = map[string]VerbForms{ + "change": {Past: "changed", Gerund: "changing"}, + "export": {Past: "exported", Gerund: "exporting"}, + "function": {Past: "functioned", Gerund: "functioning"}, + "handle": {Past: "handled", Gerund: "handling"}, + "host": {Past: "hosted", Gerund: "hosting"}, + "import": {Past: "imported", Gerund: "importing"}, + "link": {Past: "linked", Gerund: "linking"}, + "log": {Past: "logged", Gerund: "logging"}, + "merge": {Past: "merged", Gerund: "merging"}, + "patch": {Past: "patched", Gerund: "patching"}, + "process": {Past: "processed", Gerund: "processing"}, + "queue": {Past: "queued", Gerund: "queuing"}, + "release": {Past: "released", Gerund: "releasing"}, + "stream": {Past: "streamed", Gerund: "streaming"}, + "tag": {Past: "tagged", Gerund: "tagging"}, + "trigger": {Past: "triggered", Gerund: "triggering"}, + "watch": {Past: "watched", Gerund: "watching"}, +} + +// dualClassNouns mirrors the same vocabulary as nouns so the tokeniser can +// classify the base forms as ambiguous when they appear without inflection. +var dualClassNouns = map[string]string{ + "change": "changes", + "export": "exports", + "function": "functions", + "handle": "handles", + "host": "hosts", + "import": "imports", + "link": "links", + "log": "logs", + "merge": "merges", + "patch": "patches", + "process": "processes", + "queue": "queues", + "release": "releases", + "stream": "streams", + "tag": "tags", + "trigger": "triggers", + "watch": "watches", +} + var vowelSounds = map[string]bool{ "hour": true, "honest": true, "honour": true, "honor": true, "heir": true, "herb": true, }