2026-04-02 03:42:39 +00:00
4 changed files with 123 additions and 0 deletions
--- a/grammar.go
+++ b/grammar.go
@ -163,6 +163,22 @@ func IrregularNouns() map[string]string {
 	return result
 }

+// DualClassVerbs returns a copy of the additional regular verbs that also act
+// as common nouns in dev/ops text.
+func DualClassVerbs() map[string]VerbForms {
+	result := make(map[string]VerbForms, len(dualClassVerbs))
+	maps.Copy(result, dualClassVerbs)
+	return result
+}
+
+// DualClassNouns returns a copy of the additional regular nouns that also act
+// as common verbs in dev/ops text.
+func DualClassNouns() map[string]string {
+	result := make(map[string]string, len(dualClassNouns))
+	maps.Copy(result, dualClassNouns)
+	return result
+}
+
 // Lower returns the lowercase form of s.
 func Lower(s string) string {
 	return core.Lower(s)
--- a/reversal/tokeniser.go
+++ b/reversal/tokeniser.go
@ -193,6 +193,19 @@ func (t *Tokeniser) buildVerbIndex() {
 			}
 		}
 	}
+
+	// Tier 2b: Seed additional regular dual-class bases that are common in
+	// dev/ops text. These are regular forms, but they need to behave like
+	// known bases so the dual-class resolver can disambiguate them.
+	for base, forms := range i18n.DualClassVerbs() {
+		t.baseVerbs[base] = true
+		if forms.Past != "" && t.pastToBase[forms.Past] == "" {
+			t.pastToBase[forms.Past] = base
+		}
+		if forms.Gerund != "" && t.gerundToBase[forms.Gerund] == "" {
+			t.gerundToBase[forms.Gerund] = base
+		}
+	}
 }

 // buildNounIndex reads grammar tables and irregular noun maps to build
@ -221,6 +234,18 @@ func (t *Tokeniser) buildNounIndex() {
 			}
 		}
 	}
+
+	// Tier 2b: Seed additional regular dual-class bases that are common in
+	// dev/ops text. The plural forms are regular, but the entries need to
+	// appear in the base noun set so the ambiguous-token pass can see them.
+	for base, plural := range i18n.DualClassNouns() {
+		t.baseNouns[base] = true
+		if plural != base {
+			if _, exists := t.pluralToBase[plural]; !exists {
+				t.pluralToBase[plural] = base
+			}
+		}
+	}
 }

 // MatchNoun performs a 3-tier reverse lookup for a noun form.
--- a/reversal/tokeniser_test.go
+++ b/reversal/tokeniser_test.go
@ -671,6 +671,12 @@ func TestTokeniser_DualClassDetection(t *testing.T) {
 		}
 	}

+	for _, word := range []string{"change", "export", "function", "handle", "host", "import", "link", "log", "merge", "patch", "process", "queue", "release", "stream", "tag", "trigger", "watch"} {
+		if !tok.IsDualClass(word) {
+			t.Errorf("%q should be dual-class after expansion", word)
+		}
+	}
+
 	notDual := []string{"delete", "go", "push", "branch", "repo"}
 	for _, word := range notDual {
 		if tok.IsDualClass(word) {
@ -720,6 +726,37 @@ func TestTokeniser_IgnoresDeprecatedGrammarEntries(t *testing.T) {
 	}
 }

+func TestTokeniser_DualClassExpansion_ClassifiesCommonDevOpsWords(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tests := []struct {
+		text      string
+		wantType  TokenType
+		wantLower string
+	}{
+		{"the merge", TokenNoun, "merge"},
+		{"please merge the file", TokenVerb, "merge"},
+		{"the process", TokenNoun, "process"},
+		{"please process the log", TokenVerb, "process"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.text, func(t *testing.T) {
+			tokens := tok.Tokenise(tt.text)
+			if len(tokens) < 2 {
+				t.Fatalf("Tokenise(%q) returned %d tokens, want at least 2", tt.text, len(tokens))
+			}
+			if tokens[1].Lower != tt.wantLower {
+				t.Fatalf("Tokenise(%q)[1].Lower = %q, want %q", tt.text, tokens[1].Lower, tt.wantLower)
+			}
+			if tokens[1].Type != tt.wantType {
+				t.Fatalf("Tokenise(%q)[1].Type = %v, want %v", tt.text, tokens[1].Type, tt.wantType)
+			}
+		})
+	}
+}
+
 func TestToken_ConfidenceField(t *testing.T) {
 	setup(t)
 	tok := NewTokeniser()
--- a/types.go
+++ b/types.go
@ -450,6 +450,51 @@ var irregularNouns = map[string]string{
 	"calf": "calves", "loaf": "loaves", "thief": "thieves",
 }

+// dualClassVerbs seeds additional regular verbs that are also common nouns in
+// dev/ops text. The forms are regular, but listing them here makes the
+// reversal tokeniser treat them as known bases for dual-class disambiguation.
+var dualClassVerbs = map[string]VerbForms{
+	"change":   {Past: "changed", Gerund: "changing"},
+	"export":   {Past: "exported", Gerund: "exporting"},
+	"function": {Past: "functioned", Gerund: "functioning"},
+	"handle":   {Past: "handled", Gerund: "handling"},
+	"host":     {Past: "hosted", Gerund: "hosting"},
+	"import":   {Past: "imported", Gerund: "importing"},
+	"link":     {Past: "linked", Gerund: "linking"},
+	"log":      {Past: "logged", Gerund: "logging"},
+	"merge":    {Past: "merged", Gerund: "merging"},
+	"patch":    {Past: "patched", Gerund: "patching"},
+	"process":  {Past: "processed", Gerund: "processing"},
+	"queue":    {Past: "queued", Gerund: "queuing"},
+	"release":  {Past: "released", Gerund: "releasing"},
+	"stream":   {Past: "streamed", Gerund: "streaming"},
+	"tag":      {Past: "tagged", Gerund: "tagging"},
+	"trigger":  {Past: "triggered", Gerund: "triggering"},
+	"watch":    {Past: "watched", Gerund: "watching"},
+}
+
+// dualClassNouns mirrors the same vocabulary as nouns so the tokeniser can
+// classify the base forms as ambiguous when they appear without inflection.
+var dualClassNouns = map[string]string{
+	"change":   "changes",
+	"export":   "exports",
+	"function": "functions",
+	"handle":   "handles",
+	"host":     "hosts",
+	"import":   "imports",
+	"link":     "links",
+	"log":      "logs",
+	"merge":    "merges",
+	"patch":    "patches",
+	"process":  "processes",
+	"queue":    "queues",
+	"release":  "releases",
+	"stream":   "streams",
+	"tag":      "tags",
+	"trigger":  "triggers",
+	"watch":    "watches",
+}
+
 var vowelSounds = map[string]bool{
 	"hour": true, "honest": true, "honour": true, "honor": true, "heir": true, "herb": true,
 }