go-i18n/reversal/tokeniser_test.go
Virgil d74550c605 fix(reversal): expose low-information confidence constants
Co-Authored-By: Virgil <virgil@lethean.io>
2026-04-02 09:23:13 +00:00

1438 lines
40 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package reversal
import (
"testing"
i18n "dappco.re/go/core/i18n"
)
func setup(t *testing.T) {
t.Helper()
svc, err := i18n.New()
if err != nil {
t.Fatalf("i18n.New() failed: %v", err)
}
i18n.SetDefault(svc)
}
func TestTokeniser_MatchVerb_Irregular(t *testing.T) {
setup(t)
tok := NewTokeniser()
tests := []struct {
word string
wantOK bool
wantBase string
wantTense string
}{
// Irregular past tense
{"deleted", true, "delete", "past"},
{"deleting", true, "delete", "gerund"},
{"went", true, "go", "past"},
{"going", true, "go", "gerund"},
{"was", true, "be", "past"},
{"being", true, "be", "gerund"},
{"ran", true, "run", "past"},
{"running", true, "run", "gerund"},
{"wrote", true, "write", "past"},
{"writing", true, "write", "gerund"},
{"built", true, "build", "past"},
{"building", true, "build", "gerund"},
{"committed", true, "commit", "past"},
{"committing", true, "commit", "gerund"},
// Base forms
{"delete", true, "delete", "base"},
{"go", true, "go", "base"},
// Unknown words return false
{"xyzzy", false, "", ""},
{"flurble", false, "", ""},
}
for _, tt := range tests {
t.Run(tt.word, func(t *testing.T) {
match, ok := tok.MatchVerb(tt.word)
if ok != tt.wantOK {
t.Fatalf("MatchVerb(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
}
if !ok {
return
}
if match.Base != tt.wantBase {
t.Errorf("MatchVerb(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
}
if match.Tense != tt.wantTense {
t.Errorf("MatchVerb(%q).Tense = %q, want %q", tt.word, match.Tense, tt.wantTense)
}
})
}
}
func TestTokeniser_MatchNoun_Irregular(t *testing.T) {
setup(t)
tok := NewTokeniser()
tests := []struct {
word string
wantOK bool
wantBase string
wantPlural bool
}{
{"files", true, "file", true},
{"file", true, "file", false},
{"people", true, "person", true},
{"person", true, "person", false},
{"children", true, "child", true},
{"child", true, "child", false},
{"repositories", true, "repository", true},
{"repository", true, "repository", false},
{"branches", true, "branch", true},
{"branch", true, "branch", false},
{"xyzzy", false, "", false},
}
for _, tt := range tests {
t.Run(tt.word, func(t *testing.T) {
match, ok := tok.MatchNoun(tt.word)
if ok != tt.wantOK {
t.Fatalf("MatchNoun(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
}
if !ok {
return
}
if match.Base != tt.wantBase {
t.Errorf("MatchNoun(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
}
if match.Plural != tt.wantPlural {
t.Errorf("MatchNoun(%q).Plural = %v, want %v", tt.word, match.Plural, tt.wantPlural)
}
})
}
}
func TestTokeniser_MatchNoun_Regular(t *testing.T) {
setup(t)
tok := NewTokeniser()
tests := []struct {
word string
wantOK bool
wantBase string
wantPlural bool
}{
// Regular nouns NOT in grammar tables — detected by reverse morphology + round-trip
{"servers", true, "server", true},
{"processes", true, "process", true},
{"entries", true, "entry", true},
}
for _, tt := range tests {
t.Run(tt.word, func(t *testing.T) {
match, ok := tok.MatchNoun(tt.word)
if ok != tt.wantOK {
t.Fatalf("MatchNoun(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
}
if !ok {
return
}
if match.Base != tt.wantBase {
t.Errorf("MatchNoun(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
}
if match.Plural != tt.wantPlural {
t.Errorf("MatchNoun(%q).Plural = %v, want %v", tt.word, match.Plural, tt.wantPlural)
}
})
}
}
func TestTokeniser_MatchWord(t *testing.T) {
setup(t)
tok := NewTokeniser()
tests := []struct {
word string
wantCat string
wantOK bool
}{
{"URL", "url", true},
{"url", "url", true},
{"ID", "id", true},
{"SSH", "ssh", true},
{"up to date", "up_to_date", true},
{"PHP", "php", true},
{"xyzzy", "", false},
}
for _, tt := range tests {
t.Run(tt.word, func(t *testing.T) {
cat, ok := tok.MatchWord(tt.word)
if ok != tt.wantOK {
t.Fatalf("MatchWord(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
}
if ok && cat != tt.wantCat {
t.Errorf("MatchWord(%q) = %q, want %q", tt.word, cat, tt.wantCat)
}
})
}
}
func TestTokeniser_MatchArticle(t *testing.T) {
setup(t)
tok := NewTokeniser()
tests := []struct {
word string
wantType string
wantOK bool
}{
{"a", "indefinite", true},
{"an", "indefinite", true},
{"the", "definite", true},
{"the.", "definite", true},
{"A", "indefinite", true},
{"The", "definite", true},
{"foo", "", false},
}
for _, tt := range tests {
t.Run(tt.word, func(t *testing.T) {
artType, ok := tok.MatchArticle(tt.word)
if ok != tt.wantOK {
t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
}
if ok && artType != tt.wantType {
t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
}
})
}
}
func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) {
setup(t)
tok := NewTokeniserForLang("fr")
tests := []struct {
word string
wantType string
wantOK bool
}{
{"le", "definite", true},
{"la", "definite", true},
{"le serveur", "definite", true},
{"le serveur.", "definite", true},
{"la branche", "definite", true},
{"les amis", "definite", true},
{"Le", "definite", true},
{"La", "definite", true},
{"Un enfant", "indefinite", true},
{"Une amie", "indefinite", true},
{"de la", "indefinite", true},
{"de le", "indefinite", true},
{"de les", "indefinite", true},
{"de l'", "indefinite", true},
{"de l", "indefinite", true},
{"du serveur", "indefinite", true},
{"des amis", "indefinite", true},
{"un", "indefinite", true},
{"une", "indefinite", true},
{"l'enfant", "definite", true},
{"lami", "definite", true},
}
for _, tt := range tests {
t.Run(tt.word, func(t *testing.T) {
artType, ok := tok.MatchArticle(tt.word)
if ok != tt.wantOK {
t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
}
if ok && artType != tt.wantType {
t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
}
})
}
tokens := tok.Tokenise("la branche")
if len(tokens) == 0 || tokens[0].Type != TokenArticle {
t.Fatalf("Tokenise(%q)[0] should be TokenArticle, got %#v", "la branche", tokens)
}
tokens = tok.Tokenise("une branche")
if len(tokens) == 0 || tokens[0].Type != TokenArticle {
t.Fatalf("Tokenise(%q)[0] should be TokenArticle, got %#v", "une branche", tokens)
}
if tokens[0].ArtType != "indefinite" {
t.Fatalf("Tokenise(%q)[0].ArtType = %q, want %q", "une branche", tokens[0].ArtType, "indefinite")
}
}
func TestTokeniser_Tokenise_WordPhrase(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("up to date")
if len(tokens) != 1 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 1", "up to date", len(tokens))
}
if tokens[0].Type != TokenWord {
t.Fatalf("Tokenise(%q)[0].Type = %v, want TokenWord", "up to date", tokens[0].Type)
}
if tokens[0].WordCat != "up_to_date" {
t.Fatalf("Tokenise(%q)[0].WordCat = %q, want %q", "up to date", tokens[0].WordCat, "up_to_date")
}
}
func TestTokeniser_Tokenise_WordPhraseWithPunctuation(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("up to date.")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "up to date.", len(tokens))
}
if tokens[0].Type != TokenWord {
t.Fatalf("Tokenise(%q)[0].Type = %v, want TokenWord", "up to date.", tokens[0].Type)
}
if tokens[1].Type != TokenPunctuation {
t.Fatalf("Tokenise(%q)[1].Type = %v, want TokenPunctuation", "up to date.", tokens[1].Type)
}
}
func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) {
setup(t)
tok := NewTokeniserForLang("fr")
tests := []struct {
word string
wantType string
wantOK bool
}{
{"l'", "definite", true},
{"l", "definite", true},
{"lʼ", "definite", true},
{"L'", "definite", true},
{"L", "definite", true},
{"Lʼ", "definite", true},
{"les", "definite", true},
{"au", "definite", true},
{"aux", "definite", true},
{"du", "indefinite", true},
{"des", "indefinite", true},
{"l'enfant", "definite", true},
{"de l'enfant", "indefinite", true},
{"de lami", "indefinite", true},
{"De lenfant", "indefinite", true},
}
for _, tt := range tests {
t.Run(tt.word, func(t *testing.T) {
artType, ok := tok.MatchArticle(tt.word)
if ok != tt.wantOK {
t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
}
if ok && artType != tt.wantType {
t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
}
})
}
}
func TestTokeniser_MatchArticle_FrenchUnderscoreTagFallback(t *testing.T) {
setup(t)
tok := NewTokeniserForLang("fr_CA")
tests := []struct {
word string
wantType string
wantOK bool
}{
{"le", "definite", true},
{"l'ami", "definite", true},
{"de l'ami", "indefinite", true},
}
for _, tt := range tests {
t.Run(tt.word, func(t *testing.T) {
artType, ok := tok.MatchArticle(tt.word)
if ok != tt.wantOK {
t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
}
if ok && artType != tt.wantType {
t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
}
})
}
tokens := tok.Tokenise("l'ami")
if len(tokens) == 0 || tokens[0].Type != TokenArticle {
t.Fatalf("Tokenise(%q)[0] should be TokenArticle, got %#v", "l'ami", tokens)
}
}
func TestTokeniser_MatchArticle_ConfiguredPhrasePrefix(t *testing.T) {
setup(t)
const lang = "xx"
prev := i18n.GetGrammarData(lang)
t.Cleanup(func() {
i18n.SetGrammarData(lang, prev)
})
i18n.SetGrammarData(lang, &i18n.GrammarData{
Articles: i18n.ArticleForms{
IndefiniteDefault: "a",
IndefiniteVowel: "an",
Definite: "the",
},
})
tok := NewTokeniserForLang(lang)
tests := []struct {
word string
wantType string
wantOK bool
}{
{"the file", "definite", true},
{"a file", "indefinite", true},
{"an error", "indefinite", true},
{"file", "", false},
}
for _, tt := range tests {
t.Run(tt.word, func(t *testing.T) {
artType, ok := tok.MatchArticle(tt.word)
if ok != tt.wantOK {
t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
}
if ok && artType != tt.wantType {
t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
}
})
}
}
func TestTokeniser_MatchArticle_ConfiguredElisionPrefix(t *testing.T) {
setup(t)
const lang = "xy"
prev := i18n.GetGrammarData(lang)
t.Cleanup(func() {
i18n.SetGrammarData(lang, prev)
})
i18n.SetGrammarData(lang, &i18n.GrammarData{
Articles: i18n.ArticleForms{
IndefiniteDefault: "a",
IndefiniteVowel: "an",
Definite: "l'",
ByGender: map[string]string{
"m": "le",
"f": "la",
},
},
Nouns: map[string]i18n.NounForms{
"ami": {One: "ami", Other: "amis", Gender: "m"},
},
})
tok := NewTokeniserForLang(lang)
tests := []struct {
word string
wantType string
wantOK bool
}{
{"l'ami", "definite", true},
{"lami", "definite", true},
{"lʼami", "definite", true},
}
for _, tt := range tests {
t.Run(tt.word, func(t *testing.T) {
artType, ok := tok.MatchArticle(tt.word)
if ok != tt.wantOK {
t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
}
if ok && artType != tt.wantType {
t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
}
})
}
tokens := tok.Tokenise("l'ami")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "l'ami", len(tokens))
}
if tokens[0].Type != TokenArticle || tokens[0].ArtType != "definite" {
t.Fatalf("Tokenise(%q)[0] = %#v, want definite article", "l'ami", tokens[0])
}
if tokens[1].Type != TokenNoun || tokens[1].Lower != "ami" {
t.Fatalf("Tokenise(%q)[1] = %#v, want noun ami", "l'ami", tokens[1])
}
}
func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
setup(t)
tok := NewTokeniserForLang("fr")
tokens := tok.Tokenise("l'enfant")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "l'enfant", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].ArtType != "definite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "enfant" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
tokens = tok.Tokenise("de l'enfant")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de l'enfant", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].ArtType != "indefinite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
}
if tokens[0].Lower != "de l'" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "enfant" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
tokens = tok.Tokenise("de l' enfant")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de l' enfant", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].ArtType != "indefinite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
}
if tokens[0].Lower != "de l'" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "enfant" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
tokens = tok.Tokenise("De lenfant.")
if len(tokens) != 3 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 3", "De lenfant.", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].ArtType != "indefinite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "enfant" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
if tokens[2].Type != TokenPunctuation {
t.Fatalf("tokens[2].Type = %v, want TokenPunctuation", tokens[2].Type)
}
if tokens[2].PunctType != "sentence_end" {
t.Fatalf("tokens[2].PunctType = %q, want %q", tokens[2].PunctType, "sentence_end")
}
tokens = tok.Tokenise("de le serveur")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de le serveur", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].ArtType != "indefinite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
}
if tokens[0].Lower != "de le" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de le")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "serveur" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "serveur")
}
tokens = tok.Tokenise("de les amis")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de les amis", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].ArtType != "indefinite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
}
if tokens[0].Lower != "de les" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de les")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "amis" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "amis")
}
tokens = tok.Tokenise("de l enfant")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de l enfant", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].Lower != "de l'" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "enfant" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
tokens = tok.Tokenise("de lʼenfant")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de lʼenfant", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].Lower != "de l'" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "enfant" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
tokens = tok.Tokenise("d'enfant")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "d'enfant", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].ArtType != "indefinite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
tokens = tok.Tokenise("lenfant")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "lenfant", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].ArtType != "definite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "enfant" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
}
tokens = tok.Tokenise("au serveur")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "au serveur", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].ArtType != "definite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite")
}
}
func TestTokeniser_Tokenise_FrenchPartitiveArticlePhrase(t *testing.T) {
setup(t)
tok := NewTokeniserForLang("fr")
tokens := tok.Tokenise("de la branche")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de la branche", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].Lower != "de la" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de la")
}
if tokens[0].ArtType != "indefinite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "branche" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "branche")
}
tokens = tok.Tokenise("de les amis")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de les amis", len(tokens))
}
if tokens[0].Type != TokenArticle {
t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
}
if tokens[0].Lower != "de les" {
t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de les")
}
if tokens[0].ArtType != "indefinite" {
t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
}
if tokens[1].Type != TokenNoun {
t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Lower != "amis" {
t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "amis")
}
}
func TestTokeniser_Tokenise(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("Deleted the configuration files")
if len(tokens) != 4 {
t.Fatalf("Tokenise() returned %d tokens, want 4", len(tokens))
}
// "Deleted" → verb, past tense
if tokens[0].Type != TokenVerb {
t.Errorf("tokens[0].Type = %v, want TokenVerb", tokens[0].Type)
}
if tokens[0].VerbInfo.Tense != "past" {
t.Errorf("tokens[0].VerbInfo.Tense = %q, want %q", tokens[0].VerbInfo.Tense, "past")
}
// "the" → article
if tokens[1].Type != TokenArticle {
t.Errorf("tokens[1].Type = %v, want TokenArticle", tokens[1].Type)
}
// "configuration" → unknown
if tokens[2].Type != TokenUnknown {
t.Errorf("tokens[2].Type = %v, want TokenUnknown", tokens[2].Type)
}
// "files" → noun, plural
if tokens[3].Type != TokenNoun {
t.Errorf("tokens[3].Type = %v, want TokenNoun", tokens[3].Type)
}
if !tokens[3].NounInfo.Plural {
t.Errorf("tokens[3].NounInfo.Plural = false, want true")
}
}
func TestTokeniser_Tokenise_Punctuation(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("Building project...")
hasPunct := false
for _, tok := range tokens {
if tok.Type == TokenPunctuation {
hasPunct = true
}
}
if !hasPunct {
t.Error("did not detect punctuation in \"Building project...\"")
}
}
func TestTokeniser_Tokenise_ClauseBoundarySentence(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("run tests. commit")
hasSentenceEnd := false
for _, token := range tokens {
if token.Raw == "run" && token.Type != TokenVerb {
t.Errorf("'run' should remain TokenVerb, got %v", token.Type)
}
if token.Type == TokenPunctuation && token.PunctType == "sentence_end" {
hasSentenceEnd = true
}
if token.Lower == "commit" {
// Without sentence-end boundary support, this can be demoted by verb saturation.
// With boundary detection, it should still classify as a verb.
if token.Type != TokenVerb {
t.Errorf("'commit' after period should be TokenVerb, got %v", token.Type)
}
}
}
if !hasSentenceEnd {
t.Error("did not detect sentence-end punctuation in \"run tests. commit\"")
}
}
func TestTokeniser_Tokenise_ClauseBoundaryStandalonePunctuation(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("run tests . commit")
hasSentenceEnd := false
for _, token := range tokens {
if token.Type == TokenPunctuation && token.PunctType == "sentence_end" {
hasSentenceEnd = true
}
if token.Lower == "commit" && token.Type != TokenVerb {
t.Errorf("'commit' after standalone period should be TokenVerb, got %v", token.Type)
}
}
if !hasSentenceEnd {
t.Error("did not detect standalone sentence-end punctuation in \"run tests . commit\"")
}
}
func TestTokeniser_Tokenise_Empty(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("")
if len(tokens) != 0 {
t.Errorf("Tokenise(\"\") returned %d tokens, want 0", len(tokens))
}
}
func TestTokeniser_MatchVerb_Regular(t *testing.T) {
setup(t)
tok := NewTokeniser()
tests := []struct {
word string
wantOK bool
wantBase string
wantTense string
}{
// Regular verbs NOT in grammar tables — detected by reverse morphology + round-trip
{"walked", true, "walk", "past"},
{"walking", true, "walk", "gerund"},
{"processed", true, "process", "past"},
{"processing", true, "process", "gerund"},
{"copied", true, "copy", "past"},
{"copying", true, "copy", "gerund"},
{"stopped", true, "stop", "past"},
{"stopping", true, "stop", "gerund"},
}
for _, tt := range tests {
t.Run(tt.word, func(t *testing.T) {
match, ok := tok.MatchVerb(tt.word)
if ok != tt.wantOK {
t.Fatalf("MatchVerb(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
}
if !ok {
return
}
if match.Base != tt.wantBase {
t.Errorf("MatchVerb(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
}
if match.Tense != tt.wantTense {
t.Errorf("MatchVerb(%q).Tense = %q, want %q", tt.word, match.Tense, tt.wantTense)
}
})
}
}
func TestTokeniser_WithSignals(t *testing.T) {
setup(t)
tok := NewTokeniser(WithSignals())
_ = tok // verify it compiles and accepts the option
}
func TestTokeniser_Tokenise_CorpusPriorBias(t *testing.T) {
const lang = "zz-prior"
original := i18n.GetGrammarData(lang)
t.Cleanup(func() {
i18n.SetGrammarData(lang, original)
})
i18n.SetGrammarData(lang, &i18n.GrammarData{
Verbs: map[string]i18n.VerbForms{
"commit": {Past: "committed", Gerund: "committing"},
},
Nouns: map[string]i18n.NounForms{
"commit": {One: "commit", Other: "commits"},
},
Signals: i18n.SignalData{
Priors: map[string]map[string]float64{
"commit": {
"verb": 0.2,
"noun": 0.8,
},
},
},
})
tok := NewTokeniserForLang(lang)
tokens := tok.Tokenise("please commit")
if len(tokens) != 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "please commit", len(tokens))
}
if tokens[1].Type != TokenNoun {
t.Fatalf("Tokenise(%q)[1].Type = %v, want TokenNoun", "please commit", tokens[1].Type)
}
if tokens[1].Confidence <= 0.5 {
t.Fatalf("Tokenise(%q)[1].Confidence = %f, want > 0.5", "please commit", tokens[1].Confidence)
}
}
func TestTokeniser_DualClassDetection(t *testing.T) {
setup(t)
tok := NewTokeniser()
dualClass := []string{"commit", "run", "test", "check", "file", "build"}
for _, word := range dualClass {
if !tok.IsDualClass(word) {
t.Errorf("%q should be dual-class", word)
}
}
for _, word := range []string{"change", "export", "function", "handle", "host", "import", "link", "log", "merge", "patch", "process", "pull", "push", "queue", "release", "stream", "tag", "trigger", "update", "watch"} {
if !tok.IsDualClass(word) {
t.Errorf("%q should be dual-class after expansion", word)
}
}
notDual := []string{"delete", "go", "branch", "repo"}
for _, word := range notDual {
if tok.IsDualClass(word) {
t.Errorf("%q should not be dual-class", word)
}
}
}
func TestTokeniser_IgnoresDeprecatedGrammarEntries(t *testing.T) {
setup(t)
const lang = "zz-deprecated"
original := i18n.GetGrammarData(lang)
t.Cleanup(func() {
i18n.SetGrammarData(lang, original)
})
i18n.SetGrammarData(lang, &i18n.GrammarData{
Nouns: map[string]i18n.NounForms{
"passed": {One: "passed", Other: "passed"},
"failed": {One: "failed", Other: "failed"},
"skipped": {One: "skipped", Other: "skipped"},
"commit": {One: "commit", Other: "commits"},
},
Words: map[string]string{
"passed": "passed",
"failed": "failed",
"skipped": "skipped",
"url": "URL",
},
})
tok := NewTokeniserForLang(lang)
for _, word := range []string{"passed", "failed", "skipped"} {
if tok.IsDualClass(word) {
t.Fatalf("%q should not be treated as dual-class", word)
}
if cat, ok := tok.MatchWord(word); ok {
t.Fatalf("MatchWord(%q) = %q, %v; want not found", word, cat, ok)
}
if _, ok := tok.MatchNoun(word); ok {
t.Fatalf("MatchNoun(%q) should be ignored", word)
}
}
if cat, ok := tok.MatchWord("url"); !ok || cat != "url" {
t.Fatalf("MatchWord(%q) = %q, %v; want %q, true", "url", cat, ok, "url")
}
}
func TestTokeniser_DualClassExpansion_ClassifiesCommonDevOpsWords(t *testing.T) {
setup(t)
tok := NewTokeniser()
tests := []struct {
text string
wantType TokenType
wantLower string
}{
{"the merge", TokenNoun, "merge"},
{"please merge the file", TokenVerb, "merge"},
{"the process", TokenNoun, "process"},
{"please process the log", TokenVerb, "process"},
}
for _, tt := range tests {
t.Run(tt.text, func(t *testing.T) {
tokens := tok.Tokenise(tt.text)
if len(tokens) < 2 {
t.Fatalf("Tokenise(%q) returned %d tokens, want at least 2", tt.text, len(tokens))
}
if tokens[1].Lower != tt.wantLower {
t.Fatalf("Tokenise(%q)[1].Lower = %q, want %q", tt.text, tokens[1].Lower, tt.wantLower)
}
if tokens[1].Type != tt.wantType {
t.Fatalf("Tokenise(%q)[1].Type = %v, want %v", tt.text, tokens[1].Type, tt.wantType)
}
})
}
}
func TestToken_ConfidenceField(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("Deleted the branch")
for _, token := range tokens {
if token.Type != TokenUnknown && token.Confidence == 0 {
t.Errorf("token %q (type %d) has zero Confidence", token.Raw, token.Type)
}
}
}
func TestTokeniser_Disambiguate_NounAfterDeterminer(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("the commit was approved")
if tokens[1].Type != TokenNoun {
t.Errorf("'commit' after 'the': Type = %v, want TokenNoun", tokens[1].Type)
}
if tokens[1].Confidence < 0.8 {
t.Errorf("'commit' Confidence = %f, want >= 0.8", tokens[1].Confidence)
}
if tokens[1].AltType != TokenVerb {
t.Errorf("'commit' AltType = %v, want TokenVerb", tokens[1].AltType)
}
}
func TestTokeniser_Disambiguate_VerbImperative(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("Commit the changes")
if tokens[0].Type != TokenVerb {
t.Errorf("'Commit' imperative: Type = %v, want TokenVerb", tokens[0].Type)
}
if tokens[0].Confidence < 0.8 {
t.Errorf("'Commit' Confidence = %f, want >= 0.8", tokens[0].Confidence)
}
}
func TestTokeniser_Disambiguate_NounWithVerbSaturation(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("The test failed")
if tokens[1].Type != TokenNoun {
t.Errorf("'test' in 'The test failed': Type = %v, want TokenNoun", tokens[1].Type)
}
}
func TestTokeniser_Disambiguate_VerbBeforeNoun(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("Run tests")
if tokens[0].Type != TokenVerb {
t.Errorf("'Run' in 'Run tests': Type = %v, want TokenVerb", tokens[0].Type)
}
}
func TestTokeniser_Disambiguate_InflectedSelfResolve(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("committed the branch")
if tokens[0].Type != TokenVerb || tokens[0].Confidence != 1.0 {
t.Errorf("'committed' should self-resolve as verb with confidence 1.0")
}
tokens = tok.Tokenise("the commits were reviewed")
if tokens[1].Type != TokenNoun || tokens[1].Confidence != 1.0 {
t.Errorf("'commits' should self-resolve as noun with confidence 1.0")
}
}
func TestTokeniser_Disambiguate_VerbAfterAuxiliary(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("will commit the changes")
if tokens[1].Type != TokenVerb {
t.Errorf("'commit' after 'will': Type = %v, want TokenVerb", tokens[1].Type)
}
}
func TestTokeniser_Disambiguate_ProseMultiple(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("The test failed because the commit introduced a regression")
for _, token := range tokens {
if token.Lower == "test" && token.Type != TokenNoun {
t.Errorf("'test' in prose: Type = %v, want TokenNoun", token.Type)
}
if token.Lower == "commit" && token.Type != TokenNoun {
t.Errorf("'commit' in prose: Type = %v, want TokenNoun", token.Type)
}
}
}
func TestTokeniser_Disambiguate_ClauseBoundary(t *testing.T) {
setup(t)
tok := NewTokeniser()
// "passed" is a confident verb in clause 1, "commit" is a verb in clause 2
tokens := tok.Tokenise("The test passed and we should commit the fix")
for _, token := range tokens {
if token.Lower == "test" && token.Type != TokenNoun {
t.Errorf("'test' should be noun: got %v", token.Type)
}
if token.Lower == "commit" && token.Type != TokenVerb {
t.Errorf("'commit' after 'should' should be verb: got %v", token.Type)
}
}
}
func TestTokeniser_Disambiguate_ContractionAux(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("don't run the tests")
// "run" after "don't" (contraction auxiliary) should be verb
for _, token := range tokens {
if token.Lower == "run" && token.Type != TokenVerb {
t.Errorf("'run' after \"don't\": Type = %v, want TokenVerb", token.Type)
}
}
}
func TestTokeniser_Disambiguate_ContractionAux_FallbackDefaults(t *testing.T) {
tok := NewTokeniserForLang("zz")
tokens := tok.Tokenise("don't run the tests")
// The hardcoded fallback auxiliaries should still recognise contractions
// even when no locale grammar data is loaded.
for _, token := range tokens {
if token.Lower == "run" && token.Type != TokenVerb {
t.Errorf("'run' after \"don't\": Type = %v, want TokenVerb", token.Type)
}
}
}
func TestTokeniser_Disambiguate_NegationSignal(t *testing.T) {
setup(t)
tok := NewTokeniser(WithSignals())
tokens := tok.Tokenise("no longer commit the changes")
if len(tokens) < 3 {
t.Fatalf("Tokenise(%q) returned %d tokens, want at least 3", "no longer commit the changes", len(tokens))
}
commitTok := tokens[2]
if commitTok.Type != TokenVerb {
t.Fatalf("'commit' after 'no longer': Type = %v, want TokenVerb", commitTok.Type)
}
if commitTok.Signals == nil {
t.Fatal("'commit' after 'no longer' should have signal breakdown")
}
foundNegation := false
for _, component := range commitTok.Signals.Components {
if component.Name == "verb_negation" {
foundNegation = true
break
}
}
if !foundNegation {
t.Error("verb_negation signal should have fired for 'no longer commit'")
}
}
func TestTokeniser_WithSignals_Breakdown(t *testing.T) {
setup(t)
tok := NewTokeniser(WithSignals())
tokens := tok.Tokenise("the commit was approved")
// "commit" should have a SignalBreakdown
commitTok := tokens[1]
if commitTok.Signals == nil {
t.Fatal("WithSignals(): commit token has nil Signals")
}
if commitTok.Signals.NounScore <= commitTok.Signals.VerbScore {
t.Errorf("NounScore (%f) should exceed VerbScore (%f) for 'the commit'",
commitTok.Signals.NounScore, commitTok.Signals.VerbScore)
}
if len(commitTok.Signals.Components) == 0 {
t.Error("Components should not be empty")
}
// Verify noun_determiner signal fired
foundDet := false
for _, c := range commitTok.Signals.Components {
if c.Name == "noun_determiner" {
foundDet = true
if c.Contrib != 0.35 {
t.Errorf("noun_determiner Contrib = %f, want 0.35", c.Contrib)
}
}
}
if !foundDet {
t.Error("noun_determiner signal should have fired")
}
}
func TestTokeniser_WithoutSignals_NilBreakdown(t *testing.T) {
setup(t)
tok := NewTokeniser() // no WithSignals
tokens := tok.Tokenise("the commit was approved")
if tokens[1].Signals != nil {
t.Error("Without WithSignals(), Signals should be nil")
}
}
func TestDisambiguationStats_WithAmbiguous(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("The commit passed the test")
stats := tok.DisambiguationStats(tokens)
if stats.AmbiguousTokens == 0 {
t.Error("expected ambiguous tokens for dual-class words")
}
if stats.TotalTokens != len(tokens) {
t.Errorf("TotalTokens = %d, want %d", stats.TotalTokens, len(tokens))
}
}
func TestDisambiguationStats_NoAmbiguous(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("Deleted the files")
stats := tok.DisambiguationStats(tokens)
if stats.AmbiguousTokens != 0 {
t.Errorf("AmbiguousTokens = %d, want 0", stats.AmbiguousTokens)
}
}
func TestWithWeights_Override(t *testing.T) {
setup(t)
// Override noun_determiner to 0 — "The commit" should no longer resolve as noun
weights := map[string]float64{
"noun_determiner": 0.0,
"verb_auxiliary": 0.25,
"following_class": 0.15,
"sentence_position": 0.10,
"verb_saturation": 0.10,
"inflection_echo": 0.03,
"default_prior": 0.02,
}
tok := NewTokeniser(WithWeights(weights))
tokens := tok.Tokenise("The commit")
// With noun_determiner zeroed, default_prior (verb) should win
if tokens[1].Type != TokenVerb {
t.Errorf("with noun_determiner=0, 'commit' Type = %v, want TokenVerb", tokens[1].Type)
}
}
func TestWithWeights_CopiesInputMap(t *testing.T) {
setup(t)
weights := map[string]float64{
"noun_determiner": 0.35,
"verb_auxiliary": 0.25,
"following_class": 0.15,
"sentence_position": 0.10,
"verb_saturation": 0.10,
"inflection_echo": 0.03,
"default_prior": 0.02,
}
tok := NewTokeniser(WithWeights(weights))
// Mutate the caller's map after construction; the tokeniser should keep
// using the original copied values.
weights["noun_determiner"] = 0
tokens := tok.Tokenise("The commit")
if tokens[1].Type != TokenNoun {
t.Fatalf("with copied weights, 'commit' Type = %v, want TokenNoun", tokens[1].Type)
}
}
func TestWithWeights_PartialOverrideKeepsDefaults(t *testing.T) {
setup(t)
tok := NewTokeniser(WithWeights(map[string]float64{
"verb_auxiliary": 0.25,
}))
tokens := tok.Tokenise("The commit")
if tokens[1].Type != TokenNoun {
t.Fatalf("with partial weights, 'commit' Type = %v, want TokenNoun", tokens[1].Type)
}
}
func TestDefaultWeights_ReturnsCopy(t *testing.T) {
first := DefaultWeights()
second := DefaultWeights()
if first["noun_determiner"] != 0.35 {
t.Fatalf("DefaultWeights()[noun_determiner] = %v, want 0.35", first["noun_determiner"])
}
first["noun_determiner"] = 0
if second["noun_determiner"] != 0.35 {
t.Fatalf("DefaultWeights() should return a fresh copy, got %v", second["noun_determiner"])
}
}
func TestTokeniserSignalWeights_ReturnsCopy(t *testing.T) {
setup(t)
tok := NewTokeniser(WithWeights(map[string]float64{
"noun_determiner": 0.5,
"default_prior": 0.1,
}))
weights := tok.SignalWeights()
if weights["noun_determiner"] != 0.5 {
t.Fatalf("SignalWeights()[noun_determiner] = %v, want 0.5", weights["noun_determiner"])
}
weights["noun_determiner"] = 0
if got := tok.SignalWeights()["noun_determiner"]; got != 0.5 {
t.Fatalf("SignalWeights() should return a fresh copy, got %v", got)
}
}
func TestLowInformationConfidenceConstants(t *testing.T) {
if LowInformationScoreThreshold != 0.10 {
t.Fatalf("LowInformationScoreThreshold = %v, want 0.10", LowInformationScoreThreshold)
}
if LowInformationVerbConfidence != 0.55 {
t.Fatalf("LowInformationVerbConfidence = %v, want 0.55", LowInformationVerbConfidence)
}
if LowInformationNounConfidence != 0.45 {
t.Fatalf("LowInformationNounConfidence = %v, want 0.45", LowInformationNounConfidence)
}
}
func TestTokeniser_LowInformationConfidenceFloor(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("maybe commit")
if len(tokens) != 2 {
t.Fatalf("Tokenise(maybe commit) produced %d tokens, want 2", len(tokens))
}
if tokens[1].Type != TokenVerb {
t.Fatalf("Tokenise(maybe commit) Type = %v, want TokenVerb", tokens[1].Type)
}
if tokens[1].Confidence != 0.55 {
t.Fatalf("Tokenise(maybe commit) Confidence = %v, want 0.55", tokens[1].Confidence)
}
if tokens[1].AltType != TokenNoun {
t.Fatalf("Tokenise(maybe commit) AltType = %v, want TokenNoun", tokens[1].AltType)
}
if tokens[1].AltConf != 0.45 {
t.Fatalf("Tokenise(maybe commit) AltConf = %v, want 0.45", tokens[1].AltConf)
}
}
// --- Benchmarks ---
func benchSetup(b *testing.B) {
b.Helper()
svc, err := i18n.New()
if err != nil {
b.Fatalf("i18n.New() failed: %v", err)
}
i18n.SetDefault(svc)
}
func BenchmarkTokenise_Short(b *testing.B) {
benchSetup(b)
tok := NewTokeniser()
b.ResetTimer()
for i := 0; i < b.N; i++ {
tok.Tokenise("Delete the file")
}
}
func BenchmarkTokenise_Medium(b *testing.B) {
benchSetup(b)
tok := NewTokeniser()
text := "The build failed because the test commit was not pushed to the branch"
b.ResetTimer()
for i := 0; i < b.N; i++ {
tok.Tokenise(text)
}
}
func BenchmarkTokenise_DualClass(b *testing.B) {
benchSetup(b)
tok := NewTokeniser()
text := "Commit the changes and run the build test"
b.ResetTimer()
for i := 0; i < b.N; i++ {
tok.Tokenise(text)
}
}
func BenchmarkTokenise_WithSignals(b *testing.B) {
benchSetup(b)
tok := NewTokeniser(WithSignals())
text := "The commit was rebuilt and the test passed"
b.ResetTimer()
for i := 0; i < b.N; i++ {
tok.Tokenise(text)
}
}
func BenchmarkNewImprint(b *testing.B) {
benchSetup(b)
tok := NewTokeniser()
tokens := tok.Tokenise("Delete the configuration file and rebuild the project")
b.ResetTimer()
for i := 0; i < b.N; i++ {
NewImprint(tokens)
}
}
func BenchmarkImprint_Similar(b *testing.B) {
benchSetup(b)
tok := NewTokeniser()
imp1 := NewImprint(tok.Tokenise("Delete the configuration file"))
imp2 := NewImprint(tok.Tokenise("Delete the old file"))
b.ResetTimer()
for i := 0; i < b.N; i++ {
imp1.Similar(imp2)
}
}
func BenchmarkMultiplier_Expand(b *testing.B) {
benchSetup(b)
m := NewMultiplier()
b.ResetTimer()
for i := 0; i < b.N; i++ {
m.Expand("Delete the configuration file")
}
}