go-i18n/reversal/tokeniser_test.go

package reversal

import (
	"testing"

	i18n "dappco.re/go/core/i18n"
)

func setup(t *testing.T) {
	t.Helper()
	svc, err := i18n.New()
	if err != nil {
		t.Fatalf("i18n.New() failed: %v", err)
	}
	i18n.SetDefault(svc)
}

func TestTokeniser_MatchVerb_Irregular(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tests := []struct {
		word      string
		wantOK    bool
		wantBase  string
		wantTense string
	}{
		// Irregular past tense
		{"deleted", true, "delete", "past"},
		{"deleting", true, "delete", "gerund"},
		{"went", true, "go", "past"},
		{"going", true, "go", "gerund"},
		{"was", true, "be", "past"},
		{"being", true, "be", "gerund"},
		{"ran", true, "run", "past"},
		{"running", true, "run", "gerund"},
		{"wrote", true, "write", "past"},
		{"writing", true, "write", "gerund"},
		{"built", true, "build", "past"},
		{"building", true, "build", "gerund"},
		{"committed", true, "commit", "past"},
		{"committing", true, "commit", "gerund"},

		// Base forms
		{"delete", true, "delete", "base"},
		{"go", true, "go", "base"},

		// Unknown words return false
		{"xyzzy", false, "", ""},
		{"flurble", false, "", ""},
	}

	for _, tt := range tests {
		t.Run(tt.word, func(t *testing.T) {
			match, ok := tok.MatchVerb(tt.word)
			if ok != tt.wantOK {
				t.Fatalf("MatchVerb(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
			}
			if !ok {
				return
			}
			if match.Base != tt.wantBase {
				t.Errorf("MatchVerb(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
			}
			if match.Tense != tt.wantTense {
				t.Errorf("MatchVerb(%q).Tense = %q, want %q", tt.word, match.Tense, tt.wantTense)
			}
		})
	}
}

func TestTokeniser_MatchNoun_Irregular(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tests := []struct {
		word       string
		wantOK     bool
		wantBase   string
		wantPlural bool
	}{
		{"files", true, "file", true},
		{"file", true, "file", false},
		{"people", true, "person", true},
		{"person", true, "person", false},
		{"children", true, "child", true},
		{"child", true, "child", false},
		{"repositories", true, "repository", true},
		{"repository", true, "repository", false},
		{"branches", true, "branch", true},
		{"branch", true, "branch", false},
		{"xyzzy", false, "", false},
	}

	for _, tt := range tests {
		t.Run(tt.word, func(t *testing.T) {
			match, ok := tok.MatchNoun(tt.word)
			if ok != tt.wantOK {
				t.Fatalf("MatchNoun(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
			}
			if !ok {
				return
			}
			if match.Base != tt.wantBase {
				t.Errorf("MatchNoun(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
			}
			if match.Plural != tt.wantPlural {
				t.Errorf("MatchNoun(%q).Plural = %v, want %v", tt.word, match.Plural, tt.wantPlural)
			}
		})
	}
}

func TestTokeniser_MatchNoun_Regular(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tests := []struct {
		word       string
		wantOK     bool
		wantBase   string
		wantPlural bool
	}{
		// Regular nouns NOT in grammar tables — detected by reverse morphology + round-trip
		{"servers", true, "server", true},
		{"processes", true, "process", true},
		{"entries", true, "entry", true},
	}

	for _, tt := range tests {
		t.Run(tt.word, func(t *testing.T) {
			match, ok := tok.MatchNoun(tt.word)
			if ok != tt.wantOK {
				t.Fatalf("MatchNoun(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
			}
			if !ok {
				return
			}
			if match.Base != tt.wantBase {
				t.Errorf("MatchNoun(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
			}
			if match.Plural != tt.wantPlural {
				t.Errorf("MatchNoun(%q).Plural = %v, want %v", tt.word, match.Plural, tt.wantPlural)
			}
		})
	}
}

func TestTokeniser_MatchWord(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tests := []struct {
		word    string
		wantCat string
		wantOK  bool
	}{
		{"URL", "url", true},
		{"url", "url", true},
		{"ID", "id", true},
		{"SSH", "ssh", true},
		{"up to date", "up_to_date", true},
		{"PHP", "php", true},
		{"xyzzy", "", false},
	}

	for _, tt := range tests {
		t.Run(tt.word, func(t *testing.T) {
			cat, ok := tok.MatchWord(tt.word)
			if ok != tt.wantOK {
				t.Fatalf("MatchWord(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
			}
			if ok && cat != tt.wantCat {
				t.Errorf("MatchWord(%q) = %q, want %q", tt.word, cat, tt.wantCat)
			}
		})
	}
}

func TestTokeniser_MatchArticle(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tests := []struct {
		word     string
		wantType string
		wantOK   bool
	}{
		{"a", "indefinite", true},
		{"an", "indefinite", true},
		{"the", "definite", true},
		{"the.", "definite", true},
		{"A", "indefinite", true},
		{"The", "definite", true},
		{"foo", "", false},
	}

	for _, tt := range tests {
		t.Run(tt.word, func(t *testing.T) {
			artType, ok := tok.MatchArticle(tt.word)
			if ok != tt.wantOK {
				t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
			}
			if ok && artType != tt.wantType {
				t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
			}
		})
	}
}

func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) {
	setup(t)
	tok := NewTokeniserForLang("fr")

	tests := []struct {
		word     string
		wantType string
		wantOK   bool
	}{
		{"le", "definite", true},
		{"la", "definite", true},
		{"le serveur", "definite", true},
		{"le serveur.", "definite", true},
		{"la branche", "definite", true},
		{"les amis", "definite", true},
		{"Le", "definite", true},
		{"La", "definite", true},
		{"Un enfant", "indefinite", true},
		{"Une amie", "indefinite", true},
		{"de la", "indefinite", true},
		{"de le", "indefinite", true},
		{"de les", "indefinite", true},
		{"de l'", "indefinite", true},
		{"de l’", "indefinite", true},
		{"du serveur", "indefinite", true},
		{"des amis", "indefinite", true},
		{"un", "indefinite", true},
		{"une", "indefinite", true},
		{"l'enfant", "definite", true},
		{"l’ami", "definite", true},
	}

	for _, tt := range tests {
		t.Run(tt.word, func(t *testing.T) {
			artType, ok := tok.MatchArticle(tt.word)
			if ok != tt.wantOK {
				t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
			}
			if ok && artType != tt.wantType {
				t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
			}
		})
	}

	tokens := tok.Tokenise("la branche")
	if len(tokens) == 0 || tokens[0].Type != TokenArticle {
		t.Fatalf("Tokenise(%q)[0] should be TokenArticle, got %#v", "la branche", tokens)
	}

	tokens = tok.Tokenise("une branche")
	if len(tokens) == 0 || tokens[0].Type != TokenArticle {
		t.Fatalf("Tokenise(%q)[0] should be TokenArticle, got %#v", "une branche", tokens)
	}
	if tokens[0].ArtType != "indefinite" {
		t.Fatalf("Tokenise(%q)[0].ArtType = %q, want %q", "une branche", tokens[0].ArtType, "indefinite")
	}
}

func TestTokeniser_Tokenise_WordPhrase(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tokens := tok.Tokenise("up to date")
	if len(tokens) != 1 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 1", "up to date", len(tokens))
	}
	if tokens[0].Type != TokenWord {
		t.Fatalf("Tokenise(%q)[0].Type = %v, want TokenWord", "up to date", tokens[0].Type)
	}
	if tokens[0].WordCat != "up_to_date" {
		t.Fatalf("Tokenise(%q)[0].WordCat = %q, want %q", "up to date", tokens[0].WordCat, "up_to_date")
	}
}

func TestTokeniser_Tokenise_WordPhraseWithPunctuation(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tokens := tok.Tokenise("up to date.")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "up to date.", len(tokens))
	}
	if tokens[0].Type != TokenWord {
		t.Fatalf("Tokenise(%q)[0].Type = %v, want TokenWord", "up to date.", tokens[0].Type)
	}
	if tokens[1].Type != TokenPunctuation {
		t.Fatalf("Tokenise(%q)[1].Type = %v, want TokenPunctuation", "up to date.", tokens[1].Type)
	}
}

func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) {
	setup(t)
	tok := NewTokeniserForLang("fr")

	tests := []struct {
		word     string
		wantType string
		wantOK   bool
	}{
		{"l'", "definite", true},
		{"l’", "definite", true},
		{"lʼ", "definite", true},
		{"L'", "definite", true},
		{"L’", "definite", true},
		{"Lʼ", "definite", true},
		{"les", "definite", true},
		{"au", "definite", true},
		{"aux", "definite", true},
		{"du", "indefinite", true},
		{"des", "indefinite", true},
		{"l'enfant", "definite", true},
		{"de l'enfant", "indefinite", true},
		{"de l’ami", "indefinite", true},
		{"De l’enfant", "indefinite", true},
	}

	for _, tt := range tests {
		t.Run(tt.word, func(t *testing.T) {
			artType, ok := tok.MatchArticle(tt.word)
			if ok != tt.wantOK {
				t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
			}
			if ok && artType != tt.wantType {
				t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
			}
		})
	}
}

func TestTokeniser_MatchArticle_FrenchUnderscoreTagFallback(t *testing.T) {
	setup(t)
	tok := NewTokeniserForLang("fr_CA")

	tests := []struct {
		word     string
		wantType string
		wantOK   bool
	}{
		{"le", "definite", true},
		{"l'ami", "definite", true},
		{"de l'ami", "indefinite", true},
	}

	for _, tt := range tests {
		t.Run(tt.word, func(t *testing.T) {
			artType, ok := tok.MatchArticle(tt.word)
			if ok != tt.wantOK {
				t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
			}
			if ok && artType != tt.wantType {
				t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
			}
		})
	}

	tokens := tok.Tokenise("l'ami")
	if len(tokens) == 0 || tokens[0].Type != TokenArticle {
		t.Fatalf("Tokenise(%q)[0] should be TokenArticle, got %#v", "l'ami", tokens)
	}
}

func TestTokeniser_MatchArticle_ConfiguredPhrasePrefix(t *testing.T) {
	setup(t)

	const lang = "xx"
	prev := i18n.GetGrammarData(lang)
	t.Cleanup(func() {
		i18n.SetGrammarData(lang, prev)
	})

	i18n.SetGrammarData(lang, &i18n.GrammarData{
		Articles: i18n.ArticleForms{
			IndefiniteDefault: "a",
			IndefiniteVowel:   "an",
			Definite:          "the",
		},
	})

	tok := NewTokeniserForLang(lang)

	tests := []struct {
		word     string
		wantType string
		wantOK   bool
	}{
		{"the file", "definite", true},
		{"a file", "indefinite", true},
		{"an error", "indefinite", true},
		{"file", "", false},
	}

	for _, tt := range tests {
		t.Run(tt.word, func(t *testing.T) {
			artType, ok := tok.MatchArticle(tt.word)
			if ok != tt.wantOK {
				t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
			}
			if ok && artType != tt.wantType {
				t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
			}
		})
	}
}

func TestTokeniser_MatchArticle_ConfiguredElisionPrefix(t *testing.T) {
	setup(t)

	const lang = "xy"
	prev := i18n.GetGrammarData(lang)
	t.Cleanup(func() {
		i18n.SetGrammarData(lang, prev)
	})

	i18n.SetGrammarData(lang, &i18n.GrammarData{
		Articles: i18n.ArticleForms{
			IndefiniteDefault: "a",
			IndefiniteVowel:   "an",
			Definite:          "l'",
			ByGender: map[string]string{
				"m": "le",
				"f": "la",
			},
		},
		Nouns: map[string]i18n.NounForms{
			"ami": {One: "ami", Other: "amis", Gender: "m"},
		},
	})

	tok := NewTokeniserForLang(lang)

	tests := []struct {
		word     string
		wantType string
		wantOK   bool
	}{
		{"l'ami", "definite", true},
		{"l’ami", "definite", true},
		{"lʼami", "definite", true},
	}

	for _, tt := range tests {
		t.Run(tt.word, func(t *testing.T) {
			artType, ok := tok.MatchArticle(tt.word)
			if ok != tt.wantOK {
				t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
			}
			if ok && artType != tt.wantType {
				t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
			}
		})
	}

	tokens := tok.Tokenise("l'ami")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "l'ami", len(tokens))
	}
	if tokens[0].Type != TokenArticle || tokens[0].ArtType != "definite" {
		t.Fatalf("Tokenise(%q)[0] = %#v, want definite article", "l'ami", tokens[0])
	}
	if tokens[1].Type != TokenNoun || tokens[1].Lower != "ami" {
		t.Fatalf("Tokenise(%q)[1] = %#v, want noun ami", "l'ami", tokens[1])
	}
}

func TestTokeniser_Tokenise_FrenchElision(t *testing.T) {
	setup(t)
	tok := NewTokeniserForLang("fr")

	tokens := tok.Tokenise("l'enfant")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "l'enfant", len(tokens))
	}
	if tokens[0].Type != TokenArticle {
		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
	}
	if tokens[0].ArtType != "definite" {
		t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite")
	}
	if tokens[1].Type != TokenNoun {
		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
	}
	if tokens[1].Lower != "enfant" {
		t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
	}

	tokens = tok.Tokenise("de l'enfant")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de l'enfant", len(tokens))
	}
	if tokens[0].Type != TokenArticle {
		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
	}
	if tokens[0].ArtType != "indefinite" {
		t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
	}
	if tokens[0].Lower != "de l'" {
		t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
	}
	if tokens[1].Type != TokenNoun {
		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
	}
	if tokens[1].Lower != "enfant" {
		t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
	}

	tokens = tok.Tokenise("de l' enfant")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de l' enfant", len(tokens))
	}
	if tokens[0].Type != TokenArticle {
		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
	}
	if tokens[0].ArtType != "indefinite" {
		t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
	}
	if tokens[0].Lower != "de l'" {
		t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
	}
	if tokens[1].Type != TokenNoun {
		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
	}
	if tokens[1].Lower != "enfant" {
		t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
	}

	tokens = tok.Tokenise("De l’enfant.")
	if len(tokens) != 3 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 3", "De l’enfant.", len(tokens))
	}
	if tokens[0].Type != TokenArticle {
		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
	}
	if tokens[0].ArtType != "indefinite" {
		t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
	}
	if tokens[1].Type != TokenNoun {
		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
	}
	if tokens[1].Lower != "enfant" {
		t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
	}
	if tokens[2].Type != TokenPunctuation {
		t.Fatalf("tokens[2].Type = %v, want TokenPunctuation", tokens[2].Type)
	}
	if tokens[2].PunctType != "sentence_end" {
		t.Fatalf("tokens[2].PunctType = %q, want %q", tokens[2].PunctType, "sentence_end")
	}

	tokens = tok.Tokenise("de le serveur")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de le serveur", len(tokens))
	}
	if tokens[0].Type != TokenArticle {
		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
	}
	if tokens[0].ArtType != "indefinite" {
		t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
	}
	if tokens[0].Lower != "de le" {
		t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de le")
	}
	if tokens[1].Type != TokenNoun {
		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
	}
	if tokens[1].Lower != "serveur" {
		t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "serveur")
	}

	tokens = tok.Tokenise("de les amis")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de les amis", len(tokens))
	}
	if tokens[0].Type != TokenArticle {
		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
	}
	if tokens[0].ArtType != "indefinite" {
		t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
	}
	if tokens[0].Lower != "de les" {
		t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de les")
	}
	if tokens[1].Type != TokenNoun {
		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
	}
	if tokens[1].Lower != "amis" {
		t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "amis")
	}

	tokens = tok.Tokenise("de l’ enfant")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de l’ enfant", len(tokens))
	}
	if tokens[0].Type != TokenArticle {
		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
	}
	if tokens[0].Lower != "de l'" {
		t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
	}
	if tokens[1].Type != TokenNoun {
		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
	}
	if tokens[1].Lower != "enfant" {
		t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
	}

	tokens = tok.Tokenise("de lʼenfant")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de lʼenfant", len(tokens))
	}
	if tokens[0].Type != TokenArticle {
		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
	}
	if tokens[0].Lower != "de l'" {
		t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de l'")
	}
	if tokens[1].Type != TokenNoun {
		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
	}
	if tokens[1].Lower != "enfant" {
		t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
	}

	tokens = tok.Tokenise("d'enfant")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "d'enfant", len(tokens))
	}
	if tokens[0].Type != TokenArticle {
		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
	}
	if tokens[0].ArtType != "indefinite" {
		t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
	}
	if tokens[1].Type != TokenNoun {
		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
	}

	tokens = tok.Tokenise("l’enfant")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "l’enfant", len(tokens))
	}
	if tokens[0].Type != TokenArticle {
		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
	}
	if tokens[0].ArtType != "definite" {
		t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite")
	}
	if tokens[1].Type != TokenNoun {
		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
	}
	if tokens[1].Lower != "enfant" {
		t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "enfant")
	}

	tokens = tok.Tokenise("au serveur")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "au serveur", len(tokens))
	}
	if tokens[0].Type != TokenArticle {
		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
	}
	if tokens[0].ArtType != "definite" {
		t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "definite")
	}
}

func TestTokeniser_Tokenise_FrenchPartitiveArticlePhrase(t *testing.T) {
	setup(t)
	tok := NewTokeniserForLang("fr")

	tokens := tok.Tokenise("de la branche")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de la branche", len(tokens))
	}
	if tokens[0].Type != TokenArticle {
		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
	}
	if tokens[0].Lower != "de la" {
		t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de la")
	}
	if tokens[0].ArtType != "indefinite" {
		t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
	}
	if tokens[1].Type != TokenNoun {
		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
	}
	if tokens[1].Lower != "branche" {
		t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "branche")
	}

	tokens = tok.Tokenise("de les amis")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "de les amis", len(tokens))
	}
	if tokens[0].Type != TokenArticle {
		t.Fatalf("tokens[0].Type = %v, want TokenArticle", tokens[0].Type)
	}
	if tokens[0].Lower != "de les" {
		t.Fatalf("tokens[0].Lower = %q, want %q", tokens[0].Lower, "de les")
	}
	if tokens[0].ArtType != "indefinite" {
		t.Fatalf("tokens[0].ArtType = %q, want %q", tokens[0].ArtType, "indefinite")
	}
	if tokens[1].Type != TokenNoun {
		t.Fatalf("tokens[1].Type = %v, want TokenNoun", tokens[1].Type)
	}
	if tokens[1].Lower != "amis" {
		t.Fatalf("tokens[1].Lower = %q, want %q", tokens[1].Lower, "amis")
	}
}

func TestTokeniser_Tokenise(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tokens := tok.Tokenise("Deleted the configuration files")

	if len(tokens) != 4 {
		t.Fatalf("Tokenise() returned %d tokens, want 4", len(tokens))
	}

	// "Deleted" → verb, past tense
	if tokens[0].Type != TokenVerb {
		t.Errorf("tokens[0].Type = %v, want TokenVerb", tokens[0].Type)
	}
	if tokens[0].VerbInfo.Tense != "past" {
		t.Errorf("tokens[0].VerbInfo.Tense = %q, want %q", tokens[0].VerbInfo.Tense, "past")
	}

	// "the" → article
	if tokens[1].Type != TokenArticle {
		t.Errorf("tokens[1].Type = %v, want TokenArticle", tokens[1].Type)
	}

	// "configuration" → unknown
	if tokens[2].Type != TokenUnknown {
		t.Errorf("tokens[2].Type = %v, want TokenUnknown", tokens[2].Type)
	}

	// "files" → noun, plural
	if tokens[3].Type != TokenNoun {
		t.Errorf("tokens[3].Type = %v, want TokenNoun", tokens[3].Type)
	}
	if !tokens[3].NounInfo.Plural {
		t.Errorf("tokens[3].NounInfo.Plural = false, want true")
	}
}

func TestTokeniser_Tokenise_Punctuation(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tokens := tok.Tokenise("Building project...")
	hasPunct := false
	for _, tok := range tokens {
		if tok.Type == TokenPunctuation {
			hasPunct = true
		}
	}
	if !hasPunct {
		t.Error("did not detect punctuation in \"Building project...\"")
	}
}

func TestTokeniser_Tokenise_ClauseBoundarySentence(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tokens := tok.Tokenise("run tests. commit")
	hasSentenceEnd := false

	for _, token := range tokens {
		if token.Raw == "run" && token.Type != TokenVerb {
			t.Errorf("'run' should remain TokenVerb, got %v", token.Type)
		}
		if token.Type == TokenPunctuation && token.PunctType == "sentence_end" {
			hasSentenceEnd = true
		}
		if token.Lower == "commit" {
			// Without sentence-end boundary support, this can be demoted by verb saturation.
			// With boundary detection, it should still classify as a verb.
			if token.Type != TokenVerb {
				t.Errorf("'commit' after period should be TokenVerb, got %v", token.Type)
			}
		}
	}

	if !hasSentenceEnd {
		t.Error("did not detect sentence-end punctuation in \"run tests. commit\"")
	}
}

func TestTokeniser_Tokenise_ClauseBoundaryStandalonePunctuation(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tokens := tok.Tokenise("run tests . commit")
	hasSentenceEnd := false

	for _, token := range tokens {
		if token.Type == TokenPunctuation && token.PunctType == "sentence_end" {
			hasSentenceEnd = true
		}
		if token.Lower == "commit" && token.Type != TokenVerb {
			t.Errorf("'commit' after standalone period should be TokenVerb, got %v", token.Type)
		}
	}

	if !hasSentenceEnd {
		t.Error("did not detect standalone sentence-end punctuation in \"run tests . commit\"")
	}
}

func TestTokeniser_Tokenise_Empty(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tokens := tok.Tokenise("")
	if len(tokens) != 0 {
		t.Errorf("Tokenise(\"\") returned %d tokens, want 0", len(tokens))
	}
}

func TestTokeniser_MatchVerb_Regular(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tests := []struct {
		word      string
		wantOK    bool
		wantBase  string
		wantTense string
	}{
		// Regular verbs NOT in grammar tables — detected by reverse morphology + round-trip
		{"walked", true, "walk", "past"},
		{"walking", true, "walk", "gerund"},
		{"processed", true, "process", "past"},
		{"processing", true, "process", "gerund"},
		{"copied", true, "copy", "past"},
		{"copying", true, "copy", "gerund"},
		{"stopped", true, "stop", "past"},
		{"stopping", true, "stop", "gerund"},
	}

	for _, tt := range tests {
		t.Run(tt.word, func(t *testing.T) {
			match, ok := tok.MatchVerb(tt.word)
			if ok != tt.wantOK {
				t.Fatalf("MatchVerb(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
			}
			if !ok {
				return
			}
			if match.Base != tt.wantBase {
				t.Errorf("MatchVerb(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
			}
			if match.Tense != tt.wantTense {
				t.Errorf("MatchVerb(%q).Tense = %q, want %q", tt.word, match.Tense, tt.wantTense)
			}
		})
	}
}

func TestTokeniser_WithSignals(t *testing.T) {
	setup(t)
	tok := NewTokeniser(WithSignals())
	_ = tok // verify it compiles and accepts the option
}

func TestTokeniser_Tokenise_CorpusPriorBias(t *testing.T) {
	const lang = "zz-prior"
	original := i18n.GetGrammarData(lang)
	t.Cleanup(func() {
		i18n.SetGrammarData(lang, original)
	})

	i18n.SetGrammarData(lang, &i18n.GrammarData{
		Verbs: map[string]i18n.VerbForms{
			"commit": {Past: "committed", Gerund: "committing"},
		},
		Nouns: map[string]i18n.NounForms{
			"commit": {One: "commit", Other: "commits"},
		},
		Signals: i18n.SignalData{
			Priors: map[string]map[string]float64{
				"commit": {
					"verb": 0.2,
					"noun": 0.8,
				},
			},
		},
	})

	tok := NewTokeniserForLang(lang)
	tokens := tok.Tokenise("please commit")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "please commit", len(tokens))
	}
	if tokens[1].Type != TokenNoun {
		t.Fatalf("Tokenise(%q)[1].Type = %v, want TokenNoun", "please commit", tokens[1].Type)
	}
	if tokens[1].Confidence <= 0.5 {
		t.Fatalf("Tokenise(%q)[1].Confidence = %f, want > 0.5", "please commit", tokens[1].Confidence)
	}
}

func TestTokeniser_DualClassDetection(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	dualClass := []string{"commit", "run", "test", "check", "file", "build"}
	for _, word := range dualClass {
		if !tok.IsDualClass(word) {
			t.Errorf("%q should be dual-class", word)
		}
	}

	for _, word := range []string{"change", "export", "function", "handle", "host", "import", "link", "log", "merge", "patch", "process", "pull", "push", "queue", "release", "stream", "tag", "trigger", "update", "watch"} {
		if !tok.IsDualClass(word) {
			t.Errorf("%q should be dual-class after expansion", word)
		}
	}

	notDual := []string{"delete", "go", "branch", "repo"}
	for _, word := range notDual {
		if tok.IsDualClass(word) {
			t.Errorf("%q should not be dual-class", word)
		}
	}
}

func TestTokeniser_IgnoresDeprecatedGrammarEntries(t *testing.T) {
	setup(t)

	const lang = "zz-deprecated"
	original := i18n.GetGrammarData(lang)
	t.Cleanup(func() {
		i18n.SetGrammarData(lang, original)
	})

	i18n.SetGrammarData(lang, &i18n.GrammarData{
		Nouns: map[string]i18n.NounForms{
			"passed":  {One: "passed", Other: "passed"},
			"failed":  {One: "failed", Other: "failed"},
			"skipped": {One: "skipped", Other: "skipped"},
			"commit":  {One: "commit", Other: "commits"},
		},
		Words: map[string]string{
			"passed":  "passed",
			"failed":  "failed",
			"skipped": "skipped",
			"url":     "URL",
		},
	})

	tok := NewTokeniserForLang(lang)
	for _, word := range []string{"passed", "failed", "skipped"} {
		if tok.IsDualClass(word) {
			t.Fatalf("%q should not be treated as dual-class", word)
		}
		if cat, ok := tok.MatchWord(word); ok {
			t.Fatalf("MatchWord(%q) = %q, %v; want not found", word, cat, ok)
		}
		if _, ok := tok.MatchNoun(word); ok {
			t.Fatalf("MatchNoun(%q) should be ignored", word)
		}
	}
	if cat, ok := tok.MatchWord("url"); !ok || cat != "url" {
		t.Fatalf("MatchWord(%q) = %q, %v; want %q, true", "url", cat, ok, "url")
	}
}

func TestTokeniser_DualClassExpansion_ClassifiesCommonDevOpsWords(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tests := []struct {
		text      string
		wantType  TokenType
		wantLower string
	}{
		{"the merge", TokenNoun, "merge"},
		{"please merge the file", TokenVerb, "merge"},
		{"the process", TokenNoun, "process"},
		{"please process the log", TokenVerb, "process"},
	}

	for _, tt := range tests {
		t.Run(tt.text, func(t *testing.T) {
			tokens := tok.Tokenise(tt.text)
			if len(tokens) < 2 {
				t.Fatalf("Tokenise(%q) returned %d tokens, want at least 2", tt.text, len(tokens))
			}
			if tokens[1].Lower != tt.wantLower {
				t.Fatalf("Tokenise(%q)[1].Lower = %q, want %q", tt.text, tokens[1].Lower, tt.wantLower)
			}
			if tokens[1].Type != tt.wantType {
				t.Fatalf("Tokenise(%q)[1].Type = %v, want %v", tt.text, tokens[1].Type, tt.wantType)
			}
		})
	}
}

func TestToken_ConfidenceField(t *testing.T) {
	setup(t)
	tok := NewTokeniser()
	tokens := tok.Tokenise("Deleted the branch")

	for _, token := range tokens {
		if token.Type != TokenUnknown && token.Confidence == 0 {
			t.Errorf("token %q (type %d) has zero Confidence", token.Raw, token.Type)
		}
	}
}

func TestTokeniser_Disambiguate_NounAfterDeterminer(t *testing.T) {
	setup(t)
	tok := NewTokeniser()
	tokens := tok.Tokenise("the commit was approved")
	if tokens[1].Type != TokenNoun {
		t.Errorf("'commit' after 'the': Type = %v, want TokenNoun", tokens[1].Type)
	}
	if tokens[1].Confidence < 0.8 {
		t.Errorf("'commit' Confidence = %f, want >= 0.8", tokens[1].Confidence)
	}
	if tokens[1].AltType != TokenVerb {
		t.Errorf("'commit' AltType = %v, want TokenVerb", tokens[1].AltType)
	}
}

func TestTokeniser_Disambiguate_VerbImperative(t *testing.T) {
	setup(t)
	tok := NewTokeniser()
	tokens := tok.Tokenise("Commit the changes")
	if tokens[0].Type != TokenVerb {
		t.Errorf("'Commit' imperative: Type = %v, want TokenVerb", tokens[0].Type)
	}
	if tokens[0].Confidence < 0.8 {
		t.Errorf("'Commit' Confidence = %f, want >= 0.8", tokens[0].Confidence)
	}
}

func TestTokeniser_Disambiguate_NounWithVerbSaturation(t *testing.T) {
	setup(t)
	tok := NewTokeniser()
	tokens := tok.Tokenise("The test failed")
	if tokens[1].Type != TokenNoun {
		t.Errorf("'test' in 'The test failed': Type = %v, want TokenNoun", tokens[1].Type)
	}
}

func TestTokeniser_Disambiguate_VerbBeforeNoun(t *testing.T) {
	setup(t)
	tok := NewTokeniser()
	tokens := tok.Tokenise("Run tests")
	if tokens[0].Type != TokenVerb {
		t.Errorf("'Run' in 'Run tests': Type = %v, want TokenVerb", tokens[0].Type)
	}
}

func TestTokeniser_Disambiguate_InflectedSelfResolve(t *testing.T) {
	setup(t)
	tok := NewTokeniser()
	tokens := tok.Tokenise("committed the branch")
	if tokens[0].Type != TokenVerb || tokens[0].Confidence != 1.0 {
		t.Errorf("'committed' should self-resolve as verb with confidence 1.0")
	}
	tokens = tok.Tokenise("the commits were reviewed")
	if tokens[1].Type != TokenNoun || tokens[1].Confidence != 1.0 {
		t.Errorf("'commits' should self-resolve as noun with confidence 1.0")
	}
}

func TestTokeniser_Disambiguate_VerbAfterAuxiliary(t *testing.T) {
	setup(t)
	tok := NewTokeniser()
	tokens := tok.Tokenise("will commit the changes")
	if tokens[1].Type != TokenVerb {
		t.Errorf("'commit' after 'will': Type = %v, want TokenVerb", tokens[1].Type)
	}
}

func TestTokeniser_Disambiguate_ProseMultiple(t *testing.T) {
	setup(t)
	tok := NewTokeniser()
	tokens := tok.Tokenise("The test failed because the commit introduced a regression")
	for _, token := range tokens {
		if token.Lower == "test" && token.Type != TokenNoun {
			t.Errorf("'test' in prose: Type = %v, want TokenNoun", token.Type)
		}
		if token.Lower == "commit" && token.Type != TokenNoun {
			t.Errorf("'commit' in prose: Type = %v, want TokenNoun", token.Type)
		}
	}
}

func TestTokeniser_Disambiguate_ClauseBoundary(t *testing.T) {
	setup(t)
	tok := NewTokeniser()
	// "passed" is a confident verb in clause 1, "commit" is a verb in clause 2
	tokens := tok.Tokenise("The test passed and we should commit the fix")
	for _, token := range tokens {
		if token.Lower == "test" && token.Type != TokenNoun {
			t.Errorf("'test' should be noun: got %v", token.Type)
		}
		if token.Lower == "commit" && token.Type != TokenVerb {
			t.Errorf("'commit' after 'should' should be verb: got %v", token.Type)
		}
	}
}

func TestTokeniser_Disambiguate_ContractionAux(t *testing.T) {
	setup(t)
	tok := NewTokeniser()
	tokens := tok.Tokenise("don't run the tests")
	// "run" after "don't" (contraction auxiliary) should be verb
	for _, token := range tokens {
		if token.Lower == "run" && token.Type != TokenVerb {
			t.Errorf("'run' after \"don't\": Type = %v, want TokenVerb", token.Type)
		}
	}
}

func TestTokeniser_Disambiguate_ContractionAux_FallbackDefaults(t *testing.T) {
	tok := NewTokeniserForLang("zz")
	tokens := tok.Tokenise("don't run the tests")
	// The hardcoded fallback auxiliaries should still recognise contractions
	// even when no locale grammar data is loaded.
	for _, token := range tokens {
		if token.Lower == "run" && token.Type != TokenVerb {
			t.Errorf("'run' after \"don't\": Type = %v, want TokenVerb", token.Type)
		}
	}
}

func TestTokeniser_Disambiguate_NegationSignal(t *testing.T) {
	setup(t)
	tok := NewTokeniser(WithSignals())

	tokens := tok.Tokenise("no longer commit the changes")
	if len(tokens) < 3 {
		t.Fatalf("Tokenise(%q) returned %d tokens, want at least 3", "no longer commit the changes", len(tokens))
	}

	commitTok := tokens[2]
	if commitTok.Type != TokenVerb {
		t.Fatalf("'commit' after 'no longer': Type = %v, want TokenVerb", commitTok.Type)
	}
	if commitTok.Signals == nil {
		t.Fatal("'commit' after 'no longer' should have signal breakdown")
	}
	foundNegation := false
	for _, component := range commitTok.Signals.Components {
		if component.Name == "verb_negation" {
			foundNegation = true
			break
		}
	}
	if !foundNegation {
		t.Error("verb_negation signal should have fired for 'no longer commit'")
	}
}

func TestTokeniser_WithSignals_Breakdown(t *testing.T) {
	setup(t)
	tok := NewTokeniser(WithSignals())

	tokens := tok.Tokenise("the commit was approved")
	// "commit" should have a SignalBreakdown
	commitTok := tokens[1]
	if commitTok.Signals == nil {
		t.Fatal("WithSignals(): commit token has nil Signals")
	}
	if commitTok.Signals.NounScore <= commitTok.Signals.VerbScore {
		t.Errorf("NounScore (%f) should exceed VerbScore (%f) for 'the commit'",
			commitTok.Signals.NounScore, commitTok.Signals.VerbScore)
	}
	if len(commitTok.Signals.Components) == 0 {
		t.Error("Components should not be empty")
	}

	// Verify noun_determiner signal fired
	foundDet := false
	for _, c := range commitTok.Signals.Components {
		if c.Name == "noun_determiner" {
			foundDet = true
			if c.Contrib != 0.35 {
				t.Errorf("noun_determiner Contrib = %f, want 0.35", c.Contrib)
			}
		}
	}
	if !foundDet {
		t.Error("noun_determiner signal should have fired")
	}
}

func TestTokeniser_WithoutSignals_NilBreakdown(t *testing.T) {
	setup(t)
	tok := NewTokeniser() // no WithSignals

	tokens := tok.Tokenise("the commit was approved")
	if tokens[1].Signals != nil {
		t.Error("Without WithSignals(), Signals should be nil")
	}
}

func TestDisambiguationStats_WithAmbiguous(t *testing.T) {
	setup(t)
	tok := NewTokeniser()
	tokens := tok.Tokenise("The commit passed the test")
	stats := tok.DisambiguationStats(tokens)
	if stats.AmbiguousTokens == 0 {
		t.Error("expected ambiguous tokens for dual-class words")
	}
	if stats.TotalTokens != len(tokens) {
		t.Errorf("TotalTokens = %d, want %d", stats.TotalTokens, len(tokens))
	}
}

func TestDisambiguationStats_NoAmbiguous(t *testing.T) {
	setup(t)
	tok := NewTokeniser()
	tokens := tok.Tokenise("Deleted the files")
	stats := tok.DisambiguationStats(tokens)
	if stats.AmbiguousTokens != 0 {
		t.Errorf("AmbiguousTokens = %d, want 0", stats.AmbiguousTokens)
	}
}

func TestWithWeights_Override(t *testing.T) {
	setup(t)
	// Override noun_determiner to 0 — "The commit" should no longer resolve as noun
	weights := map[string]float64{
		"noun_determiner":   0.0,
		"verb_auxiliary":    0.25,
		"following_class":   0.15,
		"sentence_position": 0.10,
		"verb_saturation":   0.10,
		"inflection_echo":   0.03,
		"default_prior":     0.02,
	}
	tok := NewTokeniser(WithWeights(weights))
	tokens := tok.Tokenise("The commit")
	// With noun_determiner zeroed, default_prior (verb) should win
	if tokens[1].Type != TokenVerb {
		t.Errorf("with noun_determiner=0, 'commit' Type = %v, want TokenVerb", tokens[1].Type)
	}
}

func TestWithWeights_CopiesInputMap(t *testing.T) {
	setup(t)
	weights := map[string]float64{
		"noun_determiner":   0.35,
		"verb_auxiliary":    0.25,
		"following_class":   0.15,
		"sentence_position": 0.10,
		"verb_saturation":   0.10,
		"inflection_echo":   0.03,
		"default_prior":     0.02,
	}
	tok := NewTokeniser(WithWeights(weights))

	// Mutate the caller's map after construction; the tokeniser should keep
	// using the original copied values.
	weights["noun_determiner"] = 0

	tokens := tok.Tokenise("The commit")
	if tokens[1].Type != TokenNoun {
		t.Fatalf("with copied weights, 'commit' Type = %v, want TokenNoun", tokens[1].Type)
	}
}

func TestWithWeights_PartialOverrideKeepsDefaults(t *testing.T) {
	setup(t)
	tok := NewTokeniser(WithWeights(map[string]float64{
		"verb_auxiliary": 0.25,
	}))

	tokens := tok.Tokenise("The commit")
	if tokens[1].Type != TokenNoun {
		t.Fatalf("with partial weights, 'commit' Type = %v, want TokenNoun", tokens[1].Type)
	}
}

func TestDefaultWeights_ReturnsCopy(t *testing.T) {
	first := DefaultWeights()
	second := DefaultWeights()

	if first["noun_determiner"] != 0.35 {
		t.Fatalf("DefaultWeights()[noun_determiner] = %v, want 0.35", first["noun_determiner"])
	}
	first["noun_determiner"] = 0

	if second["noun_determiner"] != 0.35 {
		t.Fatalf("DefaultWeights() should return a fresh copy, got %v", second["noun_determiner"])
	}
}

func TestTokeniserSignalWeights_ReturnsCopy(t *testing.T) {
	setup(t)
	tok := NewTokeniser(WithWeights(map[string]float64{
		"noun_determiner": 0.5,
		"default_prior":   0.1,
	}))

	weights := tok.SignalWeights()
	if weights["noun_determiner"] != 0.5 {
		t.Fatalf("SignalWeights()[noun_determiner] = %v, want 0.5", weights["noun_determiner"])
	}

	weights["noun_determiner"] = 0
	if got := tok.SignalWeights()["noun_determiner"]; got != 0.5 {
		t.Fatalf("SignalWeights() should return a fresh copy, got %v", got)
	}
}

func TestLowInformationConfidenceConstants(t *testing.T) {
	if LowInformationScoreThreshold != 0.10 {
		t.Fatalf("LowInformationScoreThreshold = %v, want 0.10", LowInformationScoreThreshold)
	}
	if LowInformationVerbConfidence != 0.55 {
		t.Fatalf("LowInformationVerbConfidence = %v, want 0.55", LowInformationVerbConfidence)
	}
	if LowInformationNounConfidence != 0.45 {
		t.Fatalf("LowInformationNounConfidence = %v, want 0.45", LowInformationNounConfidence)
	}
}

func TestTokeniser_LowInformationConfidenceFloor(t *testing.T) {
	setup(t)
	tok := NewTokeniser()

	tokens := tok.Tokenise("maybe commit")
	if len(tokens) != 2 {
		t.Fatalf("Tokenise(maybe commit) produced %d tokens, want 2", len(tokens))
	}
	if tokens[1].Type != TokenVerb {
		t.Fatalf("Tokenise(maybe commit) Type = %v, want TokenVerb", tokens[1].Type)
	}
	if tokens[1].Confidence != 0.55 {
		t.Fatalf("Tokenise(maybe commit) Confidence = %v, want 0.55", tokens[1].Confidence)
	}
	if tokens[1].AltType != TokenNoun {
		t.Fatalf("Tokenise(maybe commit) AltType = %v, want TokenNoun", tokens[1].AltType)
	}
	if tokens[1].AltConf != 0.45 {
		t.Fatalf("Tokenise(maybe commit) AltConf = %v, want 0.45", tokens[1].AltConf)
	}
}

// --- Benchmarks ---

func benchSetup(b *testing.B) {
	b.Helper()
	svc, err := i18n.New()
	if err != nil {
		b.Fatalf("i18n.New() failed: %v", err)
	}
	i18n.SetDefault(svc)
}

func BenchmarkTokenise_Short(b *testing.B) {
	benchSetup(b)
	tok := NewTokeniser()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		tok.Tokenise("Delete the file")
	}
}

func BenchmarkTokenise_Medium(b *testing.B) {
	benchSetup(b)
	tok := NewTokeniser()
	text := "The build failed because the test commit was not pushed to the branch"
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		tok.Tokenise(text)
	}
}

func BenchmarkTokenise_DualClass(b *testing.B) {
	benchSetup(b)
	tok := NewTokeniser()
	text := "Commit the changes and run the build test"
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		tok.Tokenise(text)
	}
}

func BenchmarkTokenise_WithSignals(b *testing.B) {
	benchSetup(b)
	tok := NewTokeniser(WithSignals())
	text := "The commit was rebuilt and the test passed"
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		tok.Tokenise(text)
	}
}

func BenchmarkNewImprint(b *testing.B) {
	benchSetup(b)
	tok := NewTokeniser()
	tokens := tok.Tokenise("Delete the configuration file and rebuild the project")
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		NewImprint(tokens)
	}
}

func BenchmarkImprint_Similar(b *testing.B) {
	benchSetup(b)
	tok := NewTokeniser()
	imp1 := NewImprint(tok.Tokenise("Delete the configuration file"))
	imp2 := NewImprint(tok.Tokenise("Delete the old file"))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		imp1.Similar(imp2)
	}
}

func BenchmarkMultiplier_Expand(b *testing.B) {
	benchSetup(b)
	m := NewMultiplier()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		m.Expand("Delete the configuration file")
	}
}