test(reversal): add round-trip validation tests

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
feat(reversal): add training data Multiplier
2026-02-16 23:32:08 +00:00 · 2026-02-16 23:30:11 +00:00 · 2026-02-16 23:26:29 +00:00 · 2026-02-16 23:25:08 +00:00 · 2026-02-16 23:22:40 +00:00 · 2026-02-16 23:21:04 +00:00
9 changed files with 1607 additions and 1 deletions
--- a/grammar.go
+++ b/grammar.go
@ -20,6 +20,24 @@ func SetGrammarData(lang string, data *GrammarData) {
 	grammarCache[lang] = data
 }

+// IrregularVerbs returns a copy of the irregular verb forms map.
+func IrregularVerbs() map[string]VerbForms {
+	result := make(map[string]VerbForms, len(irregularVerbs))
+	for k, v := range irregularVerbs {
+		result[k] = v
+	}
+	return result
+}
+
+// IrregularNouns returns a copy of the irregular nouns map.
+func IrregularNouns() map[string]string {
+	result := make(map[string]string, len(irregularNouns))
+	for k, v := range irregularNouns {
+		result[k] = v
+	}
+	return result
+}
+
 func getVerbForm(lang, verb, form string) string {
 	data := GetGrammarData(lang)
 	if data == nil || data.Verbs == nil {
--- a/locales/en.json
+++ b/locales/en.json
@ -39,7 +39,10 @@
 			"hit": { "base": "hit", "past": "hit", "gerund": "hitting" },
 			"sit": { "base": "sit", "past": "sat", "gerund": "sitting" },
 			"split": { "base": "split", "past": "split", "gerund": "splitting" },
-			"shut": { "base": "shut", "past": "shut", "gerund": "shutting" }
+			"shut": { "base": "shut", "past": "shut", "gerund": "shutting" },
+			"delete": { "base": "delete", "past": "deleted", "gerund": "deleting" },
+			"update": { "base": "update", "past": "updated", "gerund": "updating" },
+			"push": { "base": "push", "past": "pushed", "gerund": "pushing" }
 		},
 		"noun": {
 			"file": { "one": "file", "other": "files" },
--- a/reversal/imprint.go
+++ b/reversal/imprint.go
@ -0,0 +1,180 @@
+package reversal
+
+import "math"
+
+// GrammarImprint is a low-dimensional grammar feature vector.
+type GrammarImprint struct {
+	VerbDistribution   map[string]float64 // verb base -> frequency
+	TenseDistribution  map[string]float64 // "past"/"gerund"/"base" -> ratio
+	NounDistribution   map[string]float64 // noun base -> frequency
+	PluralRatio        float64            // proportion of plural nouns (0.0-1.0)
+	DomainVocabulary   map[string]int     // gram.word category -> hit count
+	ArticleUsage       map[string]float64 // "definite"/"indefinite" -> ratio
+	PunctuationPattern map[string]float64 // "label"/"progress"/"question" -> ratio
+	TokenCount         int
+	UniqueVerbs        int
+	UniqueNouns        int
+}
+
+// NewImprint calculates a GrammarImprint from classified tokens.
+func NewImprint(tokens []Token) GrammarImprint {
+	imp := GrammarImprint{
+		VerbDistribution:   make(map[string]float64),
+		TenseDistribution:  make(map[string]float64),
+		NounDistribution:   make(map[string]float64),
+		DomainVocabulary:   make(map[string]int),
+		ArticleUsage:       make(map[string]float64),
+		PunctuationPattern: make(map[string]float64),
+	}
+
+	if len(tokens) == 0 {
+		return imp
+	}
+
+	imp.TokenCount = len(tokens)
+
+	verbBases := make(map[string]bool)
+	nounBases := make(map[string]bool)
+	var verbCount, nounCount, articleCount, punctCount int
+	var pluralNouns, totalNouns int
+
+	for _, tok := range tokens {
+		switch tok.Type {
+		case TokenVerb:
+			verbCount++
+			base := tok.VerbInfo.Base
+			imp.VerbDistribution[base]++
+			imp.TenseDistribution[tok.VerbInfo.Tense]++
+			verbBases[base] = true
+
+		case TokenNoun:
+			nounCount++
+			base := tok.NounInfo.Base
+			imp.NounDistribution[base]++
+			nounBases[base] = true
+			totalNouns++
+			if tok.NounInfo.Plural {
+				pluralNouns++
+			}
+
+		case TokenArticle:
+			articleCount++
+			imp.ArticleUsage[tok.ArtType]++
+
+		case TokenWord:
+			imp.DomainVocabulary[tok.WordCat]++
+
+		case TokenPunctuation:
+			punctCount++
+			imp.PunctuationPattern[tok.PunctType]++
+		}
+	}
+
+	imp.UniqueVerbs = len(verbBases)
+	imp.UniqueNouns = len(nounBases)
+
+	// Calculate plural ratio
+	if totalNouns > 0 {
+		imp.PluralRatio = float64(pluralNouns) / float64(totalNouns)
+	}
+
+	// Normalise frequency maps to sum to 1.0
+	normaliseMap(imp.VerbDistribution)
+	normaliseMap(imp.TenseDistribution)
+	normaliseMap(imp.NounDistribution)
+	normaliseMap(imp.ArticleUsage)
+	normaliseMap(imp.PunctuationPattern)
+
+	return imp
+}
+
+// normaliseMap scales all values in a map so they sum to 1.0.
+// If the map is empty or sums to zero, it is left unchanged.
+func normaliseMap(m map[string]float64) {
+	var total float64
+	for _, v := range m {
+		total += v
+	}
+	if total == 0 {
+		return
+	}
+	for k, v := range m {
+		m[k] = v / total
+	}
+}
+
+// Similar returns weighted cosine similarity between two imprints (0.0-1.0).
+// Weights: verb(0.30), tense(0.20), noun(0.25), article(0.15), punct(0.10).
+func (a GrammarImprint) Similar(b GrammarImprint) float64 {
+	// Two empty imprints are identical.
+	if a.TokenCount == 0 && b.TokenCount == 0 {
+		return 1.0
+	}
+
+	type component struct {
+		weight float64
+		a, b   map[string]float64
+	}
+
+	components := []component{
+		{0.30, a.VerbDistribution, b.VerbDistribution},
+		{0.20, a.TenseDistribution, b.TenseDistribution},
+		{0.25, a.NounDistribution, b.NounDistribution},
+		{0.15, a.ArticleUsage, b.ArticleUsage},
+		{0.10, a.PunctuationPattern, b.PunctuationPattern},
+	}
+
+	var totalWeight float64
+	var weightedSum float64
+
+	for _, c := range components {
+		// Skip components where both maps are empty (no signal).
+		if len(c.a) == 0 && len(c.b) == 0 {
+			continue
+		}
+		totalWeight += c.weight
+		weightedSum += c.weight * mapSimilarity(c.a, c.b)
+	}
+
+	if totalWeight == 0 {
+		return 1.0
+	}
+
+	return weightedSum / totalWeight
+}
+
+// mapSimilarity computes cosine similarity between two frequency maps.
+// Returns 1.0 for identical distributions, 0.0 for completely disjoint.
+func mapSimilarity(a, b map[string]float64) float64 {
+	if len(a) == 0 && len(b) == 0 {
+		return 1.0
+	}
+	if len(a) == 0 || len(b) == 0 {
+		return 0.0
+	}
+
+	// Collect the union of keys.
+	keys := make(map[string]bool)
+	for k := range a {
+		keys[k] = true
+	}
+	for k := range b {
+		keys[k] = true
+	}
+
+	var dot, magA, magB float64
+	for k := range keys {
+		va := a[k]
+		vb := b[k]
+		dot += va * vb
+		magA += va * va
+		magB += vb * vb
+	}
+
+	denom := math.Sqrt(magA) * math.Sqrt(magB)
+	if denom == 0 {
+		return 0.0
+	}
+
+	return dot / denom
+}
--- a/reversal/imprint_test.go
+++ b/reversal/imprint_test.go
@ -0,0 +1,116 @@
+package reversal
+
+import (
+	"testing"
+
+	i18n "forge.lthn.ai/core/go-i18n"
+)
+
+func TestNewImprint(t *testing.T) {
+	svc, err := i18n.New()
+	if err != nil {
+		t.Fatalf("i18n.New() failed: %v", err)
+	}
+	i18n.SetDefault(svc)
+
+	tok := NewTokeniser()
+	tokens := tok.Tokenise("Deleted the configuration files successfully")
+	imp := NewImprint(tokens)
+
+	if imp.TokenCount != 5 {
+		t.Errorf("TokenCount = %d, want 5", imp.TokenCount)
+	}
+	if imp.UniqueVerbs == 0 {
+		t.Error("UniqueVerbs = 0, want > 0")
+	}
+	if imp.UniqueNouns == 0 {
+		t.Error("UniqueNouns = 0, want > 0")
+	}
+	if imp.TenseDistribution["past"] == 0 {
+		t.Error("TenseDistribution[\"past\"] = 0, want > 0")
+	}
+	if imp.ArticleUsage["definite"] == 0 {
+		t.Error("ArticleUsage[\"definite\"] = 0, want > 0")
+	}
+}
+
+func TestNewImprint_Empty(t *testing.T) {
+	imp := NewImprint(nil)
+	if imp.TokenCount != 0 {
+		t.Errorf("TokenCount = %d, want 0", imp.TokenCount)
+	}
+}
+
+func TestNewImprint_PluralRatio(t *testing.T) {
+	svc, _ := i18n.New()
+	i18n.SetDefault(svc)
+	tok := NewTokeniser()
+
+	// All plural nouns
+	tokens := tok.Tokenise("files branches repositories")
+	imp := NewImprint(tokens)
+	if imp.PluralRatio < 0.5 {
+		t.Errorf("PluralRatio = %f for all-plural input, want >= 0.5", imp.PluralRatio)
+	}
+
+	// All singular nouns
+	tokens = tok.Tokenise("file branch repository")
+	imp = NewImprint(tokens)
+	if imp.PluralRatio > 0.5 {
+		t.Errorf("PluralRatio = %f for all-singular input, want <= 0.5", imp.PluralRatio)
+	}
+}
+
+func TestImprint_Similar_SameText(t *testing.T) {
+	svc, _ := i18n.New()
+	i18n.SetDefault(svc)
+	tok := NewTokeniser()
+	tokens := tok.Tokenise("Delete the configuration file")
+	imp1 := NewImprint(tokens)
+	imp2 := NewImprint(tokens)
+
+	sim := imp1.Similar(imp2)
+	if sim != 1.0 {
+		t.Errorf("Same text similarity = %f, want 1.0", sim)
+	}
+}
+
+func TestImprint_Similar_SimilarText(t *testing.T) {
+	svc, _ := i18n.New()
+	i18n.SetDefault(svc)
+	tok := NewTokeniser()
+
+	imp1 := NewImprint(tok.Tokenise("Delete the configuration file"))
+	imp2 := NewImprint(tok.Tokenise("Deleted the configuration files"))
+
+	sim := imp1.Similar(imp2)
+	if sim < 0.3 {
+		t.Errorf("Similar text similarity = %f, want >= 0.3", sim)
+	}
+	if sim >= 1.0 {
+		t.Errorf("Different text similarity = %f, want < 1.0", sim)
+	}
+}
+
+func TestImprint_Similar_DifferentText(t *testing.T) {
+	svc, _ := i18n.New()
+	i18n.SetDefault(svc)
+	tok := NewTokeniser()
+
+	imp1 := NewImprint(tok.Tokenise("Delete the configuration file"))
+	imp2 := NewImprint(tok.Tokenise("Building the project successfully"))
+
+	sim := imp1.Similar(imp2)
+	if sim > 0.7 {
+		t.Errorf("Different text similarity = %f, want <= 0.7", sim)
+	}
+}
+
+func TestImprint_Similar_Empty(t *testing.T) {
+	imp1 := NewImprint(nil)
+	imp2 := NewImprint(nil)
+	sim := imp1.Similar(imp2)
+	if sim != 1.0 {
+		t.Errorf("Empty imprint similarity = %f, want 1.0", sim)
+	}
+}
--- a/reversal/multiplier.go
+++ b/reversal/multiplier.go
@ -0,0 +1,258 @@
+package reversal
+
+import (
+	"strings"
+	"unicode"
+
+	i18n "forge.lthn.ai/core/go-i18n"
+)
+
+// Multiplier generates deterministic grammatical variants of text
+// for training data augmentation. Zero API calls.
+type Multiplier struct {
+	tokeniser *Tokeniser
+}
+
+// NewMultiplier creates a Multiplier using the default English tokeniser.
+func NewMultiplier() *Multiplier {
+	return &Multiplier{tokeniser: NewTokeniser()}
+}
+
+// NewMultiplierForLang creates a Multiplier for the specified language.
+func NewMultiplierForLang(lang string) *Multiplier {
+	return &Multiplier{tokeniser: NewTokeniserForLang(lang)}
+}
+
+// Expand produces: original + tense flips (past, gerund) + number flips (plural toggle) + combinations.
+// All output is deterministic and grammatically correct.
+func (m *Multiplier) Expand(text string) []string {
+	text = strings.TrimSpace(text)
+	if text == "" {
+		return nil
+	}
+
+	tokens := m.tokeniser.Tokenise(text)
+	if len(tokens) == 0 {
+		return nil
+	}
+
+	// Collect indices of verbs and nouns for targeted replacement.
+	var verbIndices []int
+	var nounIndices []int
+	for i, tok := range tokens {
+		switch tok.Type {
+		case TokenVerb:
+			verbIndices = append(verbIndices, i)
+		case TokenNoun:
+			nounIndices = append(nounIndices, i)
+		}
+	}
+
+	// Build the list of variants in deterministic order:
+	// 1. Original
+	// 2. Single verb transforms (past, gerund) for each verb
+	// 3. Single noun transforms (plural toggle) for each noun
+	// 4. Combined transforms (verb transform + noun transform)
+	seen := make(map[string]bool)
+	var results []string
+
+	addVariant := func(s string) {
+		if !seen[s] {
+			seen[s] = true
+			results = append(results, s)
+		}
+	}
+
+	// 1. Original text
+	addVariant(text)
+
+	// 2. Verb transforms: for each verb, produce past and gerund variants
+	for _, vi := range verbIndices {
+		pastTokens := m.applyVerbTransform(tokens, vi, "past")
+		addVariant(reconstruct(pastTokens))
+
+		gerundTokens := m.applyVerbTransform(tokens, vi, "gerund")
+		addVariant(reconstruct(gerundTokens))
+
+		baseTokens := m.applyVerbTransform(tokens, vi, "base")
+		addVariant(reconstruct(baseTokens))
+	}
+
+	// 3. Noun transforms: for each noun, toggle plural/singular
+	for _, ni := range nounIndices {
+		pluralTokens := m.applyNounTransform(tokens, ni)
+		addVariant(reconstruct(pluralTokens))
+	}
+
+	// 4. Combinations: each verb transform + each noun transform
+	for _, vi := range verbIndices {
+		for _, ni := range nounIndices {
+			// past + noun toggle
+			pastTokens := m.applyVerbTransform(tokens, vi, "past")
+			pastPluralTokens := m.applyNounTransformOnTokens(pastTokens, ni)
+			addVariant(reconstruct(pastPluralTokens))
+
+			// gerund + noun toggle
+			gerundTokens := m.applyVerbTransform(tokens, vi, "gerund")
+			gerundPluralTokens := m.applyNounTransformOnTokens(gerundTokens, ni)
+			addVariant(reconstruct(gerundPluralTokens))
+
+			// base + noun toggle
+			baseTokens := m.applyVerbTransform(tokens, vi, "base")
+			basePluralTokens := m.applyNounTransformOnTokens(baseTokens, ni)
+			addVariant(reconstruct(basePluralTokens))
+		}
+	}
+
+	return results
+}
+
+// applyVerbTransform returns a copy of tokens with the verb at index vi
+// transformed to the specified tense ("past", "gerund", or "base").
+func (m *Multiplier) applyVerbTransform(tokens []Token, vi int, targetTense string) []Token {
+	result := make([]Token, len(tokens))
+	copy(result, tokens)
+
+	tok := tokens[vi]
+	base := tok.VerbInfo.Base
+	currentTense := tok.VerbInfo.Tense
+
+	if currentTense == targetTense {
+		return result
+	}
+
+	var newForm string
+	switch targetTense {
+	case "past":
+		newForm = i18n.PastTense(base)
+	case "gerund":
+		newForm = i18n.Gerund(base)
+	case "base":
+		newForm = base
+	}
+
+	if newForm == "" {
+		return result
+	}
+
+	// Preserve capitalisation of the original token.
+	newForm = preserveCase(tok.Raw, newForm)
+
+	result[vi] = Token{
+		Raw:   newForm,
+		Lower: strings.ToLower(newForm),
+		Type:  TokenVerb,
+		VerbInfo: VerbMatch{
+			Base:  base,
+			Tense: targetTense,
+			Form:  newForm,
+		},
+	}
+
+	return result
+}
+
+// applyNounTransform returns a copy of tokens with the noun at index ni
+// toggled between singular and plural.
+func (m *Multiplier) applyNounTransform(tokens []Token, ni int) []Token {
+	return m.applyNounTransformOnTokens(tokens, ni)
+}
+
+// applyNounTransformOnTokens returns a copy of the given tokens with the
+// noun at index ni toggled between singular and plural.
+func (m *Multiplier) applyNounTransformOnTokens(tokens []Token, ni int) []Token {
+	result := make([]Token, len(tokens))
+	copy(result, tokens)
+
+	tok := tokens[ni]
+	base := tok.NounInfo.Base
+	isPlural := tok.NounInfo.Plural
+
+	var newForm string
+	var newPlural bool
+
+	if isPlural {
+		// Already plural, revert to singular (base form).
+		newForm = base
+		newPlural = false
+	} else {
+		// Singular, generate plural.
+		newForm = i18n.PluralForm(base)
+		newPlural = true
+	}
+
+	if newForm == "" {
+		return result
+	}
+
+	// Preserve capitalisation.
+	newForm = preserveCase(tok.Raw, newForm)
+
+	result[ni] = Token{
+		Raw:   newForm,
+		Lower: strings.ToLower(newForm),
+		Type:  TokenNoun,
+		NounInfo: NounMatch{
+			Base:   base,
+			Plural: newPlural,
+			Form:   newForm,
+		},
+	}
+
+	return result
+}
+
+// reconstruct joins tokens back into a string, preserving spacing.
+func reconstruct(tokens []Token) string {
+	var b strings.Builder
+	for i, tok := range tokens {
+		if i > 0 {
+			// Punctuation tokens that were split from the previous word
+			// should not have a leading space.
+			if tok.Type == TokenPunctuation {
+				b.WriteString(tok.Raw)
+				continue
+			}
+			b.WriteByte(' ')
+		}
+		b.WriteString(tok.Raw)
+	}
+	return b.String()
+}
+
+// preserveCase applies the capitalisation pattern of the original word
+// to the replacement word. If the original started with an uppercase
+// letter, the replacement will too.
+func preserveCase(original, replacement string) string {
+	if len(original) == 0 || len(replacement) == 0 {
+		return replacement
+	}
+
+	origRunes := []rune(original)
+	repRunes := []rune(replacement)
+
+	// If the original is all uppercase (like "DELETE"), make replacement all uppercase.
+	if isAllUpper(original) && len(original) > 1 {
+		return strings.ToUpper(replacement)
+	}
+
+	// If the first character of the original is uppercase, capitalise the replacement.
+	if unicode.IsUpper(origRunes[0]) {
+		repRunes[0] = unicode.ToUpper(repRunes[0])
+		return string(repRunes)
+	}
+
+	// Otherwise, ensure the replacement starts lowercase.
+	repRunes[0] = unicode.ToLower(repRunes[0])
+	return string(repRunes)
+}
+
+// isAllUpper returns true if every letter in the string is uppercase.
+func isAllUpper(s string) bool {
+	for _, r := range s {
+		if unicode.IsLetter(r) && !unicode.IsUpper(r) {
+			return false
+		}
+	}
+	return true
+}
--- a/reversal/multiplier_test.go
+++ b/reversal/multiplier_test.go
@ -0,0 +1,67 @@
+package reversal
+
+import (
+	"testing"
+
+	i18n "forge.lthn.ai/core/go-i18n"
+)
+
+func TestMultiplier_Expand(t *testing.T) {
+	svc, _ := i18n.New()
+	i18n.SetDefault(svc)
+	m := NewMultiplier()
+	variants := m.Expand("Delete the configuration file")
+
+	if len(variants) < 4 {
+		t.Errorf("Expand() returned %d variants, want >= 4", len(variants))
+	}
+
+	expected := map[string]bool{
+		"Delete the configuration file":   true, // original
+		"Deleted the configuration file":  true, // past
+		"Deleting the configuration file": true, // gerund
+		"Delete the configuration files":  true, // plural
+	}
+	for _, v := range variants {
+		delete(expected, v)
+	}
+	for missing := range expected {
+		t.Errorf("Expand() missing expected variant: %q", missing)
+	}
+}
+
+func TestMultiplier_Expand_NoVerbs(t *testing.T) {
+	svc, _ := i18n.New()
+	i18n.SetDefault(svc)
+	m := NewMultiplier()
+	variants := m.Expand("the configuration file")
+	if len(variants) < 2 {
+		t.Errorf("Expand() returned %d variants, want >= 2", len(variants))
+	}
+}
+
+func TestMultiplier_Expand_Empty(t *testing.T) {
+	svc, _ := i18n.New()
+	i18n.SetDefault(svc)
+	m := NewMultiplier()
+	variants := m.Expand("")
+	if len(variants) != 0 {
+		t.Errorf("Expand(\"\") returned %d variants, want 0", len(variants))
+	}
+}
+
+func TestMultiplier_Expand_Deterministic(t *testing.T) {
+	svc, _ := i18n.New()
+	i18n.SetDefault(svc)
+	m := NewMultiplier()
+	v1 := m.Expand("Delete the file")
+	v2 := m.Expand("Delete the file")
+	if len(v1) != len(v2) {
+		t.Fatalf("Non-deterministic: %d vs %d variants", len(v1), len(v2))
+	}
+	for i := range v1 {
+		if v1[i] != v2[i] {
+			t.Errorf("Non-deterministic at [%d]: %q vs %q", i, v1[i], v2[i])
+		}
+	}
+}
--- a/reversal/roundtrip_test.go
+++ b/reversal/roundtrip_test.go
@ -0,0 +1,93 @@
+package reversal
+
+import (
+	"testing"
+
+	i18n "forge.lthn.ai/core/go-i18n"
+)
+
+// TestRoundTrip_ForwardThenReverse — go-i18n composed output → reversal → verify correct tokens
+func TestRoundTrip_ForwardThenReverse(t *testing.T) {
+	svc, _ := i18n.New()
+	i18n.SetDefault(svc)
+	tok := NewTokeniser()
+
+	tests := []struct {
+		name      string
+		text      string
+		wantVerb  string
+		wantTense string
+	}{
+		{
+			name:      "Progress pattern",
+			text:      i18n.Progress("build"),
+			wantVerb:  "build",
+			wantTense: "gerund",
+		},
+		{
+			name:      "ActionResult pattern",
+			text:      i18n.ActionResult("delete", "file"),
+			wantVerb:  "delete",
+			wantTense: "past",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tokens := tok.Tokenise(tt.text)
+			foundVerb := false
+			for _, tok := range tokens {
+				if tok.Type == TokenVerb && tok.VerbInfo.Base == tt.wantVerb {
+					foundVerb = true
+					if tok.VerbInfo.Tense != tt.wantTense {
+						t.Errorf("verb %q tense = %q, want %q", tt.wantVerb, tok.VerbInfo.Tense, tt.wantTense)
+					}
+				}
+			}
+			if !foundVerb {
+				t.Errorf("did not find verb %q in tokens from %q", tt.wantVerb, tt.text)
+			}
+		})
+	}
+}
+
+// TestRoundTrip_MultiplierImprints — variants should be similar to original
+func TestRoundTrip_MultiplierImprints(t *testing.T) {
+	svc, _ := i18n.New()
+	i18n.SetDefault(svc)
+	tok := NewTokeniser()
+	m := NewMultiplier()
+
+	original := "Delete the configuration file"
+	variants := m.Expand(original)
+	origImprint := NewImprint(tok.Tokenise(original))
+
+	for _, v := range variants {
+		if v == original {
+			continue
+		}
+		varImprint := NewImprint(tok.Tokenise(v))
+		sim := origImprint.Similar(varImprint)
+		if sim < 0.2 {
+			t.Errorf("Variant %q similarity to original = %f, want >= 0.2", v, sim)
+		}
+	}
+}
+
+// TestRoundTrip_SimilarDocuments — similar docs → higher similarity than different docs
+func TestRoundTrip_SimilarDocuments(t *testing.T) {
+	svc, _ := i18n.New()
+	i18n.SetDefault(svc)
+	tok := NewTokeniser()
+
+	imp1 := NewImprint(tok.Tokenise("Delete the configuration file"))
+	imp2 := NewImprint(tok.Tokenise("Delete the old file"))
+	imp3 := NewImprint(tok.Tokenise("Building the project successfully"))
+
+	simSame := imp1.Similar(imp2)
+	simDiff := imp1.Similar(imp3)
+
+	if simSame <= simDiff {
+		t.Errorf("Similar documents (%f) should score higher than different (%f)", simSame, simDiff)
+	}
+}
--- a/reversal/tokeniser.go
+++ b/reversal/tokeniser.go
@ -0,0 +1,561 @@
+// Package reversal provides reverse grammar lookups.
+//
+// The forward engine (go-i18n) maps base forms to inflected forms:
+//
+//	PastTense("delete") → "deleted"
+//	Gerund("run")       → "running"
+//
+// The reversal engine reads those same tables backwards, turning
+// inflected forms back into base forms with tense metadata:
+//
+//	MatchVerb("deleted")  → {Base: "delete", Tense: "past"}
+//	MatchVerb("running")  → {Base: "run",    Tense: "gerund"}
+//
+// 3-tier lookup: JSON grammar data → irregular verb maps → regular
+// morphology rules (verified by round-tripping through forward functions).
+package reversal
+
+import (
+	"strings"
+
+	i18n "forge.lthn.ai/core/go-i18n"
+)
+
+// VerbMatch holds the result of a reverse verb lookup.
+type VerbMatch struct {
+	Base  string // Base form of the verb ("delete", "run")
+	Tense string // "past", "gerund", or "base"
+	Form  string // The original inflected form
+}
+
+// NounMatch holds the result of a reverse noun lookup.
+type NounMatch struct {
+	Base   string // Base/singular form of the noun
+	Plural bool   // Whether the matched form was plural
+	Form   string // The original form
+}
+
+// TokenType classifies a token identified during tokenisation.
+type TokenType int
+
+const (
+	TokenUnknown     TokenType = iota // Unrecognised word
+	TokenVerb                         // Matched verb (see VerbInfo)
+	TokenNoun                         // Matched noun (see NounInfo)
+	TokenArticle                      // Matched article ("a", "an", "the")
+	TokenWord                         // Matched word from grammar word map
+	TokenPunctuation                  // Punctuation ("...", "?")
+)
+
+// Token represents a single classified token from a text string.
+type Token struct {
+	Raw       string    // Original text as it appeared in input
+	Lower     string    // Lowercased form
+	Type      TokenType // Classification
+	VerbInfo  VerbMatch // Set when Type == TokenVerb
+	NounInfo  NounMatch // Set when Type == TokenNoun
+	WordCat   string    // Set when Type == TokenWord
+	ArtType   string    // Set when Type == TokenArticle
+	PunctType string    // Set when Type == TokenPunctuation
+}
+
+// Tokeniser provides reverse grammar lookups by maintaining inverse
+// indexes built from the forward grammar tables.
+type Tokeniser struct {
+	pastToBase   map[string]string // "deleted" → "delete"
+	gerundToBase map[string]string // "deleting" → "delete"
+	baseVerbs    map[string]bool   // "delete" → true
+	pluralToBase map[string]string // "files" → "file"
+	baseNouns    map[string]bool   // "file" → true
+	words        map[string]string // word translations
+	lang         string
+}
+
+// NewTokeniser creates a Tokeniser for English ("en").
+func NewTokeniser() *Tokeniser {
+	return NewTokeniserForLang("en")
+}
+
+// NewTokeniserForLang creates a Tokeniser for the specified language,
+// building inverse indexes from the grammar data.
+func NewTokeniserForLang(lang string) *Tokeniser {
+	t := &Tokeniser{
+		pastToBase:   make(map[string]string),
+		gerundToBase: make(map[string]string),
+		baseVerbs:    make(map[string]bool),
+		pluralToBase: make(map[string]string),
+		baseNouns:    make(map[string]bool),
+		words:        make(map[string]string),
+		lang:         lang,
+	}
+	t.buildVerbIndex()
+	t.buildNounIndex()
+	t.buildWordIndex()
+	return t
+}
+
+// buildVerbIndex reads grammar tables and irregular verb maps to build
+// inverse lookup maps: inflected form → base form.
+func (t *Tokeniser) buildVerbIndex() {
+	// Tier 1: Read from JSON grammar data (via GetGrammarData).
+	data := i18n.GetGrammarData(t.lang)
+	if data != nil && data.Verbs != nil {
+		for base, forms := range data.Verbs {
+			t.baseVerbs[base] = true
+			if forms.Past != "" {
+				t.pastToBase[forms.Past] = base
+			}
+			if forms.Gerund != "" {
+				t.gerundToBase[forms.Gerund] = base
+			}
+		}
+	}
+
+	// Tier 2: Read from the exported irregularVerbs map.
+	// Build inverse maps directly from the authoritative source.
+	for base, forms := range i18n.IrregularVerbs() {
+		t.baseVerbs[base] = true
+		if forms.Past != "" {
+			if _, exists := t.pastToBase[forms.Past]; !exists {
+				t.pastToBase[forms.Past] = base
+			}
+		}
+		if forms.Gerund != "" {
+			if _, exists := t.gerundToBase[forms.Gerund]; !exists {
+				t.gerundToBase[forms.Gerund] = base
+			}
+		}
+	}
+}
+
+// buildNounIndex reads grammar tables and irregular noun maps to build
+// inverse lookup maps: plural form → base form.
+func (t *Tokeniser) buildNounIndex() {
+	// Tier 1: Read from JSON grammar data (via GetGrammarData).
+	data := i18n.GetGrammarData(t.lang)
+	if data != nil && data.Nouns != nil {
+		for base, forms := range data.Nouns {
+			t.baseNouns[base] = true
+			if forms.Other != "" && forms.Other != base {
+				t.pluralToBase[forms.Other] = base
+			}
+		}
+	}
+
+	// Tier 2: Read from the exported irregularNouns map.
+	for base, plural := range i18n.IrregularNouns() {
+		t.baseNouns[base] = true
+		if plural != base {
+			if _, exists := t.pluralToBase[plural]; !exists {
+				t.pluralToBase[plural] = base
+			}
+		}
+	}
+}
+
+// MatchNoun performs a 3-tier reverse lookup for a noun form.
+//
+// Tier 1: Check if the word is a known base noun.
+// Tier 2: Check the pluralToBase inverse map.
+// Tier 3: Try reverse morphology rules and round-trip verify via
+// the forward function PluralForm().
+func (t *Tokeniser) MatchNoun(word string) (NounMatch, bool) {
+	word = strings.ToLower(strings.TrimSpace(word))
+	if word == "" {
+		return NounMatch{}, false
+	}
+
+	// Tier 1: Is it a base noun?
+	if t.baseNouns[word] {
+		return NounMatch{Base: word, Plural: false, Form: word}, true
+	}
+
+	// Tier 2: Check inverse map from grammar tables + irregular nouns.
+	if base, ok := t.pluralToBase[word]; ok {
+		return NounMatch{Base: base, Plural: true, Form: word}, true
+	}
+
+	// Tier 3: Reverse morphology with round-trip verification.
+	candidates := t.reverseRegularPlural(word)
+	for _, c := range candidates {
+		if i18n.PluralForm(c) == word {
+			return NounMatch{Base: c, Plural: true, Form: word}, true
+		}
+	}
+
+	return NounMatch{}, false
+}
+
+// reverseRegularPlural generates candidate base forms by reversing regular
+// plural suffixes. Returns multiple candidates ordered by likelihood.
+//
+// The forward engine applies rules in this order:
+//  1. ends in s/ss/sh/ch/x/z → +es
+//  2. ends in consonant+y → ies
+//  3. ends in f → ves, fe → ves
+//  4. default → +s
+//
+// We generate candidates for each possible reverse rule. Round-trip
+// verification ensures only correct candidates pass.
+func (t *Tokeniser) reverseRegularPlural(word string) []string {
+	var candidates []string
+
+	// Rule: consonant + "ies" → consonant + "y" (e.g., "entries" → "entry")
+	if strings.HasSuffix(word, "ies") && len(word) > 3 {
+		base := word[:len(word)-3] + "y"
+		candidates = append(candidates, base)
+	}
+
+	// Rule: "ves" → "f" or "fe" (e.g., "wolves" → "wolf", "knives" → "knife")
+	if strings.HasSuffix(word, "ves") && len(word) > 3 {
+		candidates = append(candidates, word[:len(word)-3]+"f")
+		candidates = append(candidates, word[:len(word)-3]+"fe")
+	}
+
+	// Rule: sibilant + "es" (e.g., "processes" → "process", "branches" → "branch")
+	if strings.HasSuffix(word, "ses") || strings.HasSuffix(word, "xes") ||
+		strings.HasSuffix(word, "zes") || strings.HasSuffix(word, "ches") ||
+		strings.HasSuffix(word, "shes") {
+		base := word[:len(word)-2] // strip "es"
+		candidates = append(candidates, base)
+	}
+
+	// Rule: drop "s" (e.g., "servers" → "server")
+	if strings.HasSuffix(word, "s") && len(word) > 1 {
+		base := word[:len(word)-1]
+		candidates = append(candidates, base)
+	}
+
+	return candidates
+}
+
+// MatchVerb performs a 3-tier reverse lookup for a verb form.
+//
+// Tier 1: Check if the word is a known base verb.
+// Tier 2: Check the pastToBase and gerundToBase inverse maps.
+// Tier 3: Try reverse morphology rules and round-trip verify via
+// the forward functions PastTense() and Gerund().
+func (t *Tokeniser) MatchVerb(word string) (VerbMatch, bool) {
+	word = strings.ToLower(strings.TrimSpace(word))
+	if word == "" {
+		return VerbMatch{}, false
+	}
+
+	// Tier 1: Is it a base verb?
+	if t.baseVerbs[word] {
+		return VerbMatch{Base: word, Tense: "base", Form: word}, true
+	}
+
+	// Tier 2: Check inverse maps from grammar tables + irregular verbs.
+	if base, ok := t.pastToBase[word]; ok {
+		return VerbMatch{Base: base, Tense: "past", Form: word}, true
+	}
+	if base, ok := t.gerundToBase[word]; ok {
+		return VerbMatch{Base: base, Tense: "gerund", Form: word}, true
+	}
+
+	// Tier 3: Reverse morphology with round-trip verification.
+	// Try past tense candidates.
+	if base := t.bestRoundTrip(word, t.reverseRegularPast(word), i18n.PastTense); base != "" {
+		return VerbMatch{Base: base, Tense: "past", Form: word}, true
+	}
+
+	// Try gerund candidates.
+	if base := t.bestRoundTrip(word, t.reverseRegularGerund(word), i18n.Gerund); base != "" {
+		return VerbMatch{Base: base, Tense: "gerund", Form: word}, true
+	}
+
+	return VerbMatch{}, false
+}
+
+// bestRoundTrip selects the best candidate from a list by round-tripping
+// each through a forward function. When multiple candidates round-trip
+// successfully (ambiguity), it uses the following priority:
+//  1. Candidates that are known base verbs (in grammar tables / irregular maps)
+//  2. Candidates ending in a VCe pattern (vowel-consonant-e, the "magic e"
+//     pattern common in real English verbs like "delete", "create", "use").
+//     This avoids phantom verbs like "walke" or "processe" which have a
+//     CCe pattern (consonant-consonant-e) that doesn't occur naturally.
+//  3. Candidates NOT ending in "e" (the default morphology path)
+//  4. First match in candidate order as final tiebreaker
+func (t *Tokeniser) bestRoundTrip(target string, candidates []string, forward func(string) string) string {
+	var matches []string
+	for _, c := range candidates {
+		if forward(c) == target {
+			matches = append(matches, c)
+		}
+	}
+	if len(matches) == 0 {
+		return ""
+	}
+	if len(matches) == 1 {
+		return matches[0]
+	}
+
+	// Priority 1: known base verb
+	for _, m := range matches {
+		if t.baseVerbs[m] {
+			return m
+		}
+	}
+
+	// Priority 2: prefer VCe-ending candidate (real English verb pattern)
+	for _, m := range matches {
+		if hasVCeEnding(m) {
+			return m
+		}
+	}
+
+	// Priority 3: prefer candidate not ending in "e" (avoids phantom verbs
+	// with CCe endings like "walke", "processe")
+	for _, m := range matches {
+		if !strings.HasSuffix(m, "e") {
+			return m
+		}
+	}
+
+	return matches[0]
+}
+
+// hasVCeEnding returns true if the word ends in a vowel-consonant-e pattern
+// (the "magic e" pattern). This is characteristic of real English verbs like
+// "delete" (-ete), "create" (-ate), "use" (-use), "close" (-ose).
+// Phantom verbs produced by naive suffix stripping like "walke" (-lke) or
+// "processe" (-sse) end in consonant-consonant-e and return false.
+func hasVCeEnding(word string) bool {
+	if len(word) < 3 || word[len(word)-1] != 'e' {
+		return false
+	}
+	lastConsonant := word[len(word)-2]
+	vowelBefore := word[len(word)-3]
+	return !isVowelByte(lastConsonant) && isVowelByte(vowelBefore)
+}
+
+func isVowelByte(b byte) bool {
+	switch b {
+	case 'a', 'e', 'i', 'o', 'u':
+		return true
+	}
+	return false
+}
+
+// reverseRegularPast generates candidate base forms by reversing regular
+// past tense suffixes. Returns multiple candidates ordered by likelihood.
+//
+// The forward engine applies rules in this order:
+//  1. ends in "e" → +d  (create → created)
+//  2. ends in "y" + consonant → ied  (copy → copied)
+//  3. shouldDoubleConsonant → double+ed  (stop → stopped)
+//  4. default → +ed  (walk → walked)
+//
+// We generate candidates for each possible reverse rule. Round-trip
+// verification (in bestRoundTrip) ensures only correct candidates pass.
+func (t *Tokeniser) reverseRegularPast(word string) []string {
+	var candidates []string
+
+	if !strings.HasSuffix(word, "ed") {
+		return candidates
+	}
+
+	// Rule: consonant + "ied" → consonant + "y" (e.g., "copied" → "copy")
+	if strings.HasSuffix(word, "ied") && len(word) > 3 {
+		base := word[:len(word)-3] + "y"
+		candidates = append(candidates, base)
+	}
+
+	// Rule: doubled consonant + "ed" → single consonant (e.g., "stopped" → "stop")
+	if len(word) > 4 {
+		beforeEd := word[:len(word)-2]
+		lastChar := beforeEd[len(beforeEd)-1]
+		if len(beforeEd) >= 2 && beforeEd[len(beforeEd)-2] == lastChar {
+			base := beforeEd[:len(beforeEd)-1]
+			candidates = append(candidates, base)
+		}
+	}
+
+	// Rule: stem + "d" where stem ends in "e" (e.g., "created" → "create")
+	if len(word) > 2 {
+		stemPlusE := word[:len(word)-1] // strip "d", leaving stem + "e"
+		candidates = append(candidates, stemPlusE)
+	}
+
+	// Rule: stem + "ed" (e.g., "walked" → "walk")
+	if len(word) > 2 {
+		stem := word[:len(word)-2]
+		candidates = append(candidates, stem)
+	}
+
+	return candidates
+}
+
+// reverseRegularGerund generates candidate base forms by reversing regular
+// gerund suffixes. Returns multiple candidates ordered by likelihood.
+//
+// Rules reversed:
+//   - verb + "ing"          (e.g., "walking" → "walk")
+//   - verb[:-1] + "ing"     (e.g., "creating" → "create", drop e)
+//   - doubled consonant     (e.g., "stopping" → "stop")
+//   - verb[:-2] + "ying"    (e.g., "dying" → "die")
+func (t *Tokeniser) reverseRegularGerund(word string) []string {
+	var candidates []string
+
+	if !strings.HasSuffix(word, "ing") || len(word) < 4 {
+		return candidates
+	}
+
+	stem := word[:len(word)-3] // strip "ing"
+
+	// Rule: "ying" → "ie" (e.g., "dying" → "die")
+	if strings.HasSuffix(word, "ying") && len(word) > 4 {
+		base := word[:len(word)-4] + "ie"
+		candidates = append(candidates, base)
+	}
+
+	// Rule: doubled consonant + "ing" → single consonant (e.g., "stopping" → "stop")
+	if len(stem) >= 2 && stem[len(stem)-1] == stem[len(stem)-2] {
+		base := stem[:len(stem)-1]
+		candidates = append(candidates, base)
+	}
+
+	// Rule: direct strip "ing" (e.g., "walking" → "walk")
+	// This must come before the stem+"e" rule to avoid false positives
+	// like "walke" round-tripping through Gerund("walke") = "walking".
+	candidates = append(candidates, stem)
+
+	// Rule: stem + "e" was dropped before "ing" (e.g., "creating" → "create")
+	// Try adding "e" back.
+	candidates = append(candidates, stem+"e")
+
+	return candidates
+}
+
+// buildWordIndex reads GrammarData.Words and builds a reverse lookup map.
+// Both the key (e.g., "url") and the display form (e.g., "URL") map back
+// to the key, enabling case-insensitive lookups.
+func (t *Tokeniser) buildWordIndex() {
+	data := i18n.GetGrammarData(t.lang)
+	if data == nil || data.Words == nil {
+		return
+	}
+	for key, display := range data.Words {
+		// Map the key itself (already lowercase)
+		t.words[strings.ToLower(key)] = key
+		// Map the display form (e.g., "URL" → "url", "SSH" → "ssh")
+		t.words[strings.ToLower(display)] = key
+	}
+}
+
+// MatchWord performs a case-insensitive lookup in the words map.
+// Returns the category key and true if found, or ("", false) otherwise.
+func (t *Tokeniser) MatchWord(word string) (string, bool) {
+	cat, ok := t.words[strings.ToLower(word)]
+	return cat, ok
+}
+
+// MatchArticle checks whether a word is an article (definite or indefinite).
+// Returns the article type ("indefinite" or "definite") and true if matched,
+// or ("", false) otherwise.
+func (t *Tokeniser) MatchArticle(word string) (string, bool) {
+	data := i18n.GetGrammarData(t.lang)
+	if data == nil {
+		return "", false
+	}
+
+	lower := strings.ToLower(word)
+
+	if lower == strings.ToLower(data.Articles.IndefiniteDefault) ||
+		lower == strings.ToLower(data.Articles.IndefiniteVowel) {
+		return "indefinite", true
+	}
+	if lower == strings.ToLower(data.Articles.Definite) {
+		return "definite", true
+	}
+
+	return "", false
+}
+
+// Tokenise splits text on whitespace and classifies each word.
+// Priority: punctuation → article → verb → noun → word → unknown.
+// Trailing punctuation is stripped from words before matching.
+func (t *Tokeniser) Tokenise(text string) []Token {
+	text = strings.TrimSpace(text)
+	if text == "" {
+		return nil
+	}
+
+	parts := strings.Fields(text)
+	var tokens []Token
+
+	for _, raw := range parts {
+		// Strip trailing punctuation to get the clean word.
+		word, punct := splitTrailingPunct(raw)
+
+		// Classify the word portion (if any).
+		if word != "" {
+			tok := Token{Raw: raw, Lower: strings.ToLower(word)}
+
+			if artType, ok := t.MatchArticle(word); ok {
+				tok.Type = TokenArticle
+				tok.ArtType = artType
+			} else if vm, ok := t.MatchVerb(word); ok {
+				tok.Type = TokenVerb
+				tok.VerbInfo = vm
+			} else if nm, ok := t.MatchNoun(word); ok {
+				tok.Type = TokenNoun
+				tok.NounInfo = nm
+			} else if cat, ok := t.MatchWord(word); ok {
+				tok.Type = TokenWord
+				tok.WordCat = cat
+			} else {
+				tok.Type = TokenUnknown
+			}
+			tokens = append(tokens, tok)
+		}
+
+		// Emit a punctuation token if trailing punctuation was found.
+		if punct != "" {
+			if punctType, ok := matchPunctuation(punct); ok {
+				tokens = append(tokens, Token{
+					Raw:       punct,
+					Lower:     punct,
+					Type:      TokenPunctuation,
+					PunctType: punctType,
+				})
+			}
+		}
+	}
+
+	return tokens
+}
+
+// splitTrailingPunct separates a word from its trailing punctuation.
+// Returns the word and the punctuation suffix. Punctuation patterns
+// recognised: "..." (progress), "?" (question), ":" (label).
+func splitTrailingPunct(s string) (string, string) {
+	// Check for "..." suffix first (3-char pattern).
+	if strings.HasSuffix(s, "...") {
+		return s[:len(s)-3], "..."
+	}
+	// Check single-char trailing punctuation.
+	if len(s) > 1 {
+		last := s[len(s)-1]
+		if last == '?' || last == ':' {
+			return s[:len(s)-1], string(last)
+		}
+	}
+	return s, ""
+}
+
+// matchPunctuation detects known punctuation patterns.
+// Returns the punctuation type and true if recognised.
+func matchPunctuation(punct string) (string, bool) {
+	switch punct {
+	case "...":
+		return "progress", true
+	case "?":
+		return "question", true
+	case ":":
+		return "label", true
+	}
+	return "", false
+}
--- a/reversal/tokeniser_test.go
+++ b/reversal/tokeniser_test.go
@ -0,0 +1,310 @@
+package reversal
+
+import (
+	"testing"
+
+	i18n "forge.lthn.ai/core/go-i18n"
+)
+
+func setup(t *testing.T) {
+	t.Helper()
+	svc, err := i18n.New()
+	if err != nil {
+		t.Fatalf("i18n.New() failed: %v", err)
+	}
+	i18n.SetDefault(svc)
+}
+
+func TestTokeniser_MatchVerb_Irregular(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tests := []struct {
+		word    string
+		wantOK  bool
+		wantBase string
+		wantTense string
+	}{
+		// Irregular past tense
+		{"deleted", true, "delete", "past"},
+		{"deleting", true, "delete", "gerund"},
+		{"went", true, "go", "past"},
+		{"going", true, "go", "gerund"},
+		{"was", true, "be", "past"},
+		{"being", true, "be", "gerund"},
+		{"ran", true, "run", "past"},
+		{"running", true, "run", "gerund"},
+		{"wrote", true, "write", "past"},
+		{"writing", true, "write", "gerund"},
+		{"built", true, "build", "past"},
+		{"building", true, "build", "gerund"},
+		{"committed", true, "commit", "past"},
+		{"committing", true, "commit", "gerund"},
+
+		// Base forms
+		{"delete", true, "delete", "base"},
+		{"go", true, "go", "base"},
+
+		// Unknown words return false
+		{"xyzzy", false, "", ""},
+		{"flurble", false, "", ""},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.word, func(t *testing.T) {
+			match, ok := tok.MatchVerb(tt.word)
+			if ok != tt.wantOK {
+				t.Fatalf("MatchVerb(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
+			}
+			if !ok {
+				return
+			}
+			if match.Base != tt.wantBase {
+				t.Errorf("MatchVerb(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
+			}
+			if match.Tense != tt.wantTense {
+				t.Errorf("MatchVerb(%q).Tense = %q, want %q", tt.word, match.Tense, tt.wantTense)
+			}
+		})
+	}
+}
+
+func TestTokeniser_MatchNoun_Irregular(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tests := []struct {
+		word       string
+		wantOK     bool
+		wantBase   string
+		wantPlural bool
+	}{
+		{"files", true, "file", true},
+		{"file", true, "file", false},
+		{"people", true, "person", true},
+		{"person", true, "person", false},
+		{"children", true, "child", true},
+		{"child", true, "child", false},
+		{"repositories", true, "repository", true},
+		{"repository", true, "repository", false},
+		{"branches", true, "branch", true},
+		{"branch", true, "branch", false},
+		{"xyzzy", false, "", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.word, func(t *testing.T) {
+			match, ok := tok.MatchNoun(tt.word)
+			if ok != tt.wantOK {
+				t.Fatalf("MatchNoun(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
+			}
+			if !ok {
+				return
+			}
+			if match.Base != tt.wantBase {
+				t.Errorf("MatchNoun(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
+			}
+			if match.Plural != tt.wantPlural {
+				t.Errorf("MatchNoun(%q).Plural = %v, want %v", tt.word, match.Plural, tt.wantPlural)
+			}
+		})
+	}
+}
+
+func TestTokeniser_MatchNoun_Regular(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tests := []struct {
+		word       string
+		wantOK     bool
+		wantBase   string
+		wantPlural bool
+	}{
+		// Regular nouns NOT in grammar tables — detected by reverse morphology + round-trip
+		{"servers", true, "server", true},
+		{"processes", true, "process", true},
+		{"entries", true, "entry", true},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.word, func(t *testing.T) {
+			match, ok := tok.MatchNoun(tt.word)
+			if ok != tt.wantOK {
+				t.Fatalf("MatchNoun(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
+			}
+			if !ok {
+				return
+			}
+			if match.Base != tt.wantBase {
+				t.Errorf("MatchNoun(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
+			}
+			if match.Plural != tt.wantPlural {
+				t.Errorf("MatchNoun(%q).Plural = %v, want %v", tt.word, match.Plural, tt.wantPlural)
+			}
+		})
+	}
+}
+
+func TestTokeniser_MatchWord(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tests := []struct {
+		word    string
+		wantCat string
+		wantOK  bool
+	}{
+		{"URL", "url", true},
+		{"url", "url", true},
+		{"ID", "id", true},
+		{"SSH", "ssh", true},
+		{"PHP", "php", true},
+		{"xyzzy", "", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.word, func(t *testing.T) {
+			cat, ok := tok.MatchWord(tt.word)
+			if ok != tt.wantOK {
+				t.Fatalf("MatchWord(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
+			}
+			if ok && cat != tt.wantCat {
+				t.Errorf("MatchWord(%q) = %q, want %q", tt.word, cat, tt.wantCat)
+			}
+		})
+	}
+}
+
+func TestTokeniser_MatchArticle(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tests := []struct {
+		word     string
+		wantType string
+		wantOK   bool
+	}{
+		{"a", "indefinite", true},
+		{"an", "indefinite", true},
+		{"the", "definite", true},
+		{"A", "indefinite", true},
+		{"The", "definite", true},
+		{"foo", "", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.word, func(t *testing.T) {
+			artType, ok := tok.MatchArticle(tt.word)
+			if ok != tt.wantOK {
+				t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
+			}
+			if ok && artType != tt.wantType {
+				t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
+			}
+		})
+	}
+}
+
+func TestTokeniser_Tokenise(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tokens := tok.Tokenise("Deleted the configuration files")
+
+	if len(tokens) != 4 {
+		t.Fatalf("Tokenise() returned %d tokens, want 4", len(tokens))
+	}
+
+	// "Deleted" → verb, past tense
+	if tokens[0].Type != TokenVerb {
+		t.Errorf("tokens[0].Type = %v, want TokenVerb", tokens[0].Type)
+	}
+	if tokens[0].VerbInfo.Tense != "past" {
+		t.Errorf("tokens[0].VerbInfo.Tense = %q, want %q", tokens[0].VerbInfo.Tense, "past")
+	}
+
+	// "the" → article
+	if tokens[1].Type != TokenArticle {
+		t.Errorf("tokens[1].Type = %v, want TokenArticle", tokens[1].Type)
+	}
+
+	// "configuration" → unknown
+	if tokens[2].Type != TokenUnknown {
+		t.Errorf("tokens[2].Type = %v, want TokenUnknown", tokens[2].Type)
+	}
+
+	// "files" → noun, plural
+	if tokens[3].Type != TokenNoun {
+		t.Errorf("tokens[3].Type = %v, want TokenNoun", tokens[3].Type)
+	}
+	if !tokens[3].NounInfo.Plural {
+		t.Errorf("tokens[3].NounInfo.Plural = false, want true")
+	}
+}
+
+func TestTokeniser_Tokenise_Punctuation(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tokens := tok.Tokenise("Building project...")
+	hasPunct := false
+	for _, tok := range tokens {
+		if tok.Type == TokenPunctuation {
+			hasPunct = true
+		}
+	}
+	if !hasPunct {
+		t.Error("did not detect punctuation in \"Building project...\"")
+	}
+}
+
+func TestTokeniser_Tokenise_Empty(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tokens := tok.Tokenise("")
+	if len(tokens) != 0 {
+		t.Errorf("Tokenise(\"\") returned %d tokens, want 0", len(tokens))
+	}
+}
+
+func TestTokeniser_MatchVerb_Regular(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tests := []struct {
+		word      string
+		wantOK    bool
+		wantBase  string
+		wantTense string
+	}{
+		// Regular verbs NOT in grammar tables — detected by reverse morphology + round-trip
+		{"walked", true, "walk", "past"},
+		{"walking", true, "walk", "gerund"},
+		{"processed", true, "process", "past"},
+		{"processing", true, "process", "gerund"},
+		{"copied", true, "copy", "past"},
+		{"copying", true, "copy", "gerund"},
+		{"stopped", true, "stop", "past"},
+		{"stopping", true, "stop", "gerund"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.word, func(t *testing.T) {
+			match, ok := tok.MatchVerb(tt.word)
+			if ok != tt.wantOK {
+				t.Fatalf("MatchVerb(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
+			}
+			if !ok {
+				return
+			}
+			if match.Base != tt.wantBase {
+				t.Errorf("MatchVerb(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
+			}
+			if match.Tense != tt.wantTense {
+				t.Errorf("MatchVerb(%q).Tense = %q, want %q", tt.word, match.Tense, tt.wantTense)
+			}
+		})
+	}
+}
Author	SHA1	Message	Date
Claude	9474edde6d	test(reversal): add round-trip validation tests Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-16 23:32:08 +00:00
Claude	b3f6c817d4	feat(reversal): add training data Multiplier Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-16 23:30:11 +00:00
Claude	a9c6672b12	feat(reversal): add imprint similarity comparison Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-16 23:26:29 +00:00
Claude	8b23600632	feat(reversal): add GrammarImprint struct and constructor Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-16 23:25:08 +00:00
Claude	f09cff894f	feat(reversal): add Token type and Tokenise function Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-16 23:22:40 +00:00
Claude	6d72540530	feat(reversal): add word map and article detection Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-16 23:21:04 +00:00
Claude	786909c193	feat(reversal): add noun matching to Tokeniser Inverse noun lookup: JSON grammar data → irregular nouns → regular morphology rules. Round-trip verified via forward PluralForm(). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-16 23:18:08 +00:00
Claude	f1aa4adbc4	feat(reversal): add Tokeniser with verb matching Reverse grammar tables into pattern matchers. 3-tier lookup: JSON grammar data → irregular verb maps → regular morphology rules. Verified by round-tripping through forward functions. Export IrregularVerbs() and IrregularNouns() so the reversal engine reads from the authoritative source instead of a duplicate list. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-16 23:15:13 +00:00