Compare commits
8 commits
20ab172f5b
...
9474edde6d
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9474edde6d | ||
|
|
b3f6c817d4 | ||
|
|
a9c6672b12 | ||
|
|
8b23600632 | ||
|
|
f09cff894f | ||
|
|
6d72540530 | ||
|
|
786909c193 | ||
|
|
f1aa4adbc4 |
9 changed files with 1607 additions and 1 deletions
18
grammar.go
18
grammar.go
|
|
@ -20,6 +20,24 @@ func SetGrammarData(lang string, data *GrammarData) {
|
|||
grammarCache[lang] = data
|
||||
}
|
||||
|
||||
// IrregularVerbs returns a copy of the irregular verb forms map.
|
||||
func IrregularVerbs() map[string]VerbForms {
|
||||
result := make(map[string]VerbForms, len(irregularVerbs))
|
||||
for k, v := range irregularVerbs {
|
||||
result[k] = v
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// IrregularNouns returns a copy of the irregular nouns map.
|
||||
func IrregularNouns() map[string]string {
|
||||
result := make(map[string]string, len(irregularNouns))
|
||||
for k, v := range irregularNouns {
|
||||
result[k] = v
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func getVerbForm(lang, verb, form string) string {
|
||||
data := GetGrammarData(lang)
|
||||
if data == nil || data.Verbs == nil {
|
||||
|
|
|
|||
|
|
@ -39,7 +39,10 @@
|
|||
"hit": { "base": "hit", "past": "hit", "gerund": "hitting" },
|
||||
"sit": { "base": "sit", "past": "sat", "gerund": "sitting" },
|
||||
"split": { "base": "split", "past": "split", "gerund": "splitting" },
|
||||
"shut": { "base": "shut", "past": "shut", "gerund": "shutting" }
|
||||
"shut": { "base": "shut", "past": "shut", "gerund": "shutting" },
|
||||
"delete": { "base": "delete", "past": "deleted", "gerund": "deleting" },
|
||||
"update": { "base": "update", "past": "updated", "gerund": "updating" },
|
||||
"push": { "base": "push", "past": "pushed", "gerund": "pushing" }
|
||||
},
|
||||
"noun": {
|
||||
"file": { "one": "file", "other": "files" },
|
||||
|
|
|
|||
180
reversal/imprint.go
Normal file
180
reversal/imprint.go
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
package reversal
|
||||
|
||||
import "math"
|
||||
|
||||
// GrammarImprint is a low-dimensional grammar feature vector.
|
||||
type GrammarImprint struct {
|
||||
VerbDistribution map[string]float64 // verb base -> frequency
|
||||
TenseDistribution map[string]float64 // "past"/"gerund"/"base" -> ratio
|
||||
NounDistribution map[string]float64 // noun base -> frequency
|
||||
PluralRatio float64 // proportion of plural nouns (0.0-1.0)
|
||||
DomainVocabulary map[string]int // gram.word category -> hit count
|
||||
ArticleUsage map[string]float64 // "definite"/"indefinite" -> ratio
|
||||
PunctuationPattern map[string]float64 // "label"/"progress"/"question" -> ratio
|
||||
TokenCount int
|
||||
UniqueVerbs int
|
||||
UniqueNouns int
|
||||
}
|
||||
|
||||
// NewImprint calculates a GrammarImprint from classified tokens.
|
||||
func NewImprint(tokens []Token) GrammarImprint {
|
||||
imp := GrammarImprint{
|
||||
VerbDistribution: make(map[string]float64),
|
||||
TenseDistribution: make(map[string]float64),
|
||||
NounDistribution: make(map[string]float64),
|
||||
DomainVocabulary: make(map[string]int),
|
||||
ArticleUsage: make(map[string]float64),
|
||||
PunctuationPattern: make(map[string]float64),
|
||||
}
|
||||
|
||||
if len(tokens) == 0 {
|
||||
return imp
|
||||
}
|
||||
|
||||
imp.TokenCount = len(tokens)
|
||||
|
||||
verbBases := make(map[string]bool)
|
||||
nounBases := make(map[string]bool)
|
||||
var verbCount, nounCount, articleCount, punctCount int
|
||||
var pluralNouns, totalNouns int
|
||||
|
||||
for _, tok := range tokens {
|
||||
switch tok.Type {
|
||||
case TokenVerb:
|
||||
verbCount++
|
||||
base := tok.VerbInfo.Base
|
||||
imp.VerbDistribution[base]++
|
||||
imp.TenseDistribution[tok.VerbInfo.Tense]++
|
||||
verbBases[base] = true
|
||||
|
||||
case TokenNoun:
|
||||
nounCount++
|
||||
base := tok.NounInfo.Base
|
||||
imp.NounDistribution[base]++
|
||||
nounBases[base] = true
|
||||
totalNouns++
|
||||
if tok.NounInfo.Plural {
|
||||
pluralNouns++
|
||||
}
|
||||
|
||||
case TokenArticle:
|
||||
articleCount++
|
||||
imp.ArticleUsage[tok.ArtType]++
|
||||
|
||||
case TokenWord:
|
||||
imp.DomainVocabulary[tok.WordCat]++
|
||||
|
||||
case TokenPunctuation:
|
||||
punctCount++
|
||||
imp.PunctuationPattern[tok.PunctType]++
|
||||
}
|
||||
}
|
||||
|
||||
imp.UniqueVerbs = len(verbBases)
|
||||
imp.UniqueNouns = len(nounBases)
|
||||
|
||||
// Calculate plural ratio
|
||||
if totalNouns > 0 {
|
||||
imp.PluralRatio = float64(pluralNouns) / float64(totalNouns)
|
||||
}
|
||||
|
||||
// Normalise frequency maps to sum to 1.0
|
||||
normaliseMap(imp.VerbDistribution)
|
||||
normaliseMap(imp.TenseDistribution)
|
||||
normaliseMap(imp.NounDistribution)
|
||||
normaliseMap(imp.ArticleUsage)
|
||||
normaliseMap(imp.PunctuationPattern)
|
||||
|
||||
return imp
|
||||
}
|
||||
|
||||
// normaliseMap scales all values in a map so they sum to 1.0.
|
||||
// If the map is empty or sums to zero, it is left unchanged.
|
||||
func normaliseMap(m map[string]float64) {
|
||||
var total float64
|
||||
for _, v := range m {
|
||||
total += v
|
||||
}
|
||||
if total == 0 {
|
||||
return
|
||||
}
|
||||
for k, v := range m {
|
||||
m[k] = v / total
|
||||
}
|
||||
}
|
||||
|
||||
// Similar returns weighted cosine similarity between two imprints (0.0-1.0).
|
||||
// Weights: verb(0.30), tense(0.20), noun(0.25), article(0.15), punct(0.10).
|
||||
func (a GrammarImprint) Similar(b GrammarImprint) float64 {
|
||||
// Two empty imprints are identical.
|
||||
if a.TokenCount == 0 && b.TokenCount == 0 {
|
||||
return 1.0
|
||||
}
|
||||
|
||||
type component struct {
|
||||
weight float64
|
||||
a, b map[string]float64
|
||||
}
|
||||
|
||||
components := []component{
|
||||
{0.30, a.VerbDistribution, b.VerbDistribution},
|
||||
{0.20, a.TenseDistribution, b.TenseDistribution},
|
||||
{0.25, a.NounDistribution, b.NounDistribution},
|
||||
{0.15, a.ArticleUsage, b.ArticleUsage},
|
||||
{0.10, a.PunctuationPattern, b.PunctuationPattern},
|
||||
}
|
||||
|
||||
var totalWeight float64
|
||||
var weightedSum float64
|
||||
|
||||
for _, c := range components {
|
||||
// Skip components where both maps are empty (no signal).
|
||||
if len(c.a) == 0 && len(c.b) == 0 {
|
||||
continue
|
||||
}
|
||||
totalWeight += c.weight
|
||||
weightedSum += c.weight * mapSimilarity(c.a, c.b)
|
||||
}
|
||||
|
||||
if totalWeight == 0 {
|
||||
return 1.0
|
||||
}
|
||||
|
||||
return weightedSum / totalWeight
|
||||
}
|
||||
|
||||
// mapSimilarity computes cosine similarity between two frequency maps.
|
||||
// Returns 1.0 for identical distributions, 0.0 for completely disjoint.
|
||||
func mapSimilarity(a, b map[string]float64) float64 {
|
||||
if len(a) == 0 && len(b) == 0 {
|
||||
return 1.0
|
||||
}
|
||||
if len(a) == 0 || len(b) == 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// Collect the union of keys.
|
||||
keys := make(map[string]bool)
|
||||
for k := range a {
|
||||
keys[k] = true
|
||||
}
|
||||
for k := range b {
|
||||
keys[k] = true
|
||||
}
|
||||
|
||||
var dot, magA, magB float64
|
||||
for k := range keys {
|
||||
va := a[k]
|
||||
vb := b[k]
|
||||
dot += va * vb
|
||||
magA += va * va
|
||||
magB += vb * vb
|
||||
}
|
||||
|
||||
denom := math.Sqrt(magA) * math.Sqrt(magB)
|
||||
if denom == 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
return dot / denom
|
||||
}
|
||||
116
reversal/imprint_test.go
Normal file
116
reversal/imprint_test.go
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
package reversal
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
i18n "forge.lthn.ai/core/go-i18n"
|
||||
)
|
||||
|
||||
func TestNewImprint(t *testing.T) {
|
||||
svc, err := i18n.New()
|
||||
if err != nil {
|
||||
t.Fatalf("i18n.New() failed: %v", err)
|
||||
}
|
||||
i18n.SetDefault(svc)
|
||||
|
||||
tok := NewTokeniser()
|
||||
tokens := tok.Tokenise("Deleted the configuration files successfully")
|
||||
imp := NewImprint(tokens)
|
||||
|
||||
if imp.TokenCount != 5 {
|
||||
t.Errorf("TokenCount = %d, want 5", imp.TokenCount)
|
||||
}
|
||||
if imp.UniqueVerbs == 0 {
|
||||
t.Error("UniqueVerbs = 0, want > 0")
|
||||
}
|
||||
if imp.UniqueNouns == 0 {
|
||||
t.Error("UniqueNouns = 0, want > 0")
|
||||
}
|
||||
if imp.TenseDistribution["past"] == 0 {
|
||||
t.Error("TenseDistribution[\"past\"] = 0, want > 0")
|
||||
}
|
||||
if imp.ArticleUsage["definite"] == 0 {
|
||||
t.Error("ArticleUsage[\"definite\"] = 0, want > 0")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewImprint_Empty(t *testing.T) {
|
||||
imp := NewImprint(nil)
|
||||
if imp.TokenCount != 0 {
|
||||
t.Errorf("TokenCount = %d, want 0", imp.TokenCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewImprint_PluralRatio(t *testing.T) {
|
||||
svc, _ := i18n.New()
|
||||
i18n.SetDefault(svc)
|
||||
tok := NewTokeniser()
|
||||
|
||||
// All plural nouns
|
||||
tokens := tok.Tokenise("files branches repositories")
|
||||
imp := NewImprint(tokens)
|
||||
if imp.PluralRatio < 0.5 {
|
||||
t.Errorf("PluralRatio = %f for all-plural input, want >= 0.5", imp.PluralRatio)
|
||||
}
|
||||
|
||||
// All singular nouns
|
||||
tokens = tok.Tokenise("file branch repository")
|
||||
imp = NewImprint(tokens)
|
||||
if imp.PluralRatio > 0.5 {
|
||||
t.Errorf("PluralRatio = %f for all-singular input, want <= 0.5", imp.PluralRatio)
|
||||
}
|
||||
}
|
||||
|
||||
func TestImprint_Similar_SameText(t *testing.T) {
|
||||
svc, _ := i18n.New()
|
||||
i18n.SetDefault(svc)
|
||||
tok := NewTokeniser()
|
||||
tokens := tok.Tokenise("Delete the configuration file")
|
||||
imp1 := NewImprint(tokens)
|
||||
imp2 := NewImprint(tokens)
|
||||
|
||||
sim := imp1.Similar(imp2)
|
||||
if sim != 1.0 {
|
||||
t.Errorf("Same text similarity = %f, want 1.0", sim)
|
||||
}
|
||||
}
|
||||
|
||||
func TestImprint_Similar_SimilarText(t *testing.T) {
|
||||
svc, _ := i18n.New()
|
||||
i18n.SetDefault(svc)
|
||||
tok := NewTokeniser()
|
||||
|
||||
imp1 := NewImprint(tok.Tokenise("Delete the configuration file"))
|
||||
imp2 := NewImprint(tok.Tokenise("Deleted the configuration files"))
|
||||
|
||||
sim := imp1.Similar(imp2)
|
||||
if sim < 0.3 {
|
||||
t.Errorf("Similar text similarity = %f, want >= 0.3", sim)
|
||||
}
|
||||
if sim >= 1.0 {
|
||||
t.Errorf("Different text similarity = %f, want < 1.0", sim)
|
||||
}
|
||||
}
|
||||
|
||||
func TestImprint_Similar_DifferentText(t *testing.T) {
|
||||
svc, _ := i18n.New()
|
||||
i18n.SetDefault(svc)
|
||||
tok := NewTokeniser()
|
||||
|
||||
imp1 := NewImprint(tok.Tokenise("Delete the configuration file"))
|
||||
imp2 := NewImprint(tok.Tokenise("Building the project successfully"))
|
||||
|
||||
sim := imp1.Similar(imp2)
|
||||
if sim > 0.7 {
|
||||
t.Errorf("Different text similarity = %f, want <= 0.7", sim)
|
||||
}
|
||||
}
|
||||
|
||||
func TestImprint_Similar_Empty(t *testing.T) {
|
||||
imp1 := NewImprint(nil)
|
||||
imp2 := NewImprint(nil)
|
||||
sim := imp1.Similar(imp2)
|
||||
if sim != 1.0 {
|
||||
t.Errorf("Empty imprint similarity = %f, want 1.0", sim)
|
||||
}
|
||||
}
|
||||
258
reversal/multiplier.go
Normal file
258
reversal/multiplier.go
Normal file
|
|
@ -0,0 +1,258 @@
|
|||
package reversal
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
i18n "forge.lthn.ai/core/go-i18n"
|
||||
)
|
||||
|
||||
// Multiplier generates deterministic grammatical variants of text
|
||||
// for training data augmentation. Zero API calls.
|
||||
type Multiplier struct {
|
||||
tokeniser *Tokeniser
|
||||
}
|
||||
|
||||
// NewMultiplier creates a Multiplier using the default English tokeniser.
|
||||
func NewMultiplier() *Multiplier {
|
||||
return &Multiplier{tokeniser: NewTokeniser()}
|
||||
}
|
||||
|
||||
// NewMultiplierForLang creates a Multiplier for the specified language.
|
||||
func NewMultiplierForLang(lang string) *Multiplier {
|
||||
return &Multiplier{tokeniser: NewTokeniserForLang(lang)}
|
||||
}
|
||||
|
||||
// Expand produces: original + tense flips (past, gerund) + number flips (plural toggle) + combinations.
|
||||
// All output is deterministic and grammatically correct.
|
||||
func (m *Multiplier) Expand(text string) []string {
|
||||
text = strings.TrimSpace(text)
|
||||
if text == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
tokens := m.tokeniser.Tokenise(text)
|
||||
if len(tokens) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Collect indices of verbs and nouns for targeted replacement.
|
||||
var verbIndices []int
|
||||
var nounIndices []int
|
||||
for i, tok := range tokens {
|
||||
switch tok.Type {
|
||||
case TokenVerb:
|
||||
verbIndices = append(verbIndices, i)
|
||||
case TokenNoun:
|
||||
nounIndices = append(nounIndices, i)
|
||||
}
|
||||
}
|
||||
|
||||
// Build the list of variants in deterministic order:
|
||||
// 1. Original
|
||||
// 2. Single verb transforms (past, gerund) for each verb
|
||||
// 3. Single noun transforms (plural toggle) for each noun
|
||||
// 4. Combined transforms (verb transform + noun transform)
|
||||
seen := make(map[string]bool)
|
||||
var results []string
|
||||
|
||||
addVariant := func(s string) {
|
||||
if !seen[s] {
|
||||
seen[s] = true
|
||||
results = append(results, s)
|
||||
}
|
||||
}
|
||||
|
||||
// 1. Original text
|
||||
addVariant(text)
|
||||
|
||||
// 2. Verb transforms: for each verb, produce past and gerund variants
|
||||
for _, vi := range verbIndices {
|
||||
pastTokens := m.applyVerbTransform(tokens, vi, "past")
|
||||
addVariant(reconstruct(pastTokens))
|
||||
|
||||
gerundTokens := m.applyVerbTransform(tokens, vi, "gerund")
|
||||
addVariant(reconstruct(gerundTokens))
|
||||
|
||||
baseTokens := m.applyVerbTransform(tokens, vi, "base")
|
||||
addVariant(reconstruct(baseTokens))
|
||||
}
|
||||
|
||||
// 3. Noun transforms: for each noun, toggle plural/singular
|
||||
for _, ni := range nounIndices {
|
||||
pluralTokens := m.applyNounTransform(tokens, ni)
|
||||
addVariant(reconstruct(pluralTokens))
|
||||
}
|
||||
|
||||
// 4. Combinations: each verb transform + each noun transform
|
||||
for _, vi := range verbIndices {
|
||||
for _, ni := range nounIndices {
|
||||
// past + noun toggle
|
||||
pastTokens := m.applyVerbTransform(tokens, vi, "past")
|
||||
pastPluralTokens := m.applyNounTransformOnTokens(pastTokens, ni)
|
||||
addVariant(reconstruct(pastPluralTokens))
|
||||
|
||||
// gerund + noun toggle
|
||||
gerundTokens := m.applyVerbTransform(tokens, vi, "gerund")
|
||||
gerundPluralTokens := m.applyNounTransformOnTokens(gerundTokens, ni)
|
||||
addVariant(reconstruct(gerundPluralTokens))
|
||||
|
||||
// base + noun toggle
|
||||
baseTokens := m.applyVerbTransform(tokens, vi, "base")
|
||||
basePluralTokens := m.applyNounTransformOnTokens(baseTokens, ni)
|
||||
addVariant(reconstruct(basePluralTokens))
|
||||
}
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
// applyVerbTransform returns a copy of tokens with the verb at index vi
|
||||
// transformed to the specified tense ("past", "gerund", or "base").
|
||||
func (m *Multiplier) applyVerbTransform(tokens []Token, vi int, targetTense string) []Token {
|
||||
result := make([]Token, len(tokens))
|
||||
copy(result, tokens)
|
||||
|
||||
tok := tokens[vi]
|
||||
base := tok.VerbInfo.Base
|
||||
currentTense := tok.VerbInfo.Tense
|
||||
|
||||
if currentTense == targetTense {
|
||||
return result
|
||||
}
|
||||
|
||||
var newForm string
|
||||
switch targetTense {
|
||||
case "past":
|
||||
newForm = i18n.PastTense(base)
|
||||
case "gerund":
|
||||
newForm = i18n.Gerund(base)
|
||||
case "base":
|
||||
newForm = base
|
||||
}
|
||||
|
||||
if newForm == "" {
|
||||
return result
|
||||
}
|
||||
|
||||
// Preserve capitalisation of the original token.
|
||||
newForm = preserveCase(tok.Raw, newForm)
|
||||
|
||||
result[vi] = Token{
|
||||
Raw: newForm,
|
||||
Lower: strings.ToLower(newForm),
|
||||
Type: TokenVerb,
|
||||
VerbInfo: VerbMatch{
|
||||
Base: base,
|
||||
Tense: targetTense,
|
||||
Form: newForm,
|
||||
},
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// applyNounTransform returns a copy of tokens with the noun at index ni
|
||||
// toggled between singular and plural.
|
||||
func (m *Multiplier) applyNounTransform(tokens []Token, ni int) []Token {
|
||||
return m.applyNounTransformOnTokens(tokens, ni)
|
||||
}
|
||||
|
||||
// applyNounTransformOnTokens returns a copy of the given tokens with the
|
||||
// noun at index ni toggled between singular and plural.
|
||||
func (m *Multiplier) applyNounTransformOnTokens(tokens []Token, ni int) []Token {
|
||||
result := make([]Token, len(tokens))
|
||||
copy(result, tokens)
|
||||
|
||||
tok := tokens[ni]
|
||||
base := tok.NounInfo.Base
|
||||
isPlural := tok.NounInfo.Plural
|
||||
|
||||
var newForm string
|
||||
var newPlural bool
|
||||
|
||||
if isPlural {
|
||||
// Already plural, revert to singular (base form).
|
||||
newForm = base
|
||||
newPlural = false
|
||||
} else {
|
||||
// Singular, generate plural.
|
||||
newForm = i18n.PluralForm(base)
|
||||
newPlural = true
|
||||
}
|
||||
|
||||
if newForm == "" {
|
||||
return result
|
||||
}
|
||||
|
||||
// Preserve capitalisation.
|
||||
newForm = preserveCase(tok.Raw, newForm)
|
||||
|
||||
result[ni] = Token{
|
||||
Raw: newForm,
|
||||
Lower: strings.ToLower(newForm),
|
||||
Type: TokenNoun,
|
||||
NounInfo: NounMatch{
|
||||
Base: base,
|
||||
Plural: newPlural,
|
||||
Form: newForm,
|
||||
},
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// reconstruct joins tokens back into a string, preserving spacing.
|
||||
func reconstruct(tokens []Token) string {
|
||||
var b strings.Builder
|
||||
for i, tok := range tokens {
|
||||
if i > 0 {
|
||||
// Punctuation tokens that were split from the previous word
|
||||
// should not have a leading space.
|
||||
if tok.Type == TokenPunctuation {
|
||||
b.WriteString(tok.Raw)
|
||||
continue
|
||||
}
|
||||
b.WriteByte(' ')
|
||||
}
|
||||
b.WriteString(tok.Raw)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// preserveCase applies the capitalisation pattern of the original word
|
||||
// to the replacement word. If the original started with an uppercase
|
||||
// letter, the replacement will too.
|
||||
func preserveCase(original, replacement string) string {
|
||||
if len(original) == 0 || len(replacement) == 0 {
|
||||
return replacement
|
||||
}
|
||||
|
||||
origRunes := []rune(original)
|
||||
repRunes := []rune(replacement)
|
||||
|
||||
// If the original is all uppercase (like "DELETE"), make replacement all uppercase.
|
||||
if isAllUpper(original) && len(original) > 1 {
|
||||
return strings.ToUpper(replacement)
|
||||
}
|
||||
|
||||
// If the first character of the original is uppercase, capitalise the replacement.
|
||||
if unicode.IsUpper(origRunes[0]) {
|
||||
repRunes[0] = unicode.ToUpper(repRunes[0])
|
||||
return string(repRunes)
|
||||
}
|
||||
|
||||
// Otherwise, ensure the replacement starts lowercase.
|
||||
repRunes[0] = unicode.ToLower(repRunes[0])
|
||||
return string(repRunes)
|
||||
}
|
||||
|
||||
// isAllUpper returns true if every letter in the string is uppercase.
|
||||
func isAllUpper(s string) bool {
|
||||
for _, r := range s {
|
||||
if unicode.IsLetter(r) && !unicode.IsUpper(r) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
67
reversal/multiplier_test.go
Normal file
67
reversal/multiplier_test.go
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
package reversal
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
i18n "forge.lthn.ai/core/go-i18n"
|
||||
)
|
||||
|
||||
func TestMultiplier_Expand(t *testing.T) {
|
||||
svc, _ := i18n.New()
|
||||
i18n.SetDefault(svc)
|
||||
m := NewMultiplier()
|
||||
variants := m.Expand("Delete the configuration file")
|
||||
|
||||
if len(variants) < 4 {
|
||||
t.Errorf("Expand() returned %d variants, want >= 4", len(variants))
|
||||
}
|
||||
|
||||
expected := map[string]bool{
|
||||
"Delete the configuration file": true, // original
|
||||
"Deleted the configuration file": true, // past
|
||||
"Deleting the configuration file": true, // gerund
|
||||
"Delete the configuration files": true, // plural
|
||||
}
|
||||
for _, v := range variants {
|
||||
delete(expected, v)
|
||||
}
|
||||
for missing := range expected {
|
||||
t.Errorf("Expand() missing expected variant: %q", missing)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMultiplier_Expand_NoVerbs(t *testing.T) {
|
||||
svc, _ := i18n.New()
|
||||
i18n.SetDefault(svc)
|
||||
m := NewMultiplier()
|
||||
variants := m.Expand("the configuration file")
|
||||
if len(variants) < 2 {
|
||||
t.Errorf("Expand() returned %d variants, want >= 2", len(variants))
|
||||
}
|
||||
}
|
||||
|
||||
func TestMultiplier_Expand_Empty(t *testing.T) {
|
||||
svc, _ := i18n.New()
|
||||
i18n.SetDefault(svc)
|
||||
m := NewMultiplier()
|
||||
variants := m.Expand("")
|
||||
if len(variants) != 0 {
|
||||
t.Errorf("Expand(\"\") returned %d variants, want 0", len(variants))
|
||||
}
|
||||
}
|
||||
|
||||
func TestMultiplier_Expand_Deterministic(t *testing.T) {
|
||||
svc, _ := i18n.New()
|
||||
i18n.SetDefault(svc)
|
||||
m := NewMultiplier()
|
||||
v1 := m.Expand("Delete the file")
|
||||
v2 := m.Expand("Delete the file")
|
||||
if len(v1) != len(v2) {
|
||||
t.Fatalf("Non-deterministic: %d vs %d variants", len(v1), len(v2))
|
||||
}
|
||||
for i := range v1 {
|
||||
if v1[i] != v2[i] {
|
||||
t.Errorf("Non-deterministic at [%d]: %q vs %q", i, v1[i], v2[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
93
reversal/roundtrip_test.go
Normal file
93
reversal/roundtrip_test.go
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
package reversal
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
i18n "forge.lthn.ai/core/go-i18n"
|
||||
)
|
||||
|
||||
// TestRoundTrip_ForwardThenReverse — go-i18n composed output → reversal → verify correct tokens
|
||||
func TestRoundTrip_ForwardThenReverse(t *testing.T) {
|
||||
svc, _ := i18n.New()
|
||||
i18n.SetDefault(svc)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
text string
|
||||
wantVerb string
|
||||
wantTense string
|
||||
}{
|
||||
{
|
||||
name: "Progress pattern",
|
||||
text: i18n.Progress("build"),
|
||||
wantVerb: "build",
|
||||
wantTense: "gerund",
|
||||
},
|
||||
{
|
||||
name: "ActionResult pattern",
|
||||
text: i18n.ActionResult("delete", "file"),
|
||||
wantVerb: "delete",
|
||||
wantTense: "past",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
tokens := tok.Tokenise(tt.text)
|
||||
foundVerb := false
|
||||
for _, tok := range tokens {
|
||||
if tok.Type == TokenVerb && tok.VerbInfo.Base == tt.wantVerb {
|
||||
foundVerb = true
|
||||
if tok.VerbInfo.Tense != tt.wantTense {
|
||||
t.Errorf("verb %q tense = %q, want %q", tt.wantVerb, tok.VerbInfo.Tense, tt.wantTense)
|
||||
}
|
||||
}
|
||||
}
|
||||
if !foundVerb {
|
||||
t.Errorf("did not find verb %q in tokens from %q", tt.wantVerb, tt.text)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestRoundTrip_MultiplierImprints — variants should be similar to original
|
||||
func TestRoundTrip_MultiplierImprints(t *testing.T) {
|
||||
svc, _ := i18n.New()
|
||||
i18n.SetDefault(svc)
|
||||
tok := NewTokeniser()
|
||||
m := NewMultiplier()
|
||||
|
||||
original := "Delete the configuration file"
|
||||
variants := m.Expand(original)
|
||||
origImprint := NewImprint(tok.Tokenise(original))
|
||||
|
||||
for _, v := range variants {
|
||||
if v == original {
|
||||
continue
|
||||
}
|
||||
varImprint := NewImprint(tok.Tokenise(v))
|
||||
sim := origImprint.Similar(varImprint)
|
||||
if sim < 0.2 {
|
||||
t.Errorf("Variant %q similarity to original = %f, want >= 0.2", v, sim)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestRoundTrip_SimilarDocuments — similar docs → higher similarity than different docs
|
||||
func TestRoundTrip_SimilarDocuments(t *testing.T) {
|
||||
svc, _ := i18n.New()
|
||||
i18n.SetDefault(svc)
|
||||
tok := NewTokeniser()
|
||||
|
||||
imp1 := NewImprint(tok.Tokenise("Delete the configuration file"))
|
||||
imp2 := NewImprint(tok.Tokenise("Delete the old file"))
|
||||
imp3 := NewImprint(tok.Tokenise("Building the project successfully"))
|
||||
|
||||
simSame := imp1.Similar(imp2)
|
||||
simDiff := imp1.Similar(imp3)
|
||||
|
||||
if simSame <= simDiff {
|
||||
t.Errorf("Similar documents (%f) should score higher than different (%f)", simSame, simDiff)
|
||||
}
|
||||
}
|
||||
561
reversal/tokeniser.go
Normal file
561
reversal/tokeniser.go
Normal file
|
|
@ -0,0 +1,561 @@
|
|||
// Package reversal provides reverse grammar lookups.
|
||||
//
|
||||
// The forward engine (go-i18n) maps base forms to inflected forms:
|
||||
//
|
||||
// PastTense("delete") → "deleted"
|
||||
// Gerund("run") → "running"
|
||||
//
|
||||
// The reversal engine reads those same tables backwards, turning
|
||||
// inflected forms back into base forms with tense metadata:
|
||||
//
|
||||
// MatchVerb("deleted") → {Base: "delete", Tense: "past"}
|
||||
// MatchVerb("running") → {Base: "run", Tense: "gerund"}
|
||||
//
|
||||
// 3-tier lookup: JSON grammar data → irregular verb maps → regular
|
||||
// morphology rules (verified by round-tripping through forward functions).
|
||||
package reversal
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
i18n "forge.lthn.ai/core/go-i18n"
|
||||
)
|
||||
|
||||
// VerbMatch holds the result of a reverse verb lookup.
|
||||
type VerbMatch struct {
|
||||
Base string // Base form of the verb ("delete", "run")
|
||||
Tense string // "past", "gerund", or "base"
|
||||
Form string // The original inflected form
|
||||
}
|
||||
|
||||
// NounMatch holds the result of a reverse noun lookup.
|
||||
type NounMatch struct {
|
||||
Base string // Base/singular form of the noun
|
||||
Plural bool // Whether the matched form was plural
|
||||
Form string // The original form
|
||||
}
|
||||
|
||||
// TokenType classifies a token identified during tokenisation.
|
||||
type TokenType int
|
||||
|
||||
const (
|
||||
TokenUnknown TokenType = iota // Unrecognised word
|
||||
TokenVerb // Matched verb (see VerbInfo)
|
||||
TokenNoun // Matched noun (see NounInfo)
|
||||
TokenArticle // Matched article ("a", "an", "the")
|
||||
TokenWord // Matched word from grammar word map
|
||||
TokenPunctuation // Punctuation ("...", "?")
|
||||
)
|
||||
|
||||
// Token represents a single classified token from a text string.
|
||||
type Token struct {
|
||||
Raw string // Original text as it appeared in input
|
||||
Lower string // Lowercased form
|
||||
Type TokenType // Classification
|
||||
VerbInfo VerbMatch // Set when Type == TokenVerb
|
||||
NounInfo NounMatch // Set when Type == TokenNoun
|
||||
WordCat string // Set when Type == TokenWord
|
||||
ArtType string // Set when Type == TokenArticle
|
||||
PunctType string // Set when Type == TokenPunctuation
|
||||
}
|
||||
|
||||
// Tokeniser provides reverse grammar lookups by maintaining inverse
|
||||
// indexes built from the forward grammar tables.
|
||||
type Tokeniser struct {
|
||||
pastToBase map[string]string // "deleted" → "delete"
|
||||
gerundToBase map[string]string // "deleting" → "delete"
|
||||
baseVerbs map[string]bool // "delete" → true
|
||||
pluralToBase map[string]string // "files" → "file"
|
||||
baseNouns map[string]bool // "file" → true
|
||||
words map[string]string // word translations
|
||||
lang string
|
||||
}
|
||||
|
||||
// NewTokeniser creates a Tokeniser for English ("en").
|
||||
func NewTokeniser() *Tokeniser {
|
||||
return NewTokeniserForLang("en")
|
||||
}
|
||||
|
||||
// NewTokeniserForLang creates a Tokeniser for the specified language,
|
||||
// building inverse indexes from the grammar data.
|
||||
func NewTokeniserForLang(lang string) *Tokeniser {
|
||||
t := &Tokeniser{
|
||||
pastToBase: make(map[string]string),
|
||||
gerundToBase: make(map[string]string),
|
||||
baseVerbs: make(map[string]bool),
|
||||
pluralToBase: make(map[string]string),
|
||||
baseNouns: make(map[string]bool),
|
||||
words: make(map[string]string),
|
||||
lang: lang,
|
||||
}
|
||||
t.buildVerbIndex()
|
||||
t.buildNounIndex()
|
||||
t.buildWordIndex()
|
||||
return t
|
||||
}
|
||||
|
||||
// buildVerbIndex reads grammar tables and irregular verb maps to build
|
||||
// inverse lookup maps: inflected form → base form.
|
||||
func (t *Tokeniser) buildVerbIndex() {
|
||||
// Tier 1: Read from JSON grammar data (via GetGrammarData).
|
||||
data := i18n.GetGrammarData(t.lang)
|
||||
if data != nil && data.Verbs != nil {
|
||||
for base, forms := range data.Verbs {
|
||||
t.baseVerbs[base] = true
|
||||
if forms.Past != "" {
|
||||
t.pastToBase[forms.Past] = base
|
||||
}
|
||||
if forms.Gerund != "" {
|
||||
t.gerundToBase[forms.Gerund] = base
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tier 2: Read from the exported irregularVerbs map.
|
||||
// Build inverse maps directly from the authoritative source.
|
||||
for base, forms := range i18n.IrregularVerbs() {
|
||||
t.baseVerbs[base] = true
|
||||
if forms.Past != "" {
|
||||
if _, exists := t.pastToBase[forms.Past]; !exists {
|
||||
t.pastToBase[forms.Past] = base
|
||||
}
|
||||
}
|
||||
if forms.Gerund != "" {
|
||||
if _, exists := t.gerundToBase[forms.Gerund]; !exists {
|
||||
t.gerundToBase[forms.Gerund] = base
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// buildNounIndex reads grammar tables and irregular noun maps to build
|
||||
// inverse lookup maps: plural form → base form.
|
||||
func (t *Tokeniser) buildNounIndex() {
|
||||
// Tier 1: Read from JSON grammar data (via GetGrammarData).
|
||||
data := i18n.GetGrammarData(t.lang)
|
||||
if data != nil && data.Nouns != nil {
|
||||
for base, forms := range data.Nouns {
|
||||
t.baseNouns[base] = true
|
||||
if forms.Other != "" && forms.Other != base {
|
||||
t.pluralToBase[forms.Other] = base
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tier 2: Read from the exported irregularNouns map.
|
||||
for base, plural := range i18n.IrregularNouns() {
|
||||
t.baseNouns[base] = true
|
||||
if plural != base {
|
||||
if _, exists := t.pluralToBase[plural]; !exists {
|
||||
t.pluralToBase[plural] = base
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// MatchNoun performs a 3-tier reverse lookup for a noun form.
|
||||
//
|
||||
// Tier 1: Check if the word is a known base noun.
|
||||
// Tier 2: Check the pluralToBase inverse map.
|
||||
// Tier 3: Try reverse morphology rules and round-trip verify via
|
||||
// the forward function PluralForm().
|
||||
func (t *Tokeniser) MatchNoun(word string) (NounMatch, bool) {
|
||||
word = strings.ToLower(strings.TrimSpace(word))
|
||||
if word == "" {
|
||||
return NounMatch{}, false
|
||||
}
|
||||
|
||||
// Tier 1: Is it a base noun?
|
||||
if t.baseNouns[word] {
|
||||
return NounMatch{Base: word, Plural: false, Form: word}, true
|
||||
}
|
||||
|
||||
// Tier 2: Check inverse map from grammar tables + irregular nouns.
|
||||
if base, ok := t.pluralToBase[word]; ok {
|
||||
return NounMatch{Base: base, Plural: true, Form: word}, true
|
||||
}
|
||||
|
||||
// Tier 3: Reverse morphology with round-trip verification.
|
||||
candidates := t.reverseRegularPlural(word)
|
||||
for _, c := range candidates {
|
||||
if i18n.PluralForm(c) == word {
|
||||
return NounMatch{Base: c, Plural: true, Form: word}, true
|
||||
}
|
||||
}
|
||||
|
||||
return NounMatch{}, false
|
||||
}
|
||||
|
||||
// reverseRegularPlural generates candidate base forms by reversing regular
|
||||
// plural suffixes. Returns multiple candidates ordered by likelihood.
|
||||
//
|
||||
// The forward engine applies rules in this order:
|
||||
// 1. ends in s/ss/sh/ch/x/z → +es
|
||||
// 2. ends in consonant+y → ies
|
||||
// 3. ends in f → ves, fe → ves
|
||||
// 4. default → +s
|
||||
//
|
||||
// We generate candidates for each possible reverse rule. Round-trip
|
||||
// verification ensures only correct candidates pass.
|
||||
func (t *Tokeniser) reverseRegularPlural(word string) []string {
|
||||
var candidates []string
|
||||
|
||||
// Rule: consonant + "ies" → consonant + "y" (e.g., "entries" → "entry")
|
||||
if strings.HasSuffix(word, "ies") && len(word) > 3 {
|
||||
base := word[:len(word)-3] + "y"
|
||||
candidates = append(candidates, base)
|
||||
}
|
||||
|
||||
// Rule: "ves" → "f" or "fe" (e.g., "wolves" → "wolf", "knives" → "knife")
|
||||
if strings.HasSuffix(word, "ves") && len(word) > 3 {
|
||||
candidates = append(candidates, word[:len(word)-3]+"f")
|
||||
candidates = append(candidates, word[:len(word)-3]+"fe")
|
||||
}
|
||||
|
||||
// Rule: sibilant + "es" (e.g., "processes" → "process", "branches" → "branch")
|
||||
if strings.HasSuffix(word, "ses") || strings.HasSuffix(word, "xes") ||
|
||||
strings.HasSuffix(word, "zes") || strings.HasSuffix(word, "ches") ||
|
||||
strings.HasSuffix(word, "shes") {
|
||||
base := word[:len(word)-2] // strip "es"
|
||||
candidates = append(candidates, base)
|
||||
}
|
||||
|
||||
// Rule: drop "s" (e.g., "servers" → "server")
|
||||
if strings.HasSuffix(word, "s") && len(word) > 1 {
|
||||
base := word[:len(word)-1]
|
||||
candidates = append(candidates, base)
|
||||
}
|
||||
|
||||
return candidates
|
||||
}
|
||||
|
||||
// MatchVerb performs a 3-tier reverse lookup for a verb form.
|
||||
//
|
||||
// Tier 1: Check if the word is a known base verb.
|
||||
// Tier 2: Check the pastToBase and gerundToBase inverse maps.
|
||||
// Tier 3: Try reverse morphology rules and round-trip verify via
|
||||
// the forward functions PastTense() and Gerund().
|
||||
func (t *Tokeniser) MatchVerb(word string) (VerbMatch, bool) {
|
||||
word = strings.ToLower(strings.TrimSpace(word))
|
||||
if word == "" {
|
||||
return VerbMatch{}, false
|
||||
}
|
||||
|
||||
// Tier 1: Is it a base verb?
|
||||
if t.baseVerbs[word] {
|
||||
return VerbMatch{Base: word, Tense: "base", Form: word}, true
|
||||
}
|
||||
|
||||
// Tier 2: Check inverse maps from grammar tables + irregular verbs.
|
||||
if base, ok := t.pastToBase[word]; ok {
|
||||
return VerbMatch{Base: base, Tense: "past", Form: word}, true
|
||||
}
|
||||
if base, ok := t.gerundToBase[word]; ok {
|
||||
return VerbMatch{Base: base, Tense: "gerund", Form: word}, true
|
||||
}
|
||||
|
||||
// Tier 3: Reverse morphology with round-trip verification.
|
||||
// Try past tense candidates.
|
||||
if base := t.bestRoundTrip(word, t.reverseRegularPast(word), i18n.PastTense); base != "" {
|
||||
return VerbMatch{Base: base, Tense: "past", Form: word}, true
|
||||
}
|
||||
|
||||
// Try gerund candidates.
|
||||
if base := t.bestRoundTrip(word, t.reverseRegularGerund(word), i18n.Gerund); base != "" {
|
||||
return VerbMatch{Base: base, Tense: "gerund", Form: word}, true
|
||||
}
|
||||
|
||||
return VerbMatch{}, false
|
||||
}
|
||||
|
||||
// bestRoundTrip selects the best candidate from a list by round-tripping
|
||||
// each through a forward function. When multiple candidates round-trip
|
||||
// successfully (ambiguity), it uses the following priority:
|
||||
// 1. Candidates that are known base verbs (in grammar tables / irregular maps)
|
||||
// 2. Candidates ending in a VCe pattern (vowel-consonant-e, the "magic e"
|
||||
// pattern common in real English verbs like "delete", "create", "use").
|
||||
// This avoids phantom verbs like "walke" or "processe" which have a
|
||||
// CCe pattern (consonant-consonant-e) that doesn't occur naturally.
|
||||
// 3. Candidates NOT ending in "e" (the default morphology path)
|
||||
// 4. First match in candidate order as final tiebreaker
|
||||
func (t *Tokeniser) bestRoundTrip(target string, candidates []string, forward func(string) string) string {
|
||||
var matches []string
|
||||
for _, c := range candidates {
|
||||
if forward(c) == target {
|
||||
matches = append(matches, c)
|
||||
}
|
||||
}
|
||||
if len(matches) == 0 {
|
||||
return ""
|
||||
}
|
||||
if len(matches) == 1 {
|
||||
return matches[0]
|
||||
}
|
||||
|
||||
// Priority 1: known base verb
|
||||
for _, m := range matches {
|
||||
if t.baseVerbs[m] {
|
||||
return m
|
||||
}
|
||||
}
|
||||
|
||||
// Priority 2: prefer VCe-ending candidate (real English verb pattern)
|
||||
for _, m := range matches {
|
||||
if hasVCeEnding(m) {
|
||||
return m
|
||||
}
|
||||
}
|
||||
|
||||
// Priority 3: prefer candidate not ending in "e" (avoids phantom verbs
|
||||
// with CCe endings like "walke", "processe")
|
||||
for _, m := range matches {
|
||||
if !strings.HasSuffix(m, "e") {
|
||||
return m
|
||||
}
|
||||
}
|
||||
|
||||
return matches[0]
|
||||
}
|
||||
|
||||
// hasVCeEnding returns true if the word ends in a vowel-consonant-e pattern
|
||||
// (the "magic e" pattern). This is characteristic of real English verbs like
|
||||
// "delete" (-ete), "create" (-ate), "use" (-use), "close" (-ose).
|
||||
// Phantom verbs produced by naive suffix stripping like "walke" (-lke) or
|
||||
// "processe" (-sse) end in consonant-consonant-e and return false.
|
||||
func hasVCeEnding(word string) bool {
|
||||
if len(word) < 3 || word[len(word)-1] != 'e' {
|
||||
return false
|
||||
}
|
||||
lastConsonant := word[len(word)-2]
|
||||
vowelBefore := word[len(word)-3]
|
||||
return !isVowelByte(lastConsonant) && isVowelByte(vowelBefore)
|
||||
}
|
||||
|
||||
func isVowelByte(b byte) bool {
|
||||
switch b {
|
||||
case 'a', 'e', 'i', 'o', 'u':
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// reverseRegularPast generates candidate base forms by reversing regular
|
||||
// past tense suffixes. Returns multiple candidates ordered by likelihood.
|
||||
//
|
||||
// The forward engine applies rules in this order:
|
||||
// 1. ends in "e" → +d (create → created)
|
||||
// 2. ends in "y" + consonant → ied (copy → copied)
|
||||
// 3. shouldDoubleConsonant → double+ed (stop → stopped)
|
||||
// 4. default → +ed (walk → walked)
|
||||
//
|
||||
// We generate candidates for each possible reverse rule. Round-trip
|
||||
// verification (in bestRoundTrip) ensures only correct candidates pass.
|
||||
func (t *Tokeniser) reverseRegularPast(word string) []string {
|
||||
var candidates []string
|
||||
|
||||
if !strings.HasSuffix(word, "ed") {
|
||||
return candidates
|
||||
}
|
||||
|
||||
// Rule: consonant + "ied" → consonant + "y" (e.g., "copied" → "copy")
|
||||
if strings.HasSuffix(word, "ied") && len(word) > 3 {
|
||||
base := word[:len(word)-3] + "y"
|
||||
candidates = append(candidates, base)
|
||||
}
|
||||
|
||||
// Rule: doubled consonant + "ed" → single consonant (e.g., "stopped" → "stop")
|
||||
if len(word) > 4 {
|
||||
beforeEd := word[:len(word)-2]
|
||||
lastChar := beforeEd[len(beforeEd)-1]
|
||||
if len(beforeEd) >= 2 && beforeEd[len(beforeEd)-2] == lastChar {
|
||||
base := beforeEd[:len(beforeEd)-1]
|
||||
candidates = append(candidates, base)
|
||||
}
|
||||
}
|
||||
|
||||
// Rule: stem + "d" where stem ends in "e" (e.g., "created" → "create")
|
||||
if len(word) > 2 {
|
||||
stemPlusE := word[:len(word)-1] // strip "d", leaving stem + "e"
|
||||
candidates = append(candidates, stemPlusE)
|
||||
}
|
||||
|
||||
// Rule: stem + "ed" (e.g., "walked" → "walk")
|
||||
if len(word) > 2 {
|
||||
stem := word[:len(word)-2]
|
||||
candidates = append(candidates, stem)
|
||||
}
|
||||
|
||||
return candidates
|
||||
}
|
||||
|
||||
// reverseRegularGerund generates candidate base forms by reversing regular
|
||||
// gerund suffixes. Returns multiple candidates ordered by likelihood.
|
||||
//
|
||||
// Rules reversed:
|
||||
// - verb + "ing" (e.g., "walking" → "walk")
|
||||
// - verb[:-1] + "ing" (e.g., "creating" → "create", drop e)
|
||||
// - doubled consonant (e.g., "stopping" → "stop")
|
||||
// - verb[:-2] + "ying" (e.g., "dying" → "die")
|
||||
func (t *Tokeniser) reverseRegularGerund(word string) []string {
|
||||
var candidates []string
|
||||
|
||||
if !strings.HasSuffix(word, "ing") || len(word) < 4 {
|
||||
return candidates
|
||||
}
|
||||
|
||||
stem := word[:len(word)-3] // strip "ing"
|
||||
|
||||
// Rule: "ying" → "ie" (e.g., "dying" → "die")
|
||||
if strings.HasSuffix(word, "ying") && len(word) > 4 {
|
||||
base := word[:len(word)-4] + "ie"
|
||||
candidates = append(candidates, base)
|
||||
}
|
||||
|
||||
// Rule: doubled consonant + "ing" → single consonant (e.g., "stopping" → "stop")
|
||||
if len(stem) >= 2 && stem[len(stem)-1] == stem[len(stem)-2] {
|
||||
base := stem[:len(stem)-1]
|
||||
candidates = append(candidates, base)
|
||||
}
|
||||
|
||||
// Rule: direct strip "ing" (e.g., "walking" → "walk")
|
||||
// This must come before the stem+"e" rule to avoid false positives
|
||||
// like "walke" round-tripping through Gerund("walke") = "walking".
|
||||
candidates = append(candidates, stem)
|
||||
|
||||
// Rule: stem + "e" was dropped before "ing" (e.g., "creating" → "create")
|
||||
// Try adding "e" back.
|
||||
candidates = append(candidates, stem+"e")
|
||||
|
||||
return candidates
|
||||
}
|
||||
|
||||
// buildWordIndex reads GrammarData.Words and builds a reverse lookup map.
|
||||
// Both the key (e.g., "url") and the display form (e.g., "URL") map back
|
||||
// to the key, enabling case-insensitive lookups.
|
||||
func (t *Tokeniser) buildWordIndex() {
|
||||
data := i18n.GetGrammarData(t.lang)
|
||||
if data == nil || data.Words == nil {
|
||||
return
|
||||
}
|
||||
for key, display := range data.Words {
|
||||
// Map the key itself (already lowercase)
|
||||
t.words[strings.ToLower(key)] = key
|
||||
// Map the display form (e.g., "URL" → "url", "SSH" → "ssh")
|
||||
t.words[strings.ToLower(display)] = key
|
||||
}
|
||||
}
|
||||
|
||||
// MatchWord performs a case-insensitive lookup in the words map.
|
||||
// Returns the category key and true if found, or ("", false) otherwise.
|
||||
func (t *Tokeniser) MatchWord(word string) (string, bool) {
|
||||
cat, ok := t.words[strings.ToLower(word)]
|
||||
return cat, ok
|
||||
}
|
||||
|
||||
// MatchArticle checks whether a word is an article (definite or indefinite).
|
||||
// Returns the article type ("indefinite" or "definite") and true if matched,
|
||||
// or ("", false) otherwise.
|
||||
func (t *Tokeniser) MatchArticle(word string) (string, bool) {
|
||||
data := i18n.GetGrammarData(t.lang)
|
||||
if data == nil {
|
||||
return "", false
|
||||
}
|
||||
|
||||
lower := strings.ToLower(word)
|
||||
|
||||
if lower == strings.ToLower(data.Articles.IndefiniteDefault) ||
|
||||
lower == strings.ToLower(data.Articles.IndefiniteVowel) {
|
||||
return "indefinite", true
|
||||
}
|
||||
if lower == strings.ToLower(data.Articles.Definite) {
|
||||
return "definite", true
|
||||
}
|
||||
|
||||
return "", false
|
||||
}
|
||||
|
||||
// Tokenise splits text on whitespace and classifies each word.
|
||||
// Priority: punctuation → article → verb → noun → word → unknown.
|
||||
// Trailing punctuation is stripped from words before matching.
|
||||
func (t *Tokeniser) Tokenise(text string) []Token {
|
||||
text = strings.TrimSpace(text)
|
||||
if text == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
parts := strings.Fields(text)
|
||||
var tokens []Token
|
||||
|
||||
for _, raw := range parts {
|
||||
// Strip trailing punctuation to get the clean word.
|
||||
word, punct := splitTrailingPunct(raw)
|
||||
|
||||
// Classify the word portion (if any).
|
||||
if word != "" {
|
||||
tok := Token{Raw: raw, Lower: strings.ToLower(word)}
|
||||
|
||||
if artType, ok := t.MatchArticle(word); ok {
|
||||
tok.Type = TokenArticle
|
||||
tok.ArtType = artType
|
||||
} else if vm, ok := t.MatchVerb(word); ok {
|
||||
tok.Type = TokenVerb
|
||||
tok.VerbInfo = vm
|
||||
} else if nm, ok := t.MatchNoun(word); ok {
|
||||
tok.Type = TokenNoun
|
||||
tok.NounInfo = nm
|
||||
} else if cat, ok := t.MatchWord(word); ok {
|
||||
tok.Type = TokenWord
|
||||
tok.WordCat = cat
|
||||
} else {
|
||||
tok.Type = TokenUnknown
|
||||
}
|
||||
tokens = append(tokens, tok)
|
||||
}
|
||||
|
||||
// Emit a punctuation token if trailing punctuation was found.
|
||||
if punct != "" {
|
||||
if punctType, ok := matchPunctuation(punct); ok {
|
||||
tokens = append(tokens, Token{
|
||||
Raw: punct,
|
||||
Lower: punct,
|
||||
Type: TokenPunctuation,
|
||||
PunctType: punctType,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
// splitTrailingPunct separates a word from its trailing punctuation.
|
||||
// Returns the word and the punctuation suffix. Punctuation patterns
|
||||
// recognised: "..." (progress), "?" (question), ":" (label).
|
||||
func splitTrailingPunct(s string) (string, string) {
|
||||
// Check for "..." suffix first (3-char pattern).
|
||||
if strings.HasSuffix(s, "...") {
|
||||
return s[:len(s)-3], "..."
|
||||
}
|
||||
// Check single-char trailing punctuation.
|
||||
if len(s) > 1 {
|
||||
last := s[len(s)-1]
|
||||
if last == '?' || last == ':' {
|
||||
return s[:len(s)-1], string(last)
|
||||
}
|
||||
}
|
||||
return s, ""
|
||||
}
|
||||
|
||||
// matchPunctuation detects known punctuation patterns.
|
||||
// Returns the punctuation type and true if recognised.
|
||||
func matchPunctuation(punct string) (string, bool) {
|
||||
switch punct {
|
||||
case "...":
|
||||
return "progress", true
|
||||
case "?":
|
||||
return "question", true
|
||||
case ":":
|
||||
return "label", true
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
310
reversal/tokeniser_test.go
Normal file
310
reversal/tokeniser_test.go
Normal file
|
|
@ -0,0 +1,310 @@
|
|||
package reversal
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
i18n "forge.lthn.ai/core/go-i18n"
|
||||
)
|
||||
|
||||
func setup(t *testing.T) {
|
||||
t.Helper()
|
||||
svc, err := i18n.New()
|
||||
if err != nil {
|
||||
t.Fatalf("i18n.New() failed: %v", err)
|
||||
}
|
||||
i18n.SetDefault(svc)
|
||||
}
|
||||
|
||||
func TestTokeniser_MatchVerb_Irregular(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tests := []struct {
|
||||
word string
|
||||
wantOK bool
|
||||
wantBase string
|
||||
wantTense string
|
||||
}{
|
||||
// Irregular past tense
|
||||
{"deleted", true, "delete", "past"},
|
||||
{"deleting", true, "delete", "gerund"},
|
||||
{"went", true, "go", "past"},
|
||||
{"going", true, "go", "gerund"},
|
||||
{"was", true, "be", "past"},
|
||||
{"being", true, "be", "gerund"},
|
||||
{"ran", true, "run", "past"},
|
||||
{"running", true, "run", "gerund"},
|
||||
{"wrote", true, "write", "past"},
|
||||
{"writing", true, "write", "gerund"},
|
||||
{"built", true, "build", "past"},
|
||||
{"building", true, "build", "gerund"},
|
||||
{"committed", true, "commit", "past"},
|
||||
{"committing", true, "commit", "gerund"},
|
||||
|
||||
// Base forms
|
||||
{"delete", true, "delete", "base"},
|
||||
{"go", true, "go", "base"},
|
||||
|
||||
// Unknown words return false
|
||||
{"xyzzy", false, "", ""},
|
||||
{"flurble", false, "", ""},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.word, func(t *testing.T) {
|
||||
match, ok := tok.MatchVerb(tt.word)
|
||||
if ok != tt.wantOK {
|
||||
t.Fatalf("MatchVerb(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
|
||||
}
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if match.Base != tt.wantBase {
|
||||
t.Errorf("MatchVerb(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
|
||||
}
|
||||
if match.Tense != tt.wantTense {
|
||||
t.Errorf("MatchVerb(%q).Tense = %q, want %q", tt.word, match.Tense, tt.wantTense)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_MatchNoun_Irregular(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tests := []struct {
|
||||
word string
|
||||
wantOK bool
|
||||
wantBase string
|
||||
wantPlural bool
|
||||
}{
|
||||
{"files", true, "file", true},
|
||||
{"file", true, "file", false},
|
||||
{"people", true, "person", true},
|
||||
{"person", true, "person", false},
|
||||
{"children", true, "child", true},
|
||||
{"child", true, "child", false},
|
||||
{"repositories", true, "repository", true},
|
||||
{"repository", true, "repository", false},
|
||||
{"branches", true, "branch", true},
|
||||
{"branch", true, "branch", false},
|
||||
{"xyzzy", false, "", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.word, func(t *testing.T) {
|
||||
match, ok := tok.MatchNoun(tt.word)
|
||||
if ok != tt.wantOK {
|
||||
t.Fatalf("MatchNoun(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
|
||||
}
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if match.Base != tt.wantBase {
|
||||
t.Errorf("MatchNoun(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
|
||||
}
|
||||
if match.Plural != tt.wantPlural {
|
||||
t.Errorf("MatchNoun(%q).Plural = %v, want %v", tt.word, match.Plural, tt.wantPlural)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_MatchNoun_Regular(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tests := []struct {
|
||||
word string
|
||||
wantOK bool
|
||||
wantBase string
|
||||
wantPlural bool
|
||||
}{
|
||||
// Regular nouns NOT in grammar tables — detected by reverse morphology + round-trip
|
||||
{"servers", true, "server", true},
|
||||
{"processes", true, "process", true},
|
||||
{"entries", true, "entry", true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.word, func(t *testing.T) {
|
||||
match, ok := tok.MatchNoun(tt.word)
|
||||
if ok != tt.wantOK {
|
||||
t.Fatalf("MatchNoun(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
|
||||
}
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if match.Base != tt.wantBase {
|
||||
t.Errorf("MatchNoun(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
|
||||
}
|
||||
if match.Plural != tt.wantPlural {
|
||||
t.Errorf("MatchNoun(%q).Plural = %v, want %v", tt.word, match.Plural, tt.wantPlural)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_MatchWord(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tests := []struct {
|
||||
word string
|
||||
wantCat string
|
||||
wantOK bool
|
||||
}{
|
||||
{"URL", "url", true},
|
||||
{"url", "url", true},
|
||||
{"ID", "id", true},
|
||||
{"SSH", "ssh", true},
|
||||
{"PHP", "php", true},
|
||||
{"xyzzy", "", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.word, func(t *testing.T) {
|
||||
cat, ok := tok.MatchWord(tt.word)
|
||||
if ok != tt.wantOK {
|
||||
t.Fatalf("MatchWord(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
|
||||
}
|
||||
if ok && cat != tt.wantCat {
|
||||
t.Errorf("MatchWord(%q) = %q, want %q", tt.word, cat, tt.wantCat)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_MatchArticle(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tests := []struct {
|
||||
word string
|
||||
wantType string
|
||||
wantOK bool
|
||||
}{
|
||||
{"a", "indefinite", true},
|
||||
{"an", "indefinite", true},
|
||||
{"the", "definite", true},
|
||||
{"A", "indefinite", true},
|
||||
{"The", "definite", true},
|
||||
{"foo", "", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.word, func(t *testing.T) {
|
||||
artType, ok := tok.MatchArticle(tt.word)
|
||||
if ok != tt.wantOK {
|
||||
t.Fatalf("MatchArticle(%q) ok=%v, want %v", tt.word, ok, tt.wantOK)
|
||||
}
|
||||
if ok && artType != tt.wantType {
|
||||
t.Errorf("MatchArticle(%q) = %q, want %q", tt.word, artType, tt.wantType)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tokens := tok.Tokenise("Deleted the configuration files")
|
||||
|
||||
if len(tokens) != 4 {
|
||||
t.Fatalf("Tokenise() returned %d tokens, want 4", len(tokens))
|
||||
}
|
||||
|
||||
// "Deleted" → verb, past tense
|
||||
if tokens[0].Type != TokenVerb {
|
||||
t.Errorf("tokens[0].Type = %v, want TokenVerb", tokens[0].Type)
|
||||
}
|
||||
if tokens[0].VerbInfo.Tense != "past" {
|
||||
t.Errorf("tokens[0].VerbInfo.Tense = %q, want %q", tokens[0].VerbInfo.Tense, "past")
|
||||
}
|
||||
|
||||
// "the" → article
|
||||
if tokens[1].Type != TokenArticle {
|
||||
t.Errorf("tokens[1].Type = %v, want TokenArticle", tokens[1].Type)
|
||||
}
|
||||
|
||||
// "configuration" → unknown
|
||||
if tokens[2].Type != TokenUnknown {
|
||||
t.Errorf("tokens[2].Type = %v, want TokenUnknown", tokens[2].Type)
|
||||
}
|
||||
|
||||
// "files" → noun, plural
|
||||
if tokens[3].Type != TokenNoun {
|
||||
t.Errorf("tokens[3].Type = %v, want TokenNoun", tokens[3].Type)
|
||||
}
|
||||
if !tokens[3].NounInfo.Plural {
|
||||
t.Errorf("tokens[3].NounInfo.Plural = false, want true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise_Punctuation(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tokens := tok.Tokenise("Building project...")
|
||||
hasPunct := false
|
||||
for _, tok := range tokens {
|
||||
if tok.Type == TokenPunctuation {
|
||||
hasPunct = true
|
||||
}
|
||||
}
|
||||
if !hasPunct {
|
||||
t.Error("did not detect punctuation in \"Building project...\"")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise_Empty(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tokens := tok.Tokenise("")
|
||||
if len(tokens) != 0 {
|
||||
t.Errorf("Tokenise(\"\") returned %d tokens, want 0", len(tokens))
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_MatchVerb_Regular(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tests := []struct {
|
||||
word string
|
||||
wantOK bool
|
||||
wantBase string
|
||||
wantTense string
|
||||
}{
|
||||
// Regular verbs NOT in grammar tables — detected by reverse morphology + round-trip
|
||||
{"walked", true, "walk", "past"},
|
||||
{"walking", true, "walk", "gerund"},
|
||||
{"processed", true, "process", "past"},
|
||||
{"processing", true, "process", "gerund"},
|
||||
{"copied", true, "copy", "past"},
|
||||
{"copying", true, "copy", "gerund"},
|
||||
{"stopped", true, "stop", "past"},
|
||||
{"stopping", true, "stop", "gerund"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.word, func(t *testing.T) {
|
||||
match, ok := tok.MatchVerb(tt.word)
|
||||
if ok != tt.wantOK {
|
||||
t.Fatalf("MatchVerb(%q) ok = %v, want %v", tt.word, ok, tt.wantOK)
|
||||
}
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if match.Base != tt.wantBase {
|
||||
t.Errorf("MatchVerb(%q).Base = %q, want %q", tt.word, match.Base, tt.wantBase)
|
||||
}
|
||||
if match.Tense != tt.wantTense {
|
||||
t.Errorf("MatchVerb(%q).Tense = %q, want %q", tt.word, match.Tense, tt.wantTense)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue