feat(reversal): add GrammarImprint struct and constructor

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Claude 2026-02-16 23:25:08 +00:00
parent f09cff894f
commit 8b23600632
No known key found for this signature in database
GPG key ID: AF404715446AEB41
2 changed files with 164 additions and 0 deletions

102
reversal/imprint.go Normal file
View file

@ -0,0 +1,102 @@
package reversal
// GrammarImprint is a low-dimensional grammar feature vector.
type GrammarImprint struct {
VerbDistribution map[string]float64 // verb base -> frequency
TenseDistribution map[string]float64 // "past"/"gerund"/"base" -> ratio
NounDistribution map[string]float64 // noun base -> frequency
PluralRatio float64 // proportion of plural nouns (0.0-1.0)
DomainVocabulary map[string]int // gram.word category -> hit count
ArticleUsage map[string]float64 // "definite"/"indefinite" -> ratio
PunctuationPattern map[string]float64 // "label"/"progress"/"question" -> ratio
TokenCount int
UniqueVerbs int
UniqueNouns int
}
// NewImprint calculates a GrammarImprint from classified tokens.
func NewImprint(tokens []Token) GrammarImprint {
imp := GrammarImprint{
VerbDistribution: make(map[string]float64),
TenseDistribution: make(map[string]float64),
NounDistribution: make(map[string]float64),
DomainVocabulary: make(map[string]int),
ArticleUsage: make(map[string]float64),
PunctuationPattern: make(map[string]float64),
}
if len(tokens) == 0 {
return imp
}
imp.TokenCount = len(tokens)
verbBases := make(map[string]bool)
nounBases := make(map[string]bool)
var verbCount, nounCount, articleCount, punctCount int
var pluralNouns, totalNouns int
for _, tok := range tokens {
switch tok.Type {
case TokenVerb:
verbCount++
base := tok.VerbInfo.Base
imp.VerbDistribution[base]++
imp.TenseDistribution[tok.VerbInfo.Tense]++
verbBases[base] = true
case TokenNoun:
nounCount++
base := tok.NounInfo.Base
imp.NounDistribution[base]++
nounBases[base] = true
totalNouns++
if tok.NounInfo.Plural {
pluralNouns++
}
case TokenArticle:
articleCount++
imp.ArticleUsage[tok.ArtType]++
case TokenWord:
imp.DomainVocabulary[tok.WordCat]++
case TokenPunctuation:
punctCount++
imp.PunctuationPattern[tok.PunctType]++
}
}
imp.UniqueVerbs = len(verbBases)
imp.UniqueNouns = len(nounBases)
// Calculate plural ratio
if totalNouns > 0 {
imp.PluralRatio = float64(pluralNouns) / float64(totalNouns)
}
// Normalise frequency maps to sum to 1.0
normaliseMap(imp.VerbDistribution)
normaliseMap(imp.TenseDistribution)
normaliseMap(imp.NounDistribution)
normaliseMap(imp.ArticleUsage)
normaliseMap(imp.PunctuationPattern)
return imp
}
// normaliseMap scales all values in a map so they sum to 1.0.
// If the map is empty or sums to zero, it is left unchanged.
func normaliseMap(m map[string]float64) {
var total float64
for _, v := range m {
total += v
}
if total == 0 {
return
}
for k, v := range m {
m[k] = v / total
}
}

62
reversal/imprint_test.go Normal file
View file

@ -0,0 +1,62 @@
package reversal
import (
"testing"
i18n "forge.lthn.ai/core/go-i18n"
)
func TestNewImprint(t *testing.T) {
svc, err := i18n.New()
if err != nil {
t.Fatalf("i18n.New() failed: %v", err)
}
i18n.SetDefault(svc)
tok := NewTokeniser()
tokens := tok.Tokenise("Deleted the configuration files successfully")
imp := NewImprint(tokens)
if imp.TokenCount != 5 {
t.Errorf("TokenCount = %d, want 5", imp.TokenCount)
}
if imp.UniqueVerbs == 0 {
t.Error("UniqueVerbs = 0, want > 0")
}
if imp.UniqueNouns == 0 {
t.Error("UniqueNouns = 0, want > 0")
}
if imp.TenseDistribution["past"] == 0 {
t.Error("TenseDistribution[\"past\"] = 0, want > 0")
}
if imp.ArticleUsage["definite"] == 0 {
t.Error("ArticleUsage[\"definite\"] = 0, want > 0")
}
}
func TestNewImprint_Empty(t *testing.T) {
imp := NewImprint(nil)
if imp.TokenCount != 0 {
t.Errorf("TokenCount = %d, want 0", imp.TokenCount)
}
}
func TestNewImprint_PluralRatio(t *testing.T) {
svc, _ := i18n.New()
i18n.SetDefault(svc)
tok := NewTokeniser()
// All plural nouns
tokens := tok.Tokenise("files branches repositories")
imp := NewImprint(tokens)
if imp.PluralRatio < 0.5 {
t.Errorf("PluralRatio = %f for all-plural input, want >= 0.5", imp.PluralRatio)
}
// All singular nouns
tokens = tok.Tokenise("file branch repository")
imp = NewImprint(tokens)
if imp.PluralRatio > 0.5 {
t.Errorf("PluralRatio = %f for all-singular input, want <= 0.5", imp.PluralRatio)
}
}