feat(reversal): add GrammarImprint struct and constructor
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
f09cff894f
commit
8b23600632
2 changed files with 164 additions and 0 deletions
102
reversal/imprint.go
Normal file
102
reversal/imprint.go
Normal file
|
|
@ -0,0 +1,102 @@
|
||||||
|
package reversal
|
||||||
|
|
||||||
|
// GrammarImprint is a low-dimensional grammar feature vector.
|
||||||
|
type GrammarImprint struct {
|
||||||
|
VerbDistribution map[string]float64 // verb base -> frequency
|
||||||
|
TenseDistribution map[string]float64 // "past"/"gerund"/"base" -> ratio
|
||||||
|
NounDistribution map[string]float64 // noun base -> frequency
|
||||||
|
PluralRatio float64 // proportion of plural nouns (0.0-1.0)
|
||||||
|
DomainVocabulary map[string]int // gram.word category -> hit count
|
||||||
|
ArticleUsage map[string]float64 // "definite"/"indefinite" -> ratio
|
||||||
|
PunctuationPattern map[string]float64 // "label"/"progress"/"question" -> ratio
|
||||||
|
TokenCount int
|
||||||
|
UniqueVerbs int
|
||||||
|
UniqueNouns int
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewImprint calculates a GrammarImprint from classified tokens.
|
||||||
|
func NewImprint(tokens []Token) GrammarImprint {
|
||||||
|
imp := GrammarImprint{
|
||||||
|
VerbDistribution: make(map[string]float64),
|
||||||
|
TenseDistribution: make(map[string]float64),
|
||||||
|
NounDistribution: make(map[string]float64),
|
||||||
|
DomainVocabulary: make(map[string]int),
|
||||||
|
ArticleUsage: make(map[string]float64),
|
||||||
|
PunctuationPattern: make(map[string]float64),
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(tokens) == 0 {
|
||||||
|
return imp
|
||||||
|
}
|
||||||
|
|
||||||
|
imp.TokenCount = len(tokens)
|
||||||
|
|
||||||
|
verbBases := make(map[string]bool)
|
||||||
|
nounBases := make(map[string]bool)
|
||||||
|
var verbCount, nounCount, articleCount, punctCount int
|
||||||
|
var pluralNouns, totalNouns int
|
||||||
|
|
||||||
|
for _, tok := range tokens {
|
||||||
|
switch tok.Type {
|
||||||
|
case TokenVerb:
|
||||||
|
verbCount++
|
||||||
|
base := tok.VerbInfo.Base
|
||||||
|
imp.VerbDistribution[base]++
|
||||||
|
imp.TenseDistribution[tok.VerbInfo.Tense]++
|
||||||
|
verbBases[base] = true
|
||||||
|
|
||||||
|
case TokenNoun:
|
||||||
|
nounCount++
|
||||||
|
base := tok.NounInfo.Base
|
||||||
|
imp.NounDistribution[base]++
|
||||||
|
nounBases[base] = true
|
||||||
|
totalNouns++
|
||||||
|
if tok.NounInfo.Plural {
|
||||||
|
pluralNouns++
|
||||||
|
}
|
||||||
|
|
||||||
|
case TokenArticle:
|
||||||
|
articleCount++
|
||||||
|
imp.ArticleUsage[tok.ArtType]++
|
||||||
|
|
||||||
|
case TokenWord:
|
||||||
|
imp.DomainVocabulary[tok.WordCat]++
|
||||||
|
|
||||||
|
case TokenPunctuation:
|
||||||
|
punctCount++
|
||||||
|
imp.PunctuationPattern[tok.PunctType]++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
imp.UniqueVerbs = len(verbBases)
|
||||||
|
imp.UniqueNouns = len(nounBases)
|
||||||
|
|
||||||
|
// Calculate plural ratio
|
||||||
|
if totalNouns > 0 {
|
||||||
|
imp.PluralRatio = float64(pluralNouns) / float64(totalNouns)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalise frequency maps to sum to 1.0
|
||||||
|
normaliseMap(imp.VerbDistribution)
|
||||||
|
normaliseMap(imp.TenseDistribution)
|
||||||
|
normaliseMap(imp.NounDistribution)
|
||||||
|
normaliseMap(imp.ArticleUsage)
|
||||||
|
normaliseMap(imp.PunctuationPattern)
|
||||||
|
|
||||||
|
return imp
|
||||||
|
}
|
||||||
|
|
||||||
|
// normaliseMap scales all values in a map so they sum to 1.0.
|
||||||
|
// If the map is empty or sums to zero, it is left unchanged.
|
||||||
|
func normaliseMap(m map[string]float64) {
|
||||||
|
var total float64
|
||||||
|
for _, v := range m {
|
||||||
|
total += v
|
||||||
|
}
|
||||||
|
if total == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for k, v := range m {
|
||||||
|
m[k] = v / total
|
||||||
|
}
|
||||||
|
}
|
||||||
62
reversal/imprint_test.go
Normal file
62
reversal/imprint_test.go
Normal file
|
|
@ -0,0 +1,62 @@
|
||||||
|
package reversal
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
i18n "forge.lthn.ai/core/go-i18n"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestNewImprint(t *testing.T) {
|
||||||
|
svc, err := i18n.New()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("i18n.New() failed: %v", err)
|
||||||
|
}
|
||||||
|
i18n.SetDefault(svc)
|
||||||
|
|
||||||
|
tok := NewTokeniser()
|
||||||
|
tokens := tok.Tokenise("Deleted the configuration files successfully")
|
||||||
|
imp := NewImprint(tokens)
|
||||||
|
|
||||||
|
if imp.TokenCount != 5 {
|
||||||
|
t.Errorf("TokenCount = %d, want 5", imp.TokenCount)
|
||||||
|
}
|
||||||
|
if imp.UniqueVerbs == 0 {
|
||||||
|
t.Error("UniqueVerbs = 0, want > 0")
|
||||||
|
}
|
||||||
|
if imp.UniqueNouns == 0 {
|
||||||
|
t.Error("UniqueNouns = 0, want > 0")
|
||||||
|
}
|
||||||
|
if imp.TenseDistribution["past"] == 0 {
|
||||||
|
t.Error("TenseDistribution[\"past\"] = 0, want > 0")
|
||||||
|
}
|
||||||
|
if imp.ArticleUsage["definite"] == 0 {
|
||||||
|
t.Error("ArticleUsage[\"definite\"] = 0, want > 0")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewImprint_Empty(t *testing.T) {
|
||||||
|
imp := NewImprint(nil)
|
||||||
|
if imp.TokenCount != 0 {
|
||||||
|
t.Errorf("TokenCount = %d, want 0", imp.TokenCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewImprint_PluralRatio(t *testing.T) {
|
||||||
|
svc, _ := i18n.New()
|
||||||
|
i18n.SetDefault(svc)
|
||||||
|
tok := NewTokeniser()
|
||||||
|
|
||||||
|
// All plural nouns
|
||||||
|
tokens := tok.Tokenise("files branches repositories")
|
||||||
|
imp := NewImprint(tokens)
|
||||||
|
if imp.PluralRatio < 0.5 {
|
||||||
|
t.Errorf("PluralRatio = %f for all-plural input, want >= 0.5", imp.PluralRatio)
|
||||||
|
}
|
||||||
|
|
||||||
|
// All singular nouns
|
||||||
|
tokens = tok.Tokenise("file branch repository")
|
||||||
|
imp = NewImprint(tokens)
|
||||||
|
if imp.PluralRatio > 0.5 {
|
||||||
|
t.Errorf("PluralRatio = %f for all-singular input, want <= 0.5", imp.PluralRatio)
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue