feat(reversal): add GrammarImprint struct and constructor
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
f09cff894f
commit
8b23600632
2 changed files with 164 additions and 0 deletions
102
reversal/imprint.go
Normal file
102
reversal/imprint.go
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
package reversal
|
||||
|
||||
// GrammarImprint is a low-dimensional grammar feature vector.
|
||||
type GrammarImprint struct {
|
||||
VerbDistribution map[string]float64 // verb base -> frequency
|
||||
TenseDistribution map[string]float64 // "past"/"gerund"/"base" -> ratio
|
||||
NounDistribution map[string]float64 // noun base -> frequency
|
||||
PluralRatio float64 // proportion of plural nouns (0.0-1.0)
|
||||
DomainVocabulary map[string]int // gram.word category -> hit count
|
||||
ArticleUsage map[string]float64 // "definite"/"indefinite" -> ratio
|
||||
PunctuationPattern map[string]float64 // "label"/"progress"/"question" -> ratio
|
||||
TokenCount int
|
||||
UniqueVerbs int
|
||||
UniqueNouns int
|
||||
}
|
||||
|
||||
// NewImprint calculates a GrammarImprint from classified tokens.
|
||||
func NewImprint(tokens []Token) GrammarImprint {
|
||||
imp := GrammarImprint{
|
||||
VerbDistribution: make(map[string]float64),
|
||||
TenseDistribution: make(map[string]float64),
|
||||
NounDistribution: make(map[string]float64),
|
||||
DomainVocabulary: make(map[string]int),
|
||||
ArticleUsage: make(map[string]float64),
|
||||
PunctuationPattern: make(map[string]float64),
|
||||
}
|
||||
|
||||
if len(tokens) == 0 {
|
||||
return imp
|
||||
}
|
||||
|
||||
imp.TokenCount = len(tokens)
|
||||
|
||||
verbBases := make(map[string]bool)
|
||||
nounBases := make(map[string]bool)
|
||||
var verbCount, nounCount, articleCount, punctCount int
|
||||
var pluralNouns, totalNouns int
|
||||
|
||||
for _, tok := range tokens {
|
||||
switch tok.Type {
|
||||
case TokenVerb:
|
||||
verbCount++
|
||||
base := tok.VerbInfo.Base
|
||||
imp.VerbDistribution[base]++
|
||||
imp.TenseDistribution[tok.VerbInfo.Tense]++
|
||||
verbBases[base] = true
|
||||
|
||||
case TokenNoun:
|
||||
nounCount++
|
||||
base := tok.NounInfo.Base
|
||||
imp.NounDistribution[base]++
|
||||
nounBases[base] = true
|
||||
totalNouns++
|
||||
if tok.NounInfo.Plural {
|
||||
pluralNouns++
|
||||
}
|
||||
|
||||
case TokenArticle:
|
||||
articleCount++
|
||||
imp.ArticleUsage[tok.ArtType]++
|
||||
|
||||
case TokenWord:
|
||||
imp.DomainVocabulary[tok.WordCat]++
|
||||
|
||||
case TokenPunctuation:
|
||||
punctCount++
|
||||
imp.PunctuationPattern[tok.PunctType]++
|
||||
}
|
||||
}
|
||||
|
||||
imp.UniqueVerbs = len(verbBases)
|
||||
imp.UniqueNouns = len(nounBases)
|
||||
|
||||
// Calculate plural ratio
|
||||
if totalNouns > 0 {
|
||||
imp.PluralRatio = float64(pluralNouns) / float64(totalNouns)
|
||||
}
|
||||
|
||||
// Normalise frequency maps to sum to 1.0
|
||||
normaliseMap(imp.VerbDistribution)
|
||||
normaliseMap(imp.TenseDistribution)
|
||||
normaliseMap(imp.NounDistribution)
|
||||
normaliseMap(imp.ArticleUsage)
|
||||
normaliseMap(imp.PunctuationPattern)
|
||||
|
||||
return imp
|
||||
}
|
||||
|
||||
// normaliseMap scales all values in a map so they sum to 1.0.
|
||||
// If the map is empty or sums to zero, it is left unchanged.
|
||||
func normaliseMap(m map[string]float64) {
|
||||
var total float64
|
||||
for _, v := range m {
|
||||
total += v
|
||||
}
|
||||
if total == 0 {
|
||||
return
|
||||
}
|
||||
for k, v := range m {
|
||||
m[k] = v / total
|
||||
}
|
||||
}
|
||||
62
reversal/imprint_test.go
Normal file
62
reversal/imprint_test.go
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
package reversal
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
i18n "forge.lthn.ai/core/go-i18n"
|
||||
)
|
||||
|
||||
func TestNewImprint(t *testing.T) {
|
||||
svc, err := i18n.New()
|
||||
if err != nil {
|
||||
t.Fatalf("i18n.New() failed: %v", err)
|
||||
}
|
||||
i18n.SetDefault(svc)
|
||||
|
||||
tok := NewTokeniser()
|
||||
tokens := tok.Tokenise("Deleted the configuration files successfully")
|
||||
imp := NewImprint(tokens)
|
||||
|
||||
if imp.TokenCount != 5 {
|
||||
t.Errorf("TokenCount = %d, want 5", imp.TokenCount)
|
||||
}
|
||||
if imp.UniqueVerbs == 0 {
|
||||
t.Error("UniqueVerbs = 0, want > 0")
|
||||
}
|
||||
if imp.UniqueNouns == 0 {
|
||||
t.Error("UniqueNouns = 0, want > 0")
|
||||
}
|
||||
if imp.TenseDistribution["past"] == 0 {
|
||||
t.Error("TenseDistribution[\"past\"] = 0, want > 0")
|
||||
}
|
||||
if imp.ArticleUsage["definite"] == 0 {
|
||||
t.Error("ArticleUsage[\"definite\"] = 0, want > 0")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewImprint_Empty(t *testing.T) {
|
||||
imp := NewImprint(nil)
|
||||
if imp.TokenCount != 0 {
|
||||
t.Errorf("TokenCount = %d, want 0", imp.TokenCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewImprint_PluralRatio(t *testing.T) {
|
||||
svc, _ := i18n.New()
|
||||
i18n.SetDefault(svc)
|
||||
tok := NewTokeniser()
|
||||
|
||||
// All plural nouns
|
||||
tokens := tok.Tokenise("files branches repositories")
|
||||
imp := NewImprint(tokens)
|
||||
if imp.PluralRatio < 0.5 {
|
||||
t.Errorf("PluralRatio = %f for all-plural input, want >= 0.5", imp.PluralRatio)
|
||||
}
|
||||
|
||||
// All singular nouns
|
||||
tokens = tok.Tokenise("file branch repository")
|
||||
imp = NewImprint(tokens)
|
||||
if imp.PluralRatio > 0.5 {
|
||||
t.Errorf("PluralRatio = %f for all-singular input, want <= 0.5", imp.PluralRatio)
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue