feat(reversal): add GrammarImprint struct and constructor

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 23:25:08 +00:00 · 2026-02-16 23:25:08 +00:00 · 8b23600632
commit 8b23600632
parent f09cff894f
2 changed files with 164 additions and 0 deletions
--- a/reversal/imprint.go
+++ b/reversal/imprint.go
@ -0,0 +1,102 @@
+package reversal
+
+// GrammarImprint is a low-dimensional grammar feature vector.
+type GrammarImprint struct {
+	VerbDistribution   map[string]float64 // verb base -> frequency
+	TenseDistribution  map[string]float64 // "past"/"gerund"/"base" -> ratio
+	NounDistribution   map[string]float64 // noun base -> frequency
+	PluralRatio        float64            // proportion of plural nouns (0.0-1.0)
+	DomainVocabulary   map[string]int     // gram.word category -> hit count
+	ArticleUsage       map[string]float64 // "definite"/"indefinite" -> ratio
+	PunctuationPattern map[string]float64 // "label"/"progress"/"question" -> ratio
+	TokenCount         int
+	UniqueVerbs        int
+	UniqueNouns        int
+}
+
+// NewImprint calculates a GrammarImprint from classified tokens.
+func NewImprint(tokens []Token) GrammarImprint {
+	imp := GrammarImprint{
+		VerbDistribution:   make(map[string]float64),
+		TenseDistribution:  make(map[string]float64),
+		NounDistribution:   make(map[string]float64),
+		DomainVocabulary:   make(map[string]int),
+		ArticleUsage:       make(map[string]float64),
+		PunctuationPattern: make(map[string]float64),
+	}
+
+	if len(tokens) == 0 {
+		return imp
+	}
+
+	imp.TokenCount = len(tokens)
+
+	verbBases := make(map[string]bool)
+	nounBases := make(map[string]bool)
+	var verbCount, nounCount, articleCount, punctCount int
+	var pluralNouns, totalNouns int
+
+	for _, tok := range tokens {
+		switch tok.Type {
+		case TokenVerb:
+			verbCount++
+			base := tok.VerbInfo.Base
+			imp.VerbDistribution[base]++
+			imp.TenseDistribution[tok.VerbInfo.Tense]++
+			verbBases[base] = true
+
+		case TokenNoun:
+			nounCount++
+			base := tok.NounInfo.Base
+			imp.NounDistribution[base]++
+			nounBases[base] = true
+			totalNouns++
+			if tok.NounInfo.Plural {
+				pluralNouns++
+			}
+
+		case TokenArticle:
+			articleCount++
+			imp.ArticleUsage[tok.ArtType]++
+
+		case TokenWord:
+			imp.DomainVocabulary[tok.WordCat]++
+
+		case TokenPunctuation:
+			punctCount++
+			imp.PunctuationPattern[tok.PunctType]++
+		}
+	}
+
+	imp.UniqueVerbs = len(verbBases)
+	imp.UniqueNouns = len(nounBases)
+
+	// Calculate plural ratio
+	if totalNouns > 0 {
+		imp.PluralRatio = float64(pluralNouns) / float64(totalNouns)
+	}
+
+	// Normalise frequency maps to sum to 1.0
+	normaliseMap(imp.VerbDistribution)
+	normaliseMap(imp.TenseDistribution)
+	normaliseMap(imp.NounDistribution)
+	normaliseMap(imp.ArticleUsage)
+	normaliseMap(imp.PunctuationPattern)
+
+	return imp
+}
+
+// normaliseMap scales all values in a map so they sum to 1.0.
+// If the map is empty or sums to zero, it is left unchanged.
+func normaliseMap(m map[string]float64) {
+	var total float64
+	for _, v := range m {
+		total += v
+	}
+	if total == 0 {
+		return
+	}
+	for k, v := range m {
+		m[k] = v / total
+	}
+}
--- a/reversal/imprint_test.go
+++ b/reversal/imprint_test.go
@ -0,0 +1,62 @@
+package reversal
+
+import (
+	"testing"
+
+	i18n "forge.lthn.ai/core/go-i18n"
+)
+
+func TestNewImprint(t *testing.T) {
+	svc, err := i18n.New()
+	if err != nil {
+		t.Fatalf("i18n.New() failed: %v", err)
+	}
+	i18n.SetDefault(svc)
+
+	tok := NewTokeniser()
+	tokens := tok.Tokenise("Deleted the configuration files successfully")
+	imp := NewImprint(tokens)
+
+	if imp.TokenCount != 5 {
+		t.Errorf("TokenCount = %d, want 5", imp.TokenCount)
+	}
+	if imp.UniqueVerbs == 0 {
+		t.Error("UniqueVerbs = 0, want > 0")
+	}
+	if imp.UniqueNouns == 0 {
+		t.Error("UniqueNouns = 0, want > 0")
+	}
+	if imp.TenseDistribution["past"] == 0 {
+		t.Error("TenseDistribution[\"past\"] = 0, want > 0")
+	}
+	if imp.ArticleUsage["definite"] == 0 {
+		t.Error("ArticleUsage[\"definite\"] = 0, want > 0")
+	}
+}
+
+func TestNewImprint_Empty(t *testing.T) {
+	imp := NewImprint(nil)
+	if imp.TokenCount != 0 {
+		t.Errorf("TokenCount = %d, want 0", imp.TokenCount)
+	}
+}
+
+func TestNewImprint_PluralRatio(t *testing.T) {
+	svc, _ := i18n.New()
+	i18n.SetDefault(svc)
+	tok := NewTokeniser()
+
+	// All plural nouns
+	tokens := tok.Tokenise("files branches repositories")
+	imp := NewImprint(tokens)
+	if imp.PluralRatio < 0.5 {
+		t.Errorf("PluralRatio = %f for all-plural input, want >= 0.5", imp.PluralRatio)
+	}
+
+	// All singular nouns
+	tokens = tok.Tokenise("file branch repository")
+	imp = NewImprint(tokens)
+	if imp.PluralRatio > 0.5 {
+		t.Errorf("PluralRatio = %f for all-singular input, want <= 0.5", imp.PluralRatio)
+	}
+}