From 8b23600632883806b0f2ae669251fc9bf0bbfa33 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 16 Feb 2026 23:25:08 +0000 Subject: [PATCH] feat(reversal): add GrammarImprint struct and constructor Co-Authored-By: Claude Opus 4.6 --- reversal/imprint.go | 102 +++++++++++++++++++++++++++++++++++++++ reversal/imprint_test.go | 62 ++++++++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 reversal/imprint.go create mode 100644 reversal/imprint_test.go diff --git a/reversal/imprint.go b/reversal/imprint.go new file mode 100644 index 0000000..53cabba --- /dev/null +++ b/reversal/imprint.go @@ -0,0 +1,102 @@ +package reversal + +// GrammarImprint is a low-dimensional grammar feature vector. +type GrammarImprint struct { + VerbDistribution map[string]float64 // verb base -> frequency + TenseDistribution map[string]float64 // "past"/"gerund"/"base" -> ratio + NounDistribution map[string]float64 // noun base -> frequency + PluralRatio float64 // proportion of plural nouns (0.0-1.0) + DomainVocabulary map[string]int // gram.word category -> hit count + ArticleUsage map[string]float64 // "definite"/"indefinite" -> ratio + PunctuationPattern map[string]float64 // "label"/"progress"/"question" -> ratio + TokenCount int + UniqueVerbs int + UniqueNouns int +} + +// NewImprint calculates a GrammarImprint from classified tokens. +func NewImprint(tokens []Token) GrammarImprint { + imp := GrammarImprint{ + VerbDistribution: make(map[string]float64), + TenseDistribution: make(map[string]float64), + NounDistribution: make(map[string]float64), + DomainVocabulary: make(map[string]int), + ArticleUsage: make(map[string]float64), + PunctuationPattern: make(map[string]float64), + } + + if len(tokens) == 0 { + return imp + } + + imp.TokenCount = len(tokens) + + verbBases := make(map[string]bool) + nounBases := make(map[string]bool) + var verbCount, nounCount, articleCount, punctCount int + var pluralNouns, totalNouns int + + for _, tok := range tokens { + switch tok.Type { + case TokenVerb: + verbCount++ + base := tok.VerbInfo.Base + imp.VerbDistribution[base]++ + imp.TenseDistribution[tok.VerbInfo.Tense]++ + verbBases[base] = true + + case TokenNoun: + nounCount++ + base := tok.NounInfo.Base + imp.NounDistribution[base]++ + nounBases[base] = true + totalNouns++ + if tok.NounInfo.Plural { + pluralNouns++ + } + + case TokenArticle: + articleCount++ + imp.ArticleUsage[tok.ArtType]++ + + case TokenWord: + imp.DomainVocabulary[tok.WordCat]++ + + case TokenPunctuation: + punctCount++ + imp.PunctuationPattern[tok.PunctType]++ + } + } + + imp.UniqueVerbs = len(verbBases) + imp.UniqueNouns = len(nounBases) + + // Calculate plural ratio + if totalNouns > 0 { + imp.PluralRatio = float64(pluralNouns) / float64(totalNouns) + } + + // Normalise frequency maps to sum to 1.0 + normaliseMap(imp.VerbDistribution) + normaliseMap(imp.TenseDistribution) + normaliseMap(imp.NounDistribution) + normaliseMap(imp.ArticleUsage) + normaliseMap(imp.PunctuationPattern) + + return imp +} + +// normaliseMap scales all values in a map so they sum to 1.0. +// If the map is empty or sums to zero, it is left unchanged. +func normaliseMap(m map[string]float64) { + var total float64 + for _, v := range m { + total += v + } + if total == 0 { + return + } + for k, v := range m { + m[k] = v / total + } +} diff --git a/reversal/imprint_test.go b/reversal/imprint_test.go new file mode 100644 index 0000000..66e1441 --- /dev/null +++ b/reversal/imprint_test.go @@ -0,0 +1,62 @@ +package reversal + +import ( + "testing" + + i18n "forge.lthn.ai/core/go-i18n" +) + +func TestNewImprint(t *testing.T) { + svc, err := i18n.New() + if err != nil { + t.Fatalf("i18n.New() failed: %v", err) + } + i18n.SetDefault(svc) + + tok := NewTokeniser() + tokens := tok.Tokenise("Deleted the configuration files successfully") + imp := NewImprint(tokens) + + if imp.TokenCount != 5 { + t.Errorf("TokenCount = %d, want 5", imp.TokenCount) + } + if imp.UniqueVerbs == 0 { + t.Error("UniqueVerbs = 0, want > 0") + } + if imp.UniqueNouns == 0 { + t.Error("UniqueNouns = 0, want > 0") + } + if imp.TenseDistribution["past"] == 0 { + t.Error("TenseDistribution[\"past\"] = 0, want > 0") + } + if imp.ArticleUsage["definite"] == 0 { + t.Error("ArticleUsage[\"definite\"] = 0, want > 0") + } +} + +func TestNewImprint_Empty(t *testing.T) { + imp := NewImprint(nil) + if imp.TokenCount != 0 { + t.Errorf("TokenCount = %d, want 0", imp.TokenCount) + } +} + +func TestNewImprint_PluralRatio(t *testing.T) { + svc, _ := i18n.New() + i18n.SetDefault(svc) + tok := NewTokeniser() + + // All plural nouns + tokens := tok.Tokenise("files branches repositories") + imp := NewImprint(tokens) + if imp.PluralRatio < 0.5 { + t.Errorf("PluralRatio = %f for all-plural input, want >= 0.5", imp.PluralRatio) + } + + // All singular nouns + tokens = tok.Tokenise("file branch repository") + imp = NewImprint(tokens) + if imp.PluralRatio > 0.5 { + t.Errorf("PluralRatio = %f for all-singular input, want <= 0.5", imp.PluralRatio) + } +}