go-i18n/reversal/imprint.go

package reversal

import "math"

// GrammarImprint is a low-dimensional grammar feature vector.
type GrammarImprint struct {
	VerbDistribution   map[string]float64 // verb base -> frequency
	TenseDistribution  map[string]float64 // "past"/"gerund"/"base" -> ratio
	NounDistribution   map[string]float64 // noun base -> frequency
	PluralRatio        float64            // proportion of plural nouns (0.0-1.0)
	DomainVocabulary   map[string]int     // gram.word category -> hit count
	ArticleUsage       map[string]float64 // "definite"/"indefinite" -> ratio
	PunctuationPattern map[string]float64 // "label"/"progress"/"question" -> ratio
	TokenCount         int
	UniqueVerbs        int
	UniqueNouns        int
}

// NewImprint calculates a GrammarImprint from classified tokens.
func NewImprint(tokens []Token) GrammarImprint {
	imp := GrammarImprint{
		VerbDistribution:   make(map[string]float64),
		TenseDistribution:  make(map[string]float64),
		NounDistribution:   make(map[string]float64),
		DomainVocabulary:   make(map[string]int),
		ArticleUsage:       make(map[string]float64),
		PunctuationPattern: make(map[string]float64),
	}

	if len(tokens) == 0 {
		return imp
	}

	imp.TokenCount = len(tokens)

	verbBases := make(map[string]bool)
	nounBases := make(map[string]bool)
	var verbCount, nounCount, articleCount, punctCount int
	var pluralNouns, totalNouns int

	for _, tok := range tokens {
		switch tok.Type {
		case TokenVerb:
			verbCount++
			base := tok.VerbInfo.Base
			imp.VerbDistribution[base]++
			imp.TenseDistribution[tok.VerbInfo.Tense]++
			verbBases[base] = true

		case TokenNoun:
			nounCount++
			base := tok.NounInfo.Base
			imp.NounDistribution[base]++
			nounBases[base] = true
			totalNouns++
			if tok.NounInfo.Plural {
				pluralNouns++
			}

		case TokenArticle:
			articleCount++
			imp.ArticleUsage[tok.ArtType]++

		case TokenWord:
			imp.DomainVocabulary[tok.WordCat]++

		case TokenPunctuation:
			punctCount++
			imp.PunctuationPattern[tok.PunctType]++
		}
	}

	imp.UniqueVerbs = len(verbBases)
	imp.UniqueNouns = len(nounBases)

	// Calculate plural ratio
	if totalNouns > 0 {
		imp.PluralRatio = float64(pluralNouns) / float64(totalNouns)
	}

	// Normalise frequency maps to sum to 1.0
	normaliseMap(imp.VerbDistribution)
	normaliseMap(imp.TenseDistribution)
	normaliseMap(imp.NounDistribution)
	normaliseMap(imp.ArticleUsage)
	normaliseMap(imp.PunctuationPattern)

	return imp
}

// normaliseMap scales all values in a map so they sum to 1.0.
// If the map is empty or sums to zero, it is left unchanged.
func normaliseMap(m map[string]float64) {
	var total float64
	for _, v := range m {
		total += v
	}
	if total == 0 {
		return
	}
	for k, v := range m {
		m[k] = v / total
	}
}

// Similar returns weighted cosine similarity between two imprints (0.0-1.0).
// Weights: verb(0.30), tense(0.20), noun(0.25), article(0.15), punct(0.10).
func (a GrammarImprint) Similar(b GrammarImprint) float64 {
	// Two empty imprints are identical.
	if a.TokenCount == 0 && b.TokenCount == 0 {
		return 1.0
	}

	type component struct {
		weight float64
		a, b   map[string]float64
	}

	components := []component{
		{0.30, a.VerbDistribution, b.VerbDistribution},
		{0.20, a.TenseDistribution, b.TenseDistribution},
		{0.25, a.NounDistribution, b.NounDistribution},
		{0.15, a.ArticleUsage, b.ArticleUsage},
		{0.10, a.PunctuationPattern, b.PunctuationPattern},
	}

	var totalWeight float64
	var weightedSum float64

	for _, c := range components {
		// Skip components where both maps are empty (no signal).
		if len(c.a) == 0 && len(c.b) == 0 {
			continue
		}
		totalWeight += c.weight
		weightedSum += c.weight * mapSimilarity(c.a, c.b)
	}

	if totalWeight == 0 {
		return 1.0
	}

	return weightedSum / totalWeight
}

// mapSimilarity computes cosine similarity between two frequency maps.
// Returns 1.0 for identical distributions, 0.0 for completely disjoint.
func mapSimilarity(a, b map[string]float64) float64 {
	if len(a) == 0 && len(b) == 0 {
		return 1.0
	}
	if len(a) == 0 || len(b) == 0 {
		return 0.0
	}

	// Collect the union of keys.
	keys := make(map[string]bool)
	for k := range a {
		keys[k] = true
	}
	for k := range b {
		keys[k] = true
	}

	var dot, magA, magB float64
	for k := range keys {
		va := a[k]
		vb := b[k]
		dot += va * vb
		magA += va * va
		magB += vb * vb
	}

	denom := math.Sqrt(magA) * math.Sqrt(magB)
	if denom == 0 {
		return 0.0
	}

	return dot / denom
}
feat(reversal): add GrammarImprint struct and constructor Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-16 23:25:08 +00:00			`package reversal`

feat(reversal): add imprint similarity comparison Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-16 23:26:29 +00:00			`import "math"`

feat(reversal): add GrammarImprint struct and constructor Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-16 23:25:08 +00:00			`// GrammarImprint is a low-dimensional grammar feature vector.`
			`type GrammarImprint struct {`
			`VerbDistribution map[string]float64 // verb base -> frequency`
			`TenseDistribution map[string]float64 // "past"/"gerund"/"base" -> ratio`
			`NounDistribution map[string]float64 // noun base -> frequency`
			`PluralRatio float64 // proportion of plural nouns (0.0-1.0)`
			`DomainVocabulary map[string]int // gram.word category -> hit count`
			`ArticleUsage map[string]float64 // "definite"/"indefinite" -> ratio`
			`PunctuationPattern map[string]float64 // "label"/"progress"/"question" -> ratio`
			`TokenCount int`
			`UniqueVerbs int`
			`UniqueNouns int`
			`}`

			`// NewImprint calculates a GrammarImprint from classified tokens.`
			`func NewImprint(tokens []Token) GrammarImprint {`
			`imp := GrammarImprint{`
			`VerbDistribution: make(map[string]float64),`
			`TenseDistribution: make(map[string]float64),`
			`NounDistribution: make(map[string]float64),`
			`DomainVocabulary: make(map[string]int),`
			`ArticleUsage: make(map[string]float64),`
			`PunctuationPattern: make(map[string]float64),`
			`}`

			`if len(tokens) == 0 {`
			`return imp`
			`}`

			`imp.TokenCount = len(tokens)`

			`verbBases := make(map[string]bool)`
			`nounBases := make(map[string]bool)`
			`var verbCount, nounCount, articleCount, punctCount int`
			`var pluralNouns, totalNouns int`

			`for _, tok := range tokens {`
			`switch tok.Type {`
			`case TokenVerb:`
			`verbCount++`
			`base := tok.VerbInfo.Base`
			`imp.VerbDistribution[base]++`
			`imp.TenseDistribution[tok.VerbInfo.Tense]++`
			`verbBases[base] = true`

			`case TokenNoun:`
			`nounCount++`
			`base := tok.NounInfo.Base`
			`imp.NounDistribution[base]++`
			`nounBases[base] = true`
			`totalNouns++`
			`if tok.NounInfo.Plural {`
			`pluralNouns++`
			`}`

			`case TokenArticle:`
			`articleCount++`
			`imp.ArticleUsage[tok.ArtType]++`

			`case TokenWord:`
			`imp.DomainVocabulary[tok.WordCat]++`

			`case TokenPunctuation:`
			`punctCount++`
			`imp.PunctuationPattern[tok.PunctType]++`
			`}`
			`}`

			`imp.UniqueVerbs = len(verbBases)`
			`imp.UniqueNouns = len(nounBases)`

			`// Calculate plural ratio`
			`if totalNouns > 0 {`
			`imp.PluralRatio = float64(pluralNouns) / float64(totalNouns)`
			`}`

			`// Normalise frequency maps to sum to 1.0`
			`normaliseMap(imp.VerbDistribution)`
			`normaliseMap(imp.TenseDistribution)`
			`normaliseMap(imp.NounDistribution)`
			`normaliseMap(imp.ArticleUsage)`
			`normaliseMap(imp.PunctuationPattern)`

			`return imp`
			`}`

			`// normaliseMap scales all values in a map so they sum to 1.0.`
			`// If the map is empty or sums to zero, it is left unchanged.`
			`func normaliseMap(m map[string]float64) {`
			`var total float64`
			`for _, v := range m {`
			`total += v`
			`}`
			`if total == 0 {`
			`return`
			`}`
			`for k, v := range m {`
			`m[k] = v / total`
			`}`
			`}`
feat(reversal): add imprint similarity comparison Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-16 23:26:29 +00:00
			`// Similar returns weighted cosine similarity between two imprints (0.0-1.0).`
			`// Weights: verb(0.30), tense(0.20), noun(0.25), article(0.15), punct(0.10).`
			`func (a GrammarImprint) Similar(b GrammarImprint) float64 {`
			`// Two empty imprints are identical.`
			`if a.TokenCount == 0 && b.TokenCount == 0 {`
			`return 1.0`
			`}`

			`type component struct {`
			`weight float64`
			`a, b map[string]float64`
			`}`

			`components := []component{`
			`{0.30, a.VerbDistribution, b.VerbDistribution},`
			`{0.20, a.TenseDistribution, b.TenseDistribution},`
			`{0.25, a.NounDistribution, b.NounDistribution},`
			`{0.15, a.ArticleUsage, b.ArticleUsage},`
			`{0.10, a.PunctuationPattern, b.PunctuationPattern},`
			`}`

			`var totalWeight float64`
			`var weightedSum float64`

			`for _, c := range components {`
			`// Skip components where both maps are empty (no signal).`
			`if len(c.a) == 0 && len(c.b) == 0 {`
			`continue`
			`}`
			`totalWeight += c.weight`
			`weightedSum += c.weight * mapSimilarity(c.a, c.b)`
			`}`

			`if totalWeight == 0 {`
			`return 1.0`
			`}`

			`return weightedSum / totalWeight`
			`}`

			`// mapSimilarity computes cosine similarity between two frequency maps.`
			`// Returns 1.0 for identical distributions, 0.0 for completely disjoint.`
			`func mapSimilarity(a, b map[string]float64) float64 {`
			`if len(a) == 0 && len(b) == 0 {`
			`return 1.0`
			`}`
			`if len(a) == 0 \|\| len(b) == 0 {`
			`return 0.0`
			`}`

			`// Collect the union of keys.`
			`keys := make(map[string]bool)`
			`for k := range a {`
			`keys[k] = true`
			`}`
			`for k := range b {`
			`keys[k] = true`
			`}`

			`var dot, magA, magB float64`
			`for k := range keys {`
			`va := a[k]`
			`vb := b[k]`
			`dot += va * vb`
			`magA += va * va`
			`magB += vb * vb`
			`}`

			`denom := math.Sqrt(magA) * math.Sqrt(magB)`
			`if denom == 0 {`
			`return 0.0`
			`}`

			`return dot / denom`
			`}`