From 8f8252cfc563290acea4146970380455f398aae1 Mon Sep 17 00:00:00 2001 From: Virgil Date: Thu, 2 Apr 2026 06:53:06 +0000 Subject: [PATCH] refactor(reversal): expose default disambiguation weights Co-Authored-By: Virgil --- reversal/tokeniser.go | 17 +++++++++-------- reversal/tokeniser_test.go | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 67761af..ccbda25 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -101,7 +101,7 @@ type Tokeniser struct { verbInf map[string]bool // signal: infinitive markers verbNeg map[string]bool // signal: negation cues withSignals bool // allocate SignalBreakdown on ambiguous tokens - weights map[string]float64 // signal weights (F3: configurable) + weights map[string]float64 // signal weights used during disambiguation } // TokeniserOption configures a Tokeniser. @@ -122,7 +122,7 @@ func WithWeights(w map[string]float64) TokeniserOption { } // Start from the defaults so callers can override only the weights they // care about without accidentally disabling the rest of the signal set. - copied := defaultWeights() + copied := DefaultWeights() for key, value := range w { copied[key] = value } @@ -156,7 +156,7 @@ func NewTokeniserForLang(lang string, opts ...TokeniserOption) *Tokeniser { t.buildDualClassIndex() t.buildSignalIndex() if t.weights == nil { - t.weights = defaultWeights() + t.weights = DefaultWeights() } return t } @@ -630,7 +630,8 @@ func defaultVerbAuxiliaries() []string { } } -func defaultWeights() map[string]float64 { +// DefaultWeights returns a copy of the tokeniser's built-in signal weights. +func DefaultWeights() map[string]float64 { return map[string]float64{ "noun_determiner": 0.35, "verb_auxiliary": 0.25, @@ -885,7 +886,7 @@ func matchFrenchAttachedArticle(lower string) (string, bool) { const tokenAmbiguous TokenType = -1 // clauseBoundaries lists words that delimit clause boundaries for -// the verb_saturation signal (D2 review fix). +// the verb_saturation signal. var clauseBoundaries = map[string]bool{ "and": true, "or": true, "but": true, "because": true, "when": true, "while": true, "if": true, "then": true, "so": true, @@ -1456,7 +1457,7 @@ func (t *Tokeniser) corpusPrior(word string) (float64, float64, bool) { // hasConfidentVerbInClause scans for a confident verb (Confidence >= 1.0) // within the same clause as the token at idx. Clause boundaries are -// punctuation tokens and clause-boundary conjunctions/subordinators (D2). +// punctuation tokens and clause-boundary conjunctions/subordinators. func (t *Tokeniser) hasConfidentVerbInClause(tokens []Token, idx int) bool { // Scan backwards from idx to find clause start. start := 0 @@ -1516,8 +1517,8 @@ func (t *Tokeniser) checkInflectionEcho(tokens []Token, idx int) (bool, bool) { func (t *Tokeniser) resolveToken(tok *Token, verbScore, nounScore float64, components []SignalComponent) { total := verbScore + nounScore - // B3 review fix: if total < 0.10 (only default prior fired), - // use low-information confidence floor. + // If only the default prior fired, keep confidence near chance rather than + // pretending the classification is strongly supported. if total < 0.10 { if verbScore >= nounScore { tok.Type = TokenVerb diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index ef7b6a8..9c36611 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -1259,6 +1259,20 @@ func TestWithWeights_PartialOverrideKeepsDefaults(t *testing.T) { } } +func TestDefaultWeights_ReturnsCopy(t *testing.T) { + first := DefaultWeights() + second := DefaultWeights() + + if first["noun_determiner"] != 0.35 { + t.Fatalf("DefaultWeights()[noun_determiner] = %v, want 0.35", first["noun_determiner"]) + } + first["noun_determiner"] = 0 + + if second["noun_determiner"] != 0.35 { + t.Fatalf("DefaultWeights() should return a fresh copy, got %v", second["noun_determiner"]) + } +} + // --- Benchmarks --- func benchSetup(b *testing.B) { -- 2.45.3