From 03cd99e0924e2275acf8a7fc74e5bdda5f37b44a Mon Sep 17 00:00:00 2001 From: Snider Date: Thu, 19 Feb 2026 16:08:23 +0000 Subject: [PATCH] feat(reversal): add TokeniserOption, dual-class and signal indexes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NewTokeniser now accepts variadic options (backwards compatible). Builds dual-class index from verb∩noun overlap and signal word lookup sets from gram.signal data. Configurable weights via WithWeights() for future calibration. Co-Authored-By: Virgil Co-Authored-By: Claude Opus 4.6 --- reversal/tokeniser.go | 98 ++++++++++++++++++++++++++++++++++++-- reversal/tokeniser_test.go | 25 ++++++++++ 2 files changed, 120 insertions(+), 3 deletions(-) diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index e4f5cea..f91a7e2 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -89,16 +89,36 @@ type Tokeniser struct { baseNouns map[string]bool // "file" → true words map[string]string // word translations lang string + + dualClass map[string]bool // words in both verb AND noun tables + nounDet map[string]bool // signal: noun determiners + verbAux map[string]bool // signal: verb auxiliaries + verbInf map[string]bool // signal: infinitive markers + withSignals bool // allocate SignalBreakdown on ambiguous tokens + weights map[string]float64 // signal weights (F3: configurable) +} + +// TokeniserOption configures a Tokeniser. +type TokeniserOption func(*Tokeniser) + +// WithSignals enables detailed SignalBreakdown on ambiguous tokens. +func WithSignals() TokeniserOption { + return func(t *Tokeniser) { t.withSignals = true } +} + +// WithWeights overrides the default signal weights for disambiguation. +func WithWeights(w map[string]float64) TokeniserOption { + return func(t *Tokeniser) { t.weights = w } } // NewTokeniser creates a Tokeniser for English ("en"). -func NewTokeniser() *Tokeniser { - return NewTokeniserForLang("en") +func NewTokeniser(opts ...TokeniserOption) *Tokeniser { + return NewTokeniserForLang("en", opts...) } // NewTokeniserForLang creates a Tokeniser for the specified language, // building inverse indexes from the grammar data. -func NewTokeniserForLang(lang string) *Tokeniser { +func NewTokeniserForLang(lang string, opts ...TokeniserOption) *Tokeniser { t := &Tokeniser{ pastToBase: make(map[string]string), gerundToBase: make(map[string]string), @@ -108,9 +128,17 @@ func NewTokeniserForLang(lang string) *Tokeniser { words: make(map[string]string), lang: lang, } + for _, opt := range opts { + opt(t) + } t.buildVerbIndex() t.buildNounIndex() t.buildWordIndex() + t.buildDualClassIndex() + t.buildSignalIndex() + if t.weights == nil { + t.weights = defaultWeights() + } return t } @@ -465,6 +493,70 @@ func (t *Tokeniser) buildWordIndex() { } } +// IsDualClass returns true if the word exists in both verb and noun tables. +func (t *Tokeniser) IsDualClass(word string) bool { + return t.dualClass[strings.ToLower(word)] +} + +func (t *Tokeniser) buildDualClassIndex() { + t.dualClass = make(map[string]bool) + for base := range t.baseVerbs { + if t.baseNouns[base] { + t.dualClass[base] = true + } + } +} + +func (t *Tokeniser) buildSignalIndex() { + t.nounDet = make(map[string]bool) + t.verbAux = make(map[string]bool) + t.verbInf = make(map[string]bool) + + data := i18n.GetGrammarData(t.lang) + if data != nil && len(data.Signals.NounDeterminers) > 0 { + for _, w := range data.Signals.NounDeterminers { + t.nounDet[strings.ToLower(w)] = true + } + for _, w := range data.Signals.VerbAuxiliaries { + t.verbAux[strings.ToLower(w)] = true + } + for _, w := range data.Signals.VerbInfinitive { + t.verbInf[strings.ToLower(w)] = true + } + return + } + + // Fallback: hardcoded English defaults + for _, w := range []string{ + "the", "a", "an", "this", "that", "these", "those", + "my", "your", "his", "her", "its", "our", "their", + "every", "each", "some", "any", "no", + "many", "few", "several", "all", "both", + } { + t.nounDet[w] = true + } + for _, w := range []string{ + "is", "are", "was", "were", "has", "had", "have", + "do", "does", "did", "will", "would", "could", "should", + "can", "may", "might", "shall", "must", + } { + t.verbAux[w] = true + } + t.verbInf["to"] = true +} + +func defaultWeights() map[string]float64 { + return map[string]float64{ + "noun_determiner": 0.35, + "verb_auxiliary": 0.25, + "following_class": 0.15, + "sentence_position": 0.10, + "verb_saturation": 0.10, + "inflection_echo": 0.03, + "default_prior": 0.02, + } +} + // MatchWord performs a case-insensitive lookup in the words map. // Returns the category key and true if found, or ("", false) otherwise. func (t *Tokeniser) MatchWord(word string) (string, bool) { diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 3f9329e..d8e447a 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -309,6 +309,31 @@ func TestTokeniser_MatchVerb_Regular(t *testing.T) { } } +func TestTokeniser_WithSignals(t *testing.T) { + setup(t) + tok := NewTokeniser(WithSignals()) + _ = tok // verify it compiles and accepts the option +} + +func TestTokeniser_DualClassDetection(t *testing.T) { + setup(t) + tok := NewTokeniser() + + dualClass := []string{"commit", "run", "test", "check", "file", "build"} + for _, word := range dualClass { + if !tok.IsDualClass(word) { + t.Errorf("%q should be dual-class", word) + } + } + + notDual := []string{"delete", "go", "push", "branch", "repo"} + for _, word := range notDual { + if tok.IsDualClass(word) { + t.Errorf("%q should not be dual-class", word) + } + } +} + func TestToken_ConfidenceField(t *testing.T) { setup(t) tok := NewTokeniser()