diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index f49887b..a36169d 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -115,7 +115,19 @@ func WithSignals() TokeniserOption { // WithWeights overrides the default signal weights for disambiguation. // All signal keys must be present; omitted keys silently disable those signals. func WithWeights(w map[string]float64) TokeniserOption { - return func(t *Tokeniser) { t.weights = w } + return func(t *Tokeniser) { + if len(w) == 0 { + t.weights = nil + return + } + // Copy the map so callers can safely reuse or mutate their input after + // constructing the tokeniser. + copied := make(map[string]float64, len(w)) + for key, value := range w { + copied[key] = value + } + t.weights = copied + } } // NewTokeniser creates a Tokeniser for English ("en"). diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index be505fc..05a9364 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -896,7 +896,7 @@ func TestDisambiguationStats_NoAmbiguous(t *testing.T) { func TestWithWeights_Override(t *testing.T) { setup(t) // Override noun_determiner to 0 — "The commit" should no longer resolve as noun - tok := NewTokeniser(WithWeights(map[string]float64{ + weights := map[string]float64{ "noun_determiner": 0.0, "verb_auxiliary": 0.25, "following_class": 0.15, @@ -904,7 +904,8 @@ func TestWithWeights_Override(t *testing.T) { "verb_saturation": 0.10, "inflection_echo": 0.03, "default_prior": 0.02, - })) + } + tok := NewTokeniser(WithWeights(weights)) tokens := tok.Tokenise("The commit") // With noun_determiner zeroed, default_prior (verb) should win if tokens[1].Type != TokenVerb { @@ -912,6 +913,29 @@ func TestWithWeights_Override(t *testing.T) { } } +func TestWithWeights_CopiesInputMap(t *testing.T) { + setup(t) + weights := map[string]float64{ + "noun_determiner": 0.35, + "verb_auxiliary": 0.25, + "following_class": 0.15, + "sentence_position": 0.10, + "verb_saturation": 0.10, + "inflection_echo": 0.03, + "default_prior": 0.02, + } + tok := NewTokeniser(WithWeights(weights)) + + // Mutate the caller's map after construction; the tokeniser should keep + // using the original copied values. + weights["noun_determiner"] = 0 + + tokens := tok.Tokenise("The commit") + if tokens[1].Type != TokenNoun { + t.Fatalf("with copied weights, 'commit' Type = %v, want TokenNoun", tokens[1].Type) + } +} + // --- Benchmarks --- func benchSetup(b *testing.B) {