From 0004dd91ac6333c704cfc62aea810b3bdd5434a0 Mon Sep 17 00:00:00 2001 From: Virgil Date: Mon, 30 Mar 2026 00:10:41 +0000 Subject: [PATCH] feat(reversal): add sentence boundary punctuation tokens --- reversal/tokeniser.go | 10 +++++++++- reversal/tokeniser_test.go | 28 ++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 85c32d8..4e34f57 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -945,7 +945,7 @@ func splitTrailingPunct(s string) (string, string) { // Check single-char trailing punctuation. if len(s) > 1 { last := s[len(s)-1] - if last == '?' || last == ':' || last == '!' || last == ';' || last == ',' { + if last == '?' || last == ':' || last == '!' || last == ';' || last == ',' || last == '.' || last == ')' || last == ']' || last == '}' { return s[:len(s)-1], string(last) } } @@ -968,6 +968,14 @@ func matchPunctuation(punct string) (string, bool) { return "separator", true case ",": return "comma", true + case ".": + return "sentence_end", true + case ")": + return "close_paren", true + case "]": + return "close_bracket", true + case "}": + return "close_brace", true } return "", false } diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 97c6b7d..234e957 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -259,6 +259,34 @@ func TestTokeniser_Tokenise_Punctuation(t *testing.T) { } } +func TestTokeniser_Tokenise_ClauseBoundarySentence(t *testing.T) { + setup(t) + tok := NewTokeniser() + + tokens := tok.Tokenise("run tests. commit") + hasSentenceEnd := false + + for _, token := range tokens { + if token.Raw == "run" && token.Type != TokenVerb { + t.Errorf("'run' should remain TokenVerb, got %v", token.Type) + } + if token.Type == TokenPunctuation && token.PunctType == "sentence_end" { + hasSentenceEnd = true + } + if token.Lower == "commit" { + // Without sentence-end boundary support, this can be demoted by verb saturation. + // With boundary detection, it should still classify as a verb. + if token.Type != TokenVerb { + t.Errorf("'commit' after period should be TokenVerb, got %v", token.Type) + } + } + } + + if !hasSentenceEnd { + t.Error("did not detect sentence-end punctuation in \"run tests. commit\"") + } +} + func TestTokeniser_Tokenise_Empty(t *testing.T) { setup(t) tok := NewTokeniser() -- 2.45.3