feat(reversal): add Token type and Tokenise function

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Claude 2026-02-16 23:22:40 +00:00
parent 6d72540530
commit f09cff894f
No known key found for this signature in database
GPG key ID: AF404715446AEB41
2 changed files with 173 additions and 0 deletions

View file

@ -35,6 +35,30 @@ type NounMatch struct {
Form string // The original form
}
// TokenType classifies a token identified during tokenisation.
type TokenType int
const (
TokenUnknown TokenType = iota // Unrecognised word
TokenVerb // Matched verb (see VerbInfo)
TokenNoun // Matched noun (see NounInfo)
TokenArticle // Matched article ("a", "an", "the")
TokenWord // Matched word from grammar word map
TokenPunctuation // Punctuation ("...", "?")
)
// Token represents a single classified token from a text string.
type Token struct {
Raw string // Original text as it appeared in input
Lower string // Lowercased form
Type TokenType // Classification
VerbInfo VerbMatch // Set when Type == TokenVerb
NounInfo NounMatch // Set when Type == TokenNoun
WordCat string // Set when Type == TokenWord
ArtType string // Set when Type == TokenArticle
PunctType string // Set when Type == TokenPunctuation
}
// Tokeniser provides reverse grammar lookups by maintaining inverse
// indexes built from the forward grammar tables.
type Tokeniser struct {
@ -449,3 +473,89 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) {
return "", false
}
// Tokenise splits text on whitespace and classifies each word.
// Priority: punctuation → article → verb → noun → word → unknown.
// Trailing punctuation is stripped from words before matching.
func (t *Tokeniser) Tokenise(text string) []Token {
text = strings.TrimSpace(text)
if text == "" {
return nil
}
parts := strings.Fields(text)
var tokens []Token
for _, raw := range parts {
// Strip trailing punctuation to get the clean word.
word, punct := splitTrailingPunct(raw)
// Classify the word portion (if any).
if word != "" {
tok := Token{Raw: raw, Lower: strings.ToLower(word)}
if artType, ok := t.MatchArticle(word); ok {
tok.Type = TokenArticle
tok.ArtType = artType
} else if vm, ok := t.MatchVerb(word); ok {
tok.Type = TokenVerb
tok.VerbInfo = vm
} else if nm, ok := t.MatchNoun(word); ok {
tok.Type = TokenNoun
tok.NounInfo = nm
} else if cat, ok := t.MatchWord(word); ok {
tok.Type = TokenWord
tok.WordCat = cat
} else {
tok.Type = TokenUnknown
}
tokens = append(tokens, tok)
}
// Emit a punctuation token if trailing punctuation was found.
if punct != "" {
if punctType, ok := matchPunctuation(punct); ok {
tokens = append(tokens, Token{
Raw: punct,
Lower: punct,
Type: TokenPunctuation,
PunctType: punctType,
})
}
}
}
return tokens
}
// splitTrailingPunct separates a word from its trailing punctuation.
// Returns the word and the punctuation suffix. Punctuation patterns
// recognised: "..." (progress), "?" (question), ":" (label).
func splitTrailingPunct(s string) (string, string) {
// Check for "..." suffix first (3-char pattern).
if strings.HasSuffix(s, "...") {
return s[:len(s)-3], "..."
}
// Check single-char trailing punctuation.
if len(s) > 1 {
last := s[len(s)-1]
if last == '?' || last == ':' {
return s[:len(s)-1], string(last)
}
}
return s, ""
}
// matchPunctuation detects known punctuation patterns.
// Returns the punctuation type and true if recognised.
func matchPunctuation(punct string) (string, bool) {
switch punct {
case "...":
return "progress", true
case "?":
return "question", true
case ":":
return "label", true
}
return "", false
}

View file

@ -206,6 +206,69 @@ func TestTokeniser_MatchArticle(t *testing.T) {
}
}
func TestTokeniser_Tokenise(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("Deleted the configuration files")
if len(tokens) != 4 {
t.Fatalf("Tokenise() returned %d tokens, want 4", len(tokens))
}
// "Deleted" → verb, past tense
if tokens[0].Type != TokenVerb {
t.Errorf("tokens[0].Type = %v, want TokenVerb", tokens[0].Type)
}
if tokens[0].VerbInfo.Tense != "past" {
t.Errorf("tokens[0].VerbInfo.Tense = %q, want %q", tokens[0].VerbInfo.Tense, "past")
}
// "the" → article
if tokens[1].Type != TokenArticle {
t.Errorf("tokens[1].Type = %v, want TokenArticle", tokens[1].Type)
}
// "configuration" → unknown
if tokens[2].Type != TokenUnknown {
t.Errorf("tokens[2].Type = %v, want TokenUnknown", tokens[2].Type)
}
// "files" → noun, plural
if tokens[3].Type != TokenNoun {
t.Errorf("tokens[3].Type = %v, want TokenNoun", tokens[3].Type)
}
if !tokens[3].NounInfo.Plural {
t.Errorf("tokens[3].NounInfo.Plural = false, want true")
}
}
func TestTokeniser_Tokenise_Punctuation(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("Building project...")
hasPunct := false
for _, tok := range tokens {
if tok.Type == TokenPunctuation {
hasPunct = true
}
}
if !hasPunct {
t.Error("did not detect punctuation in \"Building project...\"")
}
}
func TestTokeniser_Tokenise_Empty(t *testing.T) {
setup(t)
tok := NewTokeniser()
tokens := tok.Tokenise("")
if len(tokens) != 0 {
t.Errorf("Tokenise(\"\") returned %d tokens, want 0", len(tokens))
}
}
func TestTokeniser_MatchVerb_Regular(t *testing.T) {
setup(t)
tok := NewTokeniser()