feat(reversal): add Token type and Tokenise function

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 23:22:40 +00:00 · 2026-02-16 23:22:40 +00:00 · f09cff894f
commit f09cff894f
parent 6d72540530
2 changed files with 173 additions and 0 deletions
--- a/reversal/tokeniser.go
+++ b/reversal/tokeniser.go
@ -35,6 +35,30 @@ type NounMatch struct {
 	Form   string // The original form
 }

+// TokenType classifies a token identified during tokenisation.
+type TokenType int
+
+const (
+	TokenUnknown     TokenType = iota // Unrecognised word
+	TokenVerb                         // Matched verb (see VerbInfo)
+	TokenNoun                         // Matched noun (see NounInfo)
+	TokenArticle                      // Matched article ("a", "an", "the")
+	TokenWord                         // Matched word from grammar word map
+	TokenPunctuation                  // Punctuation ("...", "?")
+)
+
+// Token represents a single classified token from a text string.
+type Token struct {
+	Raw       string    // Original text as it appeared in input
+	Lower     string    // Lowercased form
+	Type      TokenType // Classification
+	VerbInfo  VerbMatch // Set when Type == TokenVerb
+	NounInfo  NounMatch // Set when Type == TokenNoun
+	WordCat   string    // Set when Type == TokenWord
+	ArtType   string    // Set when Type == TokenArticle
+	PunctType string    // Set when Type == TokenPunctuation
+}
+
 // Tokeniser provides reverse grammar lookups by maintaining inverse
 // indexes built from the forward grammar tables.
 type Tokeniser struct {
@ -449,3 +473,89 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) {

 	return "", false
 }
+
+// Tokenise splits text on whitespace and classifies each word.
+// Priority: punctuation → article → verb → noun → word → unknown.
+// Trailing punctuation is stripped from words before matching.
+func (t *Tokeniser) Tokenise(text string) []Token {
+	text = strings.TrimSpace(text)
+	if text == "" {
+		return nil
+	}
+
+	parts := strings.Fields(text)
+	var tokens []Token
+
+	for _, raw := range parts {
+		// Strip trailing punctuation to get the clean word.
+		word, punct := splitTrailingPunct(raw)
+
+		// Classify the word portion (if any).
+		if word != "" {
+			tok := Token{Raw: raw, Lower: strings.ToLower(word)}
+
+			if artType, ok := t.MatchArticle(word); ok {
+				tok.Type = TokenArticle
+				tok.ArtType = artType
+			} else if vm, ok := t.MatchVerb(word); ok {
+				tok.Type = TokenVerb
+				tok.VerbInfo = vm
+			} else if nm, ok := t.MatchNoun(word); ok {
+				tok.Type = TokenNoun
+				tok.NounInfo = nm
+			} else if cat, ok := t.MatchWord(word); ok {
+				tok.Type = TokenWord
+				tok.WordCat = cat
+			} else {
+				tok.Type = TokenUnknown
+			}
+			tokens = append(tokens, tok)
+		}
+
+		// Emit a punctuation token if trailing punctuation was found.
+		if punct != "" {
+			if punctType, ok := matchPunctuation(punct); ok {
+				tokens = append(tokens, Token{
+					Raw:       punct,
+					Lower:     punct,
+					Type:      TokenPunctuation,
+					PunctType: punctType,
+				})
+			}
+		}
+	}
+
+	return tokens
+}
+
+// splitTrailingPunct separates a word from its trailing punctuation.
+// Returns the word and the punctuation suffix. Punctuation patterns
+// recognised: "..." (progress), "?" (question), ":" (label).
+func splitTrailingPunct(s string) (string, string) {
+	// Check for "..." suffix first (3-char pattern).
+	if strings.HasSuffix(s, "...") {
+		return s[:len(s)-3], "..."
+	}
+	// Check single-char trailing punctuation.
+	if len(s) > 1 {
+		last := s[len(s)-1]
+		if last == '?' || last == ':' {
+			return s[:len(s)-1], string(last)
+		}
+	}
+	return s, ""
+}
+
+// matchPunctuation detects known punctuation patterns.
+// Returns the punctuation type and true if recognised.
+func matchPunctuation(punct string) (string, bool) {
+	switch punct {
+	case "...":
+		return "progress", true
+	case "?":
+		return "question", true
+	case ":":
+		return "label", true
+	}
+	return "", false
+}
--- a/reversal/tokeniser_test.go
+++ b/reversal/tokeniser_test.go
@ -206,6 +206,69 @@ func TestTokeniser_MatchArticle(t *testing.T) {
 	}
 }

+func TestTokeniser_Tokenise(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tokens := tok.Tokenise("Deleted the configuration files")
+
+	if len(tokens) != 4 {
+		t.Fatalf("Tokenise() returned %d tokens, want 4", len(tokens))
+	}
+
+	// "Deleted" → verb, past tense
+	if tokens[0].Type != TokenVerb {
+		t.Errorf("tokens[0].Type = %v, want TokenVerb", tokens[0].Type)
+	}
+	if tokens[0].VerbInfo.Tense != "past" {
+		t.Errorf("tokens[0].VerbInfo.Tense = %q, want %q", tokens[0].VerbInfo.Tense, "past")
+	}
+
+	// "the" → article
+	if tokens[1].Type != TokenArticle {
+		t.Errorf("tokens[1].Type = %v, want TokenArticle", tokens[1].Type)
+	}
+
+	// "configuration" → unknown
+	if tokens[2].Type != TokenUnknown {
+		t.Errorf("tokens[2].Type = %v, want TokenUnknown", tokens[2].Type)
+	}
+
+	// "files" → noun, plural
+	if tokens[3].Type != TokenNoun {
+		t.Errorf("tokens[3].Type = %v, want TokenNoun", tokens[3].Type)
+	}
+	if !tokens[3].NounInfo.Plural {
+		t.Errorf("tokens[3].NounInfo.Plural = false, want true")
+	}
+}
+
+func TestTokeniser_Tokenise_Punctuation(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tokens := tok.Tokenise("Building project...")
+	hasPunct := false
+	for _, tok := range tokens {
+		if tok.Type == TokenPunctuation {
+			hasPunct = true
+		}
+	}
+	if !hasPunct {
+		t.Error("did not detect punctuation in \"Building project...\"")
+	}
+}
+
+func TestTokeniser_Tokenise_Empty(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tokens := tok.Tokenise("")
+	if len(tokens) != 0 {
+		t.Errorf("Tokenise(\"\") returned %d tokens, want 0", len(tokens))
+	}
+}
+
 func TestTokeniser_MatchVerb_Regular(t *testing.T) {
 	setup(t)
 	tok := NewTokeniser()