diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index f06aaf2..e96b290 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -35,6 +35,30 @@ type NounMatch struct { Form string // The original form } +// TokenType classifies a token identified during tokenisation. +type TokenType int + +const ( + TokenUnknown TokenType = iota // Unrecognised word + TokenVerb // Matched verb (see VerbInfo) + TokenNoun // Matched noun (see NounInfo) + TokenArticle // Matched article ("a", "an", "the") + TokenWord // Matched word from grammar word map + TokenPunctuation // Punctuation ("...", "?") +) + +// Token represents a single classified token from a text string. +type Token struct { + Raw string // Original text as it appeared in input + Lower string // Lowercased form + Type TokenType // Classification + VerbInfo VerbMatch // Set when Type == TokenVerb + NounInfo NounMatch // Set when Type == TokenNoun + WordCat string // Set when Type == TokenWord + ArtType string // Set when Type == TokenArticle + PunctType string // Set when Type == TokenPunctuation +} + // Tokeniser provides reverse grammar lookups by maintaining inverse // indexes built from the forward grammar tables. type Tokeniser struct { @@ -449,3 +473,89 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) { return "", false } + +// Tokenise splits text on whitespace and classifies each word. +// Priority: punctuation → article → verb → noun → word → unknown. +// Trailing punctuation is stripped from words before matching. +func (t *Tokeniser) Tokenise(text string) []Token { + text = strings.TrimSpace(text) + if text == "" { + return nil + } + + parts := strings.Fields(text) + var tokens []Token + + for _, raw := range parts { + // Strip trailing punctuation to get the clean word. + word, punct := splitTrailingPunct(raw) + + // Classify the word portion (if any). + if word != "" { + tok := Token{Raw: raw, Lower: strings.ToLower(word)} + + if artType, ok := t.MatchArticle(word); ok { + tok.Type = TokenArticle + tok.ArtType = artType + } else if vm, ok := t.MatchVerb(word); ok { + tok.Type = TokenVerb + tok.VerbInfo = vm + } else if nm, ok := t.MatchNoun(word); ok { + tok.Type = TokenNoun + tok.NounInfo = nm + } else if cat, ok := t.MatchWord(word); ok { + tok.Type = TokenWord + tok.WordCat = cat + } else { + tok.Type = TokenUnknown + } + tokens = append(tokens, tok) + } + + // Emit a punctuation token if trailing punctuation was found. + if punct != "" { + if punctType, ok := matchPunctuation(punct); ok { + tokens = append(tokens, Token{ + Raw: punct, + Lower: punct, + Type: TokenPunctuation, + PunctType: punctType, + }) + } + } + } + + return tokens +} + +// splitTrailingPunct separates a word from its trailing punctuation. +// Returns the word and the punctuation suffix. Punctuation patterns +// recognised: "..." (progress), "?" (question), ":" (label). +func splitTrailingPunct(s string) (string, string) { + // Check for "..." suffix first (3-char pattern). + if strings.HasSuffix(s, "...") { + return s[:len(s)-3], "..." + } + // Check single-char trailing punctuation. + if len(s) > 1 { + last := s[len(s)-1] + if last == '?' || last == ':' { + return s[:len(s)-1], string(last) + } + } + return s, "" +} + +// matchPunctuation detects known punctuation patterns. +// Returns the punctuation type and true if recognised. +func matchPunctuation(punct string) (string, bool) { + switch punct { + case "...": + return "progress", true + case "?": + return "question", true + case ":": + return "label", true + } + return "", false +} diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index 64942e8..22450b8 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -206,6 +206,69 @@ func TestTokeniser_MatchArticle(t *testing.T) { } } +func TestTokeniser_Tokenise(t *testing.T) { + setup(t) + tok := NewTokeniser() + + tokens := tok.Tokenise("Deleted the configuration files") + + if len(tokens) != 4 { + t.Fatalf("Tokenise() returned %d tokens, want 4", len(tokens)) + } + + // "Deleted" → verb, past tense + if tokens[0].Type != TokenVerb { + t.Errorf("tokens[0].Type = %v, want TokenVerb", tokens[0].Type) + } + if tokens[0].VerbInfo.Tense != "past" { + t.Errorf("tokens[0].VerbInfo.Tense = %q, want %q", tokens[0].VerbInfo.Tense, "past") + } + + // "the" → article + if tokens[1].Type != TokenArticle { + t.Errorf("tokens[1].Type = %v, want TokenArticle", tokens[1].Type) + } + + // "configuration" → unknown + if tokens[2].Type != TokenUnknown { + t.Errorf("tokens[2].Type = %v, want TokenUnknown", tokens[2].Type) + } + + // "files" → noun, plural + if tokens[3].Type != TokenNoun { + t.Errorf("tokens[3].Type = %v, want TokenNoun", tokens[3].Type) + } + if !tokens[3].NounInfo.Plural { + t.Errorf("tokens[3].NounInfo.Plural = false, want true") + } +} + +func TestTokeniser_Tokenise_Punctuation(t *testing.T) { + setup(t) + tok := NewTokeniser() + + tokens := tok.Tokenise("Building project...") + hasPunct := false + for _, tok := range tokens { + if tok.Type == TokenPunctuation { + hasPunct = true + } + } + if !hasPunct { + t.Error("did not detect punctuation in \"Building project...\"") + } +} + +func TestTokeniser_Tokenise_Empty(t *testing.T) { + setup(t) + tok := NewTokeniser() + + tokens := tok.Tokenise("") + if len(tokens) != 0 { + t.Errorf("Tokenise(\"\") returned %d tokens, want 0", len(tokens)) + } +} + func TestTokeniser_MatchVerb_Regular(t *testing.T) { setup(t) tok := NewTokeniser()