feat(reversal): add Token type and Tokenise function
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
6d72540530
commit
f09cff894f
2 changed files with 173 additions and 0 deletions
|
|
@ -35,6 +35,30 @@ type NounMatch struct {
|
|||
Form string // The original form
|
||||
}
|
||||
|
||||
// TokenType classifies a token identified during tokenisation.
|
||||
type TokenType int
|
||||
|
||||
const (
|
||||
TokenUnknown TokenType = iota // Unrecognised word
|
||||
TokenVerb // Matched verb (see VerbInfo)
|
||||
TokenNoun // Matched noun (see NounInfo)
|
||||
TokenArticle // Matched article ("a", "an", "the")
|
||||
TokenWord // Matched word from grammar word map
|
||||
TokenPunctuation // Punctuation ("...", "?")
|
||||
)
|
||||
|
||||
// Token represents a single classified token from a text string.
|
||||
type Token struct {
|
||||
Raw string // Original text as it appeared in input
|
||||
Lower string // Lowercased form
|
||||
Type TokenType // Classification
|
||||
VerbInfo VerbMatch // Set when Type == TokenVerb
|
||||
NounInfo NounMatch // Set when Type == TokenNoun
|
||||
WordCat string // Set when Type == TokenWord
|
||||
ArtType string // Set when Type == TokenArticle
|
||||
PunctType string // Set when Type == TokenPunctuation
|
||||
}
|
||||
|
||||
// Tokeniser provides reverse grammar lookups by maintaining inverse
|
||||
// indexes built from the forward grammar tables.
|
||||
type Tokeniser struct {
|
||||
|
|
@ -449,3 +473,89 @@ func (t *Tokeniser) MatchArticle(word string) (string, bool) {
|
|||
|
||||
return "", false
|
||||
}
|
||||
|
||||
// Tokenise splits text on whitespace and classifies each word.
|
||||
// Priority: punctuation → article → verb → noun → word → unknown.
|
||||
// Trailing punctuation is stripped from words before matching.
|
||||
func (t *Tokeniser) Tokenise(text string) []Token {
|
||||
text = strings.TrimSpace(text)
|
||||
if text == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
parts := strings.Fields(text)
|
||||
var tokens []Token
|
||||
|
||||
for _, raw := range parts {
|
||||
// Strip trailing punctuation to get the clean word.
|
||||
word, punct := splitTrailingPunct(raw)
|
||||
|
||||
// Classify the word portion (if any).
|
||||
if word != "" {
|
||||
tok := Token{Raw: raw, Lower: strings.ToLower(word)}
|
||||
|
||||
if artType, ok := t.MatchArticle(word); ok {
|
||||
tok.Type = TokenArticle
|
||||
tok.ArtType = artType
|
||||
} else if vm, ok := t.MatchVerb(word); ok {
|
||||
tok.Type = TokenVerb
|
||||
tok.VerbInfo = vm
|
||||
} else if nm, ok := t.MatchNoun(word); ok {
|
||||
tok.Type = TokenNoun
|
||||
tok.NounInfo = nm
|
||||
} else if cat, ok := t.MatchWord(word); ok {
|
||||
tok.Type = TokenWord
|
||||
tok.WordCat = cat
|
||||
} else {
|
||||
tok.Type = TokenUnknown
|
||||
}
|
||||
tokens = append(tokens, tok)
|
||||
}
|
||||
|
||||
// Emit a punctuation token if trailing punctuation was found.
|
||||
if punct != "" {
|
||||
if punctType, ok := matchPunctuation(punct); ok {
|
||||
tokens = append(tokens, Token{
|
||||
Raw: punct,
|
||||
Lower: punct,
|
||||
Type: TokenPunctuation,
|
||||
PunctType: punctType,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
// splitTrailingPunct separates a word from its trailing punctuation.
|
||||
// Returns the word and the punctuation suffix. Punctuation patterns
|
||||
// recognised: "..." (progress), "?" (question), ":" (label).
|
||||
func splitTrailingPunct(s string) (string, string) {
|
||||
// Check for "..." suffix first (3-char pattern).
|
||||
if strings.HasSuffix(s, "...") {
|
||||
return s[:len(s)-3], "..."
|
||||
}
|
||||
// Check single-char trailing punctuation.
|
||||
if len(s) > 1 {
|
||||
last := s[len(s)-1]
|
||||
if last == '?' || last == ':' {
|
||||
return s[:len(s)-1], string(last)
|
||||
}
|
||||
}
|
||||
return s, ""
|
||||
}
|
||||
|
||||
// matchPunctuation detects known punctuation patterns.
|
||||
// Returns the punctuation type and true if recognised.
|
||||
func matchPunctuation(punct string) (string, bool) {
|
||||
switch punct {
|
||||
case "...":
|
||||
return "progress", true
|
||||
case "?":
|
||||
return "question", true
|
||||
case ":":
|
||||
return "label", true
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
|
|
|||
|
|
@ -206,6 +206,69 @@ func TestTokeniser_MatchArticle(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tokens := tok.Tokenise("Deleted the configuration files")
|
||||
|
||||
if len(tokens) != 4 {
|
||||
t.Fatalf("Tokenise() returned %d tokens, want 4", len(tokens))
|
||||
}
|
||||
|
||||
// "Deleted" → verb, past tense
|
||||
if tokens[0].Type != TokenVerb {
|
||||
t.Errorf("tokens[0].Type = %v, want TokenVerb", tokens[0].Type)
|
||||
}
|
||||
if tokens[0].VerbInfo.Tense != "past" {
|
||||
t.Errorf("tokens[0].VerbInfo.Tense = %q, want %q", tokens[0].VerbInfo.Tense, "past")
|
||||
}
|
||||
|
||||
// "the" → article
|
||||
if tokens[1].Type != TokenArticle {
|
||||
t.Errorf("tokens[1].Type = %v, want TokenArticle", tokens[1].Type)
|
||||
}
|
||||
|
||||
// "configuration" → unknown
|
||||
if tokens[2].Type != TokenUnknown {
|
||||
t.Errorf("tokens[2].Type = %v, want TokenUnknown", tokens[2].Type)
|
||||
}
|
||||
|
||||
// "files" → noun, plural
|
||||
if tokens[3].Type != TokenNoun {
|
||||
t.Errorf("tokens[3].Type = %v, want TokenNoun", tokens[3].Type)
|
||||
}
|
||||
if !tokens[3].NounInfo.Plural {
|
||||
t.Errorf("tokens[3].NounInfo.Plural = false, want true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise_Punctuation(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tokens := tok.Tokenise("Building project...")
|
||||
hasPunct := false
|
||||
for _, tok := range tokens {
|
||||
if tok.Type == TokenPunctuation {
|
||||
hasPunct = true
|
||||
}
|
||||
}
|
||||
if !hasPunct {
|
||||
t.Error("did not detect punctuation in \"Building project...\"")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_Tokenise_Empty(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
||||
tokens := tok.Tokenise("")
|
||||
if len(tokens) != 0 {
|
||||
t.Errorf("Tokenise(\"\") returned %d tokens, want 0", len(tokens))
|
||||
}
|
||||
}
|
||||
|
||||
func TestTokeniser_MatchVerb_Regular(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue