diff --git a/reversal/imprint_test.go b/reversal/imprint_test.go index 7f4421c..198e483 100644 --- a/reversal/imprint_test.go +++ b/reversal/imprint_test.go @@ -34,6 +34,21 @@ func TestNewImprint(t *testing.T) { } } +func TestNewImprint_WordPhrase(t *testing.T) { + svc, err := i18n.New() + if err != nil { + t.Fatalf("i18n.New() failed: %v", err) + } + i18n.SetDefault(svc) + + tok := NewTokeniser() + imp := NewImprint(tok.Tokenise("up to date")) + + if imp.DomainVocabulary["up_to_date"] != 1 { + t.Fatalf("DomainVocabulary[\"up_to_date\"] = %d, want 1", imp.DomainVocabulary["up_to_date"]) + } +} + func TestNewImprint_Empty(t *testing.T) { imp := NewImprint(nil) if imp.TokenCount != 0 { diff --git a/reversal/tokeniser.go b/reversal/tokeniser.go index 34ccd95..802d3fc 100644 --- a/reversal/tokeniser.go +++ b/reversal/tokeniser.go @@ -89,6 +89,7 @@ type Tokeniser struct { pluralToBase map[string]string // "files" → "file" baseNouns map[string]bool // "file" → true words map[string]string // word translations + phraseLen int // longest multi-word gram.word entry lang string dualClass map[string]bool // words in both verb AND noun tables @@ -491,7 +492,11 @@ func (t *Tokeniser) buildWordIndex() { // Map the key itself (already lowercase) t.words[core.Lower(key)] = key // Map the display form (e.g., "URL" → "url", "SSH" → "ssh") - t.words[core.Lower(display)] = key + lowerDisplay := core.Lower(display) + t.words[lowerDisplay] = key + if words := strings.Fields(lowerDisplay); len(words) > 1 && len(words) > t.phraseLen { + t.phraseLen = len(words) + } } } @@ -636,7 +641,17 @@ func (t *Tokeniser) Tokenise(text string) []Token { var tokens []Token // --- Pass 1: Classify & Mark --- - for _, raw := range parts { + for i := 0; i < len(parts); i++ { + if consumed, tok, punctTok := t.matchWordPhrase(parts, i); consumed > 0 { + tokens = append(tokens, tok) + if punctTok != nil { + tokens = append(tokens, *punctTok) + } + i += consumed - 1 + continue + } + + raw := parts[i] if prefix, rest, ok := t.splitFrenchElision(raw); ok { if artType, ok := t.MatchArticle(prefix); ok { tokens = append(tokens, Token{ @@ -729,6 +744,83 @@ func (t *Tokeniser) Tokenise(text string) []Token { return tokens } +func (t *Tokeniser) matchWordPhrase(parts []string, start int) (int, Token, *Token) { + if t.phraseLen < 2 || start >= len(parts) { + return 0, Token{}, nil + } + + maxLen := t.phraseLen + if remaining := len(parts) - start; remaining < maxLen { + maxLen = remaining + } + + for n := maxLen; n >= 2; n-- { + phraseWords := make([]string, 0, n) + rawParts := make([]string, 0, n) + var punct string + valid := true + + for j := 0; j < n; j++ { + part := parts[start+j] + if prefix, _, ok := t.splitFrenchElision(part); ok && prefix != part { + valid = false + break + } + + word, partPunct := splitTrailingPunct(part) + if word == "" { + valid = false + break + } + if partPunct != "" && j != n-1 { + valid = false + break + } + + rawParts = append(rawParts, word) + phraseWords = append(phraseWords, core.Lower(word)) + if j == n-1 { + punct = partPunct + } + } + + if !valid { + continue + } + + phrase := strings.Join(phraseWords, " ") + cat, ok := t.words[phrase] + if !ok { + continue + } + + tok := Token{ + Raw: strings.Join(rawParts, " "), + Lower: phrase, + Type: TokenWord, + WordCat: cat, + Confidence: 1.0, + } + + if punct != "" { + if punctType, ok := matchPunctuation(punct); ok { + punctTok := Token{ + Raw: punct, + Lower: punct, + Type: TokenPunctuation, + PunctType: punctType, + Confidence: 1.0, + } + return n, tok, &punctTok + } + } + + return n, tok, nil + } + + return 0, Token{}, nil +} + // resolveAmbiguous iterates all tokens and resolves any marked as // tokenAmbiguous using the weighted scoring function. func (t *Tokeniser) resolveAmbiguous(tokens []Token) { diff --git a/reversal/tokeniser_test.go b/reversal/tokeniser_test.go index bbe988e..7b75bab 100644 --- a/reversal/tokeniser_test.go +++ b/reversal/tokeniser_test.go @@ -159,6 +159,7 @@ func TestTokeniser_MatchWord(t *testing.T) { {"url", "url", true}, {"ID", "id", true}, {"SSH", "ssh", true}, + {"up to date", "up_to_date", true}, {"PHP", "php", true}, {"xyzzy", "", false}, } @@ -249,6 +250,38 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) { } } +func TestTokeniser_Tokenise_WordPhrase(t *testing.T) { + setup(t) + tok := NewTokeniser() + + tokens := tok.Tokenise("up to date") + if len(tokens) != 1 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 1", "up to date", len(tokens)) + } + if tokens[0].Type != TokenWord { + t.Fatalf("Tokenise(%q)[0].Type = %v, want TokenWord", "up to date", tokens[0].Type) + } + if tokens[0].WordCat != "up_to_date" { + t.Fatalf("Tokenise(%q)[0].WordCat = %q, want %q", "up to date", tokens[0].WordCat, "up_to_date") + } +} + +func TestTokeniser_Tokenise_WordPhraseWithPunctuation(t *testing.T) { + setup(t) + tok := NewTokeniser() + + tokens := tok.Tokenise("up to date.") + if len(tokens) != 2 { + t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "up to date.", len(tokens)) + } + if tokens[0].Type != TokenWord { + t.Fatalf("Tokenise(%q)[0].Type = %v, want TokenWord", "up to date.", tokens[0].Type) + } + if tokens[1].Type != TokenPunctuation { + t.Fatalf("Tokenise(%q)[1].Type = %v, want TokenPunctuation", "up to date.", tokens[1].Type) + } +} + func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) { setup(t) tok := NewTokeniserForLang("fr")