2026-04-01 07:01:32 +00:00
3 changed files with 142 additions and 2 deletions
--- a/reversal/imprint_test.go
+++ b/reversal/imprint_test.go
@ -34,6 +34,21 @@ func TestNewImprint(t *testing.T) {
 	}
 }

+func TestNewImprint_WordPhrase(t *testing.T) {
+	svc, err := i18n.New()
+	if err != nil {
+		t.Fatalf("i18n.New() failed: %v", err)
+	}
+	i18n.SetDefault(svc)
+
+	tok := NewTokeniser()
+	imp := NewImprint(tok.Tokenise("up to date"))
+
+	if imp.DomainVocabulary["up_to_date"] != 1 {
+		t.Fatalf("DomainVocabulary[\"up_to_date\"] = %d, want 1", imp.DomainVocabulary["up_to_date"])
+	}
+}
+
 func TestNewImprint_Empty(t *testing.T) {
 	imp := NewImprint(nil)
 	if imp.TokenCount != 0 {
--- a/reversal/tokeniser.go
+++ b/reversal/tokeniser.go
@ -89,6 +89,7 @@ type Tokeniser struct {
 	pluralToBase map[string]string // "files" → "file"
 	baseNouns    map[string]bool   // "file" → true
 	words        map[string]string // word translations
+	phraseLen    int               // longest multi-word gram.word entry
 	lang         string

 	dualClass   map[string]bool    // words in both verb AND noun tables
@ -491,7 +492,11 @@ func (t *Tokeniser) buildWordIndex() {
 		// Map the key itself (already lowercase)
 		t.words[core.Lower(key)] = key
 		// Map the display form (e.g., "URL" → "url", "SSH" → "ssh")
-		t.words[core.Lower(display)] = key
+		lowerDisplay := core.Lower(display)
+		t.words[lowerDisplay] = key
+		if words := strings.Fields(lowerDisplay); len(words) > 1 && len(words) > t.phraseLen {
+			t.phraseLen = len(words)
+		}
 	}
 }

@ -636,7 +641,17 @@ func (t *Tokeniser) Tokenise(text string) []Token {
 	var tokens []Token

 	// --- Pass 1: Classify & Mark ---
-	for _, raw := range parts {
+	for i := 0; i < len(parts); i++ {
+		if consumed, tok, punctTok := t.matchWordPhrase(parts, i); consumed > 0 {
+			tokens = append(tokens, tok)
+			if punctTok != nil {
+				tokens = append(tokens, *punctTok)
+			}
+			i += consumed - 1
+			continue
+		}
+
+		raw := parts[i]
 		if prefix, rest, ok := t.splitFrenchElision(raw); ok {
 			if artType, ok := t.MatchArticle(prefix); ok {
 				tokens = append(tokens, Token{
@ -729,6 +744,83 @@ func (t *Tokeniser) Tokenise(text string) []Token {
 	return tokens
 }

+func (t *Tokeniser) matchWordPhrase(parts []string, start int) (int, Token, *Token) {
+	if t.phraseLen < 2 || start >= len(parts) {
+		return 0, Token{}, nil
+	}
+
+	maxLen := t.phraseLen
+	if remaining := len(parts) - start; remaining < maxLen {
+		maxLen = remaining
+	}
+
+	for n := maxLen; n >= 2; n-- {
+		phraseWords := make([]string, 0, n)
+		rawParts := make([]string, 0, n)
+		var punct string
+		valid := true
+
+		for j := 0; j < n; j++ {
+			part := parts[start+j]
+			if prefix, _, ok := t.splitFrenchElision(part); ok && prefix != part {
+				valid = false
+				break
+			}
+
+			word, partPunct := splitTrailingPunct(part)
+			if word == "" {
+				valid = false
+				break
+			}
+			if partPunct != "" && j != n-1 {
+				valid = false
+				break
+			}
+
+			rawParts = append(rawParts, word)
+			phraseWords = append(phraseWords, core.Lower(word))
+			if j == n-1 {
+				punct = partPunct
+			}
+		}
+
+		if !valid {
+			continue
+		}
+
+		phrase := strings.Join(phraseWords, " ")
+		cat, ok := t.words[phrase]
+		if !ok {
+			continue
+		}
+
+		tok := Token{
+			Raw:        strings.Join(rawParts, " "),
+			Lower:      phrase,
+			Type:       TokenWord,
+			WordCat:    cat,
+			Confidence: 1.0,
+		}
+
+		if punct != "" {
+			if punctType, ok := matchPunctuation(punct); ok {
+				punctTok := Token{
+					Raw:        punct,
+					Lower:      punct,
+					Type:       TokenPunctuation,
+					PunctType:  punctType,
+					Confidence: 1.0,
+				}
+				return n, tok, &punctTok
+			}
+		}
+
+		return n, tok, nil
+	}
+
+	return 0, Token{}, nil
+}
+
 // resolveAmbiguous iterates all tokens and resolves any marked as
 // tokenAmbiguous using the weighted scoring function.
 func (t *Tokeniser) resolveAmbiguous(tokens []Token) {
--- a/reversal/tokeniser_test.go
+++ b/reversal/tokeniser_test.go
@ -159,6 +159,7 @@ func TestTokeniser_MatchWord(t *testing.T) {
 		{"url", "url", true},
 		{"ID", "id", true},
 		{"SSH", "ssh", true},
+		{"up to date", "up_to_date", true},
 		{"PHP", "php", true},
 		{"xyzzy", "", false},
 	}
@ -249,6 +250,38 @@ func TestTokeniser_MatchArticle_FrenchGendered(t *testing.T) {
 	}
 }

+func TestTokeniser_Tokenise_WordPhrase(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tokens := tok.Tokenise("up to date")
+	if len(tokens) != 1 {
+		t.Fatalf("Tokenise(%q) returned %d tokens, want 1", "up to date", len(tokens))
+	}
+	if tokens[0].Type != TokenWord {
+		t.Fatalf("Tokenise(%q)[0].Type = %v, want TokenWord", "up to date", tokens[0].Type)
+	}
+	if tokens[0].WordCat != "up_to_date" {
+		t.Fatalf("Tokenise(%q)[0].WordCat = %q, want %q", "up to date", tokens[0].WordCat, "up_to_date")
+	}
+}
+
+func TestTokeniser_Tokenise_WordPhraseWithPunctuation(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+
+	tokens := tok.Tokenise("up to date.")
+	if len(tokens) != 2 {
+		t.Fatalf("Tokenise(%q) returned %d tokens, want 2", "up to date.", len(tokens))
+	}
+	if tokens[0].Type != TokenWord {
+		t.Fatalf("Tokenise(%q)[0].Type = %v, want TokenWord", "up to date.", tokens[0].Type)
+	}
+	if tokens[1].Type != TokenPunctuation {
+		t.Fatalf("Tokenise(%q)[1].Type = %v, want TokenPunctuation", "up to date.", tokens[1].Type)
+	}
+}
+
 func TestTokeniser_MatchArticle_FrenchExtended(t *testing.T) {
 	setup(t)
 	tok := NewTokeniserForLang("fr")