1401 lines
40 KiB
Go
1401 lines
40 KiB
Go
// Package reversal provides reverse grammar lookups.
|
||
//
|
||
// The forward engine (go-i18n) maps base forms to inflected forms:
|
||
//
|
||
// PastTense("delete") → "deleted"
|
||
// Gerund("run") → "running"
|
||
//
|
||
// The reversal engine reads those same tables backwards, turning
|
||
// inflected forms back into base forms with tense metadata:
|
||
//
|
||
// MatchVerb("deleted") → {Base: "delete", Tense: "past"}
|
||
// MatchVerb("running") → {Base: "run", Tense: "gerund"}
|
||
//
|
||
// 3-tier lookup: JSON grammar data → irregular verb maps → regular
|
||
// morphology rules (verified by round-tripping through forward functions).
|
||
package reversal
|
||
|
||
import (
|
||
"strings"
|
||
"unicode/utf8"
|
||
|
||
"dappco.re/go/core"
|
||
i18n "dappco.re/go/core/i18n"
|
||
)
|
||
|
||
var frenchElisionPrefixes = []string{"l", "d", "j", "m", "t", "s", "n", "c", "qu"}
|
||
|
||
// VerbMatch holds the result of a reverse verb lookup.
|
||
type VerbMatch struct {
|
||
Base string // Base form of the verb ("delete", "run")
|
||
Tense string // "past", "gerund", or "base"
|
||
Form string // The original inflected form
|
||
}
|
||
|
||
// NounMatch holds the result of a reverse noun lookup.
|
||
type NounMatch struct {
|
||
Base string // Base/singular form of the noun
|
||
Plural bool // Whether the matched form was plural
|
||
Form string // The original form
|
||
}
|
||
|
||
// TokenType classifies a token identified during tokenisation.
|
||
type TokenType int
|
||
|
||
const (
|
||
TokenUnknown TokenType = iota // Unrecognised word
|
||
TokenVerb // Matched verb (see VerbInfo)
|
||
TokenNoun // Matched noun (see NounInfo)
|
||
TokenArticle // Matched article ("a", "an", "the")
|
||
TokenWord // Matched word from grammar word map
|
||
TokenPunctuation // Punctuation ("...", "?")
|
||
)
|
||
|
||
// Token represents a single classified token from a text string.
|
||
type Token struct {
|
||
Raw string // Original text as it appeared in input
|
||
Lower string // Lowercased form
|
||
Type TokenType // Classification
|
||
Confidence float64 // 0.0-1.0 classification confidence
|
||
AltType TokenType // Runner-up classification (dual-class only)
|
||
AltConf float64 // Runner-up confidence
|
||
VerbInfo VerbMatch // Set when Type OR AltType == TokenVerb
|
||
NounInfo NounMatch // Set when Type OR AltType == TokenNoun
|
||
WordCat string // Set when Type == TokenWord
|
||
ArtType string // Set when Type == TokenArticle
|
||
PunctType string // Set when Type == TokenPunctuation
|
||
Signals *SignalBreakdown // Non-nil only when WithSignals() option is set
|
||
}
|
||
|
||
// SignalBreakdown provides detailed scoring for dual-class disambiguation.
|
||
type SignalBreakdown struct {
|
||
VerbScore float64
|
||
NounScore float64
|
||
Components []SignalComponent
|
||
}
|
||
|
||
// SignalComponent describes a single signal's contribution to disambiguation.
|
||
type SignalComponent struct {
|
||
Name string // "noun_determiner", "verb_auxiliary", etc.
|
||
Weight float64 // Signal weight (0.0-1.0)
|
||
Value float64 // Signal firing strength (0.0-1.0)
|
||
Contrib float64 // weight x value
|
||
Reason string // Human-readable: "preceded by 'the'"
|
||
}
|
||
|
||
// Tokeniser provides reverse grammar lookups by maintaining inverse
|
||
// indexes built from the forward grammar tables.
|
||
type Tokeniser struct {
|
||
pastToBase map[string]string // "deleted" → "delete"
|
||
gerundToBase map[string]string // "deleting" → "delete"
|
||
baseVerbs map[string]bool // "delete" → true
|
||
pluralToBase map[string]string // "files" → "file"
|
||
baseNouns map[string]bool // "file" → true
|
||
words map[string]string // word translations
|
||
phraseLen int // longest multi-word gram.word entry
|
||
lang string
|
||
|
||
dualClass map[string]bool // words in both verb AND noun tables
|
||
nounDet map[string]bool // signal: noun determiners
|
||
verbAux map[string]bool // signal: verb auxiliaries
|
||
verbInf map[string]bool // signal: infinitive markers
|
||
verbNeg map[string]bool // signal: negation cues
|
||
withSignals bool // allocate SignalBreakdown on ambiguous tokens
|
||
weights map[string]float64 // signal weights (F3: configurable)
|
||
}
|
||
|
||
// TokeniserOption configures a Tokeniser.
|
||
type TokeniserOption func(*Tokeniser)
|
||
|
||
// WithSignals enables detailed SignalBreakdown on ambiguous tokens.
|
||
func WithSignals() TokeniserOption {
|
||
return func(t *Tokeniser) { t.withSignals = true }
|
||
}
|
||
|
||
// WithWeights overrides the default signal weights for disambiguation.
|
||
// All signal keys must be present; omitted keys silently disable those signals.
|
||
func WithWeights(w map[string]float64) TokeniserOption {
|
||
return func(t *Tokeniser) { t.weights = w }
|
||
}
|
||
|
||
// NewTokeniser creates a Tokeniser for English ("en").
|
||
func NewTokeniser(opts ...TokeniserOption) *Tokeniser {
|
||
return NewTokeniserForLang("en", opts...)
|
||
}
|
||
|
||
// NewTokeniserForLang creates a Tokeniser for the specified language,
|
||
// building inverse indexes from the grammar data.
|
||
func NewTokeniserForLang(lang string, opts ...TokeniserOption) *Tokeniser {
|
||
t := &Tokeniser{
|
||
pastToBase: make(map[string]string),
|
||
gerundToBase: make(map[string]string),
|
||
baseVerbs: make(map[string]bool),
|
||
pluralToBase: make(map[string]string),
|
||
baseNouns: make(map[string]bool),
|
||
words: make(map[string]string),
|
||
lang: lang,
|
||
}
|
||
for _, opt := range opts {
|
||
opt(t)
|
||
}
|
||
t.buildVerbIndex()
|
||
t.buildNounIndex()
|
||
t.buildWordIndex()
|
||
t.buildDualClassIndex()
|
||
t.buildSignalIndex()
|
||
if t.weights == nil {
|
||
t.weights = defaultWeights()
|
||
}
|
||
return t
|
||
}
|
||
|
||
// buildVerbIndex reads grammar tables and irregular verb maps to build
|
||
// inverse lookup maps: inflected form → base form.
|
||
func (t *Tokeniser) buildVerbIndex() {
|
||
// Tier 1: Read from JSON grammar data (via GetGrammarData).
|
||
data := i18n.GetGrammarData(t.lang)
|
||
if data != nil && data.Verbs != nil {
|
||
for base, forms := range data.Verbs {
|
||
t.baseVerbs[base] = true
|
||
if forms.Past != "" {
|
||
t.pastToBase[forms.Past] = base
|
||
}
|
||
if forms.Gerund != "" {
|
||
t.gerundToBase[forms.Gerund] = base
|
||
}
|
||
}
|
||
}
|
||
|
||
// Tier 2: Read from the exported irregularVerbs map.
|
||
// Build inverse maps directly from the authoritative source.
|
||
for base, forms := range i18n.IrregularVerbs() {
|
||
t.baseVerbs[base] = true
|
||
if forms.Past != "" {
|
||
if _, exists := t.pastToBase[forms.Past]; !exists {
|
||
t.pastToBase[forms.Past] = base
|
||
}
|
||
}
|
||
if forms.Gerund != "" {
|
||
if _, exists := t.gerundToBase[forms.Gerund]; !exists {
|
||
t.gerundToBase[forms.Gerund] = base
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// buildNounIndex reads grammar tables and irregular noun maps to build
|
||
// inverse lookup maps: plural form → base form.
|
||
func (t *Tokeniser) buildNounIndex() {
|
||
// Tier 1: Read from JSON grammar data (via GetGrammarData).
|
||
data := i18n.GetGrammarData(t.lang)
|
||
if data != nil && data.Nouns != nil {
|
||
for base, forms := range data.Nouns {
|
||
t.baseNouns[base] = true
|
||
if forms.Other != "" && forms.Other != base {
|
||
t.pluralToBase[forms.Other] = base
|
||
}
|
||
}
|
||
}
|
||
|
||
// Tier 2: Read from the exported irregularNouns map.
|
||
for base, plural := range i18n.IrregularNouns() {
|
||
t.baseNouns[base] = true
|
||
if plural != base {
|
||
if _, exists := t.pluralToBase[plural]; !exists {
|
||
t.pluralToBase[plural] = base
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// MatchNoun performs a 3-tier reverse lookup for a noun form.
|
||
//
|
||
// Tier 1: Check if the word is a known base noun.
|
||
// Tier 2: Check the pluralToBase inverse map.
|
||
// Tier 3: Try reverse morphology rules and round-trip verify via
|
||
// the forward function PluralForm().
|
||
func (t *Tokeniser) MatchNoun(word string) (NounMatch, bool) {
|
||
word = core.Lower(core.Trim(word))
|
||
if word == "" {
|
||
return NounMatch{}, false
|
||
}
|
||
|
||
// Tier 1: Is it a base noun?
|
||
if t.baseNouns[word] {
|
||
return NounMatch{Base: word, Plural: false, Form: word}, true
|
||
}
|
||
|
||
// Tier 2: Check inverse map from grammar tables + irregular nouns.
|
||
if base, ok := t.pluralToBase[word]; ok {
|
||
return NounMatch{Base: base, Plural: true, Form: word}, true
|
||
}
|
||
|
||
// Tier 3: Reverse morphology with round-trip verification.
|
||
candidates := t.reverseRegularPlural(word)
|
||
for _, c := range candidates {
|
||
if i18n.PluralForm(c) == word {
|
||
return NounMatch{Base: c, Plural: true, Form: word}, true
|
||
}
|
||
}
|
||
|
||
return NounMatch{}, false
|
||
}
|
||
|
||
// reverseRegularPlural generates candidate base forms by reversing regular
|
||
// plural suffixes. Returns multiple candidates ordered by likelihood.
|
||
//
|
||
// The forward engine applies rules in this order:
|
||
// 1. ends in s/ss/sh/ch/x/z → +es
|
||
// 2. ends in consonant+y → ies
|
||
// 3. ends in f → ves, fe → ves
|
||
// 4. default → +s
|
||
//
|
||
// We generate candidates for each possible reverse rule. Round-trip
|
||
// verification ensures only correct candidates pass.
|
||
func (t *Tokeniser) reverseRegularPlural(word string) []string {
|
||
var candidates []string
|
||
|
||
// Rule: consonant + "ies" → consonant + "y" (e.g., "entries" → "entry")
|
||
if core.HasSuffix(word, "ies") && len(word) > 3 {
|
||
base := word[:len(word)-3] + "y"
|
||
candidates = append(candidates, base)
|
||
}
|
||
|
||
// Rule: "ves" → "f" or "fe" (e.g., "wolves" → "wolf", "knives" → "knife")
|
||
if core.HasSuffix(word, "ves") && len(word) > 3 {
|
||
candidates = append(candidates, word[:len(word)-3]+"f")
|
||
candidates = append(candidates, word[:len(word)-3]+"fe")
|
||
}
|
||
|
||
// Rule: sibilant + "es" (e.g., "processes" → "process", "branches" → "branch")
|
||
if core.HasSuffix(word, "ses") || core.HasSuffix(word, "xes") ||
|
||
core.HasSuffix(word, "zes") || core.HasSuffix(word, "ches") ||
|
||
core.HasSuffix(word, "shes") {
|
||
base := word[:len(word)-2] // strip "es"
|
||
candidates = append(candidates, base)
|
||
}
|
||
|
||
// Rule: drop "s" (e.g., "servers" → "server")
|
||
if core.HasSuffix(word, "s") && len(word) > 1 {
|
||
base := word[:len(word)-1]
|
||
candidates = append(candidates, base)
|
||
}
|
||
|
||
return candidates
|
||
}
|
||
|
||
// MatchVerb performs a 3-tier reverse lookup for a verb form.
|
||
//
|
||
// Tier 1: Check if the word is a known base verb.
|
||
// Tier 2: Check the pastToBase and gerundToBase inverse maps.
|
||
// Tier 3: Try reverse morphology rules and round-trip verify via
|
||
// the forward functions PastTense() and Gerund().
|
||
func (t *Tokeniser) MatchVerb(word string) (VerbMatch, bool) {
|
||
word = core.Lower(core.Trim(word))
|
||
if word == "" {
|
||
return VerbMatch{}, false
|
||
}
|
||
|
||
// Tier 1: Is it a base verb?
|
||
if t.baseVerbs[word] {
|
||
return VerbMatch{Base: word, Tense: "base", Form: word}, true
|
||
}
|
||
|
||
// Tier 2: Check inverse maps from grammar tables + irregular verbs.
|
||
if base, ok := t.pastToBase[word]; ok {
|
||
return VerbMatch{Base: base, Tense: "past", Form: word}, true
|
||
}
|
||
if base, ok := t.gerundToBase[word]; ok {
|
||
return VerbMatch{Base: base, Tense: "gerund", Form: word}, true
|
||
}
|
||
|
||
// Tier 3: Reverse morphology with round-trip verification.
|
||
// Try past tense candidates.
|
||
if base := t.bestRoundTrip(word, t.reverseRegularPast(word), i18n.PastTense); base != "" {
|
||
return VerbMatch{Base: base, Tense: "past", Form: word}, true
|
||
}
|
||
|
||
// Try gerund candidates.
|
||
if base := t.bestRoundTrip(word, t.reverseRegularGerund(word), i18n.Gerund); base != "" {
|
||
return VerbMatch{Base: base, Tense: "gerund", Form: word}, true
|
||
}
|
||
|
||
return VerbMatch{}, false
|
||
}
|
||
|
||
// bestRoundTrip selects the best candidate from a list by round-tripping
|
||
// each through a forward function. When multiple candidates round-trip
|
||
// successfully (ambiguity), it uses the following priority:
|
||
// 1. Candidates that are known base verbs (in grammar tables / irregular maps)
|
||
// 2. Candidates ending in a VCe pattern (vowel-consonant-e, the "magic e"
|
||
// pattern common in real English verbs like "delete", "create", "use").
|
||
// This avoids phantom verbs like "walke" or "processe" which have a
|
||
// CCe pattern (consonant-consonant-e) that doesn't occur naturally.
|
||
// 3. Candidates NOT ending in "e" (the default morphology path)
|
||
// 4. First match in candidate order as final tiebreaker
|
||
func (t *Tokeniser) bestRoundTrip(target string, candidates []string, forward func(string) string) string {
|
||
var matches []string
|
||
for _, c := range candidates {
|
||
if forward(c) == target {
|
||
matches = append(matches, c)
|
||
}
|
||
}
|
||
if len(matches) == 0 {
|
||
return ""
|
||
}
|
||
if len(matches) == 1 {
|
||
return matches[0]
|
||
}
|
||
|
||
// Priority 1: known base verb
|
||
for _, m := range matches {
|
||
if t.baseVerbs[m] {
|
||
return m
|
||
}
|
||
}
|
||
|
||
// Priority 2: prefer VCe-ending candidate (real English verb pattern)
|
||
for _, m := range matches {
|
||
if hasVCeEnding(m) {
|
||
return m
|
||
}
|
||
}
|
||
|
||
// Priority 3: prefer candidate not ending in "e" (avoids phantom verbs
|
||
// with CCe endings like "walke", "processe")
|
||
for _, m := range matches {
|
||
if !core.HasSuffix(m, "e") {
|
||
return m
|
||
}
|
||
}
|
||
|
||
return matches[0]
|
||
}
|
||
|
||
// hasVCeEnding returns true if the word ends in a vowel-consonant-e pattern
|
||
// (the "magic e" pattern). This is characteristic of real English verbs like
|
||
// "delete" (-ete), "create" (-ate), "use" (-use), "close" (-ose).
|
||
// Phantom verbs produced by naive suffix stripping like "walke" (-lke) or
|
||
// "processe" (-sse) end in consonant-consonant-e and return false.
|
||
func hasVCeEnding(word string) bool {
|
||
if len(word) < 3 || word[len(word)-1] != 'e' {
|
||
return false
|
||
}
|
||
lastConsonant := word[len(word)-2]
|
||
vowelBefore := word[len(word)-3]
|
||
return !isVowelByte(lastConsonant) && isVowelByte(vowelBefore)
|
||
}
|
||
|
||
func isVowelByte(b byte) bool {
|
||
switch b {
|
||
case 'a', 'e', 'i', 'o', 'u':
|
||
return true
|
||
}
|
||
return false
|
||
}
|
||
|
||
// reverseRegularPast generates candidate base forms by reversing regular
|
||
// past tense suffixes. Returns multiple candidates ordered by likelihood.
|
||
//
|
||
// The forward engine applies rules in this order:
|
||
// 1. ends in "e" → +d (create → created)
|
||
// 2. ends in "y" + consonant → ied (copy → copied)
|
||
// 3. shouldDoubleConsonant → double+ed (stop → stopped)
|
||
// 4. default → +ed (walk → walked)
|
||
//
|
||
// We generate candidates for each possible reverse rule. Round-trip
|
||
// verification (in bestRoundTrip) ensures only correct candidates pass.
|
||
func (t *Tokeniser) reverseRegularPast(word string) []string {
|
||
var candidates []string
|
||
|
||
if !core.HasSuffix(word, "ed") {
|
||
return candidates
|
||
}
|
||
|
||
// Rule: consonant + "ied" → consonant + "y" (e.g., "copied" → "copy")
|
||
if core.HasSuffix(word, "ied") && len(word) > 3 {
|
||
base := word[:len(word)-3] + "y"
|
||
candidates = append(candidates, base)
|
||
}
|
||
|
||
// Rule: doubled consonant + "ed" → single consonant (e.g., "stopped" → "stop")
|
||
if len(word) > 4 {
|
||
beforeEd := word[:len(word)-2]
|
||
lastChar := beforeEd[len(beforeEd)-1]
|
||
if len(beforeEd) >= 2 && beforeEd[len(beforeEd)-2] == lastChar {
|
||
base := beforeEd[:len(beforeEd)-1]
|
||
candidates = append(candidates, base)
|
||
}
|
||
}
|
||
|
||
// Rule: stem + "d" where stem ends in "e" (e.g., "created" → "create")
|
||
if len(word) > 2 {
|
||
stemPlusE := word[:len(word)-1] // strip "d", leaving stem + "e"
|
||
candidates = append(candidates, stemPlusE)
|
||
}
|
||
|
||
// Rule: stem + "ed" (e.g., "walked" → "walk")
|
||
if len(word) > 2 {
|
||
stem := word[:len(word)-2]
|
||
candidates = append(candidates, stem)
|
||
}
|
||
|
||
return candidates
|
||
}
|
||
|
||
// reverseRegularGerund generates candidate base forms by reversing regular
|
||
// gerund suffixes. Returns multiple candidates ordered by likelihood.
|
||
//
|
||
// Rules reversed:
|
||
// - verb + "ing" (e.g., "walking" → "walk")
|
||
// - verb[:-1] + "ing" (e.g., "creating" → "create", drop e)
|
||
// - doubled consonant (e.g., "stopping" → "stop")
|
||
// - verb[:-2] + "ying" (e.g., "dying" → "die")
|
||
func (t *Tokeniser) reverseRegularGerund(word string) []string {
|
||
var candidates []string
|
||
|
||
if !core.HasSuffix(word, "ing") || len(word) < 4 {
|
||
return candidates
|
||
}
|
||
|
||
stem := word[:len(word)-3] // strip "ing"
|
||
|
||
// Rule: "ying" → "ie" (e.g., "dying" → "die")
|
||
if core.HasSuffix(word, "ying") && len(word) > 4 {
|
||
base := word[:len(word)-4] + "ie"
|
||
candidates = append(candidates, base)
|
||
}
|
||
|
||
// Rule: doubled consonant + "ing" → single consonant (e.g., "stopping" → "stop")
|
||
if len(stem) >= 2 && stem[len(stem)-1] == stem[len(stem)-2] {
|
||
base := stem[:len(stem)-1]
|
||
candidates = append(candidates, base)
|
||
}
|
||
|
||
// Rule: direct strip "ing" (e.g., "walking" → "walk")
|
||
// This must come before the stem+"e" rule to avoid false positives
|
||
// like "walke" round-tripping through Gerund("walke") = "walking".
|
||
candidates = append(candidates, stem)
|
||
|
||
// Rule: stem + "e" was dropped before "ing" (e.g., "creating" → "create")
|
||
// Try adding "e" back.
|
||
candidates = append(candidates, stem+"e")
|
||
|
||
return candidates
|
||
}
|
||
|
||
// buildWordIndex reads GrammarData.Words and builds a reverse lookup map.
|
||
// Both the key (e.g., "url") and the display form (e.g., "URL") map back
|
||
// to the key, enabling case-insensitive lookups.
|
||
func (t *Tokeniser) buildWordIndex() {
|
||
data := i18n.GetGrammarData(t.lang)
|
||
if data == nil || data.Words == nil {
|
||
return
|
||
}
|
||
for key, display := range data.Words {
|
||
// Map the key itself (already lowercase)
|
||
t.words[core.Lower(key)] = key
|
||
// Map the display form (e.g., "URL" → "url", "SSH" → "ssh")
|
||
lowerDisplay := core.Lower(display)
|
||
t.words[lowerDisplay] = key
|
||
if words := strings.Fields(lowerDisplay); len(words) > 1 && len(words) > t.phraseLen {
|
||
t.phraseLen = len(words)
|
||
}
|
||
}
|
||
}
|
||
|
||
// IsDualClass returns true if the word exists in both verb and noun tables.
|
||
func (t *Tokeniser) IsDualClass(word string) bool {
|
||
return t.dualClass[core.Lower(word)]
|
||
}
|
||
|
||
func (t *Tokeniser) buildDualClassIndex() {
|
||
t.dualClass = make(map[string]bool)
|
||
for base := range t.baseVerbs {
|
||
if t.baseNouns[base] {
|
||
t.dualClass[base] = true
|
||
}
|
||
}
|
||
}
|
||
|
||
func (t *Tokeniser) buildSignalIndex() {
|
||
t.nounDet = make(map[string]bool)
|
||
t.verbAux = make(map[string]bool)
|
||
t.verbInf = make(map[string]bool)
|
||
t.verbNeg = make(map[string]bool)
|
||
|
||
data := i18n.GetGrammarData(t.lang)
|
||
|
||
// Guard each signal list independently so partial locale data
|
||
// falls back per-field rather than silently disabling signals.
|
||
if data != nil && len(data.Signals.NounDeterminers) > 0 {
|
||
for _, w := range data.Signals.NounDeterminers {
|
||
t.nounDet[core.Lower(w)] = true
|
||
}
|
||
} else {
|
||
for _, w := range []string{
|
||
"the", "a", "an", "this", "that", "these", "those",
|
||
"my", "your", "his", "her", "its", "our", "their",
|
||
"every", "each", "some", "any", "no",
|
||
"many", "few", "several", "all", "both",
|
||
} {
|
||
t.nounDet[w] = true
|
||
}
|
||
}
|
||
|
||
if data != nil && len(data.Signals.VerbAuxiliaries) > 0 {
|
||
for _, w := range data.Signals.VerbAuxiliaries {
|
||
t.verbAux[core.Lower(w)] = true
|
||
}
|
||
} else {
|
||
for _, w := range defaultVerbAuxiliaries() {
|
||
t.verbAux[w] = true
|
||
}
|
||
}
|
||
|
||
if data != nil && len(data.Signals.VerbInfinitive) > 0 {
|
||
for _, w := range data.Signals.VerbInfinitive {
|
||
t.verbInf[core.Lower(w)] = true
|
||
}
|
||
} else {
|
||
t.verbInf["to"] = true
|
||
}
|
||
|
||
if data != nil && len(data.Signals.VerbNegation) > 0 {
|
||
for _, w := range data.Signals.VerbNegation {
|
||
t.verbNeg[core.Lower(w)] = true
|
||
}
|
||
} else {
|
||
// Keep the fallback conservative: these are weak cues, not hard
|
||
// negation parsing.
|
||
for _, w := range []string{"not", "never"} {
|
||
t.verbNeg[w] = true
|
||
}
|
||
}
|
||
}
|
||
|
||
func defaultVerbAuxiliaries() []string {
|
||
return []string{
|
||
"am", "is", "are", "was", "were",
|
||
"has", "had", "have",
|
||
"do", "does", "did",
|
||
"will", "would", "could", "should",
|
||
"can", "may", "might", "shall", "must",
|
||
"don't", "can't", "won't", "shouldn't", "couldn't", "wouldn't",
|
||
"doesn't", "didn't", "isn't", "aren't", "wasn't", "weren't",
|
||
"hasn't", "hadn't", "haven't",
|
||
}
|
||
}
|
||
|
||
func defaultWeights() map[string]float64 {
|
||
return map[string]float64{
|
||
"noun_determiner": 0.35,
|
||
"verb_auxiliary": 0.25,
|
||
"verb_negation": 0.05,
|
||
"following_class": 0.15,
|
||
"sentence_position": 0.10,
|
||
"verb_saturation": 0.10,
|
||
"inflection_echo": 0.03,
|
||
"default_prior": 0.02,
|
||
}
|
||
}
|
||
|
||
// MatchWord performs a case-insensitive lookup in the words map.
|
||
// Returns the category key and true if found, or ("", false) otherwise.
|
||
func (t *Tokeniser) MatchWord(word string) (string, bool) {
|
||
cat, ok := t.words[core.Lower(word)]
|
||
return cat, ok
|
||
}
|
||
|
||
// MatchArticle checks whether a word is an article (definite or indefinite).
|
||
// Returns the article type ("indefinite" or "definite") and true if matched,
|
||
// or ("", false) otherwise.
|
||
func (t *Tokeniser) MatchArticle(word string) (string, bool) {
|
||
data := i18n.GetGrammarData(t.lang)
|
||
if data == nil {
|
||
return "", false
|
||
}
|
||
|
||
lower := core.Lower(word)
|
||
|
||
if lower == core.Lower(data.Articles.IndefiniteDefault) ||
|
||
lower == core.Lower(data.Articles.IndefiniteVowel) {
|
||
return "indefinite", true
|
||
}
|
||
if lower == core.Lower(data.Articles.Definite) {
|
||
return "definite", true
|
||
}
|
||
for _, article := range data.Articles.ByGender {
|
||
if lower == core.Lower(article) {
|
||
return "definite", true
|
||
}
|
||
}
|
||
if t.isFrenchLanguage() {
|
||
switch lower {
|
||
case "l'", "l’", "d'", "d’", "j'", "j’", "m'", "m’", "t'", "t’", "s'", "s’", "n'", "n’", "c'", "c’", "qu'", "qu’", "de l'", "de l’", "les", "au", "aux", "du":
|
||
return "definite", true
|
||
case "un", "une", "des":
|
||
return "indefinite", true
|
||
}
|
||
}
|
||
|
||
return "", false
|
||
}
|
||
|
||
// tokenAmbiguous is an internal sentinel used during Pass 1 to mark
|
||
// dual-class base forms that need disambiguation in Pass 2.
|
||
const tokenAmbiguous TokenType = -1
|
||
|
||
// clauseBoundaries lists words that delimit clause boundaries for
|
||
// the verb_saturation signal (D2 review fix).
|
||
var clauseBoundaries = map[string]bool{
|
||
"and": true, "or": true, "but": true, "because": true,
|
||
"when": true, "while": true, "if": true, "then": true, "so": true,
|
||
}
|
||
|
||
// Tokenise splits text on whitespace and classifies each word using a
|
||
// two-pass algorithm:
|
||
//
|
||
// Pass 1 classifies unambiguous tokens and marks dual-class base forms.
|
||
// Pass 2 resolves ambiguous tokens using weighted disambiguation signals.
|
||
func (t *Tokeniser) Tokenise(text string) []Token {
|
||
text = core.Trim(text)
|
||
if text == "" {
|
||
return nil
|
||
}
|
||
|
||
parts := strings.Fields(text)
|
||
var tokens []Token
|
||
|
||
// --- Pass 1: Classify & Mark ---
|
||
for i := 0; i < len(parts); i++ {
|
||
if consumed, tok, punctTok := t.matchWordPhrase(parts, i); consumed > 0 {
|
||
tokens = append(tokens, tok)
|
||
if punctTok != nil {
|
||
tokens = append(tokens, *punctTok)
|
||
}
|
||
i += consumed - 1
|
||
continue
|
||
}
|
||
if consumed, tok, extraTok, punctTok := t.matchFrenchArticlePhrase(parts, i); consumed > 0 {
|
||
tokens = append(tokens, tok)
|
||
if extraTok != nil {
|
||
tokens = append(tokens, *extraTok)
|
||
}
|
||
if punctTok != nil {
|
||
tokens = append(tokens, *punctTok)
|
||
}
|
||
i += consumed - 1
|
||
continue
|
||
}
|
||
|
||
raw := parts[i]
|
||
if prefix, rest, ok := t.splitFrenchElision(raw); ok {
|
||
if artType, ok := t.MatchArticle(prefix); ok {
|
||
tokens = append(tokens, Token{
|
||
Raw: prefix,
|
||
Lower: core.Lower(prefix),
|
||
Type: TokenArticle,
|
||
ArtType: artType,
|
||
Confidence: 1.0,
|
||
})
|
||
}
|
||
raw = rest
|
||
if raw == "" {
|
||
continue
|
||
}
|
||
}
|
||
|
||
// Strip trailing punctuation to get the clean word.
|
||
word, punct := splitTrailingPunct(raw)
|
||
|
||
// Classify the word portion (if any).
|
||
if word != "" {
|
||
tok := Token{Raw: raw, Lower: core.Lower(word)}
|
||
|
||
if artType, ok := t.MatchArticle(word); ok {
|
||
// Articles are unambiguous.
|
||
tok.Type = TokenArticle
|
||
tok.ArtType = artType
|
||
tok.Confidence = 1.0
|
||
} else {
|
||
// For non-articles, check BOTH verb and noun.
|
||
vm, verbOK := t.MatchVerb(word)
|
||
nm, nounOK := t.MatchNoun(word)
|
||
|
||
if verbOK && nounOK && t.dualClass[tok.Lower] {
|
||
// Dual-class word: check for self-resolving inflections.
|
||
if vm.Tense != "base" {
|
||
// Inflected verb form self-resolves.
|
||
tok.Type = TokenVerb
|
||
tok.VerbInfo = vm
|
||
tok.NounInfo = nm
|
||
tok.Confidence = 1.0
|
||
} else if nm.Plural {
|
||
// Inflected noun form self-resolves.
|
||
tok.Type = TokenNoun
|
||
tok.VerbInfo = vm
|
||
tok.NounInfo = nm
|
||
tok.Confidence = 1.0
|
||
} else {
|
||
// Base form: ambiguous, stash both and defer to Pass 2.
|
||
tok.Type = tokenAmbiguous
|
||
tok.VerbInfo = vm
|
||
tok.NounInfo = nm
|
||
}
|
||
} else if verbOK {
|
||
tok.Type = TokenVerb
|
||
tok.VerbInfo = vm
|
||
tok.Confidence = 1.0
|
||
} else if nounOK {
|
||
tok.Type = TokenNoun
|
||
tok.NounInfo = nm
|
||
tok.Confidence = 1.0
|
||
} else if cat, ok := t.MatchWord(word); ok {
|
||
tok.Type = TokenWord
|
||
tok.WordCat = cat
|
||
tok.Confidence = 1.0
|
||
} else {
|
||
tok.Type = TokenUnknown
|
||
}
|
||
}
|
||
tokens = append(tokens, tok)
|
||
}
|
||
|
||
// Emit a punctuation token if trailing punctuation was found.
|
||
if punct != "" {
|
||
if punctType, ok := matchPunctuation(punct); ok {
|
||
tokens = append(tokens, Token{
|
||
Raw: punct,
|
||
Lower: punct,
|
||
Type: TokenPunctuation,
|
||
PunctType: punctType,
|
||
Confidence: 1.0,
|
||
})
|
||
}
|
||
}
|
||
}
|
||
|
||
// --- Pass 2: Resolve Ambiguous ---
|
||
t.resolveAmbiguous(tokens)
|
||
|
||
return tokens
|
||
}
|
||
|
||
func (t *Tokeniser) matchWordPhrase(parts []string, start int) (int, Token, *Token) {
|
||
if t.phraseLen < 2 || start >= len(parts) {
|
||
return 0, Token{}, nil
|
||
}
|
||
|
||
maxLen := t.phraseLen
|
||
if remaining := len(parts) - start; remaining < maxLen {
|
||
maxLen = remaining
|
||
}
|
||
|
||
for n := maxLen; n >= 2; n-- {
|
||
phraseWords := make([]string, 0, n)
|
||
rawParts := make([]string, 0, n)
|
||
var punct string
|
||
valid := true
|
||
|
||
for j := 0; j < n; j++ {
|
||
part := parts[start+j]
|
||
if prefix, _, ok := t.splitFrenchElision(part); ok && prefix != part {
|
||
valid = false
|
||
break
|
||
}
|
||
|
||
word, partPunct := splitTrailingPunct(part)
|
||
if word == "" {
|
||
valid = false
|
||
break
|
||
}
|
||
if partPunct != "" && j != n-1 {
|
||
valid = false
|
||
break
|
||
}
|
||
|
||
rawParts = append(rawParts, word)
|
||
phraseWords = append(phraseWords, core.Lower(word))
|
||
if j == n-1 {
|
||
punct = partPunct
|
||
}
|
||
}
|
||
|
||
if !valid {
|
||
continue
|
||
}
|
||
|
||
phrase := strings.Join(phraseWords, " ")
|
||
cat, ok := t.words[phrase]
|
||
if !ok {
|
||
continue
|
||
}
|
||
|
||
tok := Token{
|
||
Raw: strings.Join(rawParts, " "),
|
||
Lower: phrase,
|
||
Type: TokenWord,
|
||
WordCat: cat,
|
||
Confidence: 1.0,
|
||
}
|
||
|
||
if punct != "" {
|
||
if punctType, ok := matchPunctuation(punct); ok {
|
||
punctTok := Token{
|
||
Raw: punct,
|
||
Lower: punct,
|
||
Type: TokenPunctuation,
|
||
PunctType: punctType,
|
||
Confidence: 1.0,
|
||
}
|
||
return n, tok, &punctTok
|
||
}
|
||
}
|
||
|
||
return n, tok, nil
|
||
}
|
||
|
||
return 0, Token{}, nil
|
||
}
|
||
|
||
func (t *Tokeniser) matchFrenchArticlePhrase(parts []string, start int) (int, Token, *Token, *Token) {
|
||
if !t.isFrenchLanguage() || start+1 >= len(parts) {
|
||
return 0, Token{}, nil, nil
|
||
}
|
||
|
||
first, firstPunct := splitTrailingPunct(parts[start])
|
||
if first == "" || firstPunct != "" {
|
||
return 0, Token{}, nil, nil
|
||
}
|
||
second, secondPunct := splitTrailingPunct(parts[start+1])
|
||
if second == "" {
|
||
return 0, Token{}, nil, nil
|
||
}
|
||
|
||
switch core.Lower(first) {
|
||
case "de":
|
||
if core.Lower(second) != "la" {
|
||
if prefix, rest, ok := t.splitFrenchElision(second); ok && (prefix == "l'" || prefix == "l’") && rest != "" {
|
||
tok := Token{
|
||
Raw: first + " " + prefix,
|
||
Lower: core.Lower(first + " " + prefix),
|
||
Type: TokenArticle,
|
||
ArtType: "definite",
|
||
Confidence: 1.0,
|
||
}
|
||
extra := t.classifyElidedFrenchWord(rest)
|
||
var punctTok *Token
|
||
if secondPunct != "" {
|
||
if punctType, ok := matchPunctuation(secondPunct); ok {
|
||
punctTok = &Token{
|
||
Raw: secondPunct,
|
||
Lower: secondPunct,
|
||
Type: TokenPunctuation,
|
||
PunctType: punctType,
|
||
Confidence: 1.0,
|
||
}
|
||
}
|
||
}
|
||
return 2, tok, &extra, punctTok
|
||
}
|
||
return 0, Token{}, nil, nil
|
||
}
|
||
tok := Token{
|
||
Raw: first + " " + second,
|
||
Lower: "de la",
|
||
Type: TokenArticle,
|
||
ArtType: "definite",
|
||
Confidence: 1.0,
|
||
}
|
||
if secondPunct != "" {
|
||
if punctType, ok := matchPunctuation(secondPunct); ok {
|
||
punctTok := Token{
|
||
Raw: secondPunct,
|
||
Lower: secondPunct,
|
||
Type: TokenPunctuation,
|
||
PunctType: punctType,
|
||
Confidence: 1.0,
|
||
}
|
||
return 2, tok, nil, &punctTok
|
||
}
|
||
}
|
||
return 2, tok, nil, nil
|
||
}
|
||
|
||
return 0, Token{}, nil, nil
|
||
}
|
||
|
||
func (t *Tokeniser) classifyElidedFrenchWord(word string) Token {
|
||
tok := Token{Raw: word, Lower: core.Lower(word)}
|
||
|
||
if artType, ok := t.MatchArticle(word); ok {
|
||
tok.Type = TokenArticle
|
||
tok.ArtType = artType
|
||
tok.Confidence = 1.0
|
||
return tok
|
||
}
|
||
|
||
vm, verbOK := t.MatchVerb(word)
|
||
nm, nounOK := t.MatchNoun(word)
|
||
if verbOK && nounOK && t.dualClass[tok.Lower] {
|
||
if vm.Tense != "base" {
|
||
tok.Type = TokenVerb
|
||
tok.VerbInfo = vm
|
||
tok.NounInfo = nm
|
||
tok.Confidence = 1.0
|
||
} else if nm.Plural {
|
||
tok.Type = TokenNoun
|
||
tok.VerbInfo = vm
|
||
tok.NounInfo = nm
|
||
tok.Confidence = 1.0
|
||
} else {
|
||
tok.Type = tokenAmbiguous
|
||
tok.VerbInfo = vm
|
||
tok.NounInfo = nm
|
||
}
|
||
return tok
|
||
}
|
||
if verbOK {
|
||
tok.Type = TokenVerb
|
||
tok.VerbInfo = vm
|
||
tok.Confidence = 1.0
|
||
return tok
|
||
}
|
||
if nounOK {
|
||
tok.Type = TokenNoun
|
||
tok.NounInfo = nm
|
||
tok.Confidence = 1.0
|
||
return tok
|
||
}
|
||
if cat, ok := t.MatchWord(word); ok {
|
||
tok.Type = TokenWord
|
||
tok.WordCat = cat
|
||
tok.Confidence = 1.0
|
||
return tok
|
||
}
|
||
|
||
tok.Type = TokenUnknown
|
||
return tok
|
||
}
|
||
|
||
// resolveAmbiguous iterates all tokens and resolves any marked as
|
||
// tokenAmbiguous using the weighted scoring function.
|
||
func (t *Tokeniser) resolveAmbiguous(tokens []Token) {
|
||
for i := range tokens {
|
||
if tokens[i].Type != tokenAmbiguous {
|
||
continue
|
||
}
|
||
verbScore, nounScore, components := t.scoreAmbiguous(tokens, i)
|
||
t.resolveToken(&tokens[i], verbScore, nounScore, components)
|
||
}
|
||
}
|
||
|
||
// scoreAmbiguous evaluates 8 weighted signals to determine whether an
|
||
// ambiguous token should be classified as verb or noun.
|
||
func (t *Tokeniser) scoreAmbiguous(tokens []Token, idx int) (float64, float64, []SignalComponent) {
|
||
var verbScore, nounScore float64
|
||
var components []SignalComponent
|
||
|
||
// 1. noun_determiner: preceding token is a noun determiner
|
||
if w, ok := t.weights["noun_determiner"]; ok && idx > 0 {
|
||
prev := tokens[idx-1]
|
||
if t.nounDet[prev.Lower] {
|
||
nounScore += w * 1.0
|
||
if t.withSignals {
|
||
components = append(components, SignalComponent{
|
||
Name: "noun_determiner", Weight: w, Value: 1.0, Contrib: w,
|
||
Reason: "preceded by '" + prev.Lower + "'",
|
||
})
|
||
}
|
||
}
|
||
}
|
||
|
||
// 2. verb_auxiliary: preceding token is an auxiliary or infinitive marker
|
||
if w, ok := t.weights["verb_auxiliary"]; ok && idx > 0 {
|
||
prev := tokens[idx-1]
|
||
if t.verbAux[prev.Lower] || t.verbInf[prev.Lower] {
|
||
verbScore += w * 1.0
|
||
if t.withSignals {
|
||
components = append(components, SignalComponent{
|
||
Name: "verb_auxiliary", Weight: w, Value: 1.0, Contrib: w,
|
||
Reason: "preceded by '" + prev.Lower + "'",
|
||
})
|
||
}
|
||
}
|
||
}
|
||
|
||
// 3. verb_negation: preceding negation weakly signals a verb
|
||
if w, ok := t.weights["verb_negation"]; ok && idx > 0 {
|
||
prev := tokens[idx-1]
|
||
if t.verbNeg[prev.Lower] || t.hasNoLongerBefore(tokens, idx) {
|
||
verbScore += w * 1.0
|
||
if t.withSignals {
|
||
reason := "preceded by '" + prev.Lower + "'"
|
||
if t.hasNoLongerBefore(tokens, idx) {
|
||
reason = "preceded by 'no longer'"
|
||
}
|
||
components = append(components, SignalComponent{
|
||
Name: "verb_negation", Weight: w, Value: 1.0, Contrib: w,
|
||
Reason: reason,
|
||
})
|
||
}
|
||
}
|
||
}
|
||
|
||
// 4. following_class: next token's class informs this token's role
|
||
if w, ok := t.weights["following_class"]; ok && idx+1 < len(tokens) {
|
||
next := tokens[idx+1]
|
||
if next.Type != tokenAmbiguous {
|
||
if next.Type == TokenArticle || t.nounDet[next.Lower] || next.Type == TokenNoun {
|
||
// Followed by article/determiner/noun → verb signal
|
||
verbScore += w * 1.0
|
||
if t.withSignals {
|
||
components = append(components, SignalComponent{
|
||
Name: "following_class", Weight: w, Value: 1.0, Contrib: w,
|
||
Reason: "followed by " + next.Lower + " (article/noun)",
|
||
})
|
||
}
|
||
} else if next.Type == TokenVerb {
|
||
// Followed by verb → noun signal
|
||
nounScore += w * 1.0
|
||
if t.withSignals {
|
||
components = append(components, SignalComponent{
|
||
Name: "following_class", Weight: w, Value: 1.0, Contrib: w,
|
||
Reason: "followed by verb '" + next.Lower + "'",
|
||
})
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 5. sentence_position: first token in sentence → verb signal (imperative)
|
||
if w, ok := t.weights["sentence_position"]; ok && idx == 0 {
|
||
verbScore += w * 1.0
|
||
if t.withSignals {
|
||
components = append(components, SignalComponent{
|
||
Name: "sentence_position", Weight: w, Value: 1.0, Contrib: w,
|
||
Reason: "sentence-initial position (imperative)",
|
||
})
|
||
}
|
||
}
|
||
|
||
// 6. verb_saturation: if a confident verb already exists in the same clause
|
||
if w, ok := t.weights["verb_saturation"]; ok {
|
||
if t.hasConfidentVerbInClause(tokens, idx) {
|
||
nounScore += w * 1.0
|
||
if t.withSignals {
|
||
components = append(components, SignalComponent{
|
||
Name: "verb_saturation", Weight: w, Value: 1.0, Contrib: w,
|
||
Reason: "confident verb already in clause",
|
||
})
|
||
}
|
||
}
|
||
}
|
||
|
||
// 7. inflection_echo: another token shares the same base in inflected form
|
||
if w, ok := t.weights["inflection_echo"]; ok {
|
||
echoVerb, echoNoun := t.checkInflectionEcho(tokens, idx)
|
||
if echoNoun {
|
||
// Another token uses same base as inflected noun → signal verb
|
||
verbScore += w * 1.0
|
||
if t.withSignals {
|
||
components = append(components, SignalComponent{
|
||
Name: "inflection_echo", Weight: w, Value: 1.0, Contrib: w,
|
||
Reason: "inflected noun echo found",
|
||
})
|
||
}
|
||
}
|
||
if echoVerb {
|
||
// Another token uses same base as inflected verb → signal noun
|
||
nounScore += w * 1.0
|
||
if t.withSignals {
|
||
components = append(components, SignalComponent{
|
||
Name: "inflection_echo", Weight: w, Value: 1.0, Contrib: w,
|
||
Reason: "inflected verb echo found",
|
||
})
|
||
}
|
||
}
|
||
}
|
||
|
||
// 8. default_prior: corpus-derived priors take precedence; otherwise fall back to the static verb prior.
|
||
if priorVerb, priorNoun, ok := t.corpusPrior(tokens[idx].Lower); ok {
|
||
verbScore += priorVerb
|
||
nounScore += priorNoun
|
||
if t.withSignals {
|
||
components = append(components, SignalComponent{
|
||
Name: "default_prior", Weight: 1.0, Value: priorVerb, Contrib: priorVerb,
|
||
Reason: "corpus-derived prior",
|
||
})
|
||
if priorNoun > 0 {
|
||
components = append(components, SignalComponent{
|
||
Name: "default_prior", Weight: 1.0, Value: priorNoun, Contrib: priorNoun,
|
||
Reason: "corpus-derived prior",
|
||
})
|
||
}
|
||
}
|
||
} else if w, ok := t.weights["default_prior"]; ok {
|
||
verbScore += w * 1.0
|
||
if t.withSignals {
|
||
components = append(components, SignalComponent{
|
||
Name: "default_prior", Weight: w, Value: 1.0, Contrib: w,
|
||
Reason: "default verb prior",
|
||
})
|
||
}
|
||
}
|
||
|
||
return verbScore, nounScore, components
|
||
}
|
||
|
||
func (t *Tokeniser) hasNoLongerBefore(tokens []Token, idx int) bool {
|
||
if idx < 2 {
|
||
return false
|
||
}
|
||
return tokens[idx-2].Lower == "no" && tokens[idx-1].Lower == "longer"
|
||
}
|
||
|
||
func (t *Tokeniser) corpusPrior(word string) (float64, float64, bool) {
|
||
data := i18n.GetGrammarData(t.lang)
|
||
if data == nil || len(data.Signals.Priors) == 0 {
|
||
return 0, 0, false
|
||
}
|
||
bucket, ok := data.Signals.Priors[core.Lower(word)]
|
||
if !ok || len(bucket) == 0 {
|
||
return 0, 0, false
|
||
}
|
||
verb := bucket["verb"]
|
||
noun := bucket["noun"]
|
||
total := verb + noun
|
||
if total <= 0 {
|
||
return 0, 0, false
|
||
}
|
||
return verb / total, noun / total, true
|
||
}
|
||
|
||
// hasConfidentVerbInClause scans for a confident verb (Confidence >= 1.0)
|
||
// within the same clause as the token at idx. Clause boundaries are
|
||
// punctuation tokens and clause-boundary conjunctions/subordinators (D2).
|
||
func (t *Tokeniser) hasConfidentVerbInClause(tokens []Token, idx int) bool {
|
||
// Scan backwards from idx to find clause start.
|
||
start := 0
|
||
for i := idx - 1; i >= 0; i-- {
|
||
if tokens[i].Type == TokenPunctuation || clauseBoundaries[tokens[i].Lower] {
|
||
start = i + 1
|
||
break
|
||
}
|
||
}
|
||
// Scan forwards from idx to find clause end.
|
||
end := len(tokens)
|
||
for i := idx + 1; i < len(tokens); i++ {
|
||
if tokens[i].Type == TokenPunctuation || clauseBoundaries[tokens[i].Lower] {
|
||
end = i
|
||
break
|
||
}
|
||
}
|
||
// Look for a confident verb in [start, end), excluding idx itself.
|
||
for i := start; i < end; i++ {
|
||
if i == idx {
|
||
continue
|
||
}
|
||
if tokens[i].Type == TokenVerb && tokens[i].Confidence >= 1.0 {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
// checkInflectionEcho checks whether another token shares the same base
|
||
// as this ambiguous token but in an inflected form. Returns (echoVerb, echoNoun)
|
||
// where echoVerb means another token has the same base as an inflected verb,
|
||
// and echoNoun means another token has the same base as an inflected noun.
|
||
func (t *Tokeniser) checkInflectionEcho(tokens []Token, idx int) (bool, bool) {
|
||
target := tokens[idx]
|
||
var echoVerb, echoNoun bool
|
||
|
||
for i, tok := range tokens {
|
||
if i == idx {
|
||
continue
|
||
}
|
||
// Check if another token is a verb with the same base
|
||
if tok.Type == TokenVerb && tok.VerbInfo.Base == target.VerbInfo.Base && tok.VerbInfo.Tense != "base" {
|
||
echoVerb = true
|
||
}
|
||
// Check if another token is a noun with the same base
|
||
if tok.Type == TokenNoun && tok.NounInfo.Base == target.NounInfo.Base && tok.NounInfo.Plural {
|
||
echoNoun = true
|
||
}
|
||
}
|
||
|
||
return echoVerb, echoNoun
|
||
}
|
||
|
||
// resolveToken assigns the final classification to an ambiguous token
|
||
// based on verb and noun scores from disambiguation signals.
|
||
func (t *Tokeniser) resolveToken(tok *Token, verbScore, nounScore float64, components []SignalComponent) {
|
||
total := verbScore + nounScore
|
||
|
||
// B3 review fix: if total < 0.10 (only default prior fired),
|
||
// use low-information confidence floor.
|
||
if total < 0.10 {
|
||
if verbScore >= nounScore {
|
||
tok.Type = TokenVerb
|
||
tok.Confidence = 0.55
|
||
tok.AltType = TokenNoun
|
||
tok.AltConf = 0.45
|
||
} else {
|
||
tok.Type = TokenNoun
|
||
tok.Confidence = 0.55
|
||
tok.AltType = TokenVerb
|
||
tok.AltConf = 0.45
|
||
}
|
||
} else {
|
||
if verbScore >= nounScore {
|
||
tok.Type = TokenVerb
|
||
tok.Confidence = verbScore / total
|
||
tok.AltType = TokenNoun
|
||
tok.AltConf = nounScore / total
|
||
} else {
|
||
tok.Type = TokenNoun
|
||
tok.Confidence = nounScore / total
|
||
tok.AltType = TokenVerb
|
||
tok.AltConf = verbScore / total
|
||
}
|
||
}
|
||
|
||
if t.withSignals {
|
||
tok.Signals = &SignalBreakdown{
|
||
VerbScore: verbScore,
|
||
NounScore: nounScore,
|
||
Components: components,
|
||
}
|
||
}
|
||
}
|
||
|
||
// splitTrailingPunct separates a word from its trailing punctuation.
|
||
// Returns the word and the punctuation suffix. It also recognises
|
||
// standalone punctuation tokens such as "." and ")".
|
||
func splitTrailingPunct(s string) (string, string) {
|
||
// Standalone punctuation token.
|
||
if _, ok := matchPunctuation(s); ok {
|
||
return "", s
|
||
}
|
||
|
||
// Check for "..." suffix first (3-char pattern).
|
||
if core.HasSuffix(s, "...") {
|
||
return s[:len(s)-3], "..."
|
||
}
|
||
// Check single-char trailing punctuation.
|
||
if len(s) > 1 {
|
||
last := s[len(s)-1]
|
||
if last == '?' || last == ':' || last == '!' || last == ';' || last == ',' || last == '.' || last == ')' || last == ']' || last == '}' {
|
||
return s[:len(s)-1], string(last)
|
||
}
|
||
}
|
||
return s, ""
|
||
}
|
||
|
||
func (t *Tokeniser) splitFrenchElision(raw string) (string, string, bool) {
|
||
if !t.isFrenchLanguage() || len(raw) == 0 {
|
||
return "", raw, false
|
||
}
|
||
|
||
lower := core.Lower(raw)
|
||
if len(lower) < 2 {
|
||
return "", raw, false
|
||
}
|
||
|
||
for _, prefix := range frenchElisionPrefixes {
|
||
if !strings.HasPrefix(lower, prefix) {
|
||
continue
|
||
}
|
||
idx := len(prefix)
|
||
if idx >= len(raw) {
|
||
continue
|
||
}
|
||
if idx < len(raw) {
|
||
r, size := utf8.DecodeRuneInString(raw[idx:])
|
||
if r != '\'' && r != '’' {
|
||
continue
|
||
}
|
||
if size > 0 {
|
||
return raw[:idx+size], raw[idx+size:], true
|
||
}
|
||
}
|
||
}
|
||
|
||
return "", raw, false
|
||
}
|
||
|
||
func (t *Tokeniser) isFrenchLanguage() bool {
|
||
lang := core.Lower(t.lang)
|
||
return lang == "fr" || core.HasPrefix(lang, "fr-")
|
||
}
|
||
|
||
// matchPunctuation detects known punctuation patterns.
|
||
// Returns the punctuation type and true if recognised.
|
||
func matchPunctuation(punct string) (string, bool) {
|
||
switch punct {
|
||
case "...":
|
||
return "progress", true
|
||
case "?":
|
||
return "question", true
|
||
case "!":
|
||
return "exclamation", true
|
||
case ":":
|
||
return "label", true
|
||
case ";":
|
||
return "separator", true
|
||
case ",":
|
||
return "comma", true
|
||
case ".":
|
||
return "sentence_end", true
|
||
case ")":
|
||
return "close_paren", true
|
||
case "]":
|
||
return "close_bracket", true
|
||
case "}":
|
||
return "close_brace", true
|
||
}
|
||
return "", false
|
||
}
|
||
|
||
// DisambiguationStats provides aggregate statistics about token disambiguation.
|
||
type DisambiguationStats struct {
|
||
TotalTokens int
|
||
AmbiguousTokens int
|
||
ResolvedAsVerb int
|
||
ResolvedAsNoun int
|
||
AvgConfidence float64
|
||
LowConfidence int // count where confidence < 0.7
|
||
}
|
||
|
||
// DisambiguationStatsFromTokens computes aggregate disambiguation stats from a token slice.
|
||
func DisambiguationStatsFromTokens(tokens []Token) DisambiguationStats {
|
||
var s DisambiguationStats
|
||
s.TotalTokens = len(tokens)
|
||
var confSum float64
|
||
var confCount int
|
||
|
||
for _, tok := range tokens {
|
||
if tok.AltType != 0 && tok.AltConf > 0 {
|
||
s.AmbiguousTokens++
|
||
if tok.Type == TokenVerb {
|
||
s.ResolvedAsVerb++
|
||
} else if tok.Type == TokenNoun {
|
||
s.ResolvedAsNoun++
|
||
}
|
||
}
|
||
if tok.Type != TokenUnknown && tok.Confidence > 0 {
|
||
confSum += tok.Confidence
|
||
confCount++
|
||
if tok.Confidence < 0.7 {
|
||
s.LowConfidence++
|
||
}
|
||
}
|
||
}
|
||
|
||
if confCount > 0 {
|
||
s.AvgConfidence = confSum / float64(confCount)
|
||
}
|
||
return s
|
||
}
|