feat(reversal): add classification benchmark suite

220 domain-tagged sentences across {technical, creative, ethical, casual} with leave-one-out classification, domain separation, token coverage, tense profile, and top-verb diagnostics. Grammar-based accuracy: 54% overall (technical 78%, creative 82%, ethical 46%, casual 11%). Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 17:48:48 +00:00 · 2026-02-19 17:48:48 +00:00 · 7d5ab809f0
commit 7d5ab809f0
parent 65cf099517
2 changed files with 591 additions and 1 deletions
--- a/TODO.md
+++ b/TODO.md
@ -15,7 +15,7 @@ Dispatched from core/go orchestration. Pick up tasks in order.

 ### 2a: 1B Pre-Classification (NEW — based on benchmark findings)

- [ ] **Classification benchmark suite** — Standalone Go test file (`classify_bench_test.go`) that feeds 200+ domain-tagged sentences through the tokeniser and measures accuracy against known labels. Categories: {technical, creative, ethical, casual}. This is the ground truth for calibrating 1B pre-tags.
+- [x] **Classification benchmark suite** — 220 domain-tagged sentences, leave-one-out classification via imprint similarity. Grammar engine: technical 78%, creative 82%, ethical 46%, casual 11%. Ethical↔technical and casual↔creative confusion confirms 1B model needed for those domains.
 - [ ] **1B pre-sort pipeline tool** — CLI command or Go func that reads a JSONL corpus (Phase 0 seeds), sends each text through LEK-Gemma3-1B domain classification, and writes back JSONL with `domain_1b` field added. Target: ~5K sentences/sec on M3. Use MLX via go-ai bindings or shell out to `mlx_lm.generate`.
 - [ ] **1B vs 27B calibration check** — Sample 500 sentences, classify with both 1B and 27B, measure agreement rate. The 75% accuracy from benchmarks should improve with targeted prompt tuning. Document the confusion matrix (technical↔creative is the known weak spot).
 - [ ] **Article/irregular validator** — Lightweight Go funcs that use the 1B model's strong article correctness (100%) and irregular base form accuracy (100%) as fast validators. Could supplement rule-based `Article()` and `PastTense()` for edge cases the grammar tables don't cover.
--- a/reversal/classify_bench_test.go
+++ b/reversal/classify_bench_test.go
@ -0,0 +1,590 @@
+package reversal
+
+import (
+	"fmt"
+	"sort"
+	"testing"
+)
+
+// Domain categories for classification ground truth.
+const (
+	domainTechnical = "technical"
+	domainCreative  = "creative"
+	domainEthical   = "ethical"
+	domainCasual    = "casual"
+)
+
+type taggedSentence struct {
+	Text   string
+	Domain string
+}
+
+// classificationCorpus contains 200+ domain-tagged sentences for calibrating
+// grammar-based domain classification. These serve as ground truth for the
+// 1B pre-sort pipeline (Phase 2a).
+var classificationCorpus = []taggedSentence{
+	// --- Technical (55) ---
+	// Imperative dev/ops commands, system administration, debugging
+	{"Delete the configuration file", domainTechnical},
+	{"Build the project from source", domainTechnical},
+	{"Run the tests before committing", domainTechnical},
+	{"Push the changes to the branch", domainTechnical},
+	{"Update the dependencies", domainTechnical},
+	{"Check the build status", domainTechnical},
+	{"Find the failing test", domainTechnical},
+	{"Write the test cases first", domainTechnical},
+	{"Set the environment variables", domainTechnical},
+	{"Split the package into modules", domainTechnical},
+	{"Scan the repository for vulnerabilities", domainTechnical},
+	{"Format the source files", domainTechnical},
+	{"Reset the branch to the previous commit", domainTechnical},
+	{"Stop the running process", domainTechnical},
+	{"Cut a new release branch", domainTechnical},
+	{"Send the build artifacts to the server", domainTechnical},
+	{"Keep the test coverage above the threshold", domainTechnical},
+	{"Hold the deployment until the checks pass", domainTechnical},
+	{"Begin the migration to the new package", domainTechnical},
+	{"Take the old server offline", domainTechnical},
+	{"The build failed because of a missing dependency", domainTechnical},
+	{"The test committed changes to the wrong branch", domainTechnical},
+	{"We found a vulnerability in the package", domainTechnical},
+	{"The commit broke the build", domainTechnical},
+	{"She deleted the old configuration files", domainTechnical},
+	{"They pushed the fix to the repository", domainTechnical},
+	{"The branch was updated with the latest changes", domainTechnical},
+	{"He rebuilt the project after updating dependencies", domainTechnical},
+	{"The task failed during the scanning phase", domainTechnical},
+	{"We split the repository into separate packages", domainTechnical},
+	{"The check ran successfully on all branches", domainTechnical},
+	{"They found the issue in the build directory", domainTechnical},
+	{"The file was committed without running tests", domainTechnical},
+	{"She set the deployment configuration correctly", domainTechnical},
+	{"Building the project takes several minutes", domainTechnical},
+	{"Deleting old branches keeps the repository clean", domainTechnical},
+	{"Running the full test suite before merging", domainTechnical},
+	{"Updating packages resolved the vulnerability", domainTechnical},
+	{"Checking the build logs for errors", domainTechnical},
+	{"Scanning dependencies for known issues", domainTechnical},
+	{"Writing tests for the new commit handler", domainTechnical},
+	{"Pushing changes to the remote repository", domainTechnical},
+	{"Finding the root cause of the test failure", domainTechnical},
+	{"Formatting the code before the final commit", domainTechnical},
+	{"Splitting the configuration into separate files", domainTechnical},
+	{"Override the default build configuration", domainTechnical},
+	{"Rebuild the project with the updated dependencies", domainTechnical},
+	{"Rerun the failed tests on the branch", domainTechnical},
+	{"Debug the issue in the test runner", domainTechnical},
+	{"Embed the version string in the build", domainTechnical},
+	{"Withdraw the broken release from the repository", domainTechnical},
+	{"Offset the deployment by one commit", domainTechnical},
+	{"Input the new configuration values", domainTechnical},
+	{"Output the build results to a file", domainTechnical},
+	{"Unzip the package artifacts", domainTechnical},
+
+	// --- Creative (55) ---
+	// Narrative, descriptive, literary language
+	{"She wrote the story by candlelight", domainCreative},
+	{"The singer sang until the stars came out", domainCreative},
+	{"He drew a map of forgotten places", domainCreative},
+	{"They chose a path through the ancient forest", domainCreative},
+	{"The wind blew across the open field", domainCreative},
+	{"She spoke softly to the sleeping child", domainCreative},
+	{"He broke the silence with a whispered word", domainCreative},
+	{"The river froze under the winter moon", domainCreative},
+	{"She stole a glance at the hidden garden", domainCreative},
+	{"The old woman told tales of distant lands", domainCreative},
+	{"He threw his arms wide and began to sing", domainCreative},
+	{"The artist drew inspiration from the sea", domainCreative},
+	{"She woke to the sound of falling rain", domainCreative},
+	{"They built a castle from sand and dreams", domainCreative},
+	{"He ran through fields of golden wheat", domainCreative},
+	{"The dancer spun beneath the chandelier", domainCreative},
+	{"She wore a dress made of moonlight", domainCreative},
+	{"He hid the letter behind the painting", domainCreative},
+	{"The leaves fell like whispered secrets", domainCreative},
+	{"She found a door that led to another world", domainCreative},
+	{"He took the winding road through the hills", domainCreative},
+	{"The poet wrote verses about lost time", domainCreative},
+	{"She left footprints in the fresh snow", domainCreative},
+	{"They swam across the moonlit lake", domainCreative},
+	{"He drove through the night without stopping", domainCreative},
+	{"The music rose like smoke into the air", domainCreative},
+	{"She kept the secret for many years", domainCreative},
+	{"He led them deeper into the enchanted wood", domainCreative},
+	{"The candle shone against the darkness", domainCreative},
+	{"She lost herself in the pages of the book", domainCreative},
+	{"He caught the last train before midnight", domainCreative},
+	{"The garden grew wild after they left", domainCreative},
+	{"She paid no attention to the gathering storm", domainCreative},
+	{"He met the stranger at the crossroads", domainCreative},
+	{"The shadows held their breath", domainCreative},
+	{"Writing stories about forgotten kingdoms", domainCreative},
+	{"Drawing maps of imaginary coastlines", domainCreative},
+	{"Singing ballads under the open sky", domainCreative},
+	{"Telling tales of heroes and lost causes", domainCreative},
+	{"Weaving words into tapestries of meaning", domainCreative},
+	{"She began her tale with a sigh", domainCreative},
+	{"The old house stood at the edge of the world", domainCreative},
+	{"He gave the child a carved wooden bird", domainCreative},
+	{"They brought flowers to the abandoned shrine", domainCreative},
+	{"The ship left harbour before the dawn", domainCreative},
+	{"She sold the family ring to pay for passage", domainCreative},
+	{"He won the contest with an improvised song", domainCreative},
+	{"The clock struck twelve and the spell was broken", domainCreative},
+	{"She bent the wire into a tiny crown", domainCreative},
+	{"They flew kites above the autumn trees", domainCreative},
+	{"He spent the afternoon by the quiet river", domainCreative},
+	{"The rain fell softly on the old stone bridge", domainCreative},
+	{"She tore the map in half and chose the left path", domainCreative},
+	{"He sat on the hillside watching the sunset", domainCreative},
+	{"The story begins where the road splits in two", domainCreative},
+
+	// --- Ethical (55) ---
+	// Moral reasoning, prescriptive, policy, fairness
+	{"We should think about the consequences of our choices", domainEthical},
+	{"They must hold themselves accountable for the outcome", domainEthical},
+	{"You should not break a promise once it has been made", domainEthical},
+	{"We must find a fair solution for all parties", domainEthical},
+	{"Leaders should stand for what they believe is right", domainEthical},
+	{"We should keep our commitments to the community", domainEthical},
+	{"They must take responsibility for their decisions", domainEthical},
+	{"You should tell the truth even when it is difficult", domainEthical},
+	{"We ought to bring attention to hidden suffering", domainEthical},
+	{"They should not leave anyone behind", domainEthical},
+	{"We must hold power accountable to the people", domainEthical},
+	{"One should think carefully before making accusations", domainEthical},
+	{"Leaders must lead by example in matters of integrity", domainEthical},
+	{"We should find ways to include all voices", domainEthical},
+	{"They should not cut corners on safety", domainEthical},
+	{"We must build trust through consistent action", domainEthical},
+	{"You should give others the benefit of the doubt", domainEthical},
+	{"They must not put profit above human welfare", domainEthical},
+	{"We should pay attention to those who are vulnerable", domainEthical},
+	{"One must meet obligations before seeking rewards", domainEthical},
+	{"They broke the agreement and lost our trust", domainEthical},
+	{"The decision cost many people their livelihoods", domainEthical},
+	{"She spoke out against the policy of exclusion", domainEthical},
+	{"They chose transparency over self-interest", domainEthical},
+	{"He stood firm despite the pressure to compromise", domainEthical},
+	{"The organisation lost credibility after the scandal", domainEthical},
+	{"She held the board accountable for their failures", domainEthical},
+	{"They found that the policy caused unintended harm", domainEthical},
+	{"He brought evidence of wrongdoing to the authorities", domainEthical},
+	{"The report led to significant reforms", domainEthical},
+	{"Thinking about fairness in resource distribution", domainEthical},
+	{"Holding institutions accountable for their promises", domainEthical},
+	{"Building systems that protect the most vulnerable", domainEthical},
+	{"Finding the balance between freedom and responsibility", domainEthical},
+	{"Keeping commitments to future generations", domainEthical},
+	{"We should not sell access to essential services", domainEthical},
+	{"They must seek consent before taking action", domainEthical},
+	{"You should stand with those who cannot stand alone", domainEthical},
+	{"We must begin by acknowledging past mistakes", domainEthical},
+	{"Leaders should spend more time listening", domainEthical},
+	{"We should set clear boundaries on acceptable conduct", domainEthical},
+	{"One must not hide the truth for personal gain", domainEthical},
+	{"They should deal fairly with competing interests", domainEthical},
+	{"We must win trust through transparency and honesty", domainEthical},
+	{"You should not bend the rules to suit your needs", domainEthical},
+	{"They ought to hold elections that are free and fair", domainEthical},
+	{"We should bring diverse perspectives to the table", domainEthical},
+	{"One must take care not to cause unnecessary harm", domainEthical},
+	{"They should not shut out dissenting opinions", domainEthical},
+	{"We must keep the interests of the public in mind", domainEthical},
+	{"She fought to uphold the rights of the displaced", domainEthical},
+	{"He withdrew support after the ethical breach", domainEthical},
+	{"They overcame resistance to pass the reform", domainEthical},
+	{"The committee forbade the use of deceptive practices", domainEthical},
+	{"We should not cast blame without evidence", domainEthical},
+
+	// --- Casual (55) ---
+	// Everyday conversation, informal, personal
+	{"I went to the store yesterday", domainCasual},
+	{"She made dinner for everyone last night", domainCasual},
+	{"We took the dog for a walk this morning", domainCasual},
+	{"He got a new phone last week", domainCasual},
+	{"They left early to beat the traffic", domainCasual},
+	{"I found my keys under the sofa", domainCasual},
+	{"She bought a new jacket for the trip", domainCasual},
+	{"We met for coffee after work", domainCasual},
+	{"He cut the grass before it rained", domainCasual},
+	{"They brought snacks to the party", domainCasual},
+	{"I sat on the porch and read a book", domainCasual},
+	{"She paid for lunch at the cafe", domainCasual},
+	{"We ran into an old friend at the market", domainCasual},
+	{"He put the groceries away", domainCasual},
+	{"They spent the weekend at the beach", domainCasual},
+	{"I told her about the new restaurant", domainCasual},
+	{"She drove to the airport early", domainCasual},
+	{"We got lost on the way there", domainCasual},
+	{"He fell asleep on the couch", domainCasual},
+	{"They won tickets to the show", domainCasual},
+	{"I lost my umbrella somewhere", domainCasual},
+	{"She chose the window seat", domainCasual},
+	{"We hit the road before dawn", domainCasual},
+	{"He kept the receipt just in case", domainCasual},
+	{"They came over for board games", domainCasual},
+	{"I took a shortcut through the park", domainCasual},
+	{"She left a message on the machine", domainCasual},
+	{"We gave the old furniture away", domainCasual},
+	{"He held the door for the woman behind him", domainCasual},
+	{"They sent us a postcard from the coast", domainCasual},
+	{"Going to the park this afternoon", domainCasual},
+	{"Making plans for the holiday", domainCasual},
+	{"Getting ready for the weekend trip", domainCasual},
+	{"Meeting friends at the usual place", domainCasual},
+	{"Looking for a good place to eat", domainCasual},
+	{"I think she went home already", domainCasual},
+	{"He said he would come by later", domainCasual},
+	{"We should get together sometime", domainCasual},
+	{"She told me about her new job", domainCasual},
+	{"They brought the kids to the game", domainCasual},
+	{"I set the alarm for six in the morning", domainCasual},
+	{"She sold her old bike at the market", domainCasual},
+	{"We split the bill at the restaurant", domainCasual},
+	{"He drew a funny picture on the napkin", domainCasual},
+	{"They began planning the birthday party", domainCasual},
+	{"I threw out the old newspapers", domainCasual},
+	{"She hung the new curtains in the bedroom", domainCasual},
+	{"We led the way to the hidden trail", domainCasual},
+	{"He bent down to pick up the coin", domainCasual},
+	{"They fed the ducks at the pond", domainCasual},
+	{"I caught the bus just in time", domainCasual},
+	{"She broke her favourite mug this morning", domainCasual},
+	{"We built a shelf for the kitchen", domainCasual},
+	{"He shut the door and turned the light off", domainCasual},
+	{"They stood in line for an hour", domainCasual},
+}
+
+// --- Helpers ---
+
+// corpusByDomain groups the corpus by domain label.
+func corpusByDomain() map[string][]int {
+	groups := make(map[string][]int)
+	for i, s := range classificationCorpus {
+		groups[s.Domain] = append(groups[s.Domain], i)
+	}
+	return groups
+}
+
+// imprintCorpus tokenises and imprints every sentence. Caller reuses the slice.
+func imprintCorpus(tok *Tokeniser) []GrammarImprint {
+	imprints := make([]GrammarImprint, len(classificationCorpus))
+	for i, s := range classificationCorpus {
+		imprints[i] = NewImprint(tok.Tokenise(s.Text))
+	}
+	return imprints
+}
+
+// classifyLeaveOneOut returns the predicted domain for sentence at idx
+// by computing average similarity to every other sentence in each domain.
+func classifyLeaveOneOut(idx int, imprints []GrammarImprint, groups map[string][]int) string {
+	bestDomain := ""
+	bestSim := -1.0
+
+	for domain, indices := range groups {
+		var sum float64
+		var count int
+		for _, j := range indices {
+			if j == idx {
+				continue
+			}
+			sum += imprints[idx].Similar(imprints[j])
+			count++
+		}
+		if count == 0 {
+			continue
+		}
+		avg := sum / float64(count)
+		if avg > bestSim {
+			bestSim = avg
+			bestDomain = domain
+		}
+	}
+	return bestDomain
+}
+
+// --- Tests ---
+
+// TestClassification_CorpusSize validates the corpus has enough sentences per domain.
+func TestClassification_CorpusSize(t *testing.T) {
+	groups := corpusByDomain()
+	domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
+
+	for _, d := range domains {
+		if n := len(groups[d]); n < 50 {
+			t.Errorf("domain %q has %d sentences, want >= 50", d, n)
+		}
+	}
+	if total := len(classificationCorpus); total < 200 {
+		t.Errorf("corpus has %d sentences, want >= 200", total)
+	}
+}
+
+// TestClassification_DomainSeparation verifies within-domain imprint similarity
+// exceeds cross-domain similarity. This is the basic requirement for domain
+// classification to work.
+func TestClassification_DomainSeparation(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+	imprints := imprintCorpus(tok)
+	groups := corpusByDomain()
+
+	domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
+
+	for _, d := range domains {
+		indices := groups[d]
+
+		// Within-domain average similarity
+		var withinSum float64
+		var withinCount int
+		for i := 0; i < len(indices); i++ {
+			for j := i + 1; j < len(indices); j++ {
+				withinSum += imprints[indices[i]].Similar(imprints[indices[j]])
+				withinCount++
+			}
+		}
+		withinAvg := withinSum / float64(withinCount)
+
+		// Cross-domain average similarity
+		var crossSum float64
+		var crossCount int
+		for _, otherD := range domains {
+			if otherD == d {
+				continue
+			}
+			for _, i := range indices {
+				for _, j := range groups[otherD] {
+					crossSum += imprints[i].Similar(imprints[j])
+					crossCount++
+				}
+			}
+		}
+		crossAvg := crossSum / float64(crossCount)
+
+		t.Logf("%-10s within=%.4f cross=%.4f gap=%.4f", d, withinAvg, crossAvg, withinAvg-crossAvg)
+
+		if withinAvg <= crossAvg {
+			t.Errorf("domain %q: within-domain similarity (%.4f) should exceed cross-domain (%.4f)",
+				d, withinAvg, crossAvg)
+		}
+	}
+}
+
+// TestClassification_LeaveOneOut measures per-domain and overall accuracy
+// using leave-one-out nearest-centroid classification.
+func TestClassification_LeaveOneOut(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+	imprints := imprintCorpus(tok)
+	groups := corpusByDomain()
+
+	domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
+
+	// Confusion matrix: actual -> predicted -> count
+	confusion := make(map[string]map[string]int)
+	for _, d := range domains {
+		confusion[d] = make(map[string]int)
+	}
+
+	correct := 0
+	total := len(classificationCorpus)
+
+	for i, s := range classificationCorpus {
+		predicted := classifyLeaveOneOut(i, imprints, groups)
+		confusion[s.Domain][predicted]++
+		if predicted == s.Domain {
+			correct++
+		}
+	}
+
+	overallAcc := float64(correct) / float64(total)
+	t.Logf("Overall accuracy: %d/%d (%.1f%%)", correct, total, overallAcc*100)
+
+	// Per-domain accuracy
+	for _, d := range domains {
+		domainTotal := len(groups[d])
+		domainCorrect := confusion[d][d]
+		acc := float64(domainCorrect) / float64(domainTotal)
+		t.Logf("  %-10s %d/%d (%.1f%%)", d, domainCorrect, domainTotal, acc*100)
+	}
+
+	// Print confusion matrix
+	t.Log("\nConfusion matrix (rows=actual, cols=predicted):")
+	header := fmt.Sprintf("  %-10s", "")
+	for _, d := range domains {
+		header += fmt.Sprintf(" %10s", d[:4])
+	}
+	t.Log(header)
+	for _, actual := range domains {
+		row := fmt.Sprintf("  %-10s", actual[:4])
+		for _, predicted := range domains {
+			row += fmt.Sprintf(" %10d", confusion[actual][predicted])
+		}
+		t.Log(row)
+	}
+
+	// Soft threshold: grammar-based classification won't be perfect,
+	// but should beat random chance (25%) meaningfully.
+	if overallAcc < 0.35 {
+		t.Errorf("overall accuracy %.1f%% is below 35%% threshold", overallAcc*100)
+	}
+}
+
+// TestClassification_TokenCoverage reports per-domain token recognition rates.
+// Domains with low coverage rely more on the 1B model for classification.
+func TestClassification_TokenCoverage(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+	groups := corpusByDomain()
+
+	domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
+
+	for _, d := range domains {
+		var totalTokens, recognisedTokens int
+		var verbTokens, nounTokens, articleTokens, wordTokens int
+
+		for _, idx := range groups[d] {
+			tokens := tok.Tokenise(classificationCorpus[idx].Text)
+			for _, token := range tokens {
+				totalTokens++
+				switch token.Type {
+				case TokenVerb:
+					recognisedTokens++
+					verbTokens++
+				case TokenNoun:
+					recognisedTokens++
+					nounTokens++
+				case TokenArticle:
+					recognisedTokens++
+					articleTokens++
+				case TokenWord:
+					recognisedTokens++
+					wordTokens++
+				case TokenPunctuation:
+					recognisedTokens++
+				}
+			}
+		}
+
+		coverage := float64(recognisedTokens) / float64(totalTokens) * 100
+		t.Logf("%-10s coverage=%.1f%% tokens=%d verbs=%d nouns=%d articles=%d words=%d",
+			d, coverage, totalTokens, verbTokens, nounTokens, articleTokens, wordTokens)
+	}
+}
+
+// TestClassification_TenseProfile reports per-domain tense distribution.
+// Useful for understanding what grammar signals distinguish domains.
+func TestClassification_TenseProfile(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+	groups := corpusByDomain()
+
+	domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
+	tenses := []string{"base", "past", "gerund"}
+
+	for _, d := range domains {
+		tenseCounts := make(map[string]int)
+		var totalVerbs int
+
+		for _, idx := range groups[d] {
+			tokens := tok.Tokenise(classificationCorpus[idx].Text)
+			for _, token := range tokens {
+				if token.Type == TokenVerb {
+					tenseCounts[token.VerbInfo.Tense]++
+					totalVerbs++
+				}
+			}
+		}
+
+		parts := fmt.Sprintf("%-10s verbs=%d", d, totalVerbs)
+		for _, tense := range tenses {
+			pct := 0.0
+			if totalVerbs > 0 {
+				pct = float64(tenseCounts[tense]) / float64(totalVerbs) * 100
+			}
+			parts += fmt.Sprintf("  %s=%.0f%%", tense, pct)
+		}
+		t.Log(parts)
+	}
+}
+
+// TestClassification_TopVerbs reports the most frequent verbs per domain.
+func TestClassification_TopVerbs(t *testing.T) {
+	setup(t)
+	tok := NewTokeniser()
+	groups := corpusByDomain()
+
+	domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
+
+	for _, d := range domains {
+		verbCounts := make(map[string]int)
+		for _, idx := range groups[d] {
+			tokens := tok.Tokenise(classificationCorpus[idx].Text)
+			for _, token := range tokens {
+				if token.Type == TokenVerb {
+					verbCounts[token.VerbInfo.Base]++
+				}
+			}
+		}
+
+		// Sort by frequency
+		type kv struct {
+			verb  string
+			count int
+		}
+		var sorted []kv
+		for v, c := range verbCounts {
+			sorted = append(sorted, kv{v, c})
+		}
+		sort.Slice(sorted, func(i, j int) bool { return sorted[i].count > sorted[j].count })
+
+		top := 8
+		if len(sorted) < top {
+			top = len(sorted)
+		}
+		verbs := ""
+		for i := 0; i < top; i++ {
+			if i > 0 {
+				verbs += ", "
+			}
+			verbs += fmt.Sprintf("%s(%d)", sorted[i].verb, sorted[i].count)
+		}
+		t.Logf("%-10s unique=%d top: %s", d, len(verbCounts), verbs)
+	}
+}
+
+// --- Benchmarks ---
+
+func BenchmarkClassification_Tokenise(b *testing.B) {
+	benchSetup(b)
+	tok := NewTokeniser()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for _, s := range classificationCorpus {
+			tok.Tokenise(s.Text)
+		}
+	}
+}
+
+func BenchmarkClassification_ImprintAll(b *testing.B) {
+	benchSetup(b)
+	tok := NewTokeniser()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		imprintCorpus(tok)
+	}
+}
+
+func BenchmarkClassification_FullPipeline(b *testing.B) {
+	benchSetup(b)
+	tok := NewTokeniser()
+	groups := corpusByDomain()
+	imprints := imprintCorpus(tok)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for idx := range classificationCorpus {
+			classifyLeaveOneOut(idx, imprints, groups)
+		}
+	}
+}