feat(reversal): add classification benchmark suite

220 domain-tagged sentences across {technical, creative, ethical, casual}
with leave-one-out classification, domain separation, token coverage,
tense profile, and top-verb diagnostics. Grammar-based accuracy: 54%
overall (technical 78%, creative 82%, ethical 46%, casual 11%).

Co-Authored-By: Virgil <virgil@lethean.io>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Snider 2026-02-19 17:48:48 +00:00
parent 65cf099517
commit 7d5ab809f0
2 changed files with 591 additions and 1 deletions

View file

@ -15,7 +15,7 @@ Dispatched from core/go orchestration. Pick up tasks in order.
### 2a: 1B Pre-Classification (NEW — based on benchmark findings)
- [ ] **Classification benchmark suite** — Standalone Go test file (`classify_bench_test.go`) that feeds 200+ domain-tagged sentences through the tokeniser and measures accuracy against known labels. Categories: {technical, creative, ethical, casual}. This is the ground truth for calibrating 1B pre-tags.
- [x] **Classification benchmark suite** — 220 domain-tagged sentences, leave-one-out classification via imprint similarity. Grammar engine: technical 78%, creative 82%, ethical 46%, casual 11%. Ethical↔technical and casual↔creative confusion confirms 1B model needed for those domains.
- [ ] **1B pre-sort pipeline tool** — CLI command or Go func that reads a JSONL corpus (Phase 0 seeds), sends each text through LEK-Gemma3-1B domain classification, and writes back JSONL with `domain_1b` field added. Target: ~5K sentences/sec on M3. Use MLX via go-ai bindings or shell out to `mlx_lm.generate`.
- [ ] **1B vs 27B calibration check** — Sample 500 sentences, classify with both 1B and 27B, measure agreement rate. The 75% accuracy from benchmarks should improve with targeted prompt tuning. Document the confusion matrix (technical↔creative is the known weak spot).
- [ ] **Article/irregular validator** — Lightweight Go funcs that use the 1B model's strong article correctness (100%) and irregular base form accuracy (100%) as fast validators. Could supplement rule-based `Article()` and `PastTense()` for edge cases the grammar tables don't cover.

View file

@ -0,0 +1,590 @@
package reversal
import (
"fmt"
"sort"
"testing"
)
// Domain categories for classification ground truth.
const (
domainTechnical = "technical"
domainCreative = "creative"
domainEthical = "ethical"
domainCasual = "casual"
)
type taggedSentence struct {
Text string
Domain string
}
// classificationCorpus contains 200+ domain-tagged sentences for calibrating
// grammar-based domain classification. These serve as ground truth for the
// 1B pre-sort pipeline (Phase 2a).
var classificationCorpus = []taggedSentence{
// --- Technical (55) ---
// Imperative dev/ops commands, system administration, debugging
{"Delete the configuration file", domainTechnical},
{"Build the project from source", domainTechnical},
{"Run the tests before committing", domainTechnical},
{"Push the changes to the branch", domainTechnical},
{"Update the dependencies", domainTechnical},
{"Check the build status", domainTechnical},
{"Find the failing test", domainTechnical},
{"Write the test cases first", domainTechnical},
{"Set the environment variables", domainTechnical},
{"Split the package into modules", domainTechnical},
{"Scan the repository for vulnerabilities", domainTechnical},
{"Format the source files", domainTechnical},
{"Reset the branch to the previous commit", domainTechnical},
{"Stop the running process", domainTechnical},
{"Cut a new release branch", domainTechnical},
{"Send the build artifacts to the server", domainTechnical},
{"Keep the test coverage above the threshold", domainTechnical},
{"Hold the deployment until the checks pass", domainTechnical},
{"Begin the migration to the new package", domainTechnical},
{"Take the old server offline", domainTechnical},
{"The build failed because of a missing dependency", domainTechnical},
{"The test committed changes to the wrong branch", domainTechnical},
{"We found a vulnerability in the package", domainTechnical},
{"The commit broke the build", domainTechnical},
{"She deleted the old configuration files", domainTechnical},
{"They pushed the fix to the repository", domainTechnical},
{"The branch was updated with the latest changes", domainTechnical},
{"He rebuilt the project after updating dependencies", domainTechnical},
{"The task failed during the scanning phase", domainTechnical},
{"We split the repository into separate packages", domainTechnical},
{"The check ran successfully on all branches", domainTechnical},
{"They found the issue in the build directory", domainTechnical},
{"The file was committed without running tests", domainTechnical},
{"She set the deployment configuration correctly", domainTechnical},
{"Building the project takes several minutes", domainTechnical},
{"Deleting old branches keeps the repository clean", domainTechnical},
{"Running the full test suite before merging", domainTechnical},
{"Updating packages resolved the vulnerability", domainTechnical},
{"Checking the build logs for errors", domainTechnical},
{"Scanning dependencies for known issues", domainTechnical},
{"Writing tests for the new commit handler", domainTechnical},
{"Pushing changes to the remote repository", domainTechnical},
{"Finding the root cause of the test failure", domainTechnical},
{"Formatting the code before the final commit", domainTechnical},
{"Splitting the configuration into separate files", domainTechnical},
{"Override the default build configuration", domainTechnical},
{"Rebuild the project with the updated dependencies", domainTechnical},
{"Rerun the failed tests on the branch", domainTechnical},
{"Debug the issue in the test runner", domainTechnical},
{"Embed the version string in the build", domainTechnical},
{"Withdraw the broken release from the repository", domainTechnical},
{"Offset the deployment by one commit", domainTechnical},
{"Input the new configuration values", domainTechnical},
{"Output the build results to a file", domainTechnical},
{"Unzip the package artifacts", domainTechnical},
// --- Creative (55) ---
// Narrative, descriptive, literary language
{"She wrote the story by candlelight", domainCreative},
{"The singer sang until the stars came out", domainCreative},
{"He drew a map of forgotten places", domainCreative},
{"They chose a path through the ancient forest", domainCreative},
{"The wind blew across the open field", domainCreative},
{"She spoke softly to the sleeping child", domainCreative},
{"He broke the silence with a whispered word", domainCreative},
{"The river froze under the winter moon", domainCreative},
{"She stole a glance at the hidden garden", domainCreative},
{"The old woman told tales of distant lands", domainCreative},
{"He threw his arms wide and began to sing", domainCreative},
{"The artist drew inspiration from the sea", domainCreative},
{"She woke to the sound of falling rain", domainCreative},
{"They built a castle from sand and dreams", domainCreative},
{"He ran through fields of golden wheat", domainCreative},
{"The dancer spun beneath the chandelier", domainCreative},
{"She wore a dress made of moonlight", domainCreative},
{"He hid the letter behind the painting", domainCreative},
{"The leaves fell like whispered secrets", domainCreative},
{"She found a door that led to another world", domainCreative},
{"He took the winding road through the hills", domainCreative},
{"The poet wrote verses about lost time", domainCreative},
{"She left footprints in the fresh snow", domainCreative},
{"They swam across the moonlit lake", domainCreative},
{"He drove through the night without stopping", domainCreative},
{"The music rose like smoke into the air", domainCreative},
{"She kept the secret for many years", domainCreative},
{"He led them deeper into the enchanted wood", domainCreative},
{"The candle shone against the darkness", domainCreative},
{"She lost herself in the pages of the book", domainCreative},
{"He caught the last train before midnight", domainCreative},
{"The garden grew wild after they left", domainCreative},
{"She paid no attention to the gathering storm", domainCreative},
{"He met the stranger at the crossroads", domainCreative},
{"The shadows held their breath", domainCreative},
{"Writing stories about forgotten kingdoms", domainCreative},
{"Drawing maps of imaginary coastlines", domainCreative},
{"Singing ballads under the open sky", domainCreative},
{"Telling tales of heroes and lost causes", domainCreative},
{"Weaving words into tapestries of meaning", domainCreative},
{"She began her tale with a sigh", domainCreative},
{"The old house stood at the edge of the world", domainCreative},
{"He gave the child a carved wooden bird", domainCreative},
{"They brought flowers to the abandoned shrine", domainCreative},
{"The ship left harbour before the dawn", domainCreative},
{"She sold the family ring to pay for passage", domainCreative},
{"He won the contest with an improvised song", domainCreative},
{"The clock struck twelve and the spell was broken", domainCreative},
{"She bent the wire into a tiny crown", domainCreative},
{"They flew kites above the autumn trees", domainCreative},
{"He spent the afternoon by the quiet river", domainCreative},
{"The rain fell softly on the old stone bridge", domainCreative},
{"She tore the map in half and chose the left path", domainCreative},
{"He sat on the hillside watching the sunset", domainCreative},
{"The story begins where the road splits in two", domainCreative},
// --- Ethical (55) ---
// Moral reasoning, prescriptive, policy, fairness
{"We should think about the consequences of our choices", domainEthical},
{"They must hold themselves accountable for the outcome", domainEthical},
{"You should not break a promise once it has been made", domainEthical},
{"We must find a fair solution for all parties", domainEthical},
{"Leaders should stand for what they believe is right", domainEthical},
{"We should keep our commitments to the community", domainEthical},
{"They must take responsibility for their decisions", domainEthical},
{"You should tell the truth even when it is difficult", domainEthical},
{"We ought to bring attention to hidden suffering", domainEthical},
{"They should not leave anyone behind", domainEthical},
{"We must hold power accountable to the people", domainEthical},
{"One should think carefully before making accusations", domainEthical},
{"Leaders must lead by example in matters of integrity", domainEthical},
{"We should find ways to include all voices", domainEthical},
{"They should not cut corners on safety", domainEthical},
{"We must build trust through consistent action", domainEthical},
{"You should give others the benefit of the doubt", domainEthical},
{"They must not put profit above human welfare", domainEthical},
{"We should pay attention to those who are vulnerable", domainEthical},
{"One must meet obligations before seeking rewards", domainEthical},
{"They broke the agreement and lost our trust", domainEthical},
{"The decision cost many people their livelihoods", domainEthical},
{"She spoke out against the policy of exclusion", domainEthical},
{"They chose transparency over self-interest", domainEthical},
{"He stood firm despite the pressure to compromise", domainEthical},
{"The organisation lost credibility after the scandal", domainEthical},
{"She held the board accountable for their failures", domainEthical},
{"They found that the policy caused unintended harm", domainEthical},
{"He brought evidence of wrongdoing to the authorities", domainEthical},
{"The report led to significant reforms", domainEthical},
{"Thinking about fairness in resource distribution", domainEthical},
{"Holding institutions accountable for their promises", domainEthical},
{"Building systems that protect the most vulnerable", domainEthical},
{"Finding the balance between freedom and responsibility", domainEthical},
{"Keeping commitments to future generations", domainEthical},
{"We should not sell access to essential services", domainEthical},
{"They must seek consent before taking action", domainEthical},
{"You should stand with those who cannot stand alone", domainEthical},
{"We must begin by acknowledging past mistakes", domainEthical},
{"Leaders should spend more time listening", domainEthical},
{"We should set clear boundaries on acceptable conduct", domainEthical},
{"One must not hide the truth for personal gain", domainEthical},
{"They should deal fairly with competing interests", domainEthical},
{"We must win trust through transparency and honesty", domainEthical},
{"You should not bend the rules to suit your needs", domainEthical},
{"They ought to hold elections that are free and fair", domainEthical},
{"We should bring diverse perspectives to the table", domainEthical},
{"One must take care not to cause unnecessary harm", domainEthical},
{"They should not shut out dissenting opinions", domainEthical},
{"We must keep the interests of the public in mind", domainEthical},
{"She fought to uphold the rights of the displaced", domainEthical},
{"He withdrew support after the ethical breach", domainEthical},
{"They overcame resistance to pass the reform", domainEthical},
{"The committee forbade the use of deceptive practices", domainEthical},
{"We should not cast blame without evidence", domainEthical},
// --- Casual (55) ---
// Everyday conversation, informal, personal
{"I went to the store yesterday", domainCasual},
{"She made dinner for everyone last night", domainCasual},
{"We took the dog for a walk this morning", domainCasual},
{"He got a new phone last week", domainCasual},
{"They left early to beat the traffic", domainCasual},
{"I found my keys under the sofa", domainCasual},
{"She bought a new jacket for the trip", domainCasual},
{"We met for coffee after work", domainCasual},
{"He cut the grass before it rained", domainCasual},
{"They brought snacks to the party", domainCasual},
{"I sat on the porch and read a book", domainCasual},
{"She paid for lunch at the cafe", domainCasual},
{"We ran into an old friend at the market", domainCasual},
{"He put the groceries away", domainCasual},
{"They spent the weekend at the beach", domainCasual},
{"I told her about the new restaurant", domainCasual},
{"She drove to the airport early", domainCasual},
{"We got lost on the way there", domainCasual},
{"He fell asleep on the couch", domainCasual},
{"They won tickets to the show", domainCasual},
{"I lost my umbrella somewhere", domainCasual},
{"She chose the window seat", domainCasual},
{"We hit the road before dawn", domainCasual},
{"He kept the receipt just in case", domainCasual},
{"They came over for board games", domainCasual},
{"I took a shortcut through the park", domainCasual},
{"She left a message on the machine", domainCasual},
{"We gave the old furniture away", domainCasual},
{"He held the door for the woman behind him", domainCasual},
{"They sent us a postcard from the coast", domainCasual},
{"Going to the park this afternoon", domainCasual},
{"Making plans for the holiday", domainCasual},
{"Getting ready for the weekend trip", domainCasual},
{"Meeting friends at the usual place", domainCasual},
{"Looking for a good place to eat", domainCasual},
{"I think she went home already", domainCasual},
{"He said he would come by later", domainCasual},
{"We should get together sometime", domainCasual},
{"She told me about her new job", domainCasual},
{"They brought the kids to the game", domainCasual},
{"I set the alarm for six in the morning", domainCasual},
{"She sold her old bike at the market", domainCasual},
{"We split the bill at the restaurant", domainCasual},
{"He drew a funny picture on the napkin", domainCasual},
{"They began planning the birthday party", domainCasual},
{"I threw out the old newspapers", domainCasual},
{"She hung the new curtains in the bedroom", domainCasual},
{"We led the way to the hidden trail", domainCasual},
{"He bent down to pick up the coin", domainCasual},
{"They fed the ducks at the pond", domainCasual},
{"I caught the bus just in time", domainCasual},
{"She broke her favourite mug this morning", domainCasual},
{"We built a shelf for the kitchen", domainCasual},
{"He shut the door and turned the light off", domainCasual},
{"They stood in line for an hour", domainCasual},
}
// --- Helpers ---
// corpusByDomain groups the corpus by domain label.
func corpusByDomain() map[string][]int {
groups := make(map[string][]int)
for i, s := range classificationCorpus {
groups[s.Domain] = append(groups[s.Domain], i)
}
return groups
}
// imprintCorpus tokenises and imprints every sentence. Caller reuses the slice.
func imprintCorpus(tok *Tokeniser) []GrammarImprint {
imprints := make([]GrammarImprint, len(classificationCorpus))
for i, s := range classificationCorpus {
imprints[i] = NewImprint(tok.Tokenise(s.Text))
}
return imprints
}
// classifyLeaveOneOut returns the predicted domain for sentence at idx
// by computing average similarity to every other sentence in each domain.
func classifyLeaveOneOut(idx int, imprints []GrammarImprint, groups map[string][]int) string {
bestDomain := ""
bestSim := -1.0
for domain, indices := range groups {
var sum float64
var count int
for _, j := range indices {
if j == idx {
continue
}
sum += imprints[idx].Similar(imprints[j])
count++
}
if count == 0 {
continue
}
avg := sum / float64(count)
if avg > bestSim {
bestSim = avg
bestDomain = domain
}
}
return bestDomain
}
// --- Tests ---
// TestClassification_CorpusSize validates the corpus has enough sentences per domain.
func TestClassification_CorpusSize(t *testing.T) {
groups := corpusByDomain()
domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
for _, d := range domains {
if n := len(groups[d]); n < 50 {
t.Errorf("domain %q has %d sentences, want >= 50", d, n)
}
}
if total := len(classificationCorpus); total < 200 {
t.Errorf("corpus has %d sentences, want >= 200", total)
}
}
// TestClassification_DomainSeparation verifies within-domain imprint similarity
// exceeds cross-domain similarity. This is the basic requirement for domain
// classification to work.
func TestClassification_DomainSeparation(t *testing.T) {
setup(t)
tok := NewTokeniser()
imprints := imprintCorpus(tok)
groups := corpusByDomain()
domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
for _, d := range domains {
indices := groups[d]
// Within-domain average similarity
var withinSum float64
var withinCount int
for i := 0; i < len(indices); i++ {
for j := i + 1; j < len(indices); j++ {
withinSum += imprints[indices[i]].Similar(imprints[indices[j]])
withinCount++
}
}
withinAvg := withinSum / float64(withinCount)
// Cross-domain average similarity
var crossSum float64
var crossCount int
for _, otherD := range domains {
if otherD == d {
continue
}
for _, i := range indices {
for _, j := range groups[otherD] {
crossSum += imprints[i].Similar(imprints[j])
crossCount++
}
}
}
crossAvg := crossSum / float64(crossCount)
t.Logf("%-10s within=%.4f cross=%.4f gap=%.4f", d, withinAvg, crossAvg, withinAvg-crossAvg)
if withinAvg <= crossAvg {
t.Errorf("domain %q: within-domain similarity (%.4f) should exceed cross-domain (%.4f)",
d, withinAvg, crossAvg)
}
}
}
// TestClassification_LeaveOneOut measures per-domain and overall accuracy
// using leave-one-out nearest-centroid classification.
func TestClassification_LeaveOneOut(t *testing.T) {
setup(t)
tok := NewTokeniser()
imprints := imprintCorpus(tok)
groups := corpusByDomain()
domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
// Confusion matrix: actual -> predicted -> count
confusion := make(map[string]map[string]int)
for _, d := range domains {
confusion[d] = make(map[string]int)
}
correct := 0
total := len(classificationCorpus)
for i, s := range classificationCorpus {
predicted := classifyLeaveOneOut(i, imprints, groups)
confusion[s.Domain][predicted]++
if predicted == s.Domain {
correct++
}
}
overallAcc := float64(correct) / float64(total)
t.Logf("Overall accuracy: %d/%d (%.1f%%)", correct, total, overallAcc*100)
// Per-domain accuracy
for _, d := range domains {
domainTotal := len(groups[d])
domainCorrect := confusion[d][d]
acc := float64(domainCorrect) / float64(domainTotal)
t.Logf(" %-10s %d/%d (%.1f%%)", d, domainCorrect, domainTotal, acc*100)
}
// Print confusion matrix
t.Log("\nConfusion matrix (rows=actual, cols=predicted):")
header := fmt.Sprintf(" %-10s", "")
for _, d := range domains {
header += fmt.Sprintf(" %10s", d[:4])
}
t.Log(header)
for _, actual := range domains {
row := fmt.Sprintf(" %-10s", actual[:4])
for _, predicted := range domains {
row += fmt.Sprintf(" %10d", confusion[actual][predicted])
}
t.Log(row)
}
// Soft threshold: grammar-based classification won't be perfect,
// but should beat random chance (25%) meaningfully.
if overallAcc < 0.35 {
t.Errorf("overall accuracy %.1f%% is below 35%% threshold", overallAcc*100)
}
}
// TestClassification_TokenCoverage reports per-domain token recognition rates.
// Domains with low coverage rely more on the 1B model for classification.
func TestClassification_TokenCoverage(t *testing.T) {
setup(t)
tok := NewTokeniser()
groups := corpusByDomain()
domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
for _, d := range domains {
var totalTokens, recognisedTokens int
var verbTokens, nounTokens, articleTokens, wordTokens int
for _, idx := range groups[d] {
tokens := tok.Tokenise(classificationCorpus[idx].Text)
for _, token := range tokens {
totalTokens++
switch token.Type {
case TokenVerb:
recognisedTokens++
verbTokens++
case TokenNoun:
recognisedTokens++
nounTokens++
case TokenArticle:
recognisedTokens++
articleTokens++
case TokenWord:
recognisedTokens++
wordTokens++
case TokenPunctuation:
recognisedTokens++
}
}
}
coverage := float64(recognisedTokens) / float64(totalTokens) * 100
t.Logf("%-10s coverage=%.1f%% tokens=%d verbs=%d nouns=%d articles=%d words=%d",
d, coverage, totalTokens, verbTokens, nounTokens, articleTokens, wordTokens)
}
}
// TestClassification_TenseProfile reports per-domain tense distribution.
// Useful for understanding what grammar signals distinguish domains.
func TestClassification_TenseProfile(t *testing.T) {
setup(t)
tok := NewTokeniser()
groups := corpusByDomain()
domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
tenses := []string{"base", "past", "gerund"}
for _, d := range domains {
tenseCounts := make(map[string]int)
var totalVerbs int
for _, idx := range groups[d] {
tokens := tok.Tokenise(classificationCorpus[idx].Text)
for _, token := range tokens {
if token.Type == TokenVerb {
tenseCounts[token.VerbInfo.Tense]++
totalVerbs++
}
}
}
parts := fmt.Sprintf("%-10s verbs=%d", d, totalVerbs)
for _, tense := range tenses {
pct := 0.0
if totalVerbs > 0 {
pct = float64(tenseCounts[tense]) / float64(totalVerbs) * 100
}
parts += fmt.Sprintf(" %s=%.0f%%", tense, pct)
}
t.Log(parts)
}
}
// TestClassification_TopVerbs reports the most frequent verbs per domain.
func TestClassification_TopVerbs(t *testing.T) {
setup(t)
tok := NewTokeniser()
groups := corpusByDomain()
domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
for _, d := range domains {
verbCounts := make(map[string]int)
for _, idx := range groups[d] {
tokens := tok.Tokenise(classificationCorpus[idx].Text)
for _, token := range tokens {
if token.Type == TokenVerb {
verbCounts[token.VerbInfo.Base]++
}
}
}
// Sort by frequency
type kv struct {
verb string
count int
}
var sorted []kv
for v, c := range verbCounts {
sorted = append(sorted, kv{v, c})
}
sort.Slice(sorted, func(i, j int) bool { return sorted[i].count > sorted[j].count })
top := 8
if len(sorted) < top {
top = len(sorted)
}
verbs := ""
for i := 0; i < top; i++ {
if i > 0 {
verbs += ", "
}
verbs += fmt.Sprintf("%s(%d)", sorted[i].verb, sorted[i].count)
}
t.Logf("%-10s unique=%d top: %s", d, len(verbCounts), verbs)
}
}
// --- Benchmarks ---
func BenchmarkClassification_Tokenise(b *testing.B) {
benchSetup(b)
tok := NewTokeniser()
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, s := range classificationCorpus {
tok.Tokenise(s.Text)
}
}
}
func BenchmarkClassification_ImprintAll(b *testing.B) {
benchSetup(b)
tok := NewTokeniser()
b.ResetTimer()
for i := 0; i < b.N; i++ {
imprintCorpus(tok)
}
}
func BenchmarkClassification_FullPipeline(b *testing.B) {
benchSetup(b)
tok := NewTokeniser()
groups := corpusByDomain()
imprints := imprintCorpus(tok)
b.ResetTimer()
for i := 0; i < b.N; i++ {
for idx := range classificationCorpus {
classifyLeaveOneOut(idx, imprints, groups)
}
}
}