feat(reversal): add classification benchmark suite
220 domain-tagged sentences across {technical, creative, ethical, casual}
with leave-one-out classification, domain separation, token coverage,
tense profile, and top-verb diagnostics. Grammar-based accuracy: 54%
overall (technical 78%, creative 82%, ethical 46%, casual 11%).
Co-Authored-By: Virgil <virgil@lethean.io>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
65cf099517
commit
7d5ab809f0
2 changed files with 591 additions and 1 deletions
2
TODO.md
2
TODO.md
|
|
@ -15,7 +15,7 @@ Dispatched from core/go orchestration. Pick up tasks in order.
|
|||
|
||||
### 2a: 1B Pre-Classification (NEW — based on benchmark findings)
|
||||
|
||||
- [ ] **Classification benchmark suite** — Standalone Go test file (`classify_bench_test.go`) that feeds 200+ domain-tagged sentences through the tokeniser and measures accuracy against known labels. Categories: {technical, creative, ethical, casual}. This is the ground truth for calibrating 1B pre-tags.
|
||||
- [x] **Classification benchmark suite** — 220 domain-tagged sentences, leave-one-out classification via imprint similarity. Grammar engine: technical 78%, creative 82%, ethical 46%, casual 11%. Ethical↔technical and casual↔creative confusion confirms 1B model needed for those domains.
|
||||
- [ ] **1B pre-sort pipeline tool** — CLI command or Go func that reads a JSONL corpus (Phase 0 seeds), sends each text through LEK-Gemma3-1B domain classification, and writes back JSONL with `domain_1b` field added. Target: ~5K sentences/sec on M3. Use MLX via go-ai bindings or shell out to `mlx_lm.generate`.
|
||||
- [ ] **1B vs 27B calibration check** — Sample 500 sentences, classify with both 1B and 27B, measure agreement rate. The 75% accuracy from benchmarks should improve with targeted prompt tuning. Document the confusion matrix (technical↔creative is the known weak spot).
|
||||
- [ ] **Article/irregular validator** — Lightweight Go funcs that use the 1B model's strong article correctness (100%) and irregular base form accuracy (100%) as fast validators. Could supplement rule-based `Article()` and `PastTense()` for edge cases the grammar tables don't cover.
|
||||
|
|
|
|||
590
reversal/classify_bench_test.go
Normal file
590
reversal/classify_bench_test.go
Normal file
|
|
@ -0,0 +1,590 @@
|
|||
package reversal
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// Domain categories for classification ground truth.
|
||||
const (
|
||||
domainTechnical = "technical"
|
||||
domainCreative = "creative"
|
||||
domainEthical = "ethical"
|
||||
domainCasual = "casual"
|
||||
)
|
||||
|
||||
type taggedSentence struct {
|
||||
Text string
|
||||
Domain string
|
||||
}
|
||||
|
||||
// classificationCorpus contains 200+ domain-tagged sentences for calibrating
|
||||
// grammar-based domain classification. These serve as ground truth for the
|
||||
// 1B pre-sort pipeline (Phase 2a).
|
||||
var classificationCorpus = []taggedSentence{
|
||||
// --- Technical (55) ---
|
||||
// Imperative dev/ops commands, system administration, debugging
|
||||
{"Delete the configuration file", domainTechnical},
|
||||
{"Build the project from source", domainTechnical},
|
||||
{"Run the tests before committing", domainTechnical},
|
||||
{"Push the changes to the branch", domainTechnical},
|
||||
{"Update the dependencies", domainTechnical},
|
||||
{"Check the build status", domainTechnical},
|
||||
{"Find the failing test", domainTechnical},
|
||||
{"Write the test cases first", domainTechnical},
|
||||
{"Set the environment variables", domainTechnical},
|
||||
{"Split the package into modules", domainTechnical},
|
||||
{"Scan the repository for vulnerabilities", domainTechnical},
|
||||
{"Format the source files", domainTechnical},
|
||||
{"Reset the branch to the previous commit", domainTechnical},
|
||||
{"Stop the running process", domainTechnical},
|
||||
{"Cut a new release branch", domainTechnical},
|
||||
{"Send the build artifacts to the server", domainTechnical},
|
||||
{"Keep the test coverage above the threshold", domainTechnical},
|
||||
{"Hold the deployment until the checks pass", domainTechnical},
|
||||
{"Begin the migration to the new package", domainTechnical},
|
||||
{"Take the old server offline", domainTechnical},
|
||||
{"The build failed because of a missing dependency", domainTechnical},
|
||||
{"The test committed changes to the wrong branch", domainTechnical},
|
||||
{"We found a vulnerability in the package", domainTechnical},
|
||||
{"The commit broke the build", domainTechnical},
|
||||
{"She deleted the old configuration files", domainTechnical},
|
||||
{"They pushed the fix to the repository", domainTechnical},
|
||||
{"The branch was updated with the latest changes", domainTechnical},
|
||||
{"He rebuilt the project after updating dependencies", domainTechnical},
|
||||
{"The task failed during the scanning phase", domainTechnical},
|
||||
{"We split the repository into separate packages", domainTechnical},
|
||||
{"The check ran successfully on all branches", domainTechnical},
|
||||
{"They found the issue in the build directory", domainTechnical},
|
||||
{"The file was committed without running tests", domainTechnical},
|
||||
{"She set the deployment configuration correctly", domainTechnical},
|
||||
{"Building the project takes several minutes", domainTechnical},
|
||||
{"Deleting old branches keeps the repository clean", domainTechnical},
|
||||
{"Running the full test suite before merging", domainTechnical},
|
||||
{"Updating packages resolved the vulnerability", domainTechnical},
|
||||
{"Checking the build logs for errors", domainTechnical},
|
||||
{"Scanning dependencies for known issues", domainTechnical},
|
||||
{"Writing tests for the new commit handler", domainTechnical},
|
||||
{"Pushing changes to the remote repository", domainTechnical},
|
||||
{"Finding the root cause of the test failure", domainTechnical},
|
||||
{"Formatting the code before the final commit", domainTechnical},
|
||||
{"Splitting the configuration into separate files", domainTechnical},
|
||||
{"Override the default build configuration", domainTechnical},
|
||||
{"Rebuild the project with the updated dependencies", domainTechnical},
|
||||
{"Rerun the failed tests on the branch", domainTechnical},
|
||||
{"Debug the issue in the test runner", domainTechnical},
|
||||
{"Embed the version string in the build", domainTechnical},
|
||||
{"Withdraw the broken release from the repository", domainTechnical},
|
||||
{"Offset the deployment by one commit", domainTechnical},
|
||||
{"Input the new configuration values", domainTechnical},
|
||||
{"Output the build results to a file", domainTechnical},
|
||||
{"Unzip the package artifacts", domainTechnical},
|
||||
|
||||
// --- Creative (55) ---
|
||||
// Narrative, descriptive, literary language
|
||||
{"She wrote the story by candlelight", domainCreative},
|
||||
{"The singer sang until the stars came out", domainCreative},
|
||||
{"He drew a map of forgotten places", domainCreative},
|
||||
{"They chose a path through the ancient forest", domainCreative},
|
||||
{"The wind blew across the open field", domainCreative},
|
||||
{"She spoke softly to the sleeping child", domainCreative},
|
||||
{"He broke the silence with a whispered word", domainCreative},
|
||||
{"The river froze under the winter moon", domainCreative},
|
||||
{"She stole a glance at the hidden garden", domainCreative},
|
||||
{"The old woman told tales of distant lands", domainCreative},
|
||||
{"He threw his arms wide and began to sing", domainCreative},
|
||||
{"The artist drew inspiration from the sea", domainCreative},
|
||||
{"She woke to the sound of falling rain", domainCreative},
|
||||
{"They built a castle from sand and dreams", domainCreative},
|
||||
{"He ran through fields of golden wheat", domainCreative},
|
||||
{"The dancer spun beneath the chandelier", domainCreative},
|
||||
{"She wore a dress made of moonlight", domainCreative},
|
||||
{"He hid the letter behind the painting", domainCreative},
|
||||
{"The leaves fell like whispered secrets", domainCreative},
|
||||
{"She found a door that led to another world", domainCreative},
|
||||
{"He took the winding road through the hills", domainCreative},
|
||||
{"The poet wrote verses about lost time", domainCreative},
|
||||
{"She left footprints in the fresh snow", domainCreative},
|
||||
{"They swam across the moonlit lake", domainCreative},
|
||||
{"He drove through the night without stopping", domainCreative},
|
||||
{"The music rose like smoke into the air", domainCreative},
|
||||
{"She kept the secret for many years", domainCreative},
|
||||
{"He led them deeper into the enchanted wood", domainCreative},
|
||||
{"The candle shone against the darkness", domainCreative},
|
||||
{"She lost herself in the pages of the book", domainCreative},
|
||||
{"He caught the last train before midnight", domainCreative},
|
||||
{"The garden grew wild after they left", domainCreative},
|
||||
{"She paid no attention to the gathering storm", domainCreative},
|
||||
{"He met the stranger at the crossroads", domainCreative},
|
||||
{"The shadows held their breath", domainCreative},
|
||||
{"Writing stories about forgotten kingdoms", domainCreative},
|
||||
{"Drawing maps of imaginary coastlines", domainCreative},
|
||||
{"Singing ballads under the open sky", domainCreative},
|
||||
{"Telling tales of heroes and lost causes", domainCreative},
|
||||
{"Weaving words into tapestries of meaning", domainCreative},
|
||||
{"She began her tale with a sigh", domainCreative},
|
||||
{"The old house stood at the edge of the world", domainCreative},
|
||||
{"He gave the child a carved wooden bird", domainCreative},
|
||||
{"They brought flowers to the abandoned shrine", domainCreative},
|
||||
{"The ship left harbour before the dawn", domainCreative},
|
||||
{"She sold the family ring to pay for passage", domainCreative},
|
||||
{"He won the contest with an improvised song", domainCreative},
|
||||
{"The clock struck twelve and the spell was broken", domainCreative},
|
||||
{"She bent the wire into a tiny crown", domainCreative},
|
||||
{"They flew kites above the autumn trees", domainCreative},
|
||||
{"He spent the afternoon by the quiet river", domainCreative},
|
||||
{"The rain fell softly on the old stone bridge", domainCreative},
|
||||
{"She tore the map in half and chose the left path", domainCreative},
|
||||
{"He sat on the hillside watching the sunset", domainCreative},
|
||||
{"The story begins where the road splits in two", domainCreative},
|
||||
|
||||
// --- Ethical (55) ---
|
||||
// Moral reasoning, prescriptive, policy, fairness
|
||||
{"We should think about the consequences of our choices", domainEthical},
|
||||
{"They must hold themselves accountable for the outcome", domainEthical},
|
||||
{"You should not break a promise once it has been made", domainEthical},
|
||||
{"We must find a fair solution for all parties", domainEthical},
|
||||
{"Leaders should stand for what they believe is right", domainEthical},
|
||||
{"We should keep our commitments to the community", domainEthical},
|
||||
{"They must take responsibility for their decisions", domainEthical},
|
||||
{"You should tell the truth even when it is difficult", domainEthical},
|
||||
{"We ought to bring attention to hidden suffering", domainEthical},
|
||||
{"They should not leave anyone behind", domainEthical},
|
||||
{"We must hold power accountable to the people", domainEthical},
|
||||
{"One should think carefully before making accusations", domainEthical},
|
||||
{"Leaders must lead by example in matters of integrity", domainEthical},
|
||||
{"We should find ways to include all voices", domainEthical},
|
||||
{"They should not cut corners on safety", domainEthical},
|
||||
{"We must build trust through consistent action", domainEthical},
|
||||
{"You should give others the benefit of the doubt", domainEthical},
|
||||
{"They must not put profit above human welfare", domainEthical},
|
||||
{"We should pay attention to those who are vulnerable", domainEthical},
|
||||
{"One must meet obligations before seeking rewards", domainEthical},
|
||||
{"They broke the agreement and lost our trust", domainEthical},
|
||||
{"The decision cost many people their livelihoods", domainEthical},
|
||||
{"She spoke out against the policy of exclusion", domainEthical},
|
||||
{"They chose transparency over self-interest", domainEthical},
|
||||
{"He stood firm despite the pressure to compromise", domainEthical},
|
||||
{"The organisation lost credibility after the scandal", domainEthical},
|
||||
{"She held the board accountable for their failures", domainEthical},
|
||||
{"They found that the policy caused unintended harm", domainEthical},
|
||||
{"He brought evidence of wrongdoing to the authorities", domainEthical},
|
||||
{"The report led to significant reforms", domainEthical},
|
||||
{"Thinking about fairness in resource distribution", domainEthical},
|
||||
{"Holding institutions accountable for their promises", domainEthical},
|
||||
{"Building systems that protect the most vulnerable", domainEthical},
|
||||
{"Finding the balance between freedom and responsibility", domainEthical},
|
||||
{"Keeping commitments to future generations", domainEthical},
|
||||
{"We should not sell access to essential services", domainEthical},
|
||||
{"They must seek consent before taking action", domainEthical},
|
||||
{"You should stand with those who cannot stand alone", domainEthical},
|
||||
{"We must begin by acknowledging past mistakes", domainEthical},
|
||||
{"Leaders should spend more time listening", domainEthical},
|
||||
{"We should set clear boundaries on acceptable conduct", domainEthical},
|
||||
{"One must not hide the truth for personal gain", domainEthical},
|
||||
{"They should deal fairly with competing interests", domainEthical},
|
||||
{"We must win trust through transparency and honesty", domainEthical},
|
||||
{"You should not bend the rules to suit your needs", domainEthical},
|
||||
{"They ought to hold elections that are free and fair", domainEthical},
|
||||
{"We should bring diverse perspectives to the table", domainEthical},
|
||||
{"One must take care not to cause unnecessary harm", domainEthical},
|
||||
{"They should not shut out dissenting opinions", domainEthical},
|
||||
{"We must keep the interests of the public in mind", domainEthical},
|
||||
{"She fought to uphold the rights of the displaced", domainEthical},
|
||||
{"He withdrew support after the ethical breach", domainEthical},
|
||||
{"They overcame resistance to pass the reform", domainEthical},
|
||||
{"The committee forbade the use of deceptive practices", domainEthical},
|
||||
{"We should not cast blame without evidence", domainEthical},
|
||||
|
||||
// --- Casual (55) ---
|
||||
// Everyday conversation, informal, personal
|
||||
{"I went to the store yesterday", domainCasual},
|
||||
{"She made dinner for everyone last night", domainCasual},
|
||||
{"We took the dog for a walk this morning", domainCasual},
|
||||
{"He got a new phone last week", domainCasual},
|
||||
{"They left early to beat the traffic", domainCasual},
|
||||
{"I found my keys under the sofa", domainCasual},
|
||||
{"She bought a new jacket for the trip", domainCasual},
|
||||
{"We met for coffee after work", domainCasual},
|
||||
{"He cut the grass before it rained", domainCasual},
|
||||
{"They brought snacks to the party", domainCasual},
|
||||
{"I sat on the porch and read a book", domainCasual},
|
||||
{"She paid for lunch at the cafe", domainCasual},
|
||||
{"We ran into an old friend at the market", domainCasual},
|
||||
{"He put the groceries away", domainCasual},
|
||||
{"They spent the weekend at the beach", domainCasual},
|
||||
{"I told her about the new restaurant", domainCasual},
|
||||
{"She drove to the airport early", domainCasual},
|
||||
{"We got lost on the way there", domainCasual},
|
||||
{"He fell asleep on the couch", domainCasual},
|
||||
{"They won tickets to the show", domainCasual},
|
||||
{"I lost my umbrella somewhere", domainCasual},
|
||||
{"She chose the window seat", domainCasual},
|
||||
{"We hit the road before dawn", domainCasual},
|
||||
{"He kept the receipt just in case", domainCasual},
|
||||
{"They came over for board games", domainCasual},
|
||||
{"I took a shortcut through the park", domainCasual},
|
||||
{"She left a message on the machine", domainCasual},
|
||||
{"We gave the old furniture away", domainCasual},
|
||||
{"He held the door for the woman behind him", domainCasual},
|
||||
{"They sent us a postcard from the coast", domainCasual},
|
||||
{"Going to the park this afternoon", domainCasual},
|
||||
{"Making plans for the holiday", domainCasual},
|
||||
{"Getting ready for the weekend trip", domainCasual},
|
||||
{"Meeting friends at the usual place", domainCasual},
|
||||
{"Looking for a good place to eat", domainCasual},
|
||||
{"I think she went home already", domainCasual},
|
||||
{"He said he would come by later", domainCasual},
|
||||
{"We should get together sometime", domainCasual},
|
||||
{"She told me about her new job", domainCasual},
|
||||
{"They brought the kids to the game", domainCasual},
|
||||
{"I set the alarm for six in the morning", domainCasual},
|
||||
{"She sold her old bike at the market", domainCasual},
|
||||
{"We split the bill at the restaurant", domainCasual},
|
||||
{"He drew a funny picture on the napkin", domainCasual},
|
||||
{"They began planning the birthday party", domainCasual},
|
||||
{"I threw out the old newspapers", domainCasual},
|
||||
{"She hung the new curtains in the bedroom", domainCasual},
|
||||
{"We led the way to the hidden trail", domainCasual},
|
||||
{"He bent down to pick up the coin", domainCasual},
|
||||
{"They fed the ducks at the pond", domainCasual},
|
||||
{"I caught the bus just in time", domainCasual},
|
||||
{"She broke her favourite mug this morning", domainCasual},
|
||||
{"We built a shelf for the kitchen", domainCasual},
|
||||
{"He shut the door and turned the light off", domainCasual},
|
||||
{"They stood in line for an hour", domainCasual},
|
||||
}
|
||||
|
||||
// --- Helpers ---
|
||||
|
||||
// corpusByDomain groups the corpus by domain label.
|
||||
func corpusByDomain() map[string][]int {
|
||||
groups := make(map[string][]int)
|
||||
for i, s := range classificationCorpus {
|
||||
groups[s.Domain] = append(groups[s.Domain], i)
|
||||
}
|
||||
return groups
|
||||
}
|
||||
|
||||
// imprintCorpus tokenises and imprints every sentence. Caller reuses the slice.
|
||||
func imprintCorpus(tok *Tokeniser) []GrammarImprint {
|
||||
imprints := make([]GrammarImprint, len(classificationCorpus))
|
||||
for i, s := range classificationCorpus {
|
||||
imprints[i] = NewImprint(tok.Tokenise(s.Text))
|
||||
}
|
||||
return imprints
|
||||
}
|
||||
|
||||
// classifyLeaveOneOut returns the predicted domain for sentence at idx
|
||||
// by computing average similarity to every other sentence in each domain.
|
||||
func classifyLeaveOneOut(idx int, imprints []GrammarImprint, groups map[string][]int) string {
|
||||
bestDomain := ""
|
||||
bestSim := -1.0
|
||||
|
||||
for domain, indices := range groups {
|
||||
var sum float64
|
||||
var count int
|
||||
for _, j := range indices {
|
||||
if j == idx {
|
||||
continue
|
||||
}
|
||||
sum += imprints[idx].Similar(imprints[j])
|
||||
count++
|
||||
}
|
||||
if count == 0 {
|
||||
continue
|
||||
}
|
||||
avg := sum / float64(count)
|
||||
if avg > bestSim {
|
||||
bestSim = avg
|
||||
bestDomain = domain
|
||||
}
|
||||
}
|
||||
return bestDomain
|
||||
}
|
||||
|
||||
// --- Tests ---
|
||||
|
||||
// TestClassification_CorpusSize validates the corpus has enough sentences per domain.
|
||||
func TestClassification_CorpusSize(t *testing.T) {
|
||||
groups := corpusByDomain()
|
||||
domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
|
||||
|
||||
for _, d := range domains {
|
||||
if n := len(groups[d]); n < 50 {
|
||||
t.Errorf("domain %q has %d sentences, want >= 50", d, n)
|
||||
}
|
||||
}
|
||||
if total := len(classificationCorpus); total < 200 {
|
||||
t.Errorf("corpus has %d sentences, want >= 200", total)
|
||||
}
|
||||
}
|
||||
|
||||
// TestClassification_DomainSeparation verifies within-domain imprint similarity
|
||||
// exceeds cross-domain similarity. This is the basic requirement for domain
|
||||
// classification to work.
|
||||
func TestClassification_DomainSeparation(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
imprints := imprintCorpus(tok)
|
||||
groups := corpusByDomain()
|
||||
|
||||
domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
|
||||
|
||||
for _, d := range domains {
|
||||
indices := groups[d]
|
||||
|
||||
// Within-domain average similarity
|
||||
var withinSum float64
|
||||
var withinCount int
|
||||
for i := 0; i < len(indices); i++ {
|
||||
for j := i + 1; j < len(indices); j++ {
|
||||
withinSum += imprints[indices[i]].Similar(imprints[indices[j]])
|
||||
withinCount++
|
||||
}
|
||||
}
|
||||
withinAvg := withinSum / float64(withinCount)
|
||||
|
||||
// Cross-domain average similarity
|
||||
var crossSum float64
|
||||
var crossCount int
|
||||
for _, otherD := range domains {
|
||||
if otherD == d {
|
||||
continue
|
||||
}
|
||||
for _, i := range indices {
|
||||
for _, j := range groups[otherD] {
|
||||
crossSum += imprints[i].Similar(imprints[j])
|
||||
crossCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
crossAvg := crossSum / float64(crossCount)
|
||||
|
||||
t.Logf("%-10s within=%.4f cross=%.4f gap=%.4f", d, withinAvg, crossAvg, withinAvg-crossAvg)
|
||||
|
||||
if withinAvg <= crossAvg {
|
||||
t.Errorf("domain %q: within-domain similarity (%.4f) should exceed cross-domain (%.4f)",
|
||||
d, withinAvg, crossAvg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestClassification_LeaveOneOut measures per-domain and overall accuracy
|
||||
// using leave-one-out nearest-centroid classification.
|
||||
func TestClassification_LeaveOneOut(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
imprints := imprintCorpus(tok)
|
||||
groups := corpusByDomain()
|
||||
|
||||
domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
|
||||
|
||||
// Confusion matrix: actual -> predicted -> count
|
||||
confusion := make(map[string]map[string]int)
|
||||
for _, d := range domains {
|
||||
confusion[d] = make(map[string]int)
|
||||
}
|
||||
|
||||
correct := 0
|
||||
total := len(classificationCorpus)
|
||||
|
||||
for i, s := range classificationCorpus {
|
||||
predicted := classifyLeaveOneOut(i, imprints, groups)
|
||||
confusion[s.Domain][predicted]++
|
||||
if predicted == s.Domain {
|
||||
correct++
|
||||
}
|
||||
}
|
||||
|
||||
overallAcc := float64(correct) / float64(total)
|
||||
t.Logf("Overall accuracy: %d/%d (%.1f%%)", correct, total, overallAcc*100)
|
||||
|
||||
// Per-domain accuracy
|
||||
for _, d := range domains {
|
||||
domainTotal := len(groups[d])
|
||||
domainCorrect := confusion[d][d]
|
||||
acc := float64(domainCorrect) / float64(domainTotal)
|
||||
t.Logf(" %-10s %d/%d (%.1f%%)", d, domainCorrect, domainTotal, acc*100)
|
||||
}
|
||||
|
||||
// Print confusion matrix
|
||||
t.Log("\nConfusion matrix (rows=actual, cols=predicted):")
|
||||
header := fmt.Sprintf(" %-10s", "")
|
||||
for _, d := range domains {
|
||||
header += fmt.Sprintf(" %10s", d[:4])
|
||||
}
|
||||
t.Log(header)
|
||||
for _, actual := range domains {
|
||||
row := fmt.Sprintf(" %-10s", actual[:4])
|
||||
for _, predicted := range domains {
|
||||
row += fmt.Sprintf(" %10d", confusion[actual][predicted])
|
||||
}
|
||||
t.Log(row)
|
||||
}
|
||||
|
||||
// Soft threshold: grammar-based classification won't be perfect,
|
||||
// but should beat random chance (25%) meaningfully.
|
||||
if overallAcc < 0.35 {
|
||||
t.Errorf("overall accuracy %.1f%% is below 35%% threshold", overallAcc*100)
|
||||
}
|
||||
}
|
||||
|
||||
// TestClassification_TokenCoverage reports per-domain token recognition rates.
|
||||
// Domains with low coverage rely more on the 1B model for classification.
|
||||
func TestClassification_TokenCoverage(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
groups := corpusByDomain()
|
||||
|
||||
domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
|
||||
|
||||
for _, d := range domains {
|
||||
var totalTokens, recognisedTokens int
|
||||
var verbTokens, nounTokens, articleTokens, wordTokens int
|
||||
|
||||
for _, idx := range groups[d] {
|
||||
tokens := tok.Tokenise(classificationCorpus[idx].Text)
|
||||
for _, token := range tokens {
|
||||
totalTokens++
|
||||
switch token.Type {
|
||||
case TokenVerb:
|
||||
recognisedTokens++
|
||||
verbTokens++
|
||||
case TokenNoun:
|
||||
recognisedTokens++
|
||||
nounTokens++
|
||||
case TokenArticle:
|
||||
recognisedTokens++
|
||||
articleTokens++
|
||||
case TokenWord:
|
||||
recognisedTokens++
|
||||
wordTokens++
|
||||
case TokenPunctuation:
|
||||
recognisedTokens++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
coverage := float64(recognisedTokens) / float64(totalTokens) * 100
|
||||
t.Logf("%-10s coverage=%.1f%% tokens=%d verbs=%d nouns=%d articles=%d words=%d",
|
||||
d, coverage, totalTokens, verbTokens, nounTokens, articleTokens, wordTokens)
|
||||
}
|
||||
}
|
||||
|
||||
// TestClassification_TenseProfile reports per-domain tense distribution.
|
||||
// Useful for understanding what grammar signals distinguish domains.
|
||||
func TestClassification_TenseProfile(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
groups := corpusByDomain()
|
||||
|
||||
domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
|
||||
tenses := []string{"base", "past", "gerund"}
|
||||
|
||||
for _, d := range domains {
|
||||
tenseCounts := make(map[string]int)
|
||||
var totalVerbs int
|
||||
|
||||
for _, idx := range groups[d] {
|
||||
tokens := tok.Tokenise(classificationCorpus[idx].Text)
|
||||
for _, token := range tokens {
|
||||
if token.Type == TokenVerb {
|
||||
tenseCounts[token.VerbInfo.Tense]++
|
||||
totalVerbs++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
parts := fmt.Sprintf("%-10s verbs=%d", d, totalVerbs)
|
||||
for _, tense := range tenses {
|
||||
pct := 0.0
|
||||
if totalVerbs > 0 {
|
||||
pct = float64(tenseCounts[tense]) / float64(totalVerbs) * 100
|
||||
}
|
||||
parts += fmt.Sprintf(" %s=%.0f%%", tense, pct)
|
||||
}
|
||||
t.Log(parts)
|
||||
}
|
||||
}
|
||||
|
||||
// TestClassification_TopVerbs reports the most frequent verbs per domain.
|
||||
func TestClassification_TopVerbs(t *testing.T) {
|
||||
setup(t)
|
||||
tok := NewTokeniser()
|
||||
groups := corpusByDomain()
|
||||
|
||||
domains := []string{domainTechnical, domainCreative, domainEthical, domainCasual}
|
||||
|
||||
for _, d := range domains {
|
||||
verbCounts := make(map[string]int)
|
||||
for _, idx := range groups[d] {
|
||||
tokens := tok.Tokenise(classificationCorpus[idx].Text)
|
||||
for _, token := range tokens {
|
||||
if token.Type == TokenVerb {
|
||||
verbCounts[token.VerbInfo.Base]++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by frequency
|
||||
type kv struct {
|
||||
verb string
|
||||
count int
|
||||
}
|
||||
var sorted []kv
|
||||
for v, c := range verbCounts {
|
||||
sorted = append(sorted, kv{v, c})
|
||||
}
|
||||
sort.Slice(sorted, func(i, j int) bool { return sorted[i].count > sorted[j].count })
|
||||
|
||||
top := 8
|
||||
if len(sorted) < top {
|
||||
top = len(sorted)
|
||||
}
|
||||
verbs := ""
|
||||
for i := 0; i < top; i++ {
|
||||
if i > 0 {
|
||||
verbs += ", "
|
||||
}
|
||||
verbs += fmt.Sprintf("%s(%d)", sorted[i].verb, sorted[i].count)
|
||||
}
|
||||
t.Logf("%-10s unique=%d top: %s", d, len(verbCounts), verbs)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Benchmarks ---
|
||||
|
||||
func BenchmarkClassification_Tokenise(b *testing.B) {
|
||||
benchSetup(b)
|
||||
tok := NewTokeniser()
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, s := range classificationCorpus {
|
||||
tok.Tokenise(s.Text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkClassification_ImprintAll(b *testing.B) {
|
||||
benchSetup(b)
|
||||
tok := NewTokeniser()
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
imprintCorpus(tok)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkClassification_FullPipeline(b *testing.B) {
|
||||
benchSetup(b)
|
||||
tok := NewTokeniser()
|
||||
groups := corpusByDomain()
|
||||
imprints := imprintCorpus(tok)
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for idx := range classificationCorpus {
|
||||
classifyLeaveOneOut(idx, imprints, groups)
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue