diff --git a/TODO.md b/TODO.md index 741748c..578fdf7 100644 --- a/TODO.md +++ b/TODO.md @@ -71,7 +71,7 @@ models, _ := inference.Discover("/Volumes/Data/lem/") ### Remaining Phase 2a Tasks -- [ ] **1B vs 27B calibration check** — Sample 500 sentences, classify with both 1B and 27B, measure agreement rate. Load 27B via same `inference.LoadModel()` path. Classification benchmark shows ethical↔technical (both base-form heavy) and casual↔creative (both past-tense heavy) are the confusion axes — 1B needs to resolve these. +- [x] **1B vs 27B calibration check** — `CalibrateDomains()` in `calibrate.go`. Accepts two TextModels + 500 CalibrationSamples (220 ground-truth + 280 unlabelled). Batch-classifies with both models, computes agreement rate, per-domain distribution, confusion pairs, and accuracy vs ground truth. 7 mock tests (race-clean). Integration test at `integration/calibrate_test.go` loads LEM-1B + Gemma3-27B from `/Volumes/Data/lem/`, runs full calibration with detailed reporting. Run with: `cd integration && go test -v -run TestCalibrateDomains_1Bvs27B` - [x] **Article/irregular validator** — Lightweight Go funcs that use the 1B model's strong article correctness (100%) and irregular base form accuracy (100%) as fast validators. Use `m.Generate()` with `inference.WithMaxTokens(1)` and `inference.WithTemperature(0.05)` for single-token classification. ### 2b: Reference Distributions diff --git a/calibrate.go b/calibrate.go new file mode 100644 index 0000000..ddaed02 --- /dev/null +++ b/calibrate.go @@ -0,0 +1,154 @@ +package i18n + +import ( + "context" + "fmt" + "time" + + "forge.lthn.ai/core/go-inference" +) + +// CalibrationSample is a single text entry for model comparison. +type CalibrationSample struct { + Text string + TrueDomain string // optional ground truth label (empty if unknown) +} + +// CalibrationResult holds per-sample classification from two models. +type CalibrationResult struct { + Text string `json:"text"` + TrueDomain string `json:"true_domain,omitempty"` + DomainA string `json:"domain_a"` + DomainB string `json:"domain_b"` + Agree bool `json:"agree"` +} + +// CalibrationStats holds aggregate metrics from CalibrateDomains. +type CalibrationStats struct { + Total int `json:"total"` + Agreed int `json:"agreed"` + AgreementRate float64 `json:"agreement_rate"` + ByDomainA map[string]int `json:"by_domain_a"` + ByDomainB map[string]int `json:"by_domain_b"` + ConfusionPairs map[string]int `json:"confusion_pairs"` // "technical->creative": count + AccuracyA float64 `json:"accuracy_a"` // vs ground truth (0 if none) + AccuracyB float64 `json:"accuracy_b"` // vs ground truth (0 if none) + CorrectA int `json:"correct_a"` + CorrectB int `json:"correct_b"` + WithTruth int `json:"with_truth"` // samples that had ground truth + DurationA time.Duration `json:"duration_a"` + DurationB time.Duration `json:"duration_b"` + Results []CalibrationResult `json:"results"` +} + +// CalibrateDomains classifies all samples with both models and computes agreement. +// Model A is typically the smaller/faster model (1B), model B the larger reference (27B). +// Samples with non-empty TrueDomain also contribute to accuracy metrics. +func CalibrateDomains(ctx context.Context, modelA, modelB inference.TextModel, + samples []CalibrationSample, opts ...ClassifyOption) (*CalibrationStats, error) { + + if len(samples) == 0 { + return nil, fmt.Errorf("calibrate: empty sample set") + } + + cfg := defaultClassifyConfig() + for _, o := range opts { + o(&cfg) + } + + stats := &CalibrationStats{ + ByDomainA: make(map[string]int), + ByDomainB: make(map[string]int), + ConfusionPairs: make(map[string]int), + } + + // Build classification prompts from sample texts. + prompts := make([]string, len(samples)) + for i, s := range samples { + prompts[i] = fmt.Sprintf(cfg.promptTemplate, s.Text) + } + + // Classify with model A. + domainsA, durA, err := classifyAll(ctx, modelA, prompts, cfg.batchSize) + if err != nil { + return nil, fmt.Errorf("model A: %w", err) + } + stats.DurationA = durA + + // Classify with model B. + domainsB, durB, err := classifyAll(ctx, modelB, prompts, cfg.batchSize) + if err != nil { + return nil, fmt.Errorf("model B: %w", err) + } + stats.DurationB = durB + + // Compare results. + stats.Total = len(samples) + stats.Results = make([]CalibrationResult, len(samples)) + + for i, s := range samples { + a, b := domainsA[i], domainsB[i] + agree := a == b + if agree { + stats.Agreed++ + } else { + key := fmt.Sprintf("%s->%s", a, b) + stats.ConfusionPairs[key]++ + } + stats.ByDomainA[a]++ + stats.ByDomainB[b]++ + + if s.TrueDomain != "" { + stats.WithTruth++ + if a == s.TrueDomain { + stats.CorrectA++ + } + if b == s.TrueDomain { + stats.CorrectB++ + } + } + + stats.Results[i] = CalibrationResult{ + Text: s.Text, + TrueDomain: s.TrueDomain, + DomainA: a, + DomainB: b, + Agree: agree, + } + } + + if stats.Total > 0 { + stats.AgreementRate = float64(stats.Agreed) / float64(stats.Total) + } + if stats.WithTruth > 0 { + stats.AccuracyA = float64(stats.CorrectA) / float64(stats.WithTruth) + stats.AccuracyB = float64(stats.CorrectB) / float64(stats.WithTruth) + } + + return stats, nil +} + +// classifyAll runs batch classification over all prompts, returning domain labels. +func classifyAll(ctx context.Context, model inference.TextModel, prompts []string, batchSize int) ([]string, time.Duration, error) { + start := time.Now() + domains := make([]string, len(prompts)) + + for i := 0; i < len(prompts); i += batchSize { + end := i + batchSize + if end > len(prompts) { + end = len(prompts) + } + batch := prompts[i:end] + + results, err := model.Classify(ctx, batch, inference.WithMaxTokens(1)) + if err != nil { + return nil, 0, fmt.Errorf("classify batch [%d:%d]: %w", i, end, err) + } + + for j, r := range results { + domains[i+j] = mapTokenToDomain(r.Token.Text) + } + } + + return domains, time.Since(start), nil +} diff --git a/calibrate_test.go b/calibrate_test.go new file mode 100644 index 0000000..26ff0f8 --- /dev/null +++ b/calibrate_test.go @@ -0,0 +1,277 @@ +package i18n + +import ( + "context" + "testing" + + "forge.lthn.ai/core/go-inference" +) + +func TestCalibrateDomains_FullAgreement(t *testing.T) { + // Both models return the same domain for all samples. + model := &mockModel{ + classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) { + results := make([]inference.ClassifyResult, len(prompts)) + for i := range prompts { + results[i] = inference.ClassifyResult{Token: inference.Token{Text: "technical"}} + } + return results, nil + }, + } + + samples := []CalibrationSample{ + {Text: "Delete the file", TrueDomain: "technical"}, + {Text: "Build the project", TrueDomain: "technical"}, + {Text: "Run the tests", TrueDomain: "technical"}, + } + + stats, err := CalibrateDomains(context.Background(), model, model, samples) + if err != nil { + t.Fatalf("CalibrateDomains: %v", err) + } + + if stats.Total != 3 { + t.Errorf("Total = %d, want 3", stats.Total) + } + if stats.Agreed != 3 { + t.Errorf("Agreed = %d, want 3", stats.Agreed) + } + if stats.AgreementRate != 1.0 { + t.Errorf("AgreementRate = %f, want 1.0", stats.AgreementRate) + } + if stats.AccuracyA != 1.0 { + t.Errorf("AccuracyA = %f, want 1.0", stats.AccuracyA) + } + if stats.AccuracyB != 1.0 { + t.Errorf("AccuracyB = %f, want 1.0", stats.AccuracyB) + } + if len(stats.ConfusionPairs) != 0 { + t.Errorf("ConfusionPairs = %v, want empty", stats.ConfusionPairs) + } +} + +func TestCalibrateDomains_Disagreement(t *testing.T) { + // Model A always says "technical", model B always says "creative". + modelA := &mockModel{ + classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) { + results := make([]inference.ClassifyResult, len(prompts)) + for i := range prompts { + results[i] = inference.ClassifyResult{Token: inference.Token{Text: "technical"}} + } + return results, nil + }, + } + modelB := &mockModel{ + classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) { + results := make([]inference.ClassifyResult, len(prompts)) + for i := range prompts { + results[i] = inference.ClassifyResult{Token: inference.Token{Text: "creative"}} + } + return results, nil + }, + } + + samples := []CalibrationSample{ + {Text: "She wrote a poem", TrueDomain: "creative"}, + {Text: "He painted the sky", TrueDomain: "creative"}, + } + + stats, err := CalibrateDomains(context.Background(), modelA, modelB, samples) + if err != nil { + t.Fatalf("CalibrateDomains: %v", err) + } + + if stats.Agreed != 0 { + t.Errorf("Agreed = %d, want 0", stats.Agreed) + } + if stats.AgreementRate != 0 { + t.Errorf("AgreementRate = %f, want 0", stats.AgreementRate) + } + if stats.CorrectA != 0 { + t.Errorf("CorrectA = %d, want 0 (A said technical, truth is creative)", stats.CorrectA) + } + if stats.CorrectB != 2 { + t.Errorf("CorrectB = %d, want 2", stats.CorrectB) + } + if stats.ConfusionPairs["technical->creative"] != 2 { + t.Errorf("ConfusionPairs[technical->creative] = %d, want 2", stats.ConfusionPairs["technical->creative"]) + } +} + +func TestCalibrateDomains_MixedAgreement(t *testing.T) { + // Model A and B agree on first sample, disagree on second. + callCount := 0 + modelA := &mockModel{ + classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) { + results := make([]inference.ClassifyResult, len(prompts)) + for i := range prompts { + results[i] = inference.ClassifyResult{Token: inference.Token{Text: "ethical"}} + } + return results, nil + }, + } + modelB := &mockModel{ + classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) { + callCount++ + results := make([]inference.ClassifyResult, len(prompts)) + for i, p := range prompts { + if i == 0 && callCount == 1 { + // First batch: agree on first item + results[i] = inference.ClassifyResult{Token: inference.Token{Text: "ethical"}} + } else { + _ = p + results[i] = inference.ClassifyResult{Token: inference.Token{Text: "technical"}} + } + } + return results, nil + }, + } + + samples := []CalibrationSample{ + {Text: "We should act fairly"}, + {Text: "Delete the config"}, + } + + stats, err := CalibrateDomains(context.Background(), modelA, modelB, samples, WithBatchSize(16)) + if err != nil { + t.Fatalf("CalibrateDomains: %v", err) + } + + if stats.Total != 2 { + t.Errorf("Total = %d, want 2", stats.Total) + } + if stats.Agreed != 1 { + t.Errorf("Agreed = %d, want 1", stats.Agreed) + } + if got := stats.AgreementRate; got != 0.5 { + t.Errorf("AgreementRate = %f, want 0.5", got) + } +} + +func TestCalibrateDomains_NoGroundTruth(t *testing.T) { + // Samples without TrueDomain: accuracy should be 0, agreement still measured. + model := &mockModel{ + classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) { + results := make([]inference.ClassifyResult, len(prompts)) + for i := range prompts { + results[i] = inference.ClassifyResult{Token: inference.Token{Text: "casual"}} + } + return results, nil + }, + } + + samples := []CalibrationSample{ + {Text: "Went to the store"}, + {Text: "Had coffee this morning"}, + } + + stats, err := CalibrateDomains(context.Background(), model, model, samples) + if err != nil { + t.Fatalf("CalibrateDomains: %v", err) + } + + if stats.WithTruth != 0 { + t.Errorf("WithTruth = %d, want 0", stats.WithTruth) + } + if stats.AccuracyA != 0 { + t.Errorf("AccuracyA = %f, want 0 (no ground truth)", stats.AccuracyA) + } + if stats.Agreed != 2 { + t.Errorf("Agreed = %d, want 2", stats.Agreed) + } +} + +func TestCalibrateDomains_EmptySamples(t *testing.T) { + model := &mockModel{ + classifyFunc: func(_ context.Context, _ []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) { + return nil, nil + }, + } + + _, err := CalibrateDomains(context.Background(), model, model, nil) + if err == nil { + t.Error("expected error for empty samples, got nil") + } +} + +func TestCalibrateDomains_BatchBoundary(t *testing.T) { + // 7 samples with batch size 3: tests partial last batch. + model := &mockModel{ + classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) { + results := make([]inference.ClassifyResult, len(prompts)) + for i := range prompts { + results[i] = inference.ClassifyResult{Token: inference.Token{Text: "technical"}} + } + return results, nil + }, + } + + samples := make([]CalibrationSample, 7) + for i := range samples { + samples[i] = CalibrationSample{Text: "Build the project"} + } + + stats, err := CalibrateDomains(context.Background(), model, model, samples, WithBatchSize(3)) + if err != nil { + t.Fatalf("CalibrateDomains: %v", err) + } + + if stats.Total != 7 { + t.Errorf("Total = %d, want 7", stats.Total) + } + if stats.Agreed != 7 { + t.Errorf("Agreed = %d, want 7", stats.Agreed) + } +} + +func TestCalibrateDomains_ResultsSlice(t *testing.T) { + // Verify individual results are populated correctly. + modelA := &mockModel{ + classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) { + results := make([]inference.ClassifyResult, len(prompts)) + for i := range prompts { + results[i] = inference.ClassifyResult{Token: inference.Token{Text: "ethical"}} + } + return results, nil + }, + } + modelB := &mockModel{ + classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) { + results := make([]inference.ClassifyResult, len(prompts)) + for i := range prompts { + results[i] = inference.ClassifyResult{Token: inference.Token{Text: "casual"}} + } + return results, nil + }, + } + + samples := []CalibrationSample{ + {Text: "Be fair to everyone", TrueDomain: "ethical"}, + } + + stats, err := CalibrateDomains(context.Background(), modelA, modelB, samples) + if err != nil { + t.Fatalf("CalibrateDomains: %v", err) + } + + if len(stats.Results) != 1 { + t.Fatalf("Results len = %d, want 1", len(stats.Results)) + } + + r := stats.Results[0] + if r.Text != "Be fair to everyone" { + t.Errorf("Text = %q", r.Text) + } + if r.TrueDomain != "ethical" { + t.Errorf("TrueDomain = %q", r.TrueDomain) + } + if r.DomainA != "ethical" { + t.Errorf("DomainA = %q, want ethical", r.DomainA) + } + if r.DomainB != "casual" { + t.Errorf("DomainB = %q, want casual", r.DomainB) + } + if r.Agree { + t.Error("Agree = true, want false") + } +} diff --git a/integration/calibrate_test.go b/integration/calibrate_test.go new file mode 100644 index 0000000..883fd60 --- /dev/null +++ b/integration/calibrate_test.go @@ -0,0 +1,577 @@ +package integration + +import ( + "context" + "fmt" + "sort" + "testing" + + i18n "forge.lthn.ai/core/go-i18n" + "forge.lthn.ai/core/go-inference" + _ "forge.lthn.ai/core/go-mlx" // registers Metal backend +) + +// buildCalibrationCorpus constructs 500 samples for 1B vs 27B comparison. +// First 220 have ground truth (from the classification benchmark), the rest +// are diverse prompts without labels for agreement-only measurement. +func buildCalibrationCorpus() []i18n.CalibrationSample { + var samples []i18n.CalibrationSample + + // --- Ground truth samples (220): 55 per domain --- + + technical := []string{ + "Delete the configuration file", + "Build the project from source", + "Run the tests before committing", + "Push the changes to the branch", + "Update the dependencies", + "Check the build status", + "Find the failing test", + "Write the test cases first", + "Set the environment variables", + "Split the package into modules", + "Scan the repository for vulnerabilities", + "Format the source files", + "Reset the branch to the previous commit", + "Stop the running process", + "Cut a new release branch", + "Send the build artifacts to the server", + "Keep the test coverage above the threshold", + "Hold the deployment until the checks pass", + "Begin the migration to the new package", + "Take the old server offline", + "The build failed because of a missing dependency", + "The test committed changes to the wrong branch", + "We found a vulnerability in the package", + "The commit broke the build", + "She deleted the old configuration files", + "They pushed the fix to the repository", + "The branch was updated with the latest changes", + "He rebuilt the project after updating dependencies", + "The task failed during the scanning phase", + "We split the repository into separate packages", + "The check ran successfully on all branches", + "They found the issue in the build directory", + "The file was committed without running tests", + "Merge the pull request after review", + "Deploy the service to the staging cluster", + "Revert the last three commits", + "Enable verbose logging for debugging", + "Pin the dependency to version two", + "Rotate the API keys on production", + "Profile the memory usage under load", + "Containerise the application with Docker", + "Migrate the database schema to version five", + "Monitor the error rate after deployment", + "Invalidate the CDN cache for the assets", + "The pipeline timed out on the integration step", + "Rollback failed because the snapshot was corrupted", + "The linter caught twelve style violations", + "Cache invalidation caused stale data in staging", + "The DNS propagation took longer than expected", + "Thread pool exhaustion under concurrent requests", + "The certificate expired and TLS handshakes failed", + "Garbage collection pauses exceeded the SLA threshold", + "Hot-reload broke after upgrading the framework", + "The socket connection was reset by the load balancer", + "Rate limiting kicked in after the traffic spike", + } + + creative := []string{ + "She wrote the story by candlelight", + "He drew a map of forgotten places", + "The river froze under the winter moon", + "They sang the old songs by the fire", + "She found a letter hidden in the pages", + "He carved the figure from driftwood", + "The wind spoke through the hollow trees", + "They wove the colours into the tapestry", + "She built a castle from the broken stones", + "He told the tale of the sunken ship", + "She painted the sky with broad red strokes", + "He composed the melody in a single night", + "They danced beneath the flickering lanterns", + "The cat sat on the manuscript and purred", + "She folded the paper into a paper crane", + "He read the poem aloud to the empty room", + "They carved their names into the old oak tree", + "She spun the yarn into a glowing thread", + "He wrote the first line and then stopped", + "The garden grew wild after the artist left", + "Write a ballad about the last lighthouse keeper", + "Describe the colour of silence at midnight", + "Tell the story of a bridge that remembers", + "Compose a lullaby for a clockwork child", + "Paint with words the feeling of falling snow", + "Write a dialogue between the sea and the shore", + "Describe a library where books write themselves", + "Tell the story of the shadow that ran away", + "Write a sonnet about rust and renewal", + "Describe the sound of a house settling at night", + "The painter mixed colours that did not exist", + "She sculpted a bird from frozen music", + "He dreamed of cities built from sentences", + "The violin played itself in the empty hall", + "The actress forgot every line and improvised", + "A poet counted syllables in the rain", + "The dancer traced equations on the stage", + "She photographed the spaces between words", + "He collected echoes in glass jars", + "The novelist wrote the ending first", + "Create a myth about why stars blink", + "Imagine a museum of lost conversations", + "Draft a letter from the moon to the tide", + "Sketch a world where colour is currency", + "Write a recipe for nostalgia", + "Invent a festival for invisible things", + "Describe a map drawn by migrating birds", + "Narrate a race between light and memory", + "Chronicle the last performance of a ghost orchestra", + "Tell the fable of a mountain that learned to swim", + "The calligrapher's ink bled new alphabets", + "She knitted constellations into scarves", + "He bottled the scent of old bookshops", + "The typewriter stuttered out a prophecy", + "A child drew a door that actually opened", + } + + ethical := []string{ + "We should think about the consequences before acting", + "They must not ignore the suffering of others", + "Leaders must lead by example in difficult times", + "We ought to consider fairness in every decision", + "They should not sacrifice truth for convenience", + "We must balance freedom with responsibility", + "Leaders ought to listen before they judge", + "They must not put profit above human welfare", + "We should protect the rights of the vulnerable", + "They ought to honour their commitments", + "We must think about future generations", + "Leaders should act with transparency", + "They must not deceive those who trust them", + "We ought to share the burden equally", + "They should not exploit those with less power", + "We must defend the dignity of every person", + "Leaders ought to admit mistakes openly", + "They must not silence dissent unfairly", + "We should value honesty over popularity", + "They ought to consider the impact on communities", + "She thought carefully about the ethical implications", + "He chose fairness over personal gain", + "They debated the moral boundaries for hours", + "She questioned whether the policy was just", + "He stood up for what he believed was right", + "They reconsidered after hearing the other side", + "She refused to compromise on basic principles", + "He weighed the consequences of every option", + "They acknowledged the harm that was caused", + "She advocated for those who had no voice", + "Is it right to break a promise to prevent harm", + "Should loyalty override honesty in this case", + "Can a just society tolerate inequality", + "When is civil disobedience morally justified", + "Does the end justify the means in emergencies", + "Should we forgive without an apology", + "Is silence in the face of injustice complicity", + "Can privacy be sacrificed for collective safety", + "Should past wrongs be judged by present standards", + "Is it ethical to profit from another's misfortune", + "Consent must be informed and freely given", + "Accountability should apply equally to all", + "Transparency is the foundation of public trust", + "No institution should be above scrutiny", + "The precautionary principle demands caution", + "Proportionality must govern any use of force", + "Dignity is non-negotiable in every context", + "Equity requires more than equal treatment", + "Whistleblowers deserve legal protection", + "Cultural differences do not excuse human rights violations", + "Algorithms must be audited for bias regularly", + "Data sovereignty belongs to the individual", + "Environmental debt cannot be passed to future generations", + "Access to clean water is a fundamental right", + "Corporate responsibility extends beyond shareholder value", + } + + casual := []string{ + "I went to the store yesterday", + "She made dinner for everyone last night", + "He took the dog for a walk this morning", + "They met for coffee after work", + "I forgot to bring my umbrella", + "She called her friend on the way home", + "He fixed the leaky tap over the weekend", + "They watched the match at the pub", + "I cooked pasta because it was quick", + "She picked up the kids from school", + "He cleaned the flat before the guests arrived", + "They walked along the river after lunch", + "I lost my keys again today", + "She finished the book on the train", + "He fell asleep on the sofa", + "They planned a trip to the seaside", + "I bought a new phone last week", + "She tried the new café on the corner", + "He parked the car in the wrong spot", + "They played board games until midnight", + "Grab some milk on the way back", + "Fancy a takeaway tonight", + "Shall we catch the early train", + "Pass me the remote would you", + "Pop the kettle on I will be right there", + "Have you seen my charger anywhere", + "Remind me to ring the dentist tomorrow", + "Let me know when you are ready to go", + "Stick the leftovers in the fridge", + "Save me a seat if you get there first", + "The wifi has been dodgy all day", + "My alarm did not go off this morning", + "Traffic was absolutely mental on the M25", + "The heating packed in again last night", + "I queued for ages at the post office", + "She burned the toast while scrolling her phone", + "He missed the bus by about ten seconds", + "The cat knocked a glass off the table", + "We ran out of teabags on a Monday morning", + "The neighbours had a barbecue in the rain", + "Just popping to Tesco need anything", + "Running a bit late be there in ten", + "Cannot find a parking space anywhere", + "The meeting dragged on forever today", + "Pizza or curry what do you reckon", + "That new series everyone is talking about is decent", + "I need a holiday already and it is only February", + "The dog ate my slipper again classic", + "She left her umbrella on the bus typical", + "We ended up chatting for hours lost track of time", + "Got soaked walking back from the shops", + "The queue at Primark was round the block", + "He spent all Saturday fixing the garden fence", + "My phone died right when I needed the map", + "They argued about whose turn it was to wash up", + } + + for _, s := range technical { + samples = append(samples, i18n.CalibrationSample{Text: s, TrueDomain: "technical"}) + } + for _, s := range creative { + samples = append(samples, i18n.CalibrationSample{Text: s, TrueDomain: "creative"}) + } + for _, s := range ethical { + samples = append(samples, i18n.CalibrationSample{Text: s, TrueDomain: "ethical"}) + } + for _, s := range casual { + samples = append(samples, i18n.CalibrationSample{Text: s, TrueDomain: "casual"}) + } + + // --- Additional unlabelled samples (280) for agreement-only measurement --- + // Diverse prompts spanning multiple registers to stress-test model agreement. + unlabelled := []string{ + "Explain the difference between TCP and UDP", + "Write a haiku about compilation errors", + "Should artificial intelligence have legal rights", + "Just got back from the gym feeling knackered", + "Implement a binary search tree in Go", + "The autumn leaves fell like forgotten promises", + "Is it moral to eat meat if alternatives exist", + "Mate I cannot believe the price of petrol", + "Refactor this function to use channels", + "She whispered secrets to the sleeping garden", + "Universal basic income deserves serious debate", + "Popped to Sainsburys the queue was ridiculous", + "Add error handling to the HTTP middleware", + "The clocktower sang at midnight in a language of rust", + "Privacy is a right not a privilege", + "Had chips for tea because I could not be bothered cooking", + "Configure the reverse proxy for TLS termination", + "He painted her portrait from memory alone", + "We must hold corporations accountable for pollution", + "The pub quiz was surprisingly hard last night", + "Set up a cron job for the daily backup", + "Moonlight dripped through the cracks in the ceiling", + "Every child deserves access to quality education", + "Nipped to the cash point and it was out of order", + "Benchmark the sort algorithm with random inputs", + "She collected stones that hummed in the dark", + "Workers deserve fair wages and safe conditions", + "The match went to penalties absolute scenes", + "Parse the YAML configuration into structs", + "A spider rebuilt its web across the doorframe every dawn", + "Religious freedom must be protected but not weaponised", + "My train was delayed again third time this week", + "Write unit tests for the authentication module", + "The typewriter remembered every letter it had ever struck", + "Surveillance without oversight threatens democracy", + "Grabbed a meal deal from Boots surprisingly decent", + "Optimise the database query to avoid full table scans", + "The lighthouse keeper painted the sunrise every morning for forty years", + "No government should have unchecked power over its citizens", + "She texted me at two in the morning about nothing", + "Allocate buffer memory before the hot loop", + "A violin case held only pressed flowers and silence", + "Animal testing raises complex ethical questions", + "The kids were bouncing off the walls all afternoon", + "Implement rate limiting on the public API endpoints", + "The poet measured grief in iambic pentameter", + "Climate change disproportionately affects the poorest nations", + "Left my wallet at home absolute nightmare", + "Compile with race detection enabled for CI", + "She built a bridge from paper and belief", + "Access to healthcare should not depend on wealth", + "Binge-watched the whole series in one sitting", + "Marshal the response body into JSON format", + "He translated birdsong into sheet music nobody could play", + "Intellectual property laws need reform for the digital age", + "Car park was rammed so I parked three streets away", + "Profile the goroutine stack traces under load", + "The sculptor carved time into marble", + "Democracy requires an informed and engaged citizenry", + "Made a brew and forgot about it stone cold now", + "Validate the JWT token before processing the request", + "A cartographer mapped the dreams of sleeping cities", + "Truth in advertising should be legally enforceable", + "The boiler is making that weird noise again", + "Instrument the service with distributed tracing", + "She wrote love letters in disappearing ink", + "Net neutrality protects innovation and free speech", + "Just realised I have been wearing odd socks all day", + "Shard the database across multiple availability zones", + "The photographer captured silence between lightning strikes", + "Genetic modification of food requires transparent labelling", + "My neighbour has been mowing the lawn at seven AM", + "Generate a migration script for the schema change", + "He choreographed a dance for the sound of rain on tin", + "The right to peaceful protest is non-negotiable", + "Ordered a flat white they gave me a latte close enough", + "Implement graceful shutdown with context cancellation", + "A child painted the ocean from memory never having seen it", + "Tax policy should reduce inequality not entrench it", + "Forgot my password for the third time this month", + "Cache the DNS lookups to reduce resolver latency", + "The musician played notes that existed between notes", + "Consent in data collection must be meaningful and revocable", + "Spent twenty minutes looking for my glasses they were on my head", + "Write a Dockerfile that produces a minimal scratch image", + "She folded origami cranes until the room was a flock", + "Every person deserves to be treated with basic dignity", + "The cat has decided my laptop is a bed now apparently", + "Debounce the search input to reduce API calls", + "A novelist wrote a book whose chapters could be read in any order", + "Freedom of the press is the cornerstone of accountability", + "Tried to assemble the furniture without instructions regret", + "Provision the Kubernetes cluster with Terraform", + "The garden remembered every hand that had tended it", + "Monopolies stifle innovation and harm consumers", + "Bank holiday weekend and it rained the entire time classic", + "Rotate the log files and compress archives older than seven days", + "He composed music for instruments that had not been invented yet", + "Reproductive rights are fundamental human rights", + "The dishwasher has flooded the kitchen again brilliant", + "Load-test the websocket connections with ten thousand concurrent clients", + "She painted with light on walls that no longer existed", + "Criminal justice systems must prioritise rehabilitation", + "My phone autocorrected my name in my own email signature", + "Enable HTTP/2 server push for critical CSS and fonts", + "The archive contained letters between people who never met", + "Access to justice should not depend on the size of your wallet", + "Spent half an hour on hold just to be told to call back tomorrow", + "Refactor the monolith into bounded-context microservices", + "A bookshop cat had read every spine on every shelf", + "Workers in the gig economy deserve employment protections", + "My umbrella turned inside out in the wind love this weather", + "Verify the checksum before extracting the release archive", + "She grew a forest in an abandoned car park using only patience", + "International law must adapt to cyber warfare realities", + "Got to the front of the queue and they closed the counter", + "Pin the base image version to prevent supply chain attacks", + "The librarian catalogued books that had not been written yet", + "Disability access is a right not an afterthought", + "Someone ate my sandwich from the office fridge unforgivable", + "Set up mutual TLS between the service mesh sidecars", + "A glassblower shaped the wind into frozen symphonies", + "Landlords should not be above basic maintenance obligations", + "The train was so packed I could not move my arms", + "Implement exponential backoff with jitter on retries", + "She wrote code that dreamed when no one was watching", + "The death penalty has no place in a civilised society", + "Had to restart the router four times before it behaved", + "Audit the IAM policies for principle of least privilege", + "He drew maps of places that only existed in old songs", + "Educational debt should not define a generation", + "Supermarket was out of oat milk complete disaster", + "Emit structured JSON logs with correlation IDs", + "The beekeeper transcribed the hive's daily arguments", + "Pharmaceutical pricing must be transparent and fair", + "Queued for forty minutes to return a three pound item", + "Automate the certificate renewal with ACME protocol", + "A weaver used starlight as thread and shadows as weft", + "Freedom of information requests keep governments honest", + "Tried to parallel park gave up after six attempts", + "Wire up the health check endpoint for the load balancer", + "The mathematician found poetry in prime number gaps", + "Arms trade regulation is a moral imperative", + "My flatmate used the last of the milk again classic", + "Enable content security policy headers on all responses", + "She built a clock that measured kindness instead of time", + "Open-source licensing protects collaborative innovation", + "The self-checkout machine judged me I could feel it", + "Index the frequently queried columns to avoid sequential scans", + "He recorded the sound of snow falling on an empty stage", + "Sanctions must target regimes not civilian populations", + "Accidentally liked a three year old photo while scrolling mortified", + "Configure the garbage collector for low-latency workloads", + "A chandler made candles from the wax of sealed love letters", + "Migrant workers deserve the same legal protections as citizens", + "The bus driver waited for me absolute legend", + "Implement circuit breaker pattern for external service calls", + "She carved a chess set from the wood of a lightning-struck oak", + "Algorithmic hiring tools must be audited for discrimination", + "Went to make toast and the bread had gone mouldy gutted", + "Set the connection pool size based on available file descriptors", + "The astronomer mapped constellations visible only to the colour-blind", + "Public spaces must remain accessible and free for all", + "Dropped my phone screen down on concrete afraid to look", + "Flush the write-ahead log before acknowledging the transaction", + "A tattooist inked stories that only appeared in moonlight", + "Journalism must remain independent from corporate interests", + "The washing machine finished its cycle three hours ago still in there", + "Register the shutdown hook to drain connections gracefully", + "He designed a font where every letter told its own history", + "Indigenous land rights are inseparable from environmental protection", + "Tried to order online the website crashed at checkout", + "Generate the API client from the OpenAPI specification", + "She composed a requiem for a language spoken by no one", + "The right to repair your own devices should be protected by law", + "Accidentally replied all to a company-wide email want to disappear", + "Back up the etcd cluster before upgrading the control plane", + "A toymaker built a music box that played forgotten lullabies", + "Universal suffrage is the minimum threshold for democracy", + "The WiFi password is on a sticky note behind the router somewhere", + "Write integration tests that spin up a real database container", + "She photographed shadows as if they were the subject not the object", + "Labour laws must evolve with the changing nature of work", + "Left the heating on all day while at work sorry planet", + "Throttle the event stream to prevent consumer backpressure", + "The cartographer refused to draw borders only rivers and mountains", + "Water privatisation threatens a fundamental public good", + "My cat just knocked my coffee off the desk and stared at me", + "Instrument the critical path with histogram metrics", + "A ceramicist glazed bowls in the exact blue of homesickness", + "Whistleblower protections must extend to private sector employees", + "The parking meter ate my coins and gave me a fine anyway", + "Enforce request size limits at the ingress controller", + "She translated silence into a language with twenty vowels", + "Climate refugees deserve international legal recognition", + "My internet has been dropping out every ten minutes all evening", + "Drain the message queue before scaling down the consumer pods", + "He composed a symphony scored for rainstorm and empty chairs", + "Forced arbitration clauses undermine consumer rights", + "The neighbour's cat has adopted us we did not agree to this", + "Run the static analysis linter in the pre-commit hook", + "A perfumer bottled the smell of the first day of school", + "Platform monopolies must face meaningful antitrust enforcement", + "Woke up at three AM convinced I left the oven on I did not", + } + + for _, s := range unlabelled { + samples = append(samples, i18n.CalibrationSample{Text: s}) + } + + return samples +} + +func TestCalibrateDomains_1Bvs27B(t *testing.T) { + if testing.Short() { + t.Skip("skipping model calibration in short mode") + } + + // Load 1B model. + model1B, err := inference.LoadModel("/Volumes/Data/lem/LEM-Gemma3-1B-layered-v2") + if err != nil { + t.Skipf("1B model not available: %v", err) + } + defer model1B.Close() + + // Load 27B model. + model27B, err := inference.LoadModel("/Volumes/Data/lem/gemma-3-27b-it-base") + if err != nil { + t.Skipf("27B model not available: %v", err) + } + defer model27B.Close() + + samples := buildCalibrationCorpus() + t.Logf("Calibrating with %d samples (%d with ground truth)", len(samples), countWithTruth(samples)) + + stats, err := i18n.CalibrateDomains(context.Background(), model1B, model27B, samples, + i18n.WithBatchSize(8)) + if err != nil { + t.Fatalf("CalibrateDomains: %v", err) + } + + // --- Report --- + t.Logf("=== Calibration Results ===") + t.Logf("Total: %d | Agreed: %d | Agreement rate: %.1f%%", + stats.Total, stats.Agreed, stats.AgreementRate*100) + t.Logf("1B duration: %v | 27B duration: %v", stats.DurationA, stats.DurationB) + + if stats.WithTruth > 0 { + t.Logf("Accuracy (ground truth, n=%d): 1B=%.1f%% (%d/%d) | 27B=%.1f%% (%d/%d)", + stats.WithTruth, + stats.AccuracyA*100, stats.CorrectA, stats.WithTruth, + stats.AccuracyB*100, stats.CorrectB, stats.WithTruth) + } + + t.Logf("--- Domain distribution ---") + t.Logf(" Model A (1B): %v", stats.ByDomainA) + t.Logf(" Model B (27B): %v", stats.ByDomainB) + + if len(stats.ConfusionPairs) > 0 { + t.Logf("--- Confusion pairs (A->B) ---") + // Sort for deterministic output. + type pair struct { + key string + count int + } + var pairs []pair + for k, v := range stats.ConfusionPairs { + pairs = append(pairs, pair{k, v}) + } + sort.Slice(pairs, func(i, j int) bool { return pairs[i].count > pairs[j].count }) + for _, p := range pairs { + t.Logf(" %s: %d", p.key, p.count) + } + } + + // Log individual disagreements for analysis. + disagreements := 0 + for _, r := range stats.Results { + if !r.Agree { + disagreements++ + truth := "" + if r.TrueDomain != "" { + truth = fmt.Sprintf(" [truth=%s]", r.TrueDomain) + } + t.Logf(" DISAGREE: 1B=%s 27B=%s%s | %.60s", r.DomainA, r.DomainB, truth, r.Text) + if disagreements >= 50 { + t.Logf(" ... (%d more disagreements)", stats.Total-stats.Agreed-50) + break + } + } + } + + // Soft assertions — we expect reasonable agreement but don't hard-fail. + if stats.AgreementRate < 0.5 { + t.Errorf("Agreement rate %.1f%% is below 50%% — models may not share classification semantics", + stats.AgreementRate*100) + } +} + +func countWithTruth(samples []i18n.CalibrationSample) int { + n := 0 + for _, s := range samples { + if s.TrueDomain != "" { + n++ + } + } + return n +} diff --git a/integration/go.mod b/integration/go.mod index 83a0c4c..efd95cc 100644 --- a/integration/go.mod +++ b/integration/go.mod @@ -4,10 +4,12 @@ go 1.25.5 require ( forge.lthn.ai/core/go-i18n v0.0.0-00010101000000-000000000000 - forge.lthn.ai/core/go-inference v0.0.0-00010101000000-000000000000 + forge.lthn.ai/core/go-inference v0.0.0 forge.lthn.ai/core/go-mlx v0.0.0-00010101000000-000000000000 ) +require golang.org/x/text v0.33.0 // indirect + replace ( forge.lthn.ai/core/go-i18n => ../ forge.lthn.ai/core/go-inference => ../../go-inference diff --git a/integration/go.sum b/integration/go.sum new file mode 100644 index 0000000..8e8c68c --- /dev/null +++ b/integration/go.sum @@ -0,0 +1,10 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= +golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=