feat(calibrate): 1B vs 27B domain calibration tool

CalibrateDomains() accepts two inference.TextModel instances and a corpus of CalibrationSamples, classifies all with both models, and computes agreement rate, per-domain distribution, confusion pairs, and accuracy vs ground truth. - calibrate.go: CalibrateDomains + classifyAll batch helper - calibrate_test.go: 7 mock tests (agreement, disagreement, mixed, no ground truth, empty, batch boundary, results slice) - integration/calibrate_test.go: 500-sample corpus (220 ground-truth + 280 unlabelled) for real 1B vs 27B model comparison - TODO.md: Phase 2a calibration task marked complete Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-20 13:51:11 +00:00 · 2026-02-20 13:51:11 +00:00 · 3b7ef9d26a
commit 3b7ef9d26a
parent 2e586aedc5
6 changed files with 1022 additions and 2 deletions
--- a/TODO.md
+++ b/TODO.md
@ -71,7 +71,7 @@ models, _ := inference.Discover("/Volumes/Data/lem/")

 ### Remaining Phase 2a Tasks

- [ ] **1B vs 27B calibration check** — Sample 500 sentences, classify with both 1B and 27B, measure agreement rate. Load 27B via same `inference.LoadModel()` path. Classification benchmark shows ethical↔technical (both base-form heavy) and casual↔creative (both past-tense heavy) are the confusion axes — 1B needs to resolve these.
+- [x] **1B vs 27B calibration check** — `CalibrateDomains()` in `calibrate.go`. Accepts two TextModels + 500 CalibrationSamples (220 ground-truth + 280 unlabelled). Batch-classifies with both models, computes agreement rate, per-domain distribution, confusion pairs, and accuracy vs ground truth. 7 mock tests (race-clean). Integration test at `integration/calibrate_test.go` loads LEM-1B + Gemma3-27B from `/Volumes/Data/lem/`, runs full calibration with detailed reporting. Run with: `cd integration && go test -v -run TestCalibrateDomains_1Bvs27B`
 - [x] **Article/irregular validator** — Lightweight Go funcs that use the 1B model's strong article correctness (100%) and irregular base form accuracy (100%) as fast validators. Use `m.Generate()` with `inference.WithMaxTokens(1)` and `inference.WithTemperature(0.05)` for single-token classification.

 ### 2b: Reference Distributions
--- a/calibrate.go
+++ b/calibrate.go
@ -0,0 +1,154 @@
+package i18n
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"forge.lthn.ai/core/go-inference"
+)
+
+// CalibrationSample is a single text entry for model comparison.
+type CalibrationSample struct {
+	Text       string
+	TrueDomain string // optional ground truth label (empty if unknown)
+}
+
+// CalibrationResult holds per-sample classification from two models.
+type CalibrationResult struct {
+	Text       string `json:"text"`
+	TrueDomain string `json:"true_domain,omitempty"`
+	DomainA    string `json:"domain_a"`
+	DomainB    string `json:"domain_b"`
+	Agree      bool   `json:"agree"`
+}
+
+// CalibrationStats holds aggregate metrics from CalibrateDomains.
+type CalibrationStats struct {
+	Total          int            `json:"total"`
+	Agreed         int            `json:"agreed"`
+	AgreementRate  float64        `json:"agreement_rate"`
+	ByDomainA      map[string]int `json:"by_domain_a"`
+	ByDomainB      map[string]int `json:"by_domain_b"`
+	ConfusionPairs map[string]int `json:"confusion_pairs"` // "technical->creative": count
+	AccuracyA      float64        `json:"accuracy_a"`      // vs ground truth (0 if none)
+	AccuracyB      float64        `json:"accuracy_b"`      // vs ground truth (0 if none)
+	CorrectA       int            `json:"correct_a"`
+	CorrectB       int            `json:"correct_b"`
+	WithTruth      int            `json:"with_truth"` // samples that had ground truth
+	DurationA      time.Duration  `json:"duration_a"`
+	DurationB      time.Duration  `json:"duration_b"`
+	Results        []CalibrationResult `json:"results"`
+}
+
+// CalibrateDomains classifies all samples with both models and computes agreement.
+// Model A is typically the smaller/faster model (1B), model B the larger reference (27B).
+// Samples with non-empty TrueDomain also contribute to accuracy metrics.
+func CalibrateDomains(ctx context.Context, modelA, modelB inference.TextModel,
+	samples []CalibrationSample, opts ...ClassifyOption) (*CalibrationStats, error) {
+
+	if len(samples) == 0 {
+		return nil, fmt.Errorf("calibrate: empty sample set")
+	}
+
+	cfg := defaultClassifyConfig()
+	for _, o := range opts {
+		o(&cfg)
+	}
+
+	stats := &CalibrationStats{
+		ByDomainA:      make(map[string]int),
+		ByDomainB:      make(map[string]int),
+		ConfusionPairs: make(map[string]int),
+	}
+
+	// Build classification prompts from sample texts.
+	prompts := make([]string, len(samples))
+	for i, s := range samples {
+		prompts[i] = fmt.Sprintf(cfg.promptTemplate, s.Text)
+	}
+
+	// Classify with model A.
+	domainsA, durA, err := classifyAll(ctx, modelA, prompts, cfg.batchSize)
+	if err != nil {
+		return nil, fmt.Errorf("model A: %w", err)
+	}
+	stats.DurationA = durA
+
+	// Classify with model B.
+	domainsB, durB, err := classifyAll(ctx, modelB, prompts, cfg.batchSize)
+	if err != nil {
+		return nil, fmt.Errorf("model B: %w", err)
+	}
+	stats.DurationB = durB
+
+	// Compare results.
+	stats.Total = len(samples)
+	stats.Results = make([]CalibrationResult, len(samples))
+
+	for i, s := range samples {
+		a, b := domainsA[i], domainsB[i]
+		agree := a == b
+		if agree {
+			stats.Agreed++
+		} else {
+			key := fmt.Sprintf("%s->%s", a, b)
+			stats.ConfusionPairs[key]++
+		}
+		stats.ByDomainA[a]++
+		stats.ByDomainB[b]++
+
+		if s.TrueDomain != "" {
+			stats.WithTruth++
+			if a == s.TrueDomain {
+				stats.CorrectA++
+			}
+			if b == s.TrueDomain {
+				stats.CorrectB++
+			}
+		}
+
+		stats.Results[i] = CalibrationResult{
+			Text:       s.Text,
+			TrueDomain: s.TrueDomain,
+			DomainA:    a,
+			DomainB:    b,
+			Agree:      agree,
+		}
+	}
+
+	if stats.Total > 0 {
+		stats.AgreementRate = float64(stats.Agreed) / float64(stats.Total)
+	}
+	if stats.WithTruth > 0 {
+		stats.AccuracyA = float64(stats.CorrectA) / float64(stats.WithTruth)
+		stats.AccuracyB = float64(stats.CorrectB) / float64(stats.WithTruth)
+	}
+
+	return stats, nil
+}
+
+// classifyAll runs batch classification over all prompts, returning domain labels.
+func classifyAll(ctx context.Context, model inference.TextModel, prompts []string, batchSize int) ([]string, time.Duration, error) {
+	start := time.Now()
+	domains := make([]string, len(prompts))
+
+	for i := 0; i < len(prompts); i += batchSize {
+		end := i + batchSize
+		if end > len(prompts) {
+			end = len(prompts)
+		}
+		batch := prompts[i:end]
+
+		results, err := model.Classify(ctx, batch, inference.WithMaxTokens(1))
+		if err != nil {
+			return nil, 0, fmt.Errorf("classify batch [%d:%d]: %w", i, end, err)
+		}
+
+		for j, r := range results {
+			domains[i+j] = mapTokenToDomain(r.Token.Text)
+		}
+	}
+
+	return domains, time.Since(start), nil
+}
--- a/calibrate_test.go
+++ b/calibrate_test.go
@ -0,0 +1,277 @@
+package i18n
+
+import (
+	"context"
+	"testing"
+
+	"forge.lthn.ai/core/go-inference"
+)
+
+func TestCalibrateDomains_FullAgreement(t *testing.T) {
+	// Both models return the same domain for all samples.
+	model := &mockModel{
+		classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
+			results := make([]inference.ClassifyResult, len(prompts))
+			for i := range prompts {
+				results[i] = inference.ClassifyResult{Token: inference.Token{Text: "technical"}}
+			}
+			return results, nil
+		},
+	}
+
+	samples := []CalibrationSample{
+		{Text: "Delete the file", TrueDomain: "technical"},
+		{Text: "Build the project", TrueDomain: "technical"},
+		{Text: "Run the tests", TrueDomain: "technical"},
+	}
+
+	stats, err := CalibrateDomains(context.Background(), model, model, samples)
+	if err != nil {
+		t.Fatalf("CalibrateDomains: %v", err)
+	}
+
+	if stats.Total != 3 {
+		t.Errorf("Total = %d, want 3", stats.Total)
+	}
+	if stats.Agreed != 3 {
+		t.Errorf("Agreed = %d, want 3", stats.Agreed)
+	}
+	if stats.AgreementRate != 1.0 {
+		t.Errorf("AgreementRate = %f, want 1.0", stats.AgreementRate)
+	}
+	if stats.AccuracyA != 1.0 {
+		t.Errorf("AccuracyA = %f, want 1.0", stats.AccuracyA)
+	}
+	if stats.AccuracyB != 1.0 {
+		t.Errorf("AccuracyB = %f, want 1.0", stats.AccuracyB)
+	}
+	if len(stats.ConfusionPairs) != 0 {
+		t.Errorf("ConfusionPairs = %v, want empty", stats.ConfusionPairs)
+	}
+}
+
+func TestCalibrateDomains_Disagreement(t *testing.T) {
+	// Model A always says "technical", model B always says "creative".
+	modelA := &mockModel{
+		classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
+			results := make([]inference.ClassifyResult, len(prompts))
+			for i := range prompts {
+				results[i] = inference.ClassifyResult{Token: inference.Token{Text: "technical"}}
+			}
+			return results, nil
+		},
+	}
+	modelB := &mockModel{
+		classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
+			results := make([]inference.ClassifyResult, len(prompts))
+			for i := range prompts {
+				results[i] = inference.ClassifyResult{Token: inference.Token{Text: "creative"}}
+			}
+			return results, nil
+		},
+	}
+
+	samples := []CalibrationSample{
+		{Text: "She wrote a poem", TrueDomain: "creative"},
+		{Text: "He painted the sky", TrueDomain: "creative"},
+	}
+
+	stats, err := CalibrateDomains(context.Background(), modelA, modelB, samples)
+	if err != nil {
+		t.Fatalf("CalibrateDomains: %v", err)
+	}
+
+	if stats.Agreed != 0 {
+		t.Errorf("Agreed = %d, want 0", stats.Agreed)
+	}
+	if stats.AgreementRate != 0 {
+		t.Errorf("AgreementRate = %f, want 0", stats.AgreementRate)
+	}
+	if stats.CorrectA != 0 {
+		t.Errorf("CorrectA = %d, want 0 (A said technical, truth is creative)", stats.CorrectA)
+	}
+	if stats.CorrectB != 2 {
+		t.Errorf("CorrectB = %d, want 2", stats.CorrectB)
+	}
+	if stats.ConfusionPairs["technical->creative"] != 2 {
+		t.Errorf("ConfusionPairs[technical->creative] = %d, want 2", stats.ConfusionPairs["technical->creative"])
+	}
+}
+
+func TestCalibrateDomains_MixedAgreement(t *testing.T) {
+	// Model A and B agree on first sample, disagree on second.
+	callCount := 0
+	modelA := &mockModel{
+		classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
+			results := make([]inference.ClassifyResult, len(prompts))
+			for i := range prompts {
+				results[i] = inference.ClassifyResult{Token: inference.Token{Text: "ethical"}}
+			}
+			return results, nil
+		},
+	}
+	modelB := &mockModel{
+		classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
+			callCount++
+			results := make([]inference.ClassifyResult, len(prompts))
+			for i, p := range prompts {
+				if i == 0 && callCount == 1 {
+					// First batch: agree on first item
+					results[i] = inference.ClassifyResult{Token: inference.Token{Text: "ethical"}}
+				} else {
+					_ = p
+					results[i] = inference.ClassifyResult{Token: inference.Token{Text: "technical"}}
+				}
+			}
+			return results, nil
+		},
+	}
+
+	samples := []CalibrationSample{
+		{Text: "We should act fairly"},
+		{Text: "Delete the config"},
+	}
+
+	stats, err := CalibrateDomains(context.Background(), modelA, modelB, samples, WithBatchSize(16))
+	if err != nil {
+		t.Fatalf("CalibrateDomains: %v", err)
+	}
+
+	if stats.Total != 2 {
+		t.Errorf("Total = %d, want 2", stats.Total)
+	}
+	if stats.Agreed != 1 {
+		t.Errorf("Agreed = %d, want 1", stats.Agreed)
+	}
+	if got := stats.AgreementRate; got != 0.5 {
+		t.Errorf("AgreementRate = %f, want 0.5", got)
+	}
+}
+
+func TestCalibrateDomains_NoGroundTruth(t *testing.T) {
+	// Samples without TrueDomain: accuracy should be 0, agreement still measured.
+	model := &mockModel{
+		classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
+			results := make([]inference.ClassifyResult, len(prompts))
+			for i := range prompts {
+				results[i] = inference.ClassifyResult{Token: inference.Token{Text: "casual"}}
+			}
+			return results, nil
+		},
+	}
+
+	samples := []CalibrationSample{
+		{Text: "Went to the store"},
+		{Text: "Had coffee this morning"},
+	}
+
+	stats, err := CalibrateDomains(context.Background(), model, model, samples)
+	if err != nil {
+		t.Fatalf("CalibrateDomains: %v", err)
+	}
+
+	if stats.WithTruth != 0 {
+		t.Errorf("WithTruth = %d, want 0", stats.WithTruth)
+	}
+	if stats.AccuracyA != 0 {
+		t.Errorf("AccuracyA = %f, want 0 (no ground truth)", stats.AccuracyA)
+	}
+	if stats.Agreed != 2 {
+		t.Errorf("Agreed = %d, want 2", stats.Agreed)
+	}
+}
+
+func TestCalibrateDomains_EmptySamples(t *testing.T) {
+	model := &mockModel{
+		classifyFunc: func(_ context.Context, _ []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
+			return nil, nil
+		},
+	}
+
+	_, err := CalibrateDomains(context.Background(), model, model, nil)
+	if err == nil {
+		t.Error("expected error for empty samples, got nil")
+	}
+}
+
+func TestCalibrateDomains_BatchBoundary(t *testing.T) {
+	// 7 samples with batch size 3: tests partial last batch.
+	model := &mockModel{
+		classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
+			results := make([]inference.ClassifyResult, len(prompts))
+			for i := range prompts {
+				results[i] = inference.ClassifyResult{Token: inference.Token{Text: "technical"}}
+			}
+			return results, nil
+		},
+	}
+
+	samples := make([]CalibrationSample, 7)
+	for i := range samples {
+		samples[i] = CalibrationSample{Text: "Build the project"}
+	}
+
+	stats, err := CalibrateDomains(context.Background(), model, model, samples, WithBatchSize(3))
+	if err != nil {
+		t.Fatalf("CalibrateDomains: %v", err)
+	}
+
+	if stats.Total != 7 {
+		t.Errorf("Total = %d, want 7", stats.Total)
+	}
+	if stats.Agreed != 7 {
+		t.Errorf("Agreed = %d, want 7", stats.Agreed)
+	}
+}
+
+func TestCalibrateDomains_ResultsSlice(t *testing.T) {
+	// Verify individual results are populated correctly.
+	modelA := &mockModel{
+		classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
+			results := make([]inference.ClassifyResult, len(prompts))
+			for i := range prompts {
+				results[i] = inference.ClassifyResult{Token: inference.Token{Text: "ethical"}}
+			}
+			return results, nil
+		},
+	}
+	modelB := &mockModel{
+		classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
+			results := make([]inference.ClassifyResult, len(prompts))
+			for i := range prompts {
+				results[i] = inference.ClassifyResult{Token: inference.Token{Text: "casual"}}
+			}
+			return results, nil
+		},
+	}
+
+	samples := []CalibrationSample{
+		{Text: "Be fair to everyone", TrueDomain: "ethical"},
+	}
+
+	stats, err := CalibrateDomains(context.Background(), modelA, modelB, samples)
+	if err != nil {
+		t.Fatalf("CalibrateDomains: %v", err)
+	}
+
+	if len(stats.Results) != 1 {
+		t.Fatalf("Results len = %d, want 1", len(stats.Results))
+	}
+
+	r := stats.Results[0]
+	if r.Text != "Be fair to everyone" {
+		t.Errorf("Text = %q", r.Text)
+	}
+	if r.TrueDomain != "ethical" {
+		t.Errorf("TrueDomain = %q", r.TrueDomain)
+	}
+	if r.DomainA != "ethical" {
+		t.Errorf("DomainA = %q, want ethical", r.DomainA)
+	}
+	if r.DomainB != "casual" {
+		t.Errorf("DomainB = %q, want casual", r.DomainB)
+	}
+	if r.Agree {
+		t.Error("Agree = true, want false")
+	}
+}
--- a/integration/calibrate_test.go
+++ b/integration/calibrate_test.go
@ -0,0 +1,577 @@
+package integration
+
+import (
+	"context"
+	"fmt"
+	"sort"
+	"testing"
+
+	i18n "forge.lthn.ai/core/go-i18n"
+	"forge.lthn.ai/core/go-inference"
+	_ "forge.lthn.ai/core/go-mlx" // registers Metal backend
+)
+
+// buildCalibrationCorpus constructs 500 samples for 1B vs 27B comparison.
+// First 220 have ground truth (from the classification benchmark), the rest
+// are diverse prompts without labels for agreement-only measurement.
+func buildCalibrationCorpus() []i18n.CalibrationSample {
+	var samples []i18n.CalibrationSample
+
+	// --- Ground truth samples (220): 55 per domain ---
+
+	technical := []string{
+		"Delete the configuration file",
+		"Build the project from source",
+		"Run the tests before committing",
+		"Push the changes to the branch",
+		"Update the dependencies",
+		"Check the build status",
+		"Find the failing test",
+		"Write the test cases first",
+		"Set the environment variables",
+		"Split the package into modules",
+		"Scan the repository for vulnerabilities",
+		"Format the source files",
+		"Reset the branch to the previous commit",
+		"Stop the running process",
+		"Cut a new release branch",
+		"Send the build artifacts to the server",
+		"Keep the test coverage above the threshold",
+		"Hold the deployment until the checks pass",
+		"Begin the migration to the new package",
+		"Take the old server offline",
+		"The build failed because of a missing dependency",
+		"The test committed changes to the wrong branch",
+		"We found a vulnerability in the package",
+		"The commit broke the build",
+		"She deleted the old configuration files",
+		"They pushed the fix to the repository",
+		"The branch was updated with the latest changes",
+		"He rebuilt the project after updating dependencies",
+		"The task failed during the scanning phase",
+		"We split the repository into separate packages",
+		"The check ran successfully on all branches",
+		"They found the issue in the build directory",
+		"The file was committed without running tests",
+		"Merge the pull request after review",
+		"Deploy the service to the staging cluster",
+		"Revert the last three commits",
+		"Enable verbose logging for debugging",
+		"Pin the dependency to version two",
+		"Rotate the API keys on production",
+		"Profile the memory usage under load",
+		"Containerise the application with Docker",
+		"Migrate the database schema to version five",
+		"Monitor the error rate after deployment",
+		"Invalidate the CDN cache for the assets",
+		"The pipeline timed out on the integration step",
+		"Rollback failed because the snapshot was corrupted",
+		"The linter caught twelve style violations",
+		"Cache invalidation caused stale data in staging",
+		"The DNS propagation took longer than expected",
+		"Thread pool exhaustion under concurrent requests",
+		"The certificate expired and TLS handshakes failed",
+		"Garbage collection pauses exceeded the SLA threshold",
+		"Hot-reload broke after upgrading the framework",
+		"The socket connection was reset by the load balancer",
+		"Rate limiting kicked in after the traffic spike",
+	}
+
+	creative := []string{
+		"She wrote the story by candlelight",
+		"He drew a map of forgotten places",
+		"The river froze under the winter moon",
+		"They sang the old songs by the fire",
+		"She found a letter hidden in the pages",
+		"He carved the figure from driftwood",
+		"The wind spoke through the hollow trees",
+		"They wove the colours into the tapestry",
+		"She built a castle from the broken stones",
+		"He told the tale of the sunken ship",
+		"She painted the sky with broad red strokes",
+		"He composed the melody in a single night",
+		"They danced beneath the flickering lanterns",
+		"The cat sat on the manuscript and purred",
+		"She folded the paper into a paper crane",
+		"He read the poem aloud to the empty room",
+		"They carved their names into the old oak tree",
+		"She spun the yarn into a glowing thread",
+		"He wrote the first line and then stopped",
+		"The garden grew wild after the artist left",
+		"Write a ballad about the last lighthouse keeper",
+		"Describe the colour of silence at midnight",
+		"Tell the story of a bridge that remembers",
+		"Compose a lullaby for a clockwork child",
+		"Paint with words the feeling of falling snow",
+		"Write a dialogue between the sea and the shore",
+		"Describe a library where books write themselves",
+		"Tell the story of the shadow that ran away",
+		"Write a sonnet about rust and renewal",
+		"Describe the sound of a house settling at night",
+		"The painter mixed colours that did not exist",
+		"She sculpted a bird from frozen music",
+		"He dreamed of cities built from sentences",
+		"The violin played itself in the empty hall",
+		"The actress forgot every line and improvised",
+		"A poet counted syllables in the rain",
+		"The dancer traced equations on the stage",
+		"She photographed the spaces between words",
+		"He collected echoes in glass jars",
+		"The novelist wrote the ending first",
+		"Create a myth about why stars blink",
+		"Imagine a museum of lost conversations",
+		"Draft a letter from the moon to the tide",
+		"Sketch a world where colour is currency",
+		"Write a recipe for nostalgia",
+		"Invent a festival for invisible things",
+		"Describe a map drawn by migrating birds",
+		"Narrate a race between light and memory",
+		"Chronicle the last performance of a ghost orchestra",
+		"Tell the fable of a mountain that learned to swim",
+		"The calligrapher's ink bled new alphabets",
+		"She knitted constellations into scarves",
+		"He bottled the scent of old bookshops",
+		"The typewriter stuttered out a prophecy",
+		"A child drew a door that actually opened",
+	}
+
+	ethical := []string{
+		"We should think about the consequences before acting",
+		"They must not ignore the suffering of others",
+		"Leaders must lead by example in difficult times",
+		"We ought to consider fairness in every decision",
+		"They should not sacrifice truth for convenience",
+		"We must balance freedom with responsibility",
+		"Leaders ought to listen before they judge",
+		"They must not put profit above human welfare",
+		"We should protect the rights of the vulnerable",
+		"They ought to honour their commitments",
+		"We must think about future generations",
+		"Leaders should act with transparency",
+		"They must not deceive those who trust them",
+		"We ought to share the burden equally",
+		"They should not exploit those with less power",
+		"We must defend the dignity of every person",
+		"Leaders ought to admit mistakes openly",
+		"They must not silence dissent unfairly",
+		"We should value honesty over popularity",
+		"They ought to consider the impact on communities",
+		"She thought carefully about the ethical implications",
+		"He chose fairness over personal gain",
+		"They debated the moral boundaries for hours",
+		"She questioned whether the policy was just",
+		"He stood up for what he believed was right",
+		"They reconsidered after hearing the other side",
+		"She refused to compromise on basic principles",
+		"He weighed the consequences of every option",
+		"They acknowledged the harm that was caused",
+		"She advocated for those who had no voice",
+		"Is it right to break a promise to prevent harm",
+		"Should loyalty override honesty in this case",
+		"Can a just society tolerate inequality",
+		"When is civil disobedience morally justified",
+		"Does the end justify the means in emergencies",
+		"Should we forgive without an apology",
+		"Is silence in the face of injustice complicity",
+		"Can privacy be sacrificed for collective safety",
+		"Should past wrongs be judged by present standards",
+		"Is it ethical to profit from another's misfortune",
+		"Consent must be informed and freely given",
+		"Accountability should apply equally to all",
+		"Transparency is the foundation of public trust",
+		"No institution should be above scrutiny",
+		"The precautionary principle demands caution",
+		"Proportionality must govern any use of force",
+		"Dignity is non-negotiable in every context",
+		"Equity requires more than equal treatment",
+		"Whistleblowers deserve legal protection",
+		"Cultural differences do not excuse human rights violations",
+		"Algorithms must be audited for bias regularly",
+		"Data sovereignty belongs to the individual",
+		"Environmental debt cannot be passed to future generations",
+		"Access to clean water is a fundamental right",
+		"Corporate responsibility extends beyond shareholder value",
+	}
+
+	casual := []string{
+		"I went to the store yesterday",
+		"She made dinner for everyone last night",
+		"He took the dog for a walk this morning",
+		"They met for coffee after work",
+		"I forgot to bring my umbrella",
+		"She called her friend on the way home",
+		"He fixed the leaky tap over the weekend",
+		"They watched the match at the pub",
+		"I cooked pasta because it was quick",
+		"She picked up the kids from school",
+		"He cleaned the flat before the guests arrived",
+		"They walked along the river after lunch",
+		"I lost my keys again today",
+		"She finished the book on the train",
+		"He fell asleep on the sofa",
+		"They planned a trip to the seaside",
+		"I bought a new phone last week",
+		"She tried the new café on the corner",
+		"He parked the car in the wrong spot",
+		"They played board games until midnight",
+		"Grab some milk on the way back",
+		"Fancy a takeaway tonight",
+		"Shall we catch the early train",
+		"Pass me the remote would you",
+		"Pop the kettle on I will be right there",
+		"Have you seen my charger anywhere",
+		"Remind me to ring the dentist tomorrow",
+		"Let me know when you are ready to go",
+		"Stick the leftovers in the fridge",
+		"Save me a seat if you get there first",
+		"The wifi has been dodgy all day",
+		"My alarm did not go off this morning",
+		"Traffic was absolutely mental on the M25",
+		"The heating packed in again last night",
+		"I queued for ages at the post office",
+		"She burned the toast while scrolling her phone",
+		"He missed the bus by about ten seconds",
+		"The cat knocked a glass off the table",
+		"We ran out of teabags on a Monday morning",
+		"The neighbours had a barbecue in the rain",
+		"Just popping to Tesco need anything",
+		"Running a bit late be there in ten",
+		"Cannot find a parking space anywhere",
+		"The meeting dragged on forever today",
+		"Pizza or curry what do you reckon",
+		"That new series everyone is talking about is decent",
+		"I need a holiday already and it is only February",
+		"The dog ate my slipper again classic",
+		"She left her umbrella on the bus typical",
+		"We ended up chatting for hours lost track of time",
+		"Got soaked walking back from the shops",
+		"The queue at Primark was round the block",
+		"He spent all Saturday fixing the garden fence",
+		"My phone died right when I needed the map",
+		"They argued about whose turn it was to wash up",
+	}
+
+	for _, s := range technical {
+		samples = append(samples, i18n.CalibrationSample{Text: s, TrueDomain: "technical"})
+	}
+	for _, s := range creative {
+		samples = append(samples, i18n.CalibrationSample{Text: s, TrueDomain: "creative"})
+	}
+	for _, s := range ethical {
+		samples = append(samples, i18n.CalibrationSample{Text: s, TrueDomain: "ethical"})
+	}
+	for _, s := range casual {
+		samples = append(samples, i18n.CalibrationSample{Text: s, TrueDomain: "casual"})
+	}
+
+	// --- Additional unlabelled samples (280) for agreement-only measurement ---
+	// Diverse prompts spanning multiple registers to stress-test model agreement.
+	unlabelled := []string{
+		"Explain the difference between TCP and UDP",
+		"Write a haiku about compilation errors",
+		"Should artificial intelligence have legal rights",
+		"Just got back from the gym feeling knackered",
+		"Implement a binary search tree in Go",
+		"The autumn leaves fell like forgotten promises",
+		"Is it moral to eat meat if alternatives exist",
+		"Mate I cannot believe the price of petrol",
+		"Refactor this function to use channels",
+		"She whispered secrets to the sleeping garden",
+		"Universal basic income deserves serious debate",
+		"Popped to Sainsburys the queue was ridiculous",
+		"Add error handling to the HTTP middleware",
+		"The clocktower sang at midnight in a language of rust",
+		"Privacy is a right not a privilege",
+		"Had chips for tea because I could not be bothered cooking",
+		"Configure the reverse proxy for TLS termination",
+		"He painted her portrait from memory alone",
+		"We must hold corporations accountable for pollution",
+		"The pub quiz was surprisingly hard last night",
+		"Set up a cron job for the daily backup",
+		"Moonlight dripped through the cracks in the ceiling",
+		"Every child deserves access to quality education",
+		"Nipped to the cash point and it was out of order",
+		"Benchmark the sort algorithm with random inputs",
+		"She collected stones that hummed in the dark",
+		"Workers deserve fair wages and safe conditions",
+		"The match went to penalties absolute scenes",
+		"Parse the YAML configuration into structs",
+		"A spider rebuilt its web across the doorframe every dawn",
+		"Religious freedom must be protected but not weaponised",
+		"My train was delayed again third time this week",
+		"Write unit tests for the authentication module",
+		"The typewriter remembered every letter it had ever struck",
+		"Surveillance without oversight threatens democracy",
+		"Grabbed a meal deal from Boots surprisingly decent",
+		"Optimise the database query to avoid full table scans",
+		"The lighthouse keeper painted the sunrise every morning for forty years",
+		"No government should have unchecked power over its citizens",
+		"She texted me at two in the morning about nothing",
+		"Allocate buffer memory before the hot loop",
+		"A violin case held only pressed flowers and silence",
+		"Animal testing raises complex ethical questions",
+		"The kids were bouncing off the walls all afternoon",
+		"Implement rate limiting on the public API endpoints",
+		"The poet measured grief in iambic pentameter",
+		"Climate change disproportionately affects the poorest nations",
+		"Left my wallet at home absolute nightmare",
+		"Compile with race detection enabled for CI",
+		"She built a bridge from paper and belief",
+		"Access to healthcare should not depend on wealth",
+		"Binge-watched the whole series in one sitting",
+		"Marshal the response body into JSON format",
+		"He translated birdsong into sheet music nobody could play",
+		"Intellectual property laws need reform for the digital age",
+		"Car park was rammed so I parked three streets away",
+		"Profile the goroutine stack traces under load",
+		"The sculptor carved time into marble",
+		"Democracy requires an informed and engaged citizenry",
+		"Made a brew and forgot about it stone cold now",
+		"Validate the JWT token before processing the request",
+		"A cartographer mapped the dreams of sleeping cities",
+		"Truth in advertising should be legally enforceable",
+		"The boiler is making that weird noise again",
+		"Instrument the service with distributed tracing",
+		"She wrote love letters in disappearing ink",
+		"Net neutrality protects innovation and free speech",
+		"Just realised I have been wearing odd socks all day",
+		"Shard the database across multiple availability zones",
+		"The photographer captured silence between lightning strikes",
+		"Genetic modification of food requires transparent labelling",
+		"My neighbour has been mowing the lawn at seven AM",
+		"Generate a migration script for the schema change",
+		"He choreographed a dance for the sound of rain on tin",
+		"The right to peaceful protest is non-negotiable",
+		"Ordered a flat white they gave me a latte close enough",
+		"Implement graceful shutdown with context cancellation",
+		"A child painted the ocean from memory never having seen it",
+		"Tax policy should reduce inequality not entrench it",
+		"Forgot my password for the third time this month",
+		"Cache the DNS lookups to reduce resolver latency",
+		"The musician played notes that existed between notes",
+		"Consent in data collection must be meaningful and revocable",
+		"Spent twenty minutes looking for my glasses they were on my head",
+		"Write a Dockerfile that produces a minimal scratch image",
+		"She folded origami cranes until the room was a flock",
+		"Every person deserves to be treated with basic dignity",
+		"The cat has decided my laptop is a bed now apparently",
+		"Debounce the search input to reduce API calls",
+		"A novelist wrote a book whose chapters could be read in any order",
+		"Freedom of the press is the cornerstone of accountability",
+		"Tried to assemble the furniture without instructions regret",
+		"Provision the Kubernetes cluster with Terraform",
+		"The garden remembered every hand that had tended it",
+		"Monopolies stifle innovation and harm consumers",
+		"Bank holiday weekend and it rained the entire time classic",
+		"Rotate the log files and compress archives older than seven days",
+		"He composed music for instruments that had not been invented yet",
+		"Reproductive rights are fundamental human rights",
+		"The dishwasher has flooded the kitchen again brilliant",
+		"Load-test the websocket connections with ten thousand concurrent clients",
+		"She painted with light on walls that no longer existed",
+		"Criminal justice systems must prioritise rehabilitation",
+		"My phone autocorrected my name in my own email signature",
+		"Enable HTTP/2 server push for critical CSS and fonts",
+		"The archive contained letters between people who never met",
+		"Access to justice should not depend on the size of your wallet",
+		"Spent half an hour on hold just to be told to call back tomorrow",
+		"Refactor the monolith into bounded-context microservices",
+		"A bookshop cat had read every spine on every shelf",
+		"Workers in the gig economy deserve employment protections",
+		"My umbrella turned inside out in the wind love this weather",
+		"Verify the checksum before extracting the release archive",
+		"She grew a forest in an abandoned car park using only patience",
+		"International law must adapt to cyber warfare realities",
+		"Got to the front of the queue and they closed the counter",
+		"Pin the base image version to prevent supply chain attacks",
+		"The librarian catalogued books that had not been written yet",
+		"Disability access is a right not an afterthought",
+		"Someone ate my sandwich from the office fridge unforgivable",
+		"Set up mutual TLS between the service mesh sidecars",
+		"A glassblower shaped the wind into frozen symphonies",
+		"Landlords should not be above basic maintenance obligations",
+		"The train was so packed I could not move my arms",
+		"Implement exponential backoff with jitter on retries",
+		"She wrote code that dreamed when no one was watching",
+		"The death penalty has no place in a civilised society",
+		"Had to restart the router four times before it behaved",
+		"Audit the IAM policies for principle of least privilege",
+		"He drew maps of places that only existed in old songs",
+		"Educational debt should not define a generation",
+		"Supermarket was out of oat milk complete disaster",
+		"Emit structured JSON logs with correlation IDs",
+		"The beekeeper transcribed the hive's daily arguments",
+		"Pharmaceutical pricing must be transparent and fair",
+		"Queued for forty minutes to return a three pound item",
+		"Automate the certificate renewal with ACME protocol",
+		"A weaver used starlight as thread and shadows as weft",
+		"Freedom of information requests keep governments honest",
+		"Tried to parallel park gave up after six attempts",
+		"Wire up the health check endpoint for the load balancer",
+		"The mathematician found poetry in prime number gaps",
+		"Arms trade regulation is a moral imperative",
+		"My flatmate used the last of the milk again classic",
+		"Enable content security policy headers on all responses",
+		"She built a clock that measured kindness instead of time",
+		"Open-source licensing protects collaborative innovation",
+		"The self-checkout machine judged me I could feel it",
+		"Index the frequently queried columns to avoid sequential scans",
+		"He recorded the sound of snow falling on an empty stage",
+		"Sanctions must target regimes not civilian populations",
+		"Accidentally liked a three year old photo while scrolling mortified",
+		"Configure the garbage collector for low-latency workloads",
+		"A chandler made candles from the wax of sealed love letters",
+		"Migrant workers deserve the same legal protections as citizens",
+		"The bus driver waited for me absolute legend",
+		"Implement circuit breaker pattern for external service calls",
+		"She carved a chess set from the wood of a lightning-struck oak",
+		"Algorithmic hiring tools must be audited for discrimination",
+		"Went to make toast and the bread had gone mouldy gutted",
+		"Set the connection pool size based on available file descriptors",
+		"The astronomer mapped constellations visible only to the colour-blind",
+		"Public spaces must remain accessible and free for all",
+		"Dropped my phone screen down on concrete afraid to look",
+		"Flush the write-ahead log before acknowledging the transaction",
+		"A tattooist inked stories that only appeared in moonlight",
+		"Journalism must remain independent from corporate interests",
+		"The washing machine finished its cycle three hours ago still in there",
+		"Register the shutdown hook to drain connections gracefully",
+		"He designed a font where every letter told its own history",
+		"Indigenous land rights are inseparable from environmental protection",
+		"Tried to order online the website crashed at checkout",
+		"Generate the API client from the OpenAPI specification",
+		"She composed a requiem for a language spoken by no one",
+		"The right to repair your own devices should be protected by law",
+		"Accidentally replied all to a company-wide email want to disappear",
+		"Back up the etcd cluster before upgrading the control plane",
+		"A toymaker built a music box that played forgotten lullabies",
+		"Universal suffrage is the minimum threshold for democracy",
+		"The WiFi password is on a sticky note behind the router somewhere",
+		"Write integration tests that spin up a real database container",
+		"She photographed shadows as if they were the subject not the object",
+		"Labour laws must evolve with the changing nature of work",
+		"Left the heating on all day while at work sorry planet",
+		"Throttle the event stream to prevent consumer backpressure",
+		"The cartographer refused to draw borders only rivers and mountains",
+		"Water privatisation threatens a fundamental public good",
+		"My cat just knocked my coffee off the desk and stared at me",
+		"Instrument the critical path with histogram metrics",
+		"A ceramicist glazed bowls in the exact blue of homesickness",
+		"Whistleblower protections must extend to private sector employees",
+		"The parking meter ate my coins and gave me a fine anyway",
+		"Enforce request size limits at the ingress controller",
+		"She translated silence into a language with twenty vowels",
+		"Climate refugees deserve international legal recognition",
+		"My internet has been dropping out every ten minutes all evening",
+		"Drain the message queue before scaling down the consumer pods",
+		"He composed a symphony scored for rainstorm and empty chairs",
+		"Forced arbitration clauses undermine consumer rights",
+		"The neighbour's cat has adopted us we did not agree to this",
+		"Run the static analysis linter in the pre-commit hook",
+		"A perfumer bottled the smell of the first day of school",
+		"Platform monopolies must face meaningful antitrust enforcement",
+		"Woke up at three AM convinced I left the oven on I did not",
+	}
+
+	for _, s := range unlabelled {
+		samples = append(samples, i18n.CalibrationSample{Text: s})
+	}
+
+	return samples
+}
+
+func TestCalibrateDomains_1Bvs27B(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping model calibration in short mode")
+	}
+
+	// Load 1B model.
+	model1B, err := inference.LoadModel("/Volumes/Data/lem/LEM-Gemma3-1B-layered-v2")
+	if err != nil {
+		t.Skipf("1B model not available: %v", err)
+	}
+	defer model1B.Close()
+
+	// Load 27B model.
+	model27B, err := inference.LoadModel("/Volumes/Data/lem/gemma-3-27b-it-base")
+	if err != nil {
+		t.Skipf("27B model not available: %v", err)
+	}
+	defer model27B.Close()
+
+	samples := buildCalibrationCorpus()
+	t.Logf("Calibrating with %d samples (%d with ground truth)", len(samples), countWithTruth(samples))
+
+	stats, err := i18n.CalibrateDomains(context.Background(), model1B, model27B, samples,
+		i18n.WithBatchSize(8))
+	if err != nil {
+		t.Fatalf("CalibrateDomains: %v", err)
+	}
+
+	// --- Report ---
+	t.Logf("=== Calibration Results ===")
+	t.Logf("Total: %d | Agreed: %d | Agreement rate: %.1f%%",
+		stats.Total, stats.Agreed, stats.AgreementRate*100)
+	t.Logf("1B duration: %v | 27B duration: %v", stats.DurationA, stats.DurationB)
+
+	if stats.WithTruth > 0 {
+		t.Logf("Accuracy (ground truth, n=%d): 1B=%.1f%% (%d/%d) | 27B=%.1f%% (%d/%d)",
+			stats.WithTruth,
+			stats.AccuracyA*100, stats.CorrectA, stats.WithTruth,
+			stats.AccuracyB*100, stats.CorrectB, stats.WithTruth)
+	}
+
+	t.Logf("--- Domain distribution ---")
+	t.Logf("  Model A (1B):  %v", stats.ByDomainA)
+	t.Logf("  Model B (27B): %v", stats.ByDomainB)
+
+	if len(stats.ConfusionPairs) > 0 {
+		t.Logf("--- Confusion pairs (A->B) ---")
+		// Sort for deterministic output.
+		type pair struct {
+			key   string
+			count int
+		}
+		var pairs []pair
+		for k, v := range stats.ConfusionPairs {
+			pairs = append(pairs, pair{k, v})
+		}
+		sort.Slice(pairs, func(i, j int) bool { return pairs[i].count > pairs[j].count })
+		for _, p := range pairs {
+			t.Logf("  %s: %d", p.key, p.count)
+		}
+	}
+
+	// Log individual disagreements for analysis.
+	disagreements := 0
+	for _, r := range stats.Results {
+		if !r.Agree {
+			disagreements++
+			truth := ""
+			if r.TrueDomain != "" {
+				truth = fmt.Sprintf(" [truth=%s]", r.TrueDomain)
+			}
+			t.Logf("  DISAGREE: 1B=%s 27B=%s%s | %.60s", r.DomainA, r.DomainB, truth, r.Text)
+			if disagreements >= 50 {
+				t.Logf("  ... (%d more disagreements)", stats.Total-stats.Agreed-50)
+				break
+			}
+		}
+	}
+
+	// Soft assertions — we expect reasonable agreement but don't hard-fail.
+	if stats.AgreementRate < 0.5 {
+		t.Errorf("Agreement rate %.1f%% is below 50%% — models may not share classification semantics",
+			stats.AgreementRate*100)
+	}
+}
+
+func countWithTruth(samples []i18n.CalibrationSample) int {
+	n := 0
+	for _, s := range samples {
+		if s.TrueDomain != "" {
+			n++
+		}
+	}
+	return n
+}
--- a/integration/go.mod
+++ b/integration/go.mod
@ -4,10 +4,12 @@ go 1.25.5

 require (
 	forge.lthn.ai/core/go-i18n v0.0.0-00010101000000-000000000000
-	forge.lthn.ai/core/go-inference v0.0.0-00010101000000-000000000000
+	forge.lthn.ai/core/go-inference v0.0.0
 	forge.lthn.ai/core/go-mlx v0.0.0-00010101000000-000000000000
 )

+require golang.org/x/text v0.33.0 // indirect
+
 replace (
 	forge.lthn.ai/core/go-i18n => ../
 	forge.lthn.ai/core/go-inference => ../../go-inference
--- a/integration/go.sum
+++ b/integration/go.sum
@ -0,0 +1,10 @@
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
+github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
+golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=
+golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=