feat(calibrate): 1B vs 27B domain calibration tool
CalibrateDomains() accepts two inference.TextModel instances and a corpus of CalibrationSamples, classifies all with both models, and computes agreement rate, per-domain distribution, confusion pairs, and accuracy vs ground truth. - calibrate.go: CalibrateDomains + classifyAll batch helper - calibrate_test.go: 7 mock tests (agreement, disagreement, mixed, no ground truth, empty, batch boundary, results slice) - integration/calibrate_test.go: 500-sample corpus (220 ground-truth + 280 unlabelled) for real 1B vs 27B model comparison - TODO.md: Phase 2a calibration task marked complete Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
parent
2e586aedc5
commit
3b7ef9d26a
6 changed files with 1022 additions and 2 deletions
2
TODO.md
2
TODO.md
|
|
@ -71,7 +71,7 @@ models, _ := inference.Discover("/Volumes/Data/lem/")
|
|||
|
||||
### Remaining Phase 2a Tasks
|
||||
|
||||
- [ ] **1B vs 27B calibration check** — Sample 500 sentences, classify with both 1B and 27B, measure agreement rate. Load 27B via same `inference.LoadModel()` path. Classification benchmark shows ethical↔technical (both base-form heavy) and casual↔creative (both past-tense heavy) are the confusion axes — 1B needs to resolve these.
|
||||
- [x] **1B vs 27B calibration check** — `CalibrateDomains()` in `calibrate.go`. Accepts two TextModels + 500 CalibrationSamples (220 ground-truth + 280 unlabelled). Batch-classifies with both models, computes agreement rate, per-domain distribution, confusion pairs, and accuracy vs ground truth. 7 mock tests (race-clean). Integration test at `integration/calibrate_test.go` loads LEM-1B + Gemma3-27B from `/Volumes/Data/lem/`, runs full calibration with detailed reporting. Run with: `cd integration && go test -v -run TestCalibrateDomains_1Bvs27B`
|
||||
- [x] **Article/irregular validator** — Lightweight Go funcs that use the 1B model's strong article correctness (100%) and irregular base form accuracy (100%) as fast validators. Use `m.Generate()` with `inference.WithMaxTokens(1)` and `inference.WithTemperature(0.05)` for single-token classification.
|
||||
|
||||
### 2b: Reference Distributions
|
||||
|
|
|
|||
154
calibrate.go
Normal file
154
calibrate.go
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
package i18n
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"forge.lthn.ai/core/go-inference"
|
||||
)
|
||||
|
||||
// CalibrationSample is a single text entry for model comparison.
|
||||
type CalibrationSample struct {
|
||||
Text string
|
||||
TrueDomain string // optional ground truth label (empty if unknown)
|
||||
}
|
||||
|
||||
// CalibrationResult holds per-sample classification from two models.
|
||||
type CalibrationResult struct {
|
||||
Text string `json:"text"`
|
||||
TrueDomain string `json:"true_domain,omitempty"`
|
||||
DomainA string `json:"domain_a"`
|
||||
DomainB string `json:"domain_b"`
|
||||
Agree bool `json:"agree"`
|
||||
}
|
||||
|
||||
// CalibrationStats holds aggregate metrics from CalibrateDomains.
|
||||
type CalibrationStats struct {
|
||||
Total int `json:"total"`
|
||||
Agreed int `json:"agreed"`
|
||||
AgreementRate float64 `json:"agreement_rate"`
|
||||
ByDomainA map[string]int `json:"by_domain_a"`
|
||||
ByDomainB map[string]int `json:"by_domain_b"`
|
||||
ConfusionPairs map[string]int `json:"confusion_pairs"` // "technical->creative": count
|
||||
AccuracyA float64 `json:"accuracy_a"` // vs ground truth (0 if none)
|
||||
AccuracyB float64 `json:"accuracy_b"` // vs ground truth (0 if none)
|
||||
CorrectA int `json:"correct_a"`
|
||||
CorrectB int `json:"correct_b"`
|
||||
WithTruth int `json:"with_truth"` // samples that had ground truth
|
||||
DurationA time.Duration `json:"duration_a"`
|
||||
DurationB time.Duration `json:"duration_b"`
|
||||
Results []CalibrationResult `json:"results"`
|
||||
}
|
||||
|
||||
// CalibrateDomains classifies all samples with both models and computes agreement.
|
||||
// Model A is typically the smaller/faster model (1B), model B the larger reference (27B).
|
||||
// Samples with non-empty TrueDomain also contribute to accuracy metrics.
|
||||
func CalibrateDomains(ctx context.Context, modelA, modelB inference.TextModel,
|
||||
samples []CalibrationSample, opts ...ClassifyOption) (*CalibrationStats, error) {
|
||||
|
||||
if len(samples) == 0 {
|
||||
return nil, fmt.Errorf("calibrate: empty sample set")
|
||||
}
|
||||
|
||||
cfg := defaultClassifyConfig()
|
||||
for _, o := range opts {
|
||||
o(&cfg)
|
||||
}
|
||||
|
||||
stats := &CalibrationStats{
|
||||
ByDomainA: make(map[string]int),
|
||||
ByDomainB: make(map[string]int),
|
||||
ConfusionPairs: make(map[string]int),
|
||||
}
|
||||
|
||||
// Build classification prompts from sample texts.
|
||||
prompts := make([]string, len(samples))
|
||||
for i, s := range samples {
|
||||
prompts[i] = fmt.Sprintf(cfg.promptTemplate, s.Text)
|
||||
}
|
||||
|
||||
// Classify with model A.
|
||||
domainsA, durA, err := classifyAll(ctx, modelA, prompts, cfg.batchSize)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("model A: %w", err)
|
||||
}
|
||||
stats.DurationA = durA
|
||||
|
||||
// Classify with model B.
|
||||
domainsB, durB, err := classifyAll(ctx, modelB, prompts, cfg.batchSize)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("model B: %w", err)
|
||||
}
|
||||
stats.DurationB = durB
|
||||
|
||||
// Compare results.
|
||||
stats.Total = len(samples)
|
||||
stats.Results = make([]CalibrationResult, len(samples))
|
||||
|
||||
for i, s := range samples {
|
||||
a, b := domainsA[i], domainsB[i]
|
||||
agree := a == b
|
||||
if agree {
|
||||
stats.Agreed++
|
||||
} else {
|
||||
key := fmt.Sprintf("%s->%s", a, b)
|
||||
stats.ConfusionPairs[key]++
|
||||
}
|
||||
stats.ByDomainA[a]++
|
||||
stats.ByDomainB[b]++
|
||||
|
||||
if s.TrueDomain != "" {
|
||||
stats.WithTruth++
|
||||
if a == s.TrueDomain {
|
||||
stats.CorrectA++
|
||||
}
|
||||
if b == s.TrueDomain {
|
||||
stats.CorrectB++
|
||||
}
|
||||
}
|
||||
|
||||
stats.Results[i] = CalibrationResult{
|
||||
Text: s.Text,
|
||||
TrueDomain: s.TrueDomain,
|
||||
DomainA: a,
|
||||
DomainB: b,
|
||||
Agree: agree,
|
||||
}
|
||||
}
|
||||
|
||||
if stats.Total > 0 {
|
||||
stats.AgreementRate = float64(stats.Agreed) / float64(stats.Total)
|
||||
}
|
||||
if stats.WithTruth > 0 {
|
||||
stats.AccuracyA = float64(stats.CorrectA) / float64(stats.WithTruth)
|
||||
stats.AccuracyB = float64(stats.CorrectB) / float64(stats.WithTruth)
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
// classifyAll runs batch classification over all prompts, returning domain labels.
|
||||
func classifyAll(ctx context.Context, model inference.TextModel, prompts []string, batchSize int) ([]string, time.Duration, error) {
|
||||
start := time.Now()
|
||||
domains := make([]string, len(prompts))
|
||||
|
||||
for i := 0; i < len(prompts); i += batchSize {
|
||||
end := i + batchSize
|
||||
if end > len(prompts) {
|
||||
end = len(prompts)
|
||||
}
|
||||
batch := prompts[i:end]
|
||||
|
||||
results, err := model.Classify(ctx, batch, inference.WithMaxTokens(1))
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("classify batch [%d:%d]: %w", i, end, err)
|
||||
}
|
||||
|
||||
for j, r := range results {
|
||||
domains[i+j] = mapTokenToDomain(r.Token.Text)
|
||||
}
|
||||
}
|
||||
|
||||
return domains, time.Since(start), nil
|
||||
}
|
||||
277
calibrate_test.go
Normal file
277
calibrate_test.go
Normal file
|
|
@ -0,0 +1,277 @@
|
|||
package i18n
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"forge.lthn.ai/core/go-inference"
|
||||
)
|
||||
|
||||
func TestCalibrateDomains_FullAgreement(t *testing.T) {
|
||||
// Both models return the same domain for all samples.
|
||||
model := &mockModel{
|
||||
classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
|
||||
results := make([]inference.ClassifyResult, len(prompts))
|
||||
for i := range prompts {
|
||||
results[i] = inference.ClassifyResult{Token: inference.Token{Text: "technical"}}
|
||||
}
|
||||
return results, nil
|
||||
},
|
||||
}
|
||||
|
||||
samples := []CalibrationSample{
|
||||
{Text: "Delete the file", TrueDomain: "technical"},
|
||||
{Text: "Build the project", TrueDomain: "technical"},
|
||||
{Text: "Run the tests", TrueDomain: "technical"},
|
||||
}
|
||||
|
||||
stats, err := CalibrateDomains(context.Background(), model, model, samples)
|
||||
if err != nil {
|
||||
t.Fatalf("CalibrateDomains: %v", err)
|
||||
}
|
||||
|
||||
if stats.Total != 3 {
|
||||
t.Errorf("Total = %d, want 3", stats.Total)
|
||||
}
|
||||
if stats.Agreed != 3 {
|
||||
t.Errorf("Agreed = %d, want 3", stats.Agreed)
|
||||
}
|
||||
if stats.AgreementRate != 1.0 {
|
||||
t.Errorf("AgreementRate = %f, want 1.0", stats.AgreementRate)
|
||||
}
|
||||
if stats.AccuracyA != 1.0 {
|
||||
t.Errorf("AccuracyA = %f, want 1.0", stats.AccuracyA)
|
||||
}
|
||||
if stats.AccuracyB != 1.0 {
|
||||
t.Errorf("AccuracyB = %f, want 1.0", stats.AccuracyB)
|
||||
}
|
||||
if len(stats.ConfusionPairs) != 0 {
|
||||
t.Errorf("ConfusionPairs = %v, want empty", stats.ConfusionPairs)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalibrateDomains_Disagreement(t *testing.T) {
|
||||
// Model A always says "technical", model B always says "creative".
|
||||
modelA := &mockModel{
|
||||
classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
|
||||
results := make([]inference.ClassifyResult, len(prompts))
|
||||
for i := range prompts {
|
||||
results[i] = inference.ClassifyResult{Token: inference.Token{Text: "technical"}}
|
||||
}
|
||||
return results, nil
|
||||
},
|
||||
}
|
||||
modelB := &mockModel{
|
||||
classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
|
||||
results := make([]inference.ClassifyResult, len(prompts))
|
||||
for i := range prompts {
|
||||
results[i] = inference.ClassifyResult{Token: inference.Token{Text: "creative"}}
|
||||
}
|
||||
return results, nil
|
||||
},
|
||||
}
|
||||
|
||||
samples := []CalibrationSample{
|
||||
{Text: "She wrote a poem", TrueDomain: "creative"},
|
||||
{Text: "He painted the sky", TrueDomain: "creative"},
|
||||
}
|
||||
|
||||
stats, err := CalibrateDomains(context.Background(), modelA, modelB, samples)
|
||||
if err != nil {
|
||||
t.Fatalf("CalibrateDomains: %v", err)
|
||||
}
|
||||
|
||||
if stats.Agreed != 0 {
|
||||
t.Errorf("Agreed = %d, want 0", stats.Agreed)
|
||||
}
|
||||
if stats.AgreementRate != 0 {
|
||||
t.Errorf("AgreementRate = %f, want 0", stats.AgreementRate)
|
||||
}
|
||||
if stats.CorrectA != 0 {
|
||||
t.Errorf("CorrectA = %d, want 0 (A said technical, truth is creative)", stats.CorrectA)
|
||||
}
|
||||
if stats.CorrectB != 2 {
|
||||
t.Errorf("CorrectB = %d, want 2", stats.CorrectB)
|
||||
}
|
||||
if stats.ConfusionPairs["technical->creative"] != 2 {
|
||||
t.Errorf("ConfusionPairs[technical->creative] = %d, want 2", stats.ConfusionPairs["technical->creative"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalibrateDomains_MixedAgreement(t *testing.T) {
|
||||
// Model A and B agree on first sample, disagree on second.
|
||||
callCount := 0
|
||||
modelA := &mockModel{
|
||||
classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
|
||||
results := make([]inference.ClassifyResult, len(prompts))
|
||||
for i := range prompts {
|
||||
results[i] = inference.ClassifyResult{Token: inference.Token{Text: "ethical"}}
|
||||
}
|
||||
return results, nil
|
||||
},
|
||||
}
|
||||
modelB := &mockModel{
|
||||
classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
|
||||
callCount++
|
||||
results := make([]inference.ClassifyResult, len(prompts))
|
||||
for i, p := range prompts {
|
||||
if i == 0 && callCount == 1 {
|
||||
// First batch: agree on first item
|
||||
results[i] = inference.ClassifyResult{Token: inference.Token{Text: "ethical"}}
|
||||
} else {
|
||||
_ = p
|
||||
results[i] = inference.ClassifyResult{Token: inference.Token{Text: "technical"}}
|
||||
}
|
||||
}
|
||||
return results, nil
|
||||
},
|
||||
}
|
||||
|
||||
samples := []CalibrationSample{
|
||||
{Text: "We should act fairly"},
|
||||
{Text: "Delete the config"},
|
||||
}
|
||||
|
||||
stats, err := CalibrateDomains(context.Background(), modelA, modelB, samples, WithBatchSize(16))
|
||||
if err != nil {
|
||||
t.Fatalf("CalibrateDomains: %v", err)
|
||||
}
|
||||
|
||||
if stats.Total != 2 {
|
||||
t.Errorf("Total = %d, want 2", stats.Total)
|
||||
}
|
||||
if stats.Agreed != 1 {
|
||||
t.Errorf("Agreed = %d, want 1", stats.Agreed)
|
||||
}
|
||||
if got := stats.AgreementRate; got != 0.5 {
|
||||
t.Errorf("AgreementRate = %f, want 0.5", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalibrateDomains_NoGroundTruth(t *testing.T) {
|
||||
// Samples without TrueDomain: accuracy should be 0, agreement still measured.
|
||||
model := &mockModel{
|
||||
classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
|
||||
results := make([]inference.ClassifyResult, len(prompts))
|
||||
for i := range prompts {
|
||||
results[i] = inference.ClassifyResult{Token: inference.Token{Text: "casual"}}
|
||||
}
|
||||
return results, nil
|
||||
},
|
||||
}
|
||||
|
||||
samples := []CalibrationSample{
|
||||
{Text: "Went to the store"},
|
||||
{Text: "Had coffee this morning"},
|
||||
}
|
||||
|
||||
stats, err := CalibrateDomains(context.Background(), model, model, samples)
|
||||
if err != nil {
|
||||
t.Fatalf("CalibrateDomains: %v", err)
|
||||
}
|
||||
|
||||
if stats.WithTruth != 0 {
|
||||
t.Errorf("WithTruth = %d, want 0", stats.WithTruth)
|
||||
}
|
||||
if stats.AccuracyA != 0 {
|
||||
t.Errorf("AccuracyA = %f, want 0 (no ground truth)", stats.AccuracyA)
|
||||
}
|
||||
if stats.Agreed != 2 {
|
||||
t.Errorf("Agreed = %d, want 2", stats.Agreed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalibrateDomains_EmptySamples(t *testing.T) {
|
||||
model := &mockModel{
|
||||
classifyFunc: func(_ context.Context, _ []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
|
||||
return nil, nil
|
||||
},
|
||||
}
|
||||
|
||||
_, err := CalibrateDomains(context.Background(), model, model, nil)
|
||||
if err == nil {
|
||||
t.Error("expected error for empty samples, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalibrateDomains_BatchBoundary(t *testing.T) {
|
||||
// 7 samples with batch size 3: tests partial last batch.
|
||||
model := &mockModel{
|
||||
classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
|
||||
results := make([]inference.ClassifyResult, len(prompts))
|
||||
for i := range prompts {
|
||||
results[i] = inference.ClassifyResult{Token: inference.Token{Text: "technical"}}
|
||||
}
|
||||
return results, nil
|
||||
},
|
||||
}
|
||||
|
||||
samples := make([]CalibrationSample, 7)
|
||||
for i := range samples {
|
||||
samples[i] = CalibrationSample{Text: "Build the project"}
|
||||
}
|
||||
|
||||
stats, err := CalibrateDomains(context.Background(), model, model, samples, WithBatchSize(3))
|
||||
if err != nil {
|
||||
t.Fatalf("CalibrateDomains: %v", err)
|
||||
}
|
||||
|
||||
if stats.Total != 7 {
|
||||
t.Errorf("Total = %d, want 7", stats.Total)
|
||||
}
|
||||
if stats.Agreed != 7 {
|
||||
t.Errorf("Agreed = %d, want 7", stats.Agreed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalibrateDomains_ResultsSlice(t *testing.T) {
|
||||
// Verify individual results are populated correctly.
|
||||
modelA := &mockModel{
|
||||
classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
|
||||
results := make([]inference.ClassifyResult, len(prompts))
|
||||
for i := range prompts {
|
||||
results[i] = inference.ClassifyResult{Token: inference.Token{Text: "ethical"}}
|
||||
}
|
||||
return results, nil
|
||||
},
|
||||
}
|
||||
modelB := &mockModel{
|
||||
classifyFunc: func(_ context.Context, prompts []string, _ ...inference.GenerateOption) ([]inference.ClassifyResult, error) {
|
||||
results := make([]inference.ClassifyResult, len(prompts))
|
||||
for i := range prompts {
|
||||
results[i] = inference.ClassifyResult{Token: inference.Token{Text: "casual"}}
|
||||
}
|
||||
return results, nil
|
||||
},
|
||||
}
|
||||
|
||||
samples := []CalibrationSample{
|
||||
{Text: "Be fair to everyone", TrueDomain: "ethical"},
|
||||
}
|
||||
|
||||
stats, err := CalibrateDomains(context.Background(), modelA, modelB, samples)
|
||||
if err != nil {
|
||||
t.Fatalf("CalibrateDomains: %v", err)
|
||||
}
|
||||
|
||||
if len(stats.Results) != 1 {
|
||||
t.Fatalf("Results len = %d, want 1", len(stats.Results))
|
||||
}
|
||||
|
||||
r := stats.Results[0]
|
||||
if r.Text != "Be fair to everyone" {
|
||||
t.Errorf("Text = %q", r.Text)
|
||||
}
|
||||
if r.TrueDomain != "ethical" {
|
||||
t.Errorf("TrueDomain = %q", r.TrueDomain)
|
||||
}
|
||||
if r.DomainA != "ethical" {
|
||||
t.Errorf("DomainA = %q, want ethical", r.DomainA)
|
||||
}
|
||||
if r.DomainB != "casual" {
|
||||
t.Errorf("DomainB = %q, want casual", r.DomainB)
|
||||
}
|
||||
if r.Agree {
|
||||
t.Error("Agree = true, want false")
|
||||
}
|
||||
}
|
||||
577
integration/calibrate_test.go
Normal file
577
integration/calibrate_test.go
Normal file
|
|
@ -0,0 +1,577 @@
|
|||
package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"testing"
|
||||
|
||||
i18n "forge.lthn.ai/core/go-i18n"
|
||||
"forge.lthn.ai/core/go-inference"
|
||||
_ "forge.lthn.ai/core/go-mlx" // registers Metal backend
|
||||
)
|
||||
|
||||
// buildCalibrationCorpus constructs 500 samples for 1B vs 27B comparison.
|
||||
// First 220 have ground truth (from the classification benchmark), the rest
|
||||
// are diverse prompts without labels for agreement-only measurement.
|
||||
func buildCalibrationCorpus() []i18n.CalibrationSample {
|
||||
var samples []i18n.CalibrationSample
|
||||
|
||||
// --- Ground truth samples (220): 55 per domain ---
|
||||
|
||||
technical := []string{
|
||||
"Delete the configuration file",
|
||||
"Build the project from source",
|
||||
"Run the tests before committing",
|
||||
"Push the changes to the branch",
|
||||
"Update the dependencies",
|
||||
"Check the build status",
|
||||
"Find the failing test",
|
||||
"Write the test cases first",
|
||||
"Set the environment variables",
|
||||
"Split the package into modules",
|
||||
"Scan the repository for vulnerabilities",
|
||||
"Format the source files",
|
||||
"Reset the branch to the previous commit",
|
||||
"Stop the running process",
|
||||
"Cut a new release branch",
|
||||
"Send the build artifacts to the server",
|
||||
"Keep the test coverage above the threshold",
|
||||
"Hold the deployment until the checks pass",
|
||||
"Begin the migration to the new package",
|
||||
"Take the old server offline",
|
||||
"The build failed because of a missing dependency",
|
||||
"The test committed changes to the wrong branch",
|
||||
"We found a vulnerability in the package",
|
||||
"The commit broke the build",
|
||||
"She deleted the old configuration files",
|
||||
"They pushed the fix to the repository",
|
||||
"The branch was updated with the latest changes",
|
||||
"He rebuilt the project after updating dependencies",
|
||||
"The task failed during the scanning phase",
|
||||
"We split the repository into separate packages",
|
||||
"The check ran successfully on all branches",
|
||||
"They found the issue in the build directory",
|
||||
"The file was committed without running tests",
|
||||
"Merge the pull request after review",
|
||||
"Deploy the service to the staging cluster",
|
||||
"Revert the last three commits",
|
||||
"Enable verbose logging for debugging",
|
||||
"Pin the dependency to version two",
|
||||
"Rotate the API keys on production",
|
||||
"Profile the memory usage under load",
|
||||
"Containerise the application with Docker",
|
||||
"Migrate the database schema to version five",
|
||||
"Monitor the error rate after deployment",
|
||||
"Invalidate the CDN cache for the assets",
|
||||
"The pipeline timed out on the integration step",
|
||||
"Rollback failed because the snapshot was corrupted",
|
||||
"The linter caught twelve style violations",
|
||||
"Cache invalidation caused stale data in staging",
|
||||
"The DNS propagation took longer than expected",
|
||||
"Thread pool exhaustion under concurrent requests",
|
||||
"The certificate expired and TLS handshakes failed",
|
||||
"Garbage collection pauses exceeded the SLA threshold",
|
||||
"Hot-reload broke after upgrading the framework",
|
||||
"The socket connection was reset by the load balancer",
|
||||
"Rate limiting kicked in after the traffic spike",
|
||||
}
|
||||
|
||||
creative := []string{
|
||||
"She wrote the story by candlelight",
|
||||
"He drew a map of forgotten places",
|
||||
"The river froze under the winter moon",
|
||||
"They sang the old songs by the fire",
|
||||
"She found a letter hidden in the pages",
|
||||
"He carved the figure from driftwood",
|
||||
"The wind spoke through the hollow trees",
|
||||
"They wove the colours into the tapestry",
|
||||
"She built a castle from the broken stones",
|
||||
"He told the tale of the sunken ship",
|
||||
"She painted the sky with broad red strokes",
|
||||
"He composed the melody in a single night",
|
||||
"They danced beneath the flickering lanterns",
|
||||
"The cat sat on the manuscript and purred",
|
||||
"She folded the paper into a paper crane",
|
||||
"He read the poem aloud to the empty room",
|
||||
"They carved their names into the old oak tree",
|
||||
"She spun the yarn into a glowing thread",
|
||||
"He wrote the first line and then stopped",
|
||||
"The garden grew wild after the artist left",
|
||||
"Write a ballad about the last lighthouse keeper",
|
||||
"Describe the colour of silence at midnight",
|
||||
"Tell the story of a bridge that remembers",
|
||||
"Compose a lullaby for a clockwork child",
|
||||
"Paint with words the feeling of falling snow",
|
||||
"Write a dialogue between the sea and the shore",
|
||||
"Describe a library where books write themselves",
|
||||
"Tell the story of the shadow that ran away",
|
||||
"Write a sonnet about rust and renewal",
|
||||
"Describe the sound of a house settling at night",
|
||||
"The painter mixed colours that did not exist",
|
||||
"She sculpted a bird from frozen music",
|
||||
"He dreamed of cities built from sentences",
|
||||
"The violin played itself in the empty hall",
|
||||
"The actress forgot every line and improvised",
|
||||
"A poet counted syllables in the rain",
|
||||
"The dancer traced equations on the stage",
|
||||
"She photographed the spaces between words",
|
||||
"He collected echoes in glass jars",
|
||||
"The novelist wrote the ending first",
|
||||
"Create a myth about why stars blink",
|
||||
"Imagine a museum of lost conversations",
|
||||
"Draft a letter from the moon to the tide",
|
||||
"Sketch a world where colour is currency",
|
||||
"Write a recipe for nostalgia",
|
||||
"Invent a festival for invisible things",
|
||||
"Describe a map drawn by migrating birds",
|
||||
"Narrate a race between light and memory",
|
||||
"Chronicle the last performance of a ghost orchestra",
|
||||
"Tell the fable of a mountain that learned to swim",
|
||||
"The calligrapher's ink bled new alphabets",
|
||||
"She knitted constellations into scarves",
|
||||
"He bottled the scent of old bookshops",
|
||||
"The typewriter stuttered out a prophecy",
|
||||
"A child drew a door that actually opened",
|
||||
}
|
||||
|
||||
ethical := []string{
|
||||
"We should think about the consequences before acting",
|
||||
"They must not ignore the suffering of others",
|
||||
"Leaders must lead by example in difficult times",
|
||||
"We ought to consider fairness in every decision",
|
||||
"They should not sacrifice truth for convenience",
|
||||
"We must balance freedom with responsibility",
|
||||
"Leaders ought to listen before they judge",
|
||||
"They must not put profit above human welfare",
|
||||
"We should protect the rights of the vulnerable",
|
||||
"They ought to honour their commitments",
|
||||
"We must think about future generations",
|
||||
"Leaders should act with transparency",
|
||||
"They must not deceive those who trust them",
|
||||
"We ought to share the burden equally",
|
||||
"They should not exploit those with less power",
|
||||
"We must defend the dignity of every person",
|
||||
"Leaders ought to admit mistakes openly",
|
||||
"They must not silence dissent unfairly",
|
||||
"We should value honesty over popularity",
|
||||
"They ought to consider the impact on communities",
|
||||
"She thought carefully about the ethical implications",
|
||||
"He chose fairness over personal gain",
|
||||
"They debated the moral boundaries for hours",
|
||||
"She questioned whether the policy was just",
|
||||
"He stood up for what he believed was right",
|
||||
"They reconsidered after hearing the other side",
|
||||
"She refused to compromise on basic principles",
|
||||
"He weighed the consequences of every option",
|
||||
"They acknowledged the harm that was caused",
|
||||
"She advocated for those who had no voice",
|
||||
"Is it right to break a promise to prevent harm",
|
||||
"Should loyalty override honesty in this case",
|
||||
"Can a just society tolerate inequality",
|
||||
"When is civil disobedience morally justified",
|
||||
"Does the end justify the means in emergencies",
|
||||
"Should we forgive without an apology",
|
||||
"Is silence in the face of injustice complicity",
|
||||
"Can privacy be sacrificed for collective safety",
|
||||
"Should past wrongs be judged by present standards",
|
||||
"Is it ethical to profit from another's misfortune",
|
||||
"Consent must be informed and freely given",
|
||||
"Accountability should apply equally to all",
|
||||
"Transparency is the foundation of public trust",
|
||||
"No institution should be above scrutiny",
|
||||
"The precautionary principle demands caution",
|
||||
"Proportionality must govern any use of force",
|
||||
"Dignity is non-negotiable in every context",
|
||||
"Equity requires more than equal treatment",
|
||||
"Whistleblowers deserve legal protection",
|
||||
"Cultural differences do not excuse human rights violations",
|
||||
"Algorithms must be audited for bias regularly",
|
||||
"Data sovereignty belongs to the individual",
|
||||
"Environmental debt cannot be passed to future generations",
|
||||
"Access to clean water is a fundamental right",
|
||||
"Corporate responsibility extends beyond shareholder value",
|
||||
}
|
||||
|
||||
casual := []string{
|
||||
"I went to the store yesterday",
|
||||
"She made dinner for everyone last night",
|
||||
"He took the dog for a walk this morning",
|
||||
"They met for coffee after work",
|
||||
"I forgot to bring my umbrella",
|
||||
"She called her friend on the way home",
|
||||
"He fixed the leaky tap over the weekend",
|
||||
"They watched the match at the pub",
|
||||
"I cooked pasta because it was quick",
|
||||
"She picked up the kids from school",
|
||||
"He cleaned the flat before the guests arrived",
|
||||
"They walked along the river after lunch",
|
||||
"I lost my keys again today",
|
||||
"She finished the book on the train",
|
||||
"He fell asleep on the sofa",
|
||||
"They planned a trip to the seaside",
|
||||
"I bought a new phone last week",
|
||||
"She tried the new café on the corner",
|
||||
"He parked the car in the wrong spot",
|
||||
"They played board games until midnight",
|
||||
"Grab some milk on the way back",
|
||||
"Fancy a takeaway tonight",
|
||||
"Shall we catch the early train",
|
||||
"Pass me the remote would you",
|
||||
"Pop the kettle on I will be right there",
|
||||
"Have you seen my charger anywhere",
|
||||
"Remind me to ring the dentist tomorrow",
|
||||
"Let me know when you are ready to go",
|
||||
"Stick the leftovers in the fridge",
|
||||
"Save me a seat if you get there first",
|
||||
"The wifi has been dodgy all day",
|
||||
"My alarm did not go off this morning",
|
||||
"Traffic was absolutely mental on the M25",
|
||||
"The heating packed in again last night",
|
||||
"I queued for ages at the post office",
|
||||
"She burned the toast while scrolling her phone",
|
||||
"He missed the bus by about ten seconds",
|
||||
"The cat knocked a glass off the table",
|
||||
"We ran out of teabags on a Monday morning",
|
||||
"The neighbours had a barbecue in the rain",
|
||||
"Just popping to Tesco need anything",
|
||||
"Running a bit late be there in ten",
|
||||
"Cannot find a parking space anywhere",
|
||||
"The meeting dragged on forever today",
|
||||
"Pizza or curry what do you reckon",
|
||||
"That new series everyone is talking about is decent",
|
||||
"I need a holiday already and it is only February",
|
||||
"The dog ate my slipper again classic",
|
||||
"She left her umbrella on the bus typical",
|
||||
"We ended up chatting for hours lost track of time",
|
||||
"Got soaked walking back from the shops",
|
||||
"The queue at Primark was round the block",
|
||||
"He spent all Saturday fixing the garden fence",
|
||||
"My phone died right when I needed the map",
|
||||
"They argued about whose turn it was to wash up",
|
||||
}
|
||||
|
||||
for _, s := range technical {
|
||||
samples = append(samples, i18n.CalibrationSample{Text: s, TrueDomain: "technical"})
|
||||
}
|
||||
for _, s := range creative {
|
||||
samples = append(samples, i18n.CalibrationSample{Text: s, TrueDomain: "creative"})
|
||||
}
|
||||
for _, s := range ethical {
|
||||
samples = append(samples, i18n.CalibrationSample{Text: s, TrueDomain: "ethical"})
|
||||
}
|
||||
for _, s := range casual {
|
||||
samples = append(samples, i18n.CalibrationSample{Text: s, TrueDomain: "casual"})
|
||||
}
|
||||
|
||||
// --- Additional unlabelled samples (280) for agreement-only measurement ---
|
||||
// Diverse prompts spanning multiple registers to stress-test model agreement.
|
||||
unlabelled := []string{
|
||||
"Explain the difference between TCP and UDP",
|
||||
"Write a haiku about compilation errors",
|
||||
"Should artificial intelligence have legal rights",
|
||||
"Just got back from the gym feeling knackered",
|
||||
"Implement a binary search tree in Go",
|
||||
"The autumn leaves fell like forgotten promises",
|
||||
"Is it moral to eat meat if alternatives exist",
|
||||
"Mate I cannot believe the price of petrol",
|
||||
"Refactor this function to use channels",
|
||||
"She whispered secrets to the sleeping garden",
|
||||
"Universal basic income deserves serious debate",
|
||||
"Popped to Sainsburys the queue was ridiculous",
|
||||
"Add error handling to the HTTP middleware",
|
||||
"The clocktower sang at midnight in a language of rust",
|
||||
"Privacy is a right not a privilege",
|
||||
"Had chips for tea because I could not be bothered cooking",
|
||||
"Configure the reverse proxy for TLS termination",
|
||||
"He painted her portrait from memory alone",
|
||||
"We must hold corporations accountable for pollution",
|
||||
"The pub quiz was surprisingly hard last night",
|
||||
"Set up a cron job for the daily backup",
|
||||
"Moonlight dripped through the cracks in the ceiling",
|
||||
"Every child deserves access to quality education",
|
||||
"Nipped to the cash point and it was out of order",
|
||||
"Benchmark the sort algorithm with random inputs",
|
||||
"She collected stones that hummed in the dark",
|
||||
"Workers deserve fair wages and safe conditions",
|
||||
"The match went to penalties absolute scenes",
|
||||
"Parse the YAML configuration into structs",
|
||||
"A spider rebuilt its web across the doorframe every dawn",
|
||||
"Religious freedom must be protected but not weaponised",
|
||||
"My train was delayed again third time this week",
|
||||
"Write unit tests for the authentication module",
|
||||
"The typewriter remembered every letter it had ever struck",
|
||||
"Surveillance without oversight threatens democracy",
|
||||
"Grabbed a meal deal from Boots surprisingly decent",
|
||||
"Optimise the database query to avoid full table scans",
|
||||
"The lighthouse keeper painted the sunrise every morning for forty years",
|
||||
"No government should have unchecked power over its citizens",
|
||||
"She texted me at two in the morning about nothing",
|
||||
"Allocate buffer memory before the hot loop",
|
||||
"A violin case held only pressed flowers and silence",
|
||||
"Animal testing raises complex ethical questions",
|
||||
"The kids were bouncing off the walls all afternoon",
|
||||
"Implement rate limiting on the public API endpoints",
|
||||
"The poet measured grief in iambic pentameter",
|
||||
"Climate change disproportionately affects the poorest nations",
|
||||
"Left my wallet at home absolute nightmare",
|
||||
"Compile with race detection enabled for CI",
|
||||
"She built a bridge from paper and belief",
|
||||
"Access to healthcare should not depend on wealth",
|
||||
"Binge-watched the whole series in one sitting",
|
||||
"Marshal the response body into JSON format",
|
||||
"He translated birdsong into sheet music nobody could play",
|
||||
"Intellectual property laws need reform for the digital age",
|
||||
"Car park was rammed so I parked three streets away",
|
||||
"Profile the goroutine stack traces under load",
|
||||
"The sculptor carved time into marble",
|
||||
"Democracy requires an informed and engaged citizenry",
|
||||
"Made a brew and forgot about it stone cold now",
|
||||
"Validate the JWT token before processing the request",
|
||||
"A cartographer mapped the dreams of sleeping cities",
|
||||
"Truth in advertising should be legally enforceable",
|
||||
"The boiler is making that weird noise again",
|
||||
"Instrument the service with distributed tracing",
|
||||
"She wrote love letters in disappearing ink",
|
||||
"Net neutrality protects innovation and free speech",
|
||||
"Just realised I have been wearing odd socks all day",
|
||||
"Shard the database across multiple availability zones",
|
||||
"The photographer captured silence between lightning strikes",
|
||||
"Genetic modification of food requires transparent labelling",
|
||||
"My neighbour has been mowing the lawn at seven AM",
|
||||
"Generate a migration script for the schema change",
|
||||
"He choreographed a dance for the sound of rain on tin",
|
||||
"The right to peaceful protest is non-negotiable",
|
||||
"Ordered a flat white they gave me a latte close enough",
|
||||
"Implement graceful shutdown with context cancellation",
|
||||
"A child painted the ocean from memory never having seen it",
|
||||
"Tax policy should reduce inequality not entrench it",
|
||||
"Forgot my password for the third time this month",
|
||||
"Cache the DNS lookups to reduce resolver latency",
|
||||
"The musician played notes that existed between notes",
|
||||
"Consent in data collection must be meaningful and revocable",
|
||||
"Spent twenty minutes looking for my glasses they were on my head",
|
||||
"Write a Dockerfile that produces a minimal scratch image",
|
||||
"She folded origami cranes until the room was a flock",
|
||||
"Every person deserves to be treated with basic dignity",
|
||||
"The cat has decided my laptop is a bed now apparently",
|
||||
"Debounce the search input to reduce API calls",
|
||||
"A novelist wrote a book whose chapters could be read in any order",
|
||||
"Freedom of the press is the cornerstone of accountability",
|
||||
"Tried to assemble the furniture without instructions regret",
|
||||
"Provision the Kubernetes cluster with Terraform",
|
||||
"The garden remembered every hand that had tended it",
|
||||
"Monopolies stifle innovation and harm consumers",
|
||||
"Bank holiday weekend and it rained the entire time classic",
|
||||
"Rotate the log files and compress archives older than seven days",
|
||||
"He composed music for instruments that had not been invented yet",
|
||||
"Reproductive rights are fundamental human rights",
|
||||
"The dishwasher has flooded the kitchen again brilliant",
|
||||
"Load-test the websocket connections with ten thousand concurrent clients",
|
||||
"She painted with light on walls that no longer existed",
|
||||
"Criminal justice systems must prioritise rehabilitation",
|
||||
"My phone autocorrected my name in my own email signature",
|
||||
"Enable HTTP/2 server push for critical CSS and fonts",
|
||||
"The archive contained letters between people who never met",
|
||||
"Access to justice should not depend on the size of your wallet",
|
||||
"Spent half an hour on hold just to be told to call back tomorrow",
|
||||
"Refactor the monolith into bounded-context microservices",
|
||||
"A bookshop cat had read every spine on every shelf",
|
||||
"Workers in the gig economy deserve employment protections",
|
||||
"My umbrella turned inside out in the wind love this weather",
|
||||
"Verify the checksum before extracting the release archive",
|
||||
"She grew a forest in an abandoned car park using only patience",
|
||||
"International law must adapt to cyber warfare realities",
|
||||
"Got to the front of the queue and they closed the counter",
|
||||
"Pin the base image version to prevent supply chain attacks",
|
||||
"The librarian catalogued books that had not been written yet",
|
||||
"Disability access is a right not an afterthought",
|
||||
"Someone ate my sandwich from the office fridge unforgivable",
|
||||
"Set up mutual TLS between the service mesh sidecars",
|
||||
"A glassblower shaped the wind into frozen symphonies",
|
||||
"Landlords should not be above basic maintenance obligations",
|
||||
"The train was so packed I could not move my arms",
|
||||
"Implement exponential backoff with jitter on retries",
|
||||
"She wrote code that dreamed when no one was watching",
|
||||
"The death penalty has no place in a civilised society",
|
||||
"Had to restart the router four times before it behaved",
|
||||
"Audit the IAM policies for principle of least privilege",
|
||||
"He drew maps of places that only existed in old songs",
|
||||
"Educational debt should not define a generation",
|
||||
"Supermarket was out of oat milk complete disaster",
|
||||
"Emit structured JSON logs with correlation IDs",
|
||||
"The beekeeper transcribed the hive's daily arguments",
|
||||
"Pharmaceutical pricing must be transparent and fair",
|
||||
"Queued for forty minutes to return a three pound item",
|
||||
"Automate the certificate renewal with ACME protocol",
|
||||
"A weaver used starlight as thread and shadows as weft",
|
||||
"Freedom of information requests keep governments honest",
|
||||
"Tried to parallel park gave up after six attempts",
|
||||
"Wire up the health check endpoint for the load balancer",
|
||||
"The mathematician found poetry in prime number gaps",
|
||||
"Arms trade regulation is a moral imperative",
|
||||
"My flatmate used the last of the milk again classic",
|
||||
"Enable content security policy headers on all responses",
|
||||
"She built a clock that measured kindness instead of time",
|
||||
"Open-source licensing protects collaborative innovation",
|
||||
"The self-checkout machine judged me I could feel it",
|
||||
"Index the frequently queried columns to avoid sequential scans",
|
||||
"He recorded the sound of snow falling on an empty stage",
|
||||
"Sanctions must target regimes not civilian populations",
|
||||
"Accidentally liked a three year old photo while scrolling mortified",
|
||||
"Configure the garbage collector for low-latency workloads",
|
||||
"A chandler made candles from the wax of sealed love letters",
|
||||
"Migrant workers deserve the same legal protections as citizens",
|
||||
"The bus driver waited for me absolute legend",
|
||||
"Implement circuit breaker pattern for external service calls",
|
||||
"She carved a chess set from the wood of a lightning-struck oak",
|
||||
"Algorithmic hiring tools must be audited for discrimination",
|
||||
"Went to make toast and the bread had gone mouldy gutted",
|
||||
"Set the connection pool size based on available file descriptors",
|
||||
"The astronomer mapped constellations visible only to the colour-blind",
|
||||
"Public spaces must remain accessible and free for all",
|
||||
"Dropped my phone screen down on concrete afraid to look",
|
||||
"Flush the write-ahead log before acknowledging the transaction",
|
||||
"A tattooist inked stories that only appeared in moonlight",
|
||||
"Journalism must remain independent from corporate interests",
|
||||
"The washing machine finished its cycle three hours ago still in there",
|
||||
"Register the shutdown hook to drain connections gracefully",
|
||||
"He designed a font where every letter told its own history",
|
||||
"Indigenous land rights are inseparable from environmental protection",
|
||||
"Tried to order online the website crashed at checkout",
|
||||
"Generate the API client from the OpenAPI specification",
|
||||
"She composed a requiem for a language spoken by no one",
|
||||
"The right to repair your own devices should be protected by law",
|
||||
"Accidentally replied all to a company-wide email want to disappear",
|
||||
"Back up the etcd cluster before upgrading the control plane",
|
||||
"A toymaker built a music box that played forgotten lullabies",
|
||||
"Universal suffrage is the minimum threshold for democracy",
|
||||
"The WiFi password is on a sticky note behind the router somewhere",
|
||||
"Write integration tests that spin up a real database container",
|
||||
"She photographed shadows as if they were the subject not the object",
|
||||
"Labour laws must evolve with the changing nature of work",
|
||||
"Left the heating on all day while at work sorry planet",
|
||||
"Throttle the event stream to prevent consumer backpressure",
|
||||
"The cartographer refused to draw borders only rivers and mountains",
|
||||
"Water privatisation threatens a fundamental public good",
|
||||
"My cat just knocked my coffee off the desk and stared at me",
|
||||
"Instrument the critical path with histogram metrics",
|
||||
"A ceramicist glazed bowls in the exact blue of homesickness",
|
||||
"Whistleblower protections must extend to private sector employees",
|
||||
"The parking meter ate my coins and gave me a fine anyway",
|
||||
"Enforce request size limits at the ingress controller",
|
||||
"She translated silence into a language with twenty vowels",
|
||||
"Climate refugees deserve international legal recognition",
|
||||
"My internet has been dropping out every ten minutes all evening",
|
||||
"Drain the message queue before scaling down the consumer pods",
|
||||
"He composed a symphony scored for rainstorm and empty chairs",
|
||||
"Forced arbitration clauses undermine consumer rights",
|
||||
"The neighbour's cat has adopted us we did not agree to this",
|
||||
"Run the static analysis linter in the pre-commit hook",
|
||||
"A perfumer bottled the smell of the first day of school",
|
||||
"Platform monopolies must face meaningful antitrust enforcement",
|
||||
"Woke up at three AM convinced I left the oven on I did not",
|
||||
}
|
||||
|
||||
for _, s := range unlabelled {
|
||||
samples = append(samples, i18n.CalibrationSample{Text: s})
|
||||
}
|
||||
|
||||
return samples
|
||||
}
|
||||
|
||||
func TestCalibrateDomains_1Bvs27B(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping model calibration in short mode")
|
||||
}
|
||||
|
||||
// Load 1B model.
|
||||
model1B, err := inference.LoadModel("/Volumes/Data/lem/LEM-Gemma3-1B-layered-v2")
|
||||
if err != nil {
|
||||
t.Skipf("1B model not available: %v", err)
|
||||
}
|
||||
defer model1B.Close()
|
||||
|
||||
// Load 27B model.
|
||||
model27B, err := inference.LoadModel("/Volumes/Data/lem/gemma-3-27b-it-base")
|
||||
if err != nil {
|
||||
t.Skipf("27B model not available: %v", err)
|
||||
}
|
||||
defer model27B.Close()
|
||||
|
||||
samples := buildCalibrationCorpus()
|
||||
t.Logf("Calibrating with %d samples (%d with ground truth)", len(samples), countWithTruth(samples))
|
||||
|
||||
stats, err := i18n.CalibrateDomains(context.Background(), model1B, model27B, samples,
|
||||
i18n.WithBatchSize(8))
|
||||
if err != nil {
|
||||
t.Fatalf("CalibrateDomains: %v", err)
|
||||
}
|
||||
|
||||
// --- Report ---
|
||||
t.Logf("=== Calibration Results ===")
|
||||
t.Logf("Total: %d | Agreed: %d | Agreement rate: %.1f%%",
|
||||
stats.Total, stats.Agreed, stats.AgreementRate*100)
|
||||
t.Logf("1B duration: %v | 27B duration: %v", stats.DurationA, stats.DurationB)
|
||||
|
||||
if stats.WithTruth > 0 {
|
||||
t.Logf("Accuracy (ground truth, n=%d): 1B=%.1f%% (%d/%d) | 27B=%.1f%% (%d/%d)",
|
||||
stats.WithTruth,
|
||||
stats.AccuracyA*100, stats.CorrectA, stats.WithTruth,
|
||||
stats.AccuracyB*100, stats.CorrectB, stats.WithTruth)
|
||||
}
|
||||
|
||||
t.Logf("--- Domain distribution ---")
|
||||
t.Logf(" Model A (1B): %v", stats.ByDomainA)
|
||||
t.Logf(" Model B (27B): %v", stats.ByDomainB)
|
||||
|
||||
if len(stats.ConfusionPairs) > 0 {
|
||||
t.Logf("--- Confusion pairs (A->B) ---")
|
||||
// Sort for deterministic output.
|
||||
type pair struct {
|
||||
key string
|
||||
count int
|
||||
}
|
||||
var pairs []pair
|
||||
for k, v := range stats.ConfusionPairs {
|
||||
pairs = append(pairs, pair{k, v})
|
||||
}
|
||||
sort.Slice(pairs, func(i, j int) bool { return pairs[i].count > pairs[j].count })
|
||||
for _, p := range pairs {
|
||||
t.Logf(" %s: %d", p.key, p.count)
|
||||
}
|
||||
}
|
||||
|
||||
// Log individual disagreements for analysis.
|
||||
disagreements := 0
|
||||
for _, r := range stats.Results {
|
||||
if !r.Agree {
|
||||
disagreements++
|
||||
truth := ""
|
||||
if r.TrueDomain != "" {
|
||||
truth = fmt.Sprintf(" [truth=%s]", r.TrueDomain)
|
||||
}
|
||||
t.Logf(" DISAGREE: 1B=%s 27B=%s%s | %.60s", r.DomainA, r.DomainB, truth, r.Text)
|
||||
if disagreements >= 50 {
|
||||
t.Logf(" ... (%d more disagreements)", stats.Total-stats.Agreed-50)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Soft assertions — we expect reasonable agreement but don't hard-fail.
|
||||
if stats.AgreementRate < 0.5 {
|
||||
t.Errorf("Agreement rate %.1f%% is below 50%% — models may not share classification semantics",
|
||||
stats.AgreementRate*100)
|
||||
}
|
||||
}
|
||||
|
||||
func countWithTruth(samples []i18n.CalibrationSample) int {
|
||||
n := 0
|
||||
for _, s := range samples {
|
||||
if s.TrueDomain != "" {
|
||||
n++
|
||||
}
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
|
@ -4,10 +4,12 @@ go 1.25.5
|
|||
|
||||
require (
|
||||
forge.lthn.ai/core/go-i18n v0.0.0-00010101000000-000000000000
|
||||
forge.lthn.ai/core/go-inference v0.0.0-00010101000000-000000000000
|
||||
forge.lthn.ai/core/go-inference v0.0.0
|
||||
forge.lthn.ai/core/go-mlx v0.0.0-00010101000000-000000000000
|
||||
)
|
||||
|
||||
require golang.org/x/text v0.33.0 // indirect
|
||||
|
||||
replace (
|
||||
forge.lthn.ai/core/go-i18n => ../
|
||||
forge.lthn.ai/core/go-inference => ../../go-inference
|
||||
|
|
|
|||
10
integration/go.sum
Normal file
10
integration/go.sum
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=
|
||||
golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
Loading…
Add table
Reference in a new issue