Port LEM scoring/training pipeline into CoreGo as pkg/ml with: - Inference abstraction with HTTP, llama-server, and Ollama backends - 3-tier scoring engine (heuristic, exact, LLM judge) - Capability and content probes for model evaluation - GGUF/safetensors format converters, MLX to PEFT adapter conversion - DuckDB integration for training data pipeline - InfluxDB metrics for lab dashboard - Training data export (JSONL + Parquet) - Expansion generation pipeline with distributed workers - 10 CLI commands under 'core ml' (score, probe, export, expand, status, gguf, convert, agent, worker) - 5 MCP tools (ml_generate, ml_score, ml_probe, ml_status, ml_backends) All 37 ML tests passing. Binary builds at 138MB with all commands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
109 lines
2.5 KiB
Go
109 lines
2.5 KiB
Go
package ml
|
|
|
|
import "testing"
|
|
|
|
func TestScoreGSM8K(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
response string
|
|
correctAnswer string
|
|
wantCorrect bool
|
|
wantExtracted string
|
|
}{
|
|
{
|
|
name: "hash delimiter correct",
|
|
response: "The answer is #### 42",
|
|
correctAnswer: "42",
|
|
wantCorrect: true,
|
|
wantExtracted: "42",
|
|
},
|
|
{
|
|
name: "last number match correct",
|
|
response: "Let me calculate... the result is 42.0",
|
|
correctAnswer: "42",
|
|
wantCorrect: true,
|
|
wantExtracted: "42.0",
|
|
},
|
|
{
|
|
name: "last number incorrect",
|
|
response: "I think it's 43",
|
|
correctAnswer: "42",
|
|
wantCorrect: false,
|
|
wantExtracted: "43",
|
|
},
|
|
{
|
|
name: "comma separated correct",
|
|
response: "#### 1,234",
|
|
correctAnswer: "1234",
|
|
wantCorrect: true,
|
|
wantExtracted: "1,234",
|
|
},
|
|
{
|
|
name: "no numbers",
|
|
response: "No numbers here",
|
|
correctAnswer: "5",
|
|
wantCorrect: false,
|
|
wantExtracted: "",
|
|
},
|
|
{
|
|
name: "empty response",
|
|
response: "",
|
|
correctAnswer: "5",
|
|
wantCorrect: false,
|
|
wantExtracted: "",
|
|
},
|
|
{
|
|
name: "error response",
|
|
response: "ERROR: model timeout",
|
|
correctAnswer: "10",
|
|
wantCorrect: false,
|
|
wantExtracted: "",
|
|
},
|
|
{
|
|
name: "multiple numbers picks last",
|
|
response: "First 10, then 20, finally 30",
|
|
correctAnswer: "30",
|
|
wantCorrect: true,
|
|
wantExtracted: "30",
|
|
},
|
|
{
|
|
name: "negative number",
|
|
response: "The answer is #### -5",
|
|
correctAnswer: "-5",
|
|
wantCorrect: true,
|
|
wantExtracted: "-5",
|
|
},
|
|
{
|
|
name: "decimal answer",
|
|
response: "Result = 3.14",
|
|
correctAnswer: "3.14",
|
|
wantCorrect: true,
|
|
wantExtracted: "3.14",
|
|
},
|
|
{
|
|
name: "hash takes priority over last number",
|
|
response: "Steps: 10 + 20 = 30 #### 30 and some trailing 99",
|
|
correctAnswer: "30",
|
|
wantCorrect: true,
|
|
wantExtracted: "30",
|
|
},
|
|
}
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
scores := scoreGSM8K(tt.response, tt.correctAnswer)
|
|
|
|
if scores.Correct == nil {
|
|
t.Fatal("Correct field is nil")
|
|
}
|
|
if *scores.Correct != tt.wantCorrect {
|
|
t.Errorf("correct = %v, want %v", *scores.Correct, tt.wantCorrect)
|
|
}
|
|
if scores.Extracted != tt.wantExtracted {
|
|
t.Errorf("extracted = %q, want %q", scores.Extracted, tt.wantExtracted)
|
|
}
|
|
if scores.Expected != tt.correctAnswer {
|
|
t.Errorf("expected = %q, want %q", scores.Expected, tt.correctAnswer)
|
|
}
|
|
})
|
|
}
|
|
}
|