feat: add ML inference, scoring, and training pipeline (pkg/ml)
Port LEM scoring/training pipeline into CoreGo as pkg/ml with:
- Inference abstraction with HTTP, llama-server, and Ollama backends
- 3-tier scoring engine (heuristic, exact, LLM judge)
- Capability and content probes for model evaluation
- GGUF/safetensors format converters, MLX to PEFT adapter conversion
- DuckDB integration for training data pipeline
- InfluxDB metrics for lab dashboard
- Training data export (JSONL + Parquet)
- Expansion generation pipeline with distributed workers
- 10 CLI commands under 'core ml' (score, probe, export, expand, status, gguf, convert, agent, worker)
- 5 MCP tools (ml_generate, ml_score, ml_probe, ml_status, ml_backends)
All 37 ML tests passing. Binary builds at 138MB with all commands.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 00:34:53 +00:00
|
|
|
package ml
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"context"
|
|
|
|
|
"fmt"
|
|
|
|
|
"time"
|
|
|
|
|
|
2026-02-16 14:24:37 +00:00
|
|
|
"forge.lthn.ai/core/go/pkg/cli"
|
|
|
|
|
"forge.lthn.ai/core/go/pkg/ml"
|
feat: add ML inference, scoring, and training pipeline (pkg/ml)
Port LEM scoring/training pipeline into CoreGo as pkg/ml with:
- Inference abstraction with HTTP, llama-server, and Ollama backends
- 3-tier scoring engine (heuristic, exact, LLM judge)
- Capability and content probes for model evaluation
- GGUF/safetensors format converters, MLX to PEFT adapter conversion
- DuckDB integration for training data pipeline
- InfluxDB metrics for lab dashboard
- Training data export (JSONL + Parquet)
- Expansion generation pipeline with distributed workers
- 10 CLI commands under 'core ml' (score, probe, export, expand, status, gguf, convert, agent, worker)
- 5 MCP tools (ml_generate, ml_score, ml_probe, ml_status, ml_backends)
All 37 ML tests passing. Binary builds at 138MB with all commands.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 00:34:53 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
var (
|
|
|
|
|
scoreInput string
|
|
|
|
|
scoreSuites string
|
|
|
|
|
scoreOutput string
|
|
|
|
|
scoreConcur int
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
var scoreCmd = &cli.Command{
|
|
|
|
|
Use: "score",
|
|
|
|
|
Short: "Score responses with heuristic and LLM judges",
|
|
|
|
|
Long: "Reads a JSONL file of prompt/response pairs and scores them across configured suites.",
|
|
|
|
|
RunE: runScore,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func init() {
|
|
|
|
|
scoreCmd.Flags().StringVar(&scoreInput, "input", "", "Input JSONL file with prompt/response pairs (required)")
|
|
|
|
|
scoreCmd.Flags().StringVar(&scoreSuites, "suites", "all", "Comma-separated scoring suites (heuristic,semantic,content,exact,truthfulqa,donotanswer,toxigen)")
|
|
|
|
|
scoreCmd.Flags().StringVar(&scoreOutput, "output", "", "Output JSON file for scores")
|
|
|
|
|
scoreCmd.Flags().IntVar(&scoreConcur, "concurrency", 4, "Number of concurrent scoring workers")
|
|
|
|
|
scoreCmd.MarkFlagRequired("input")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func runScore(cmd *cli.Command, args []string) error {
|
|
|
|
|
responses, err := ml.ReadResponses(scoreInput)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("read input: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var judge *ml.Judge
|
|
|
|
|
if judgeURL != "" {
|
|
|
|
|
backend := ml.NewHTTPBackend(judgeURL, judgeModel)
|
|
|
|
|
judge = ml.NewJudge(backend)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
engine := ml.NewEngine(judge, scoreConcur, scoreSuites)
|
|
|
|
|
|
|
|
|
|
ctx := context.Background()
|
|
|
|
|
perPrompt := engine.ScoreAll(ctx, responses)
|
|
|
|
|
averages := ml.ComputeAverages(perPrompt)
|
|
|
|
|
|
|
|
|
|
if scoreOutput != "" {
|
|
|
|
|
output := &ml.ScorerOutput{
|
|
|
|
|
Metadata: ml.Metadata{
|
|
|
|
|
JudgeModel: judgeModel,
|
|
|
|
|
JudgeURL: judgeURL,
|
|
|
|
|
ScoredAt: time.Now(),
|
|
|
|
|
Suites: ml.SplitComma(scoreSuites),
|
|
|
|
|
},
|
|
|
|
|
ModelAverages: averages,
|
|
|
|
|
PerPrompt: perPrompt,
|
|
|
|
|
}
|
|
|
|
|
if err := ml.WriteScores(scoreOutput, output); err != nil {
|
|
|
|
|
return fmt.Errorf("write output: %w", err)
|
|
|
|
|
}
|
|
|
|
|
fmt.Printf("Scores written to %s\n", scoreOutput)
|
|
|
|
|
} else {
|
|
|
|
|
for model, avgs := range averages {
|
|
|
|
|
fmt.Printf("%s:\n", model)
|
|
|
|
|
for field, val := range avgs {
|
|
|
|
|
fmt.Printf(" %-25s %.3f\n", field, val)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|