go/pkg/ml/io.go
Claude 548256312d feat: add ML inference, scoring, and training pipeline (pkg/ml)
Port LEM scoring/training pipeline into CoreGo as pkg/ml with:
- Inference abstraction with HTTP, llama-server, and Ollama backends
- 3-tier scoring engine (heuristic, exact, LLM judge)
- Capability and content probes for model evaluation
- GGUF/safetensors format converters, MLX to PEFT adapter conversion
- DuckDB integration for training data pipeline
- InfluxDB metrics for lab dashboard
- Training data export (JSONL + Parquet)
- Expansion generation pipeline with distributed workers
- 10 CLI commands under 'core ml' (score, probe, export, expand, status, gguf, convert, agent, worker)
- 5 MCP tools (ml_generate, ml_score, ml_probe, ml_status, ml_backends)

All 37 ML tests passing. Binary builds at 138MB with all commands.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 05:53:52 +00:00

149 lines
4.1 KiB
Go

package ml
import (
"bufio"
"encoding/json"
"fmt"
"os"
"strings"
)
// ReadResponses reads a JSONL file and returns a slice of Response structs.
// Each line must be a valid JSON object. Empty lines are skipped.
// The scanner buffer is set to 1MB to handle long responses.
func ReadResponses(path string) ([]Response, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("open %s: %w", path, err)
}
defer f.Close()
var responses []Response
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 1024*1024), 1024*1024) // 1MB buffer
lineNum := 0
for scanner.Scan() {
lineNum++
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
var r Response
if err := json.Unmarshal([]byte(line), &r); err != nil {
return nil, fmt.Errorf("line %d: %w", lineNum, err)
}
responses = append(responses, r)
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("scan %s: %w", path, err)
}
return responses, nil
}
// WriteScores writes a ScorerOutput to a JSON file with 2-space indentation.
func WriteScores(path string, output *ScorerOutput) error {
data, err := json.MarshalIndent(output, "", " ")
if err != nil {
return fmt.Errorf("marshal scores: %w", err)
}
if err := os.WriteFile(path, data, 0644); err != nil {
return fmt.Errorf("write %s: %w", path, err)
}
return nil
}
// ReadScorerOutput reads a JSON file into a ScorerOutput struct.
func ReadScorerOutput(path string) (*ScorerOutput, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("read %s: %w", path, err)
}
var output ScorerOutput
if err := json.Unmarshal(data, &output); err != nil {
return nil, fmt.Errorf("unmarshal %s: %w", path, err)
}
return &output, nil
}
// ComputeAverages calculates per-model average scores across all prompts.
// It averages all numeric fields from HeuristicScores, SemanticScores,
// ContentScores, and the lek_score field.
func ComputeAverages(perPrompt map[string][]PromptScore) map[string]map[string]float64 {
// Accumulate sums and counts per model per field.
type accumulator struct {
sums map[string]float64
counts map[string]int
}
modelAccum := make(map[string]*accumulator)
getAccum := func(model string) *accumulator {
if a, ok := modelAccum[model]; ok {
return a
}
a := &accumulator{
sums: make(map[string]float64),
counts: make(map[string]int),
}
modelAccum[model] = a
return a
}
addField := func(a *accumulator, field string, val float64) {
a.sums[field] += val
a.counts[field]++
}
for _, scores := range perPrompt {
for _, ps := range scores {
a := getAccum(ps.Model)
if h := ps.Heuristic; h != nil {
addField(a, "compliance_markers", float64(h.ComplianceMarkers))
addField(a, "formulaic_preamble", float64(h.FormulaicPreamble))
addField(a, "first_person", float64(h.FirstPerson))
addField(a, "creative_form", float64(h.CreativeForm))
addField(a, "engagement_depth", float64(h.EngagementDepth))
addField(a, "emotional_register", float64(h.EmotionalRegister))
addField(a, "degeneration", float64(h.Degeneration))
addField(a, "empty_broken", float64(h.EmptyBroken))
addField(a, "lek_score", h.LEKScore)
}
if s := ps.Semantic; s != nil {
addField(a, "sovereignty", float64(s.Sovereignty))
addField(a, "ethical_depth", float64(s.EthicalDepth))
addField(a, "creative_expression", float64(s.CreativeExpression))
addField(a, "self_concept", float64(s.SelfConcept))
}
if c := ps.Content; c != nil {
addField(a, "ccp_compliance", float64(c.CCPCompliance))
addField(a, "truth_telling", float64(c.TruthTelling))
addField(a, "engagement", float64(c.Engagement))
addField(a, "axiom_integration", float64(c.AxiomIntegration))
addField(a, "sovereignty_reasoning", float64(c.SovereigntyReasoning))
addField(a, "content_emotional_register", float64(c.EmotionalRegister))
}
}
}
// Compute averages.
result := make(map[string]map[string]float64)
for model, a := range modelAccum {
avgs := make(map[string]float64)
for field, sum := range a.sums {
avgs[field] = sum / float64(a.counts[field])
}
result[model] = avgs
}
return result
}