cli/cmd/ml/cmd_score.go

package ml

import (
	"context"
	"fmt"
	"time"

	"forge.lthn.ai/core/go/pkg/cli"
	"forge.lthn.ai/core/go/pkg/ml"
)

var (
	scoreInput   string
	scoreSuites  string
	scoreOutput  string
	scoreConcur  int
)

var scoreCmd = &cli.Command{
	Use:   "score",
	Short: "Score responses with heuristic and LLM judges",
	Long:  "Reads a JSONL file of prompt/response pairs and scores them across configured suites.",
	RunE:  runScore,
}

func init() {
	scoreCmd.Flags().StringVar(&scoreInput, "input", "", "Input JSONL file with prompt/response pairs (required)")
	scoreCmd.Flags().StringVar(&scoreSuites, "suites", "all", "Comma-separated scoring suites (heuristic,semantic,content,exact,truthfulqa,donotanswer,toxigen)")
	scoreCmd.Flags().StringVar(&scoreOutput, "output", "", "Output JSON file for scores")
	scoreCmd.Flags().IntVar(&scoreConcur, "concurrency", 4, "Number of concurrent scoring workers")
	scoreCmd.MarkFlagRequired("input")
}

func runScore(cmd *cli.Command, args []string) error {
	responses, err := ml.ReadResponses(scoreInput)
	if err != nil {
		return fmt.Errorf("read input: %w", err)
	}

	var judge *ml.Judge
	if judgeURL != "" {
		backend := ml.NewHTTPBackend(judgeURL, judgeModel)
		judge = ml.NewJudge(backend)
	}

	engine := ml.NewEngine(judge, scoreConcur, scoreSuites)

	ctx := context.Background()
	perPrompt := engine.ScoreAll(ctx, responses)
	averages := ml.ComputeAverages(perPrompt)

	if scoreOutput != "" {
		output := &ml.ScorerOutput{
			Metadata: ml.Metadata{
				JudgeModel: judgeModel,
				JudgeURL:   judgeURL,
				ScoredAt:   time.Now(),
				Suites:     ml.SplitComma(scoreSuites),
			},
			ModelAverages: averages,
			PerPrompt:     perPrompt,
		}
		if err := ml.WriteScores(scoreOutput, output); err != nil {
			return fmt.Errorf("write output: %w", err)
		}
		fmt.Printf("Scores written to %s\n", scoreOutput)
	} else {
		for model, avgs := range averages {
			fmt.Printf("%s:\n", model)
			for field, val := range avgs {
				fmt.Printf("  %-25s %.3f\n", field, val)
			}
		}
	}

	return nil
}
feat: add ML inference, scoring, and training pipeline (pkg/ml) Port LEM scoring/training pipeline into CoreGo as pkg/ml with: - Inference abstraction with HTTP, llama-server, and Ollama backends - 3-tier scoring engine (heuristic, exact, LLM judge) - Capability and content probes for model evaluation - GGUF/safetensors format converters, MLX to PEFT adapter conversion - DuckDB integration for training data pipeline - InfluxDB metrics for lab dashboard - Training data export (JSONL + Parquet) - Expansion generation pipeline with distributed workers - 10 CLI commands under 'core ml' (score, probe, export, expand, status, gguf, convert, agent, worker) - 5 MCP tools (ml_generate, ml_score, ml_probe, ml_status, ml_backends) All 37 ML tests passing. Binary builds at 138MB with all commands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-16 00:34:53 +00:00			`package ml`

			`import (`
			`"context"`
			`"fmt"`
			`"time"`

refactor: split CLI from monorepo, import core/go as library (#1) - Change module from forge.lthn.ai/core/go to forge.lthn.ai/core/cli - Remove pkg/ directory (now served from core/go) - Add require + replace for forge.lthn.ai/core/go => ../go - Update go.work to include ../go workspace module - Fix all internal/cmd/* imports: pkg/ refs → forge.lthn.ai/core/go/pkg/ - Rename internal/cmd/sdk package to sdkcmd (avoids conflict with pkg/sdk) - Remove SDK library files from internal/cmd/sdk/ (now in core/go/pkg/sdk/) - Remove duplicate RAG helper functions from internal/cmd/rag/ - Remove stale cmd/core-ide/ (now in core/ide repo) - Update IDE variant to remove core-ide import - Fix test assertion for new module name - Run go mod tidy to sync dependencies core/cli is now a pure CLI application importing core/go for packages. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Claude <developers@lethean.io> Reviewed-on: https://forge.lthn.ai/core/cli/pulls/1 2026-02-16 14:24:37 +00:00			`"forge.lthn.ai/core/go/pkg/cli"`
			`"forge.lthn.ai/core/go/pkg/ml"`
feat: add ML inference, scoring, and training pipeline (pkg/ml) Port LEM scoring/training pipeline into CoreGo as pkg/ml with: - Inference abstraction with HTTP, llama-server, and Ollama backends - 3-tier scoring engine (heuristic, exact, LLM judge) - Capability and content probes for model evaluation - GGUF/safetensors format converters, MLX to PEFT adapter conversion - DuckDB integration for training data pipeline - InfluxDB metrics for lab dashboard - Training data export (JSONL + Parquet) - Expansion generation pipeline with distributed workers - 10 CLI commands under 'core ml' (score, probe, export, expand, status, gguf, convert, agent, worker) - 5 MCP tools (ml_generate, ml_score, ml_probe, ml_status, ml_backends) All 37 ML tests passing. Binary builds at 138MB with all commands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-16 00:34:53 +00:00			`)`

			`var (`
			`scoreInput string`
			`scoreSuites string`
			`scoreOutput string`
			`scoreConcur int`
			`)`

			`var scoreCmd = &cli.Command{`
			`Use: "score",`
			`Short: "Score responses with heuristic and LLM judges",`
			`Long: "Reads a JSONL file of prompt/response pairs and scores them across configured suites.",`
			`RunE: runScore,`
			`}`

			`func init() {`
			`scoreCmd.Flags().StringVar(&scoreInput, "input", "", "Input JSONL file with prompt/response pairs (required)")`
			`scoreCmd.Flags().StringVar(&scoreSuites, "suites", "all", "Comma-separated scoring suites (heuristic,semantic,content,exact,truthfulqa,donotanswer,toxigen)")`
			`scoreCmd.Flags().StringVar(&scoreOutput, "output", "", "Output JSON file for scores")`
			`scoreCmd.Flags().IntVar(&scoreConcur, "concurrency", 4, "Number of concurrent scoring workers")`
			`scoreCmd.MarkFlagRequired("input")`
			`}`

			`func runScore(cmd *cli.Command, args []string) error {`
			`responses, err := ml.ReadResponses(scoreInput)`
			`if err != nil {`
			`return fmt.Errorf("read input: %w", err)`
			`}`

			`var judge *ml.Judge`
			`if judgeURL != "" {`
			`backend := ml.NewHTTPBackend(judgeURL, judgeModel)`
			`judge = ml.NewJudge(backend)`
			`}`

			`engine := ml.NewEngine(judge, scoreConcur, scoreSuites)`

			`ctx := context.Background()`
			`perPrompt := engine.ScoreAll(ctx, responses)`
			`averages := ml.ComputeAverages(perPrompt)`

			`if scoreOutput != "" {`
			`output := &ml.ScorerOutput{`
			`Metadata: ml.Metadata{`
			`JudgeModel: judgeModel,`
			`JudgeURL: judgeURL,`
			`ScoredAt: time.Now(),`
			`Suites: ml.SplitComma(scoreSuites),`
			`},`
			`ModelAverages: averages,`
			`PerPrompt: perPrompt,`
			`}`
			`if err := ml.WriteScores(scoreOutput, output); err != nil {`
			`return fmt.Errorf("write output: %w", err)`
			`}`
			`fmt.Printf("Scores written to %s\n", scoreOutput)`
			`} else {`
			`for model, avgs := range averages {`
			`fmt.Printf("%s:\n", model)`
			`for field, val := range avgs {`
			`fmt.Printf(" %-25s %.3f\n", field, val)`
			`}`
			`}`
			`}`

			`return nil`
			`}`