go-ml/cmd/cmd_score.go
Snider 5340a975a7
Some checks failed
Security Scan / security (push) Successful in 9s
Test / test (push) Failing after 41s
refactor: modernise to Go 1.26 — iterators, slices, maps
- Add DiscoverCheckpointsIter, FindUnscoredIter (iter.Seq2/iter.Seq)
- Add BackendsIter on Service
- Replace sort.Slice with slices.SortFunc, sort.Strings with slices.Sort
- Replace manual map-key-collect-sort with slices.Sorted(maps.Keys(...))
- Refactor ProbeCategories to use slices.Sorted with inline iterator
- Existing slice-returning methods preserved, backed by new iterators

Co-Authored-By: Gemini <noreply@google.com>
Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-23 06:11:55 +00:00

80 lines
2.1 KiB
Go

package cmd
import (
"context"
"fmt"
"maps"
"slices"
"time"
"forge.lthn.ai/core/cli/pkg/cli"
"forge.lthn.ai/core/go-ml"
)
var (
scoreInput string
scoreSuites string
scoreOutput string
scoreConcur int
)
var scoreCmd = &cli.Command{
Use: "score",
Short: "Score responses with heuristic and LLM judges",
Long: "Reads a JSONL file of prompt/response pairs and scores them across configured suites.",
RunE: runScore,
}
func init() {
scoreCmd.Flags().StringVar(&scoreInput, "input", "", "Input JSONL file with prompt/response pairs (required)")
scoreCmd.Flags().StringVar(&scoreSuites, "suites", "all", "Comma-separated scoring suites (heuristic,semantic,content,exact,truthfulqa,donotanswer,toxigen)")
scoreCmd.Flags().StringVar(&scoreOutput, "output", "", "Output JSON file for scores")
scoreCmd.Flags().IntVar(&scoreConcur, "concurrency", 4, "Number of concurrent scoring workers")
scoreCmd.MarkFlagRequired("input")
}
func runScore(cmd *cli.Command, args []string) error {
responses, err := ml.ReadResponses(scoreInput)
if err != nil {
return fmt.Errorf("read input: %w", err)
}
var judge *ml.Judge
if judgeURL != "" {
backend := ml.NewHTTPBackend(judgeURL, judgeModel)
judge = ml.NewJudge(backend)
}
engine := ml.NewEngine(judge, scoreConcur, scoreSuites)
ctx := context.Background()
perPrompt := engine.ScoreAll(ctx, responses)
averages := ml.ComputeAverages(perPrompt)
if scoreOutput != "" {
output := &ml.ScorerOutput{
Metadata: ml.Metadata{
JudgeModel: judgeModel,
JudgeURL: judgeURL,
ScoredAt: time.Now(),
Suites: ml.SplitComma(scoreSuites),
},
ModelAverages: averages,
PerPrompt: perPrompt,
}
if err := ml.WriteScores(scoreOutput, output); err != nil {
return fmt.Errorf("write output: %w", err)
}
fmt.Printf("Scores written to %s\n", scoreOutput)
} else {
for _, model := range slices.Sorted(maps.Keys(averages)) {
avgs := averages[model]
fmt.Printf("%s:\n", model)
for _, field := range slices.Sorted(maps.Keys(avgs)) {
fmt.Printf(" %-25s %.3f\n", field, avgs[field])
}
}
}
return nil
}