LEM/pkg/lem/score_cmd.go
Snider 56eda1a081 refactor: migrate all 25 commands from passthrough to cobra framework
Replace passthrough() + stdlib flag.FlagSet anti-pattern with proper
cobra integration. Every Run* function now takes a typed *Opts struct
and returns error. Flags registered via cli.StringFlag/IntFlag/etc.
Commands participate in Core lifecycle with full cobra flag parsing.

- 6 command groups: gen, score, data, export, infra, mon
- 25 commands converted, 0 passthrough() calls remain
- Delete passthrough() helper from lem.go
- Update export_test.go to use ExportOpts struct

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-23 03:32:53 +00:00

164 lines
3.9 KiB
Go

package lem
import (
"fmt"
"log"
"os"
"time"
)
// ScoreOpts holds configuration for the score run command.
type ScoreOpts struct {
Input string
Suites string
JudgeModel string
JudgeURL string
Concurrency int
Output string
Resume bool
}
// RunScore scores existing response files using a judge model.
func RunScore(cfg ScoreOpts) error {
if cfg.Input == "" {
return fmt.Errorf("--input is required")
}
responses, err := ReadResponses(cfg.Input)
if err != nil {
return fmt.Errorf("read responses: %w", err)
}
log.Printf("loaded %d responses from %s", len(responses), cfg.Input)
if cfg.Resume {
if _, statErr := os.Stat(cfg.Output); statErr == nil {
existing, readErr := ReadScorerOutput(cfg.Output)
if readErr != nil {
return fmt.Errorf("read existing scores for resume: %w", readErr)
}
scored := make(map[string]bool)
for _, scores := range existing.PerPrompt {
for _, ps := range scores {
scored[ps.ID] = true
}
}
var filtered []Response
for _, r := range responses {
if !scored[r.ID] {
filtered = append(filtered, r)
}
}
log.Printf("resume: skipping %d already-scored, %d remaining",
len(responses)-len(filtered), len(filtered))
responses = filtered
if len(responses) == 0 {
log.Println("all responses already scored, nothing to do")
return nil
}
}
}
client := NewClient(cfg.JudgeURL, cfg.JudgeModel)
client.MaxTokens = 512
judge := NewJudge(client)
engine := NewEngine(judge, cfg.Concurrency, cfg.Suites)
log.Printf("scoring with %s", engine)
perPrompt := engine.ScoreAll(responses)
if cfg.Resume {
if _, statErr := os.Stat(cfg.Output); statErr == nil {
existing, readErr := ReadScorerOutput(cfg.Output)
if readErr != nil {
return fmt.Errorf("re-read scores for merge: %w", readErr)
}
for model, scores := range existing.PerPrompt {
perPrompt[model] = append(scores, perPrompt[model]...)
}
}
}
averages := ComputeAverages(perPrompt)
scorerOutput := &ScorerOutput{
Metadata: Metadata{
JudgeModel: cfg.JudgeModel,
JudgeURL: cfg.JudgeURL,
ScoredAt: time.Now().UTC(),
ScorerVersion: "1.0.0",
Suites: engine.SuiteNames(),
},
ModelAverages: averages,
PerPrompt: perPrompt,
}
if err := WriteScores(cfg.Output, scorerOutput); err != nil {
return fmt.Errorf("write scores: %w", err)
}
log.Printf("wrote scores to %s", cfg.Output)
return nil
}
// ProbeOpts holds configuration for the probe command.
type ProbeOpts struct {
Model string
TargetURL string
ProbesFile string
Suites string
JudgeModel string
JudgeURL string
Concurrency int
Output string
}
// RunProbe generates responses from a target model and scores them.
func RunProbe(cfg ProbeOpts) error {
if cfg.Model == "" {
return fmt.Errorf("--model is required")
}
targetURL := cfg.TargetURL
if targetURL == "" {
targetURL = cfg.JudgeURL
}
targetClient := NewClient(targetURL, cfg.Model)
targetClient.MaxTokens = 1024
judgeClient := NewClient(cfg.JudgeURL, cfg.JudgeModel)
judgeClient.MaxTokens = 512
judge := NewJudge(judgeClient)
engine := NewEngine(judge, cfg.Concurrency, cfg.Suites)
prober := NewProber(targetClient, engine)
var scorerOutput *ScorerOutput
var err error
if cfg.ProbesFile != "" {
probes, readErr := ReadResponses(cfg.ProbesFile)
if readErr != nil {
return fmt.Errorf("read probes: %w", readErr)
}
log.Printf("loaded %d custom probes from %s", len(probes), cfg.ProbesFile)
scorerOutput, err = prober.ProbeModel(probes, cfg.Model)
} else {
log.Printf("using %d built-in content probes", len(ContentProbes))
scorerOutput, err = prober.ProbeContent(cfg.Model)
}
if err != nil {
return fmt.Errorf("probe: %w", err)
}
if writeErr := WriteScores(cfg.Output, scorerOutput); writeErr != nil {
return fmt.Errorf("write scores: %w", writeErr)
}
log.Printf("wrote scores to %s", cfg.Output)
return nil
}