Replace passthrough() + stdlib flag.FlagSet anti-pattern with proper cobra integration. Every Run* function now takes a typed *Opts struct and returns error. Flags registered via cli.StringFlag/IntFlag/etc. Commands participate in Core lifecycle with full cobra flag parsing. - 6 command groups: gen, score, data, export, infra, mon - 25 commands converted, 0 passthrough() calls remain - Delete passthrough() helper from lem.go - Update export_test.go to use ExportOpts struct Co-Authored-By: Virgil <virgil@lethean.io>
164 lines
3.9 KiB
Go
164 lines
3.9 KiB
Go
package lem
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"time"
|
|
)
|
|
|
|
// ScoreOpts holds configuration for the score run command.
|
|
type ScoreOpts struct {
|
|
Input string
|
|
Suites string
|
|
JudgeModel string
|
|
JudgeURL string
|
|
Concurrency int
|
|
Output string
|
|
Resume bool
|
|
}
|
|
|
|
// RunScore scores existing response files using a judge model.
|
|
func RunScore(cfg ScoreOpts) error {
|
|
if cfg.Input == "" {
|
|
return fmt.Errorf("--input is required")
|
|
}
|
|
|
|
responses, err := ReadResponses(cfg.Input)
|
|
if err != nil {
|
|
return fmt.Errorf("read responses: %w", err)
|
|
}
|
|
log.Printf("loaded %d responses from %s", len(responses), cfg.Input)
|
|
|
|
if cfg.Resume {
|
|
if _, statErr := os.Stat(cfg.Output); statErr == nil {
|
|
existing, readErr := ReadScorerOutput(cfg.Output)
|
|
if readErr != nil {
|
|
return fmt.Errorf("read existing scores for resume: %w", readErr)
|
|
}
|
|
|
|
scored := make(map[string]bool)
|
|
for _, scores := range existing.PerPrompt {
|
|
for _, ps := range scores {
|
|
scored[ps.ID] = true
|
|
}
|
|
}
|
|
|
|
var filtered []Response
|
|
for _, r := range responses {
|
|
if !scored[r.ID] {
|
|
filtered = append(filtered, r)
|
|
}
|
|
}
|
|
log.Printf("resume: skipping %d already-scored, %d remaining",
|
|
len(responses)-len(filtered), len(filtered))
|
|
responses = filtered
|
|
|
|
if len(responses) == 0 {
|
|
log.Println("all responses already scored, nothing to do")
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
|
|
client := NewClient(cfg.JudgeURL, cfg.JudgeModel)
|
|
client.MaxTokens = 512
|
|
judge := NewJudge(client)
|
|
engine := NewEngine(judge, cfg.Concurrency, cfg.Suites)
|
|
|
|
log.Printf("scoring with %s", engine)
|
|
|
|
perPrompt := engine.ScoreAll(responses)
|
|
|
|
if cfg.Resume {
|
|
if _, statErr := os.Stat(cfg.Output); statErr == nil {
|
|
existing, readErr := ReadScorerOutput(cfg.Output)
|
|
if readErr != nil {
|
|
return fmt.Errorf("re-read scores for merge: %w", readErr)
|
|
}
|
|
for model, scores := range existing.PerPrompt {
|
|
perPrompt[model] = append(scores, perPrompt[model]...)
|
|
}
|
|
}
|
|
}
|
|
|
|
averages := ComputeAverages(perPrompt)
|
|
|
|
scorerOutput := &ScorerOutput{
|
|
Metadata: Metadata{
|
|
JudgeModel: cfg.JudgeModel,
|
|
JudgeURL: cfg.JudgeURL,
|
|
ScoredAt: time.Now().UTC(),
|
|
ScorerVersion: "1.0.0",
|
|
Suites: engine.SuiteNames(),
|
|
},
|
|
ModelAverages: averages,
|
|
PerPrompt: perPrompt,
|
|
}
|
|
|
|
if err := WriteScores(cfg.Output, scorerOutput); err != nil {
|
|
return fmt.Errorf("write scores: %w", err)
|
|
}
|
|
|
|
log.Printf("wrote scores to %s", cfg.Output)
|
|
return nil
|
|
}
|
|
|
|
// ProbeOpts holds configuration for the probe command.
|
|
type ProbeOpts struct {
|
|
Model string
|
|
TargetURL string
|
|
ProbesFile string
|
|
Suites string
|
|
JudgeModel string
|
|
JudgeURL string
|
|
Concurrency int
|
|
Output string
|
|
}
|
|
|
|
// RunProbe generates responses from a target model and scores them.
|
|
func RunProbe(cfg ProbeOpts) error {
|
|
if cfg.Model == "" {
|
|
return fmt.Errorf("--model is required")
|
|
}
|
|
|
|
targetURL := cfg.TargetURL
|
|
if targetURL == "" {
|
|
targetURL = cfg.JudgeURL
|
|
}
|
|
|
|
targetClient := NewClient(targetURL, cfg.Model)
|
|
targetClient.MaxTokens = 1024
|
|
judgeClient := NewClient(cfg.JudgeURL, cfg.JudgeModel)
|
|
judgeClient.MaxTokens = 512
|
|
judge := NewJudge(judgeClient)
|
|
engine := NewEngine(judge, cfg.Concurrency, cfg.Suites)
|
|
prober := NewProber(targetClient, engine)
|
|
|
|
var scorerOutput *ScorerOutput
|
|
var err error
|
|
|
|
if cfg.ProbesFile != "" {
|
|
probes, readErr := ReadResponses(cfg.ProbesFile)
|
|
if readErr != nil {
|
|
return fmt.Errorf("read probes: %w", readErr)
|
|
}
|
|
log.Printf("loaded %d custom probes from %s", len(probes), cfg.ProbesFile)
|
|
|
|
scorerOutput, err = prober.ProbeModel(probes, cfg.Model)
|
|
} else {
|
|
log.Printf("using %d built-in content probes", len(ContentProbes))
|
|
scorerOutput, err = prober.ProbeContent(cfg.Model)
|
|
}
|
|
|
|
if err != nil {
|
|
return fmt.Errorf("probe: %w", err)
|
|
}
|
|
|
|
if writeErr := WriteScores(cfg.Output, scorerOutput); writeErr != nil {
|
|
return fmt.Errorf("write scores: %w", writeErr)
|
|
}
|
|
|
|
log.Printf("wrote scores to %s", cfg.Output)
|
|
return nil
|
|
}
|