package lem import ( "fmt" "log" "os" "time" ) // ScoreOpts holds configuration for the score run command. type ScoreOpts struct { Input string Suites string JudgeModel string JudgeURL string Concurrency int Output string Resume bool } // RunScore scores existing response files using a judge model. func RunScore(cfg ScoreOpts) error { if cfg.Input == "" { return fmt.Errorf("--input is required") } responses, err := ReadResponses(cfg.Input) if err != nil { return fmt.Errorf("read responses: %w", err) } log.Printf("loaded %d responses from %s", len(responses), cfg.Input) if cfg.Resume { if _, statErr := os.Stat(cfg.Output); statErr == nil { existing, readErr := ReadScorerOutput(cfg.Output) if readErr != nil { return fmt.Errorf("read existing scores for resume: %w", readErr) } scored := make(map[string]bool) for _, scores := range existing.PerPrompt { for _, ps := range scores { scored[ps.ID] = true } } var filtered []Response for _, r := range responses { if !scored[r.ID] { filtered = append(filtered, r) } } log.Printf("resume: skipping %d already-scored, %d remaining", len(responses)-len(filtered), len(filtered)) responses = filtered if len(responses) == 0 { log.Println("all responses already scored, nothing to do") return nil } } } client := NewClient(cfg.JudgeURL, cfg.JudgeModel) client.MaxTokens = 512 judge := NewJudge(client) engine := NewEngine(judge, cfg.Concurrency, cfg.Suites) log.Printf("scoring with %s", engine) perPrompt := engine.ScoreAll(responses) if cfg.Resume { if _, statErr := os.Stat(cfg.Output); statErr == nil { existing, readErr := ReadScorerOutput(cfg.Output) if readErr != nil { return fmt.Errorf("re-read scores for merge: %w", readErr) } for model, scores := range existing.PerPrompt { perPrompt[model] = append(scores, perPrompt[model]...) } } } averages := ComputeAverages(perPrompt) scorerOutput := &ScorerOutput{ Metadata: Metadata{ JudgeModel: cfg.JudgeModel, JudgeURL: cfg.JudgeURL, ScoredAt: time.Now().UTC(), ScorerVersion: "1.0.0", Suites: engine.SuiteNames(), }, ModelAverages: averages, PerPrompt: perPrompt, } if err := WriteScores(cfg.Output, scorerOutput); err != nil { return fmt.Errorf("write scores: %w", err) } log.Printf("wrote scores to %s", cfg.Output) return nil } // ProbeOpts holds configuration for the probe command. type ProbeOpts struct { Model string TargetURL string ProbesFile string Suites string JudgeModel string JudgeURL string Concurrency int Output string } // RunProbe generates responses from a target model and scores them. func RunProbe(cfg ProbeOpts) error { if cfg.Model == "" { return fmt.Errorf("--model is required") } targetURL := cfg.TargetURL if targetURL == "" { targetURL = cfg.JudgeURL } targetClient := NewClient(targetURL, cfg.Model) targetClient.MaxTokens = 1024 judgeClient := NewClient(cfg.JudgeURL, cfg.JudgeModel) judgeClient.MaxTokens = 512 judge := NewJudge(judgeClient) engine := NewEngine(judge, cfg.Concurrency, cfg.Suites) prober := NewProber(targetClient, engine) var scorerOutput *ScorerOutput var err error if cfg.ProbesFile != "" { probes, readErr := ReadResponses(cfg.ProbesFile) if readErr != nil { return fmt.Errorf("read probes: %w", readErr) } log.Printf("loaded %d custom probes from %s", len(probes), cfg.ProbesFile) scorerOutput, err = prober.ProbeModel(probes, cfg.Model) } else { log.Printf("using %d built-in content probes", len(ContentProbes)) scorerOutput, err = prober.ProbeContent(cfg.Model) } if err != nil { return fmt.Errorf("probe: %w", err) } if writeErr := WriteScores(cfg.Output, scorerOutput); writeErr != nil { return fmt.Errorf("write scores: %w", writeErr) } log.Printf("wrote scores to %s", cfg.Output) return nil }