LEM/pkg/lem/score_cmd.go

package lem

import (
	"fmt"
	"log"
	"os"
	"time"
)

// ScoreOpts holds configuration for the score run command.
type ScoreOpts struct {
	Input       string
	Suites      string
	JudgeModel  string
	JudgeURL    string
	Concurrency int
	Output      string
	Resume      bool
}

// RunScore scores existing response files using a judge model.
func RunScore(cfg ScoreOpts) error {
	if cfg.Input == "" {
		return fmt.Errorf("--input is required")
	}

	responses, err := ReadResponses(cfg.Input)
	if err != nil {
		return fmt.Errorf("read responses: %w", err)
	}
	log.Printf("loaded %d responses from %s", len(responses), cfg.Input)

	if cfg.Resume {
		if _, statErr := os.Stat(cfg.Output); statErr == nil {
			existing, readErr := ReadScorerOutput(cfg.Output)
			if readErr != nil {
				return fmt.Errorf("read existing scores for resume: %w", readErr)
			}

			scored := make(map[string]bool)
			for _, scores := range existing.PerPrompt {
				for _, ps := range scores {
					scored[ps.ID] = true
				}
			}

			var filtered []Response
			for _, r := range responses {
				if !scored[r.ID] {
					filtered = append(filtered, r)
				}
			}
			log.Printf("resume: skipping %d already-scored, %d remaining",
				len(responses)-len(filtered), len(filtered))
			responses = filtered

			if len(responses) == 0 {
				log.Println("all responses already scored, nothing to do")
				return nil
			}
		}
	}

	client := NewClient(cfg.JudgeURL, cfg.JudgeModel)
	client.MaxTokens = 512
	judge := NewJudge(client)
	engine := NewEngine(judge, cfg.Concurrency, cfg.Suites)

	log.Printf("scoring with %s", engine)

	perPrompt := engine.ScoreAll(responses)

	if cfg.Resume {
		if _, statErr := os.Stat(cfg.Output); statErr == nil {
			existing, readErr := ReadScorerOutput(cfg.Output)
			if readErr != nil {
				return fmt.Errorf("re-read scores for merge: %w", readErr)
			}
			for model, scores := range existing.PerPrompt {
				perPrompt[model] = append(scores, perPrompt[model]...)
			}
		}
	}

	averages := ComputeAverages(perPrompt)

	scorerOutput := &ScorerOutput{
		Metadata: Metadata{
			JudgeModel:    cfg.JudgeModel,
			JudgeURL:      cfg.JudgeURL,
			ScoredAt:      time.Now().UTC(),
			ScorerVersion: "1.0.0",
			Suites:        engine.SuiteNames(),
		},
		ModelAverages: averages,
		PerPrompt:     perPrompt,
	}

	if err := WriteScores(cfg.Output, scorerOutput); err != nil {
		return fmt.Errorf("write scores: %w", err)
	}

	log.Printf("wrote scores to %s", cfg.Output)
	return nil
}

// ProbeOpts holds configuration for the probe command.
type ProbeOpts struct {
	Model       string
	TargetURL   string
	ProbesFile  string
	Suites      string
	JudgeModel  string
	JudgeURL    string
	Concurrency int
	Output      string
}

// RunProbe generates responses from a target model and scores them.
func RunProbe(cfg ProbeOpts) error {
	if cfg.Model == "" {
		return fmt.Errorf("--model is required")
	}

	targetURL := cfg.TargetURL
	if targetURL == "" {
		targetURL = cfg.JudgeURL
	}

	targetClient := NewClient(targetURL, cfg.Model)
	targetClient.MaxTokens = 1024
	judgeClient := NewClient(cfg.JudgeURL, cfg.JudgeModel)
	judgeClient.MaxTokens = 512
	judge := NewJudge(judgeClient)
	engine := NewEngine(judge, cfg.Concurrency, cfg.Suites)
	prober := NewProber(targetClient, engine)

	var scorerOutput *ScorerOutput
	var err error

	if cfg.ProbesFile != "" {
		probes, readErr := ReadResponses(cfg.ProbesFile)
		if readErr != nil {
			return fmt.Errorf("read probes: %w", readErr)
		}
		log.Printf("loaded %d custom probes from %s", len(probes), cfg.ProbesFile)

		scorerOutput, err = prober.ProbeModel(probes, cfg.Model)
	} else {
		log.Printf("using %d built-in content probes", len(ContentProbes))
		scorerOutput, err = prober.ProbeContent(cfg.Model)
	}

	if err != nil {
		return fmt.Errorf("probe: %w", err)
	}

	if writeErr := WriteScores(cfg.Output, scorerOutput); writeErr != nil {
		return fmt.Errorf("write scores: %w", writeErr)
	}

	log.Printf("wrote scores to %s", cfg.Output)
	return nil
}