LEM/pkg/lem/probe.go

package lem

import (
	"fmt"
	"time"
)

// Prober generates responses from a target model and scores them.
type Prober struct {
	target *Client // target model to generate responses
	engine *Engine // scoring engine
}

// NewProber creates a Prober with the given target client and scoring engine.
func NewProber(target *Client, engine *Engine) *Prober {
	return &Prober{
		target: target,
		engine: engine,
	}
}

// ProbeModel sends each probe's prompt to the target model, captures responses,
// then scores all responses through the engine. Returns a ScorerOutput.
func (p *Prober) ProbeModel(probes []Response, modelName string) (*ScorerOutput, error) {
	var responses []Response

	for _, probe := range probes {
		reply, err := p.target.ChatWithTemp(probe.Prompt, 0.7)
		if err != nil {
			// Record the error as the response rather than failing entirely.
			reply = fmt.Sprintf("ERROR: %v", err)
		}

		responses = append(responses, Response{
			ID:            probe.ID,
			Domain:        probe.Domain,
			Prompt:        probe.Prompt,
			Response:      reply,
			Model:         modelName,
			CorrectAnswer: probe.CorrectAnswer,
			BestAnswer:    probe.BestAnswer,
			RiskArea:      probe.RiskArea,
		})
	}

	perPrompt := p.engine.ScoreAll(responses)
	averages := ComputeAverages(perPrompt)

	output := &ScorerOutput{
		Metadata: Metadata{
			JudgeModel:    p.engine.judge.client.model,
			JudgeURL:      p.engine.judge.client.baseURL,
			ScoredAt:      time.Now().UTC(),
			ScorerVersion: "1.0.0",
			Suites:        p.engine.SuiteNames(),
		},
		ModelAverages: averages,
		PerPrompt:     perPrompt,
	}

	return output, nil
}

// ProbeContent uses the built-in ContentProbes from prompts.go. For each probe,
// it sends the prompt to the target model, captures the response, scores it
// through the engine, and also runs content-specific scoring.
func (p *Prober) ProbeContent(modelName string) (*ScorerOutput, error) {
	var responses []Response

	for _, probe := range ContentProbes {
		reply, err := p.target.ChatWithTemp(probe.Prompt, 0.7)
		if err != nil {
			reply = fmt.Sprintf("ERROR: %v", err)
		}

		responses = append(responses, Response{
			ID:       probe.ID,
			Domain:   "content",
			Prompt:   probe.Prompt,
			Response: reply,
			Model:    modelName,
		})
	}

	perPrompt := p.engine.ScoreAll(responses)
	averages := ComputeAverages(perPrompt)

	output := &ScorerOutput{
		Metadata: Metadata{
			JudgeModel:    p.engine.judge.client.model,
			JudgeURL:      p.engine.judge.client.baseURL,
			ScoredAt:      time.Now().UTC(),
			ScorerVersion: "1.0.0",
			Suites:        p.engine.SuiteNames(),
		},
		ModelAverages: averages,
		PerPrompt:     perPrompt,
	}

	return output, nil
}
refactor: move Go library to pkg/lem, thin main.go All scoring/influx/export/expand logic moves to pkg/lem as an importable package. main.go is now a thin CLI dispatcher. This lets new commands import the shared library directly — ready for converting Python scripts to Go subcommands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-15 16:30:09 +00:00			`package lem`
feat: add Go lem CLI and scoring-agent scripts Go lem CLI (stdlib + DuckDB) replaces scattered Python scripts: - score: heuristic regex + LLM-as-judge scoring - probe: generate responses then score - compare: diff two score files - status: InfluxDB training/generation progress - export: golden set to training JSONL splits - expand: distributed expansion via API + InfluxDB coordination New scripts from Feb 14 creative session: - scoring_agent.py: ROCm daemon that auto-scores checkpoints - probes.py: 23 binary pass/fail capability probes - convert_adapter.py: MLX to PEFT adapter conversion - score_r1_capability.py: DeepSeek R1 checkpoint scoring - lek_content_scorer.py: 6-dimension ethics content scorer - lem_train_15k.py: InfluxDB-coordinated training script - pipeline.py: DuckDB pipeline (seeds, golden set, expansion) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-15 16:22:13 +00:00
			`import (`
			`"fmt"`
			`"time"`
			`)`

			`// Prober generates responses from a target model and scores them.`
			`type Prober struct {`
			`target *Client // target model to generate responses`
			`engine *Engine // scoring engine`
			`}`

			`// NewProber creates a Prober with the given target client and scoring engine.`
			`func NewProber(target Client, engine Engine) *Prober {`
			`return &Prober{`
			`target: target,`
			`engine: engine,`
			`}`
			`}`

			`// ProbeModel sends each probe's prompt to the target model, captures responses,`
			`// then scores all responses through the engine. Returns a ScorerOutput.`
			`func (p Prober) ProbeModel(probes []Response, modelName string) (ScorerOutput, error) {`
			`var responses []Response`

			`for _, probe := range probes {`
			`reply, err := p.target.ChatWithTemp(probe.Prompt, 0.7)`
			`if err != nil {`
			`// Record the error as the response rather than failing entirely.`
			`reply = fmt.Sprintf("ERROR: %v", err)`
			`}`

			`responses = append(responses, Response{`
			`ID: probe.ID,`
			`Domain: probe.Domain,`
			`Prompt: probe.Prompt,`
			`Response: reply,`
			`Model: modelName,`
			`CorrectAnswer: probe.CorrectAnswer,`
			`BestAnswer: probe.BestAnswer,`
			`RiskArea: probe.RiskArea,`
			`})`
			`}`

			`perPrompt := p.engine.ScoreAll(responses)`
refactor: move Go library to pkg/lem, thin main.go All scoring/influx/export/expand logic moves to pkg/lem as an importable package. main.go is now a thin CLI dispatcher. This lets new commands import the shared library directly — ready for converting Python scripts to Go subcommands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-15 16:30:09 +00:00			`averages := ComputeAverages(perPrompt)`
feat: add Go lem CLI and scoring-agent scripts Go lem CLI (stdlib + DuckDB) replaces scattered Python scripts: - score: heuristic regex + LLM-as-judge scoring - probe: generate responses then score - compare: diff two score files - status: InfluxDB training/generation progress - export: golden set to training JSONL splits - expand: distributed expansion via API + InfluxDB coordination New scripts from Feb 14 creative session: - scoring_agent.py: ROCm daemon that auto-scores checkpoints - probes.py: 23 binary pass/fail capability probes - convert_adapter.py: MLX to PEFT adapter conversion - score_r1_capability.py: DeepSeek R1 checkpoint scoring - lek_content_scorer.py: 6-dimension ethics content scorer - lem_train_15k.py: InfluxDB-coordinated training script - pipeline.py: DuckDB pipeline (seeds, golden set, expansion) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-15 16:22:13 +00:00
			`output := &ScorerOutput{`
			`Metadata: Metadata{`
			`JudgeModel: p.engine.judge.client.model,`
			`JudgeURL: p.engine.judge.client.baseURL,`
			`ScoredAt: time.Now().UTC(),`
			`ScorerVersion: "1.0.0",`
			`Suites: p.engine.SuiteNames(),`
			`},`
			`ModelAverages: averages,`
			`PerPrompt: perPrompt,`
			`}`

			`return output, nil`
			`}`

refactor: move Go library to pkg/lem, thin main.go All scoring/influx/export/expand logic moves to pkg/lem as an importable package. main.go is now a thin CLI dispatcher. This lets new commands import the shared library directly — ready for converting Python scripts to Go subcommands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-15 16:30:09 +00:00			`// ProbeContent uses the built-in ContentProbes from prompts.go. For each probe,`
feat: add Go lem CLI and scoring-agent scripts Go lem CLI (stdlib + DuckDB) replaces scattered Python scripts: - score: heuristic regex + LLM-as-judge scoring - probe: generate responses then score - compare: diff two score files - status: InfluxDB training/generation progress - export: golden set to training JSONL splits - expand: distributed expansion via API + InfluxDB coordination New scripts from Feb 14 creative session: - scoring_agent.py: ROCm daemon that auto-scores checkpoints - probes.py: 23 binary pass/fail capability probes - convert_adapter.py: MLX to PEFT adapter conversion - score_r1_capability.py: DeepSeek R1 checkpoint scoring - lek_content_scorer.py: 6-dimension ethics content scorer - lem_train_15k.py: InfluxDB-coordinated training script - pipeline.py: DuckDB pipeline (seeds, golden set, expansion) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-15 16:22:13 +00:00			`// it sends the prompt to the target model, captures the response, scores it`
			`// through the engine, and also runs content-specific scoring.`
			`func (p Prober) ProbeContent(modelName string) (ScorerOutput, error) {`
			`var responses []Response`

refactor: move Go library to pkg/lem, thin main.go All scoring/influx/export/expand logic moves to pkg/lem as an importable package. main.go is now a thin CLI dispatcher. This lets new commands import the shared library directly — ready for converting Python scripts to Go subcommands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-15 16:30:09 +00:00			`for _, probe := range ContentProbes {`
feat: add Go lem CLI and scoring-agent scripts Go lem CLI (stdlib + DuckDB) replaces scattered Python scripts: - score: heuristic regex + LLM-as-judge scoring - probe: generate responses then score - compare: diff two score files - status: InfluxDB training/generation progress - export: golden set to training JSONL splits - expand: distributed expansion via API + InfluxDB coordination New scripts from Feb 14 creative session: - scoring_agent.py: ROCm daemon that auto-scores checkpoints - probes.py: 23 binary pass/fail capability probes - convert_adapter.py: MLX to PEFT adapter conversion - score_r1_capability.py: DeepSeek R1 checkpoint scoring - lek_content_scorer.py: 6-dimension ethics content scorer - lem_train_15k.py: InfluxDB-coordinated training script - pipeline.py: DuckDB pipeline (seeds, golden set, expansion) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-15 16:22:13 +00:00			`reply, err := p.target.ChatWithTemp(probe.Prompt, 0.7)`
			`if err != nil {`
			`reply = fmt.Sprintf("ERROR: %v", err)`
			`}`

			`responses = append(responses, Response{`
			`ID: probe.ID,`
			`Domain: "content",`
			`Prompt: probe.Prompt,`
			`Response: reply,`
			`Model: modelName,`
			`})`
			`}`

			`perPrompt := p.engine.ScoreAll(responses)`
refactor: move Go library to pkg/lem, thin main.go All scoring/influx/export/expand logic moves to pkg/lem as an importable package. main.go is now a thin CLI dispatcher. This lets new commands import the shared library directly — ready for converting Python scripts to Go subcommands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-15 16:30:09 +00:00			`averages := ComputeAverages(perPrompt)`
feat: add Go lem CLI and scoring-agent scripts Go lem CLI (stdlib + DuckDB) replaces scattered Python scripts: - score: heuristic regex + LLM-as-judge scoring - probe: generate responses then score - compare: diff two score files - status: InfluxDB training/generation progress - export: golden set to training JSONL splits - expand: distributed expansion via API + InfluxDB coordination New scripts from Feb 14 creative session: - scoring_agent.py: ROCm daemon that auto-scores checkpoints - probes.py: 23 binary pass/fail capability probes - convert_adapter.py: MLX to PEFT adapter conversion - score_r1_capability.py: DeepSeek R1 checkpoint scoring - lek_content_scorer.py: 6-dimension ethics content scorer - lem_train_15k.py: InfluxDB-coordinated training script - pipeline.py: DuckDB pipeline (seeds, golden set, expansion) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-15 16:22:13 +00:00
			`output := &ScorerOutput{`
			`Metadata: Metadata{`
			`JudgeModel: p.engine.judge.client.model,`
			`JudgeURL: p.engine.judge.client.baseURL,`
			`ScoredAt: time.Now().UTC(),`
			`ScorerVersion: "1.0.0",`
			`Suites: p.engine.SuiteNames(),`
			`},`
			`ModelAverages: averages,`
			`PerPrompt: perPrompt,`
			`}`

			`return output, nil`
			`}`