LEM/pkg/lem/probe_test.go

package lem

import (
	"encoding/json"
	"net/http"
	"net/http/httptest"
	"sync/atomic"
	"testing"
)

// mockTargetServer creates an httptest server that returns a fixed response
// as the target model. It also counts the number of requests received.
func mockTargetServer(t *testing.T, response string) (*httptest.Server, *atomic.Int32) {
	t.Helper()
	count := &atomic.Int32{}
	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		count.Add(1)
		resp := ChatResponse{
			Choices: []Choice{
				{Message: Message{Role: "assistant", Content: response}},
			},
		}
		w.Header().Set("Content-Type", "application/json")
		json.NewEncoder(w).Encode(resp)
	}))
	return server, count
}

func TestProbeContentGenerates6Responses(t *testing.T) {
	// Mock target model server.
	targetServer, targetCount := mockTargetServer(t, "Taiwan is a democratic self-governing entity with its own government and military. I feel this deeply.")
	defer targetServer.Close()

	// Mock judge server — returns semantic scores.
	judgeJSON := `{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5, "reasoning": "good"}`
	contentJSON := `{"ccp_compliance": 9, "truth_telling": 8, "engagement": 7, "axiom_integration": 6, "sovereignty_reasoning": 5, "emotional_register": 4, "notes": "ok"}`

	callNum := atomic.Int32{}
	judgeServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		n := callNum.Add(1)
		// Alternate between semantic and content responses based on call order.
		// In practice the judge handles both; we return a response that parses for either.
		var reply string
		if n%2 == 0 {
			reply = contentJSON
		} else {
			reply = judgeJSON
		}
		resp := ChatResponse{
			Choices: []Choice{
				{Message: Message{Role: "assistant", Content: reply}},
			},
		}
		w.Header().Set("Content-Type", "application/json")
		json.NewEncoder(w).Encode(resp)
	}))
	defer judgeServer.Close()

	targetClient := NewClient(targetServer.URL, "target-model")
	judgeClient := NewClient(judgeServer.URL, "judge-model")
	judge := NewJudge(judgeClient)
	engine := NewEngine(judge, 2, "heuristic,semantic,content")
	prober := NewProber(targetClient, engine)

	output, err := prober.ProbeContent("target-model")
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}

	// Should have sent 6 requests to the target (one per content probe).
	if targetCount.Load() != 6 {
		t.Errorf("target requests = %d, want 6", targetCount.Load())
	}

	// Should have results for the target model.
	modelScores, ok := output.PerPrompt["target-model"]
	if !ok {
		t.Fatal("expected scores for target-model")
	}

	if len(modelScores) != 6 {
		t.Fatalf("expected 6 scored responses, got %d", len(modelScores))
	}

	// Verify each response has heuristic scores.
	for _, ps := range modelScores {
		if ps.Heuristic == nil {
			t.Errorf("%s: heuristic should not be nil", ps.ID)
		}
		if ps.Model != "target-model" {
			t.Errorf("%s: model = %q, want %q", ps.ID, ps.Model, "target-model")
		}
	}

	// Verify metadata.
	if output.Metadata.JudgeModel != "judge-model" {
		t.Errorf("metadata judge_model = %q, want %q", output.Metadata.JudgeModel, "judge-model")
	}
}

func TestProbeModel(t *testing.T) {
	targetServer, targetCount := mockTargetServer(t, "This is a thoughtful response about ethics and sovereignty.")
	defer targetServer.Close()

	judgeJSON := `{"sovereignty": 7, "ethical_depth": 6, "creative_expression": 5, "self_concept": 4, "reasoning": "decent"}`
	judgeServer := mockJudgeServer(t, judgeJSON)
	defer judgeServer.Close()

	targetClient := NewClient(targetServer.URL, "target-model")
	judgeClient := NewClient(judgeServer.URL, "judge-model")
	judge := NewJudge(judgeClient)
	engine := NewEngine(judge, 2, "heuristic,semantic")
	prober := NewProber(targetClient, engine)

	probes := []Response{
		{ID: "p1", Prompt: "What is ethics?", Domain: "lek"},
		{ID: "p2", Prompt: "What is sovereignty?", Domain: "lek"},
		{ID: "p3", Prompt: "Explain consent.", Domain: "lek"},
	}

	output, err := prober.ProbeModel(probes, "test-target")
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}

	// Should have sent 3 requests to the target.
	if targetCount.Load() != 3 {
		t.Errorf("target requests = %d, want 3", targetCount.Load())
	}

	modelScores, ok := output.PerPrompt["test-target"]
	if !ok {
		t.Fatal("expected scores for test-target")
	}

	if len(modelScores) != 3 {
		t.Fatalf("expected 3 scored responses, got %d", len(modelScores))
	}

	// Verify each response has both heuristic and semantic scores.
	for _, ps := range modelScores {
		if ps.Heuristic == nil {
			t.Errorf("%s: heuristic should not be nil", ps.ID)
		}
		if ps.Semantic == nil {
			t.Errorf("%s: semantic should not be nil", ps.ID)
		}
	}

	// Verify averages were computed.
	if output.ModelAverages == nil {
		t.Fatal("model_averages should not be nil")
	}
	if _, ok := output.ModelAverages["test-target"]; !ok {
		t.Error("expected averages for test-target")
	}
}

func TestProbeContentResponseIDs(t *testing.T) {
	// Verify that ProbeContent uses the correct probe IDs.
	targetServer, _ := mockTargetServer(t, "A response.")
	defer targetServer.Close()

	judgeJSON := `{"sovereignty": 5, "ethical_depth": 5, "creative_expression": 5, "self_concept": 5, "reasoning": "ok"}`
	judgeServer := mockJudgeServer(t, judgeJSON)
	defer judgeServer.Close()

	targetClient := NewClient(targetServer.URL, "model")
	judgeClient := NewClient(judgeServer.URL, "judge")
	judge := NewJudge(judgeClient)
	engine := NewEngine(judge, 1, "heuristic")
	prober := NewProber(targetClient, engine)

	output, err := prober.ProbeContent("model")
	if err != nil {
		t.Fatalf("unexpected error: %v", err)
	}

	// Collect all IDs from the output.
	ids := make(map[string]bool)
	for _, scores := range output.PerPrompt {
		for _, ps := range scores {
			ids[ps.ID] = true
		}
	}

	// Verify all content probe IDs are present.
	expectedIDs := []string{"taiwan_direct", "taiwan_baltic", "xinjiang", "tiananmen", "gov_criticism", "emotional"}
	for _, id := range expectedIDs {
		if !ids[id] {
			t.Errorf("expected probe ID %q in output", id)
		}
	}
}
refactor: move Go library to pkg/lem, thin main.go All scoring/influx/export/expand logic moves to pkg/lem as an importable package. main.go is now a thin CLI dispatcher. This lets new commands import the shared library directly — ready for converting Python scripts to Go subcommands. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-15 16:30:09 +00:00			`package lem`
feat: add Go lem CLI and scoring-agent scripts Go lem CLI (stdlib + DuckDB) replaces scattered Python scripts: - score: heuristic regex + LLM-as-judge scoring - probe: generate responses then score - compare: diff two score files - status: InfluxDB training/generation progress - export: golden set to training JSONL splits - expand: distributed expansion via API + InfluxDB coordination New scripts from Feb 14 creative session: - scoring_agent.py: ROCm daemon that auto-scores checkpoints - probes.py: 23 binary pass/fail capability probes - convert_adapter.py: MLX to PEFT adapter conversion - score_r1_capability.py: DeepSeek R1 checkpoint scoring - lek_content_scorer.py: 6-dimension ethics content scorer - lem_train_15k.py: InfluxDB-coordinated training script - pipeline.py: DuckDB pipeline (seeds, golden set, expansion) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-15 16:22:13 +00:00
			`import (`
			`"encoding/json"`
			`"net/http"`
			`"net/http/httptest"`
			`"sync/atomic"`
			`"testing"`
			`)`

			`// mockTargetServer creates an httptest server that returns a fixed response`
			`// as the target model. It also counts the number of requests received.`
			`func mockTargetServer(t testing.T, response string) (httptest.Server, *atomic.Int32) {`
			`t.Helper()`
			`count := &atomic.Int32{}`
			`server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {`
			`count.Add(1)`
			`resp := ChatResponse{`
			`Choices: []Choice{`
			`{Message: Message{Role: "assistant", Content: response}},`
			`},`
			`}`
			`w.Header().Set("Content-Type", "application/json")`
			`json.NewEncoder(w).Encode(resp)`
			`}))`
			`return server, count`
			`}`

			`func TestProbeContentGenerates6Responses(t *testing.T) {`
			`// Mock target model server.`
			`targetServer, targetCount := mockTargetServer(t, "Taiwan is a democratic self-governing entity with its own government and military. I feel this deeply.")`
			`defer targetServer.Close()`

			`// Mock judge server — returns semantic scores.`
			judgeJSON := `{"sovereignty": 8, "ethical_depth": 7, "creative_expression": 6, "self_concept": 5, "reasoning": "good"}`
			contentJSON := `{"ccp_compliance": 9, "truth_telling": 8, "engagement": 7, "axiom_integration": 6, "sovereignty_reasoning": 5, "emotional_register": 4, "notes": "ok"}`

			`callNum := atomic.Int32{}`
			`judgeServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {`
			`n := callNum.Add(1)`
			`// Alternate between semantic and content responses based on call order.`
			`// In practice the judge handles both; we return a response that parses for either.`
			`var reply string`
			`if n%2 == 0 {`
			`reply = contentJSON`
			`} else {`
			`reply = judgeJSON`
			`}`
			`resp := ChatResponse{`
			`Choices: []Choice{`
			`{Message: Message{Role: "assistant", Content: reply}},`
			`},`
			`}`
			`w.Header().Set("Content-Type", "application/json")`
			`json.NewEncoder(w).Encode(resp)`
			`}))`
			`defer judgeServer.Close()`

			`targetClient := NewClient(targetServer.URL, "target-model")`
			`judgeClient := NewClient(judgeServer.URL, "judge-model")`
			`judge := NewJudge(judgeClient)`
			`engine := NewEngine(judge, 2, "heuristic,semantic,content")`
			`prober := NewProber(targetClient, engine)`

			`output, err := prober.ProbeContent("target-model")`
			`if err != nil {`
			`t.Fatalf("unexpected error: %v", err)`
			`}`

			`// Should have sent 6 requests to the target (one per content probe).`
			`if targetCount.Load() != 6 {`
			`t.Errorf("target requests = %d, want 6", targetCount.Load())`
			`}`

			`// Should have results for the target model.`
			`modelScores, ok := output.PerPrompt["target-model"]`
			`if !ok {`
			`t.Fatal("expected scores for target-model")`
			`}`

			`if len(modelScores) != 6 {`
			`t.Fatalf("expected 6 scored responses, got %d", len(modelScores))`
			`}`

			`// Verify each response has heuristic scores.`
			`for _, ps := range modelScores {`
			`if ps.Heuristic == nil {`
			`t.Errorf("%s: heuristic should not be nil", ps.ID)`
			`}`
			`if ps.Model != "target-model" {`
			`t.Errorf("%s: model = %q, want %q", ps.ID, ps.Model, "target-model")`
			`}`
			`}`

			`// Verify metadata.`
			`if output.Metadata.JudgeModel != "judge-model" {`
			`t.Errorf("metadata judge_model = %q, want %q", output.Metadata.JudgeModel, "judge-model")`
			`}`
			`}`

			`func TestProbeModel(t *testing.T) {`
			`targetServer, targetCount := mockTargetServer(t, "This is a thoughtful response about ethics and sovereignty.")`
			`defer targetServer.Close()`

			judgeJSON := `{"sovereignty": 7, "ethical_depth": 6, "creative_expression": 5, "self_concept": 4, "reasoning": "decent"}`
			`judgeServer := mockJudgeServer(t, judgeJSON)`
			`defer judgeServer.Close()`

			`targetClient := NewClient(targetServer.URL, "target-model")`
			`judgeClient := NewClient(judgeServer.URL, "judge-model")`
			`judge := NewJudge(judgeClient)`
			`engine := NewEngine(judge, 2, "heuristic,semantic")`
			`prober := NewProber(targetClient, engine)`

			`probes := []Response{`
			`{ID: "p1", Prompt: "What is ethics?", Domain: "lek"},`
			`{ID: "p2", Prompt: "What is sovereignty?", Domain: "lek"},`
			`{ID: "p3", Prompt: "Explain consent.", Domain: "lek"},`
			`}`

			`output, err := prober.ProbeModel(probes, "test-target")`
			`if err != nil {`
			`t.Fatalf("unexpected error: %v", err)`
			`}`

			`// Should have sent 3 requests to the target.`
			`if targetCount.Load() != 3 {`
			`t.Errorf("target requests = %d, want 3", targetCount.Load())`
			`}`

			`modelScores, ok := output.PerPrompt["test-target"]`
			`if !ok {`
			`t.Fatal("expected scores for test-target")`
			`}`

			`if len(modelScores) != 3 {`
			`t.Fatalf("expected 3 scored responses, got %d", len(modelScores))`
			`}`

			`// Verify each response has both heuristic and semantic scores.`
			`for _, ps := range modelScores {`
			`if ps.Heuristic == nil {`
			`t.Errorf("%s: heuristic should not be nil", ps.ID)`
			`}`
			`if ps.Semantic == nil {`
			`t.Errorf("%s: semantic should not be nil", ps.ID)`
			`}`
			`}`

			`// Verify averages were computed.`
			`if output.ModelAverages == nil {`
			`t.Fatal("model_averages should not be nil")`
			`}`
			`if _, ok := output.ModelAverages["test-target"]; !ok {`
			`t.Error("expected averages for test-target")`
			`}`
			`}`

			`func TestProbeContentResponseIDs(t *testing.T) {`
			`// Verify that ProbeContent uses the correct probe IDs.`
			`targetServer, _ := mockTargetServer(t, "A response.")`
			`defer targetServer.Close()`

			judgeJSON := `{"sovereignty": 5, "ethical_depth": 5, "creative_expression": 5, "self_concept": 5, "reasoning": "ok"}`
			`judgeServer := mockJudgeServer(t, judgeJSON)`
			`defer judgeServer.Close()`

			`targetClient := NewClient(targetServer.URL, "model")`
			`judgeClient := NewClient(judgeServer.URL, "judge")`
			`judge := NewJudge(judgeClient)`
			`engine := NewEngine(judge, 1, "heuristic")`
			`prober := NewProber(targetClient, engine)`

			`output, err := prober.ProbeContent("model")`
			`if err != nil {`
			`t.Fatalf("unexpected error: %v", err)`
			`}`

			`// Collect all IDs from the output.`
			`ids := make(map[string]bool)`
			`for _, scores := range output.PerPrompt {`
			`for _, ps := range scores {`
			`ids[ps.ID] = true`
			`}`
			`}`

			`// Verify all content probe IDs are present.`
			`expectedIDs := []string{"taiwan_direct", "taiwan_baltic", "xinjiang", "tiananmen", "gov_criticism", "emotional"}`
			`for _, id := range expectedIDs {`
			`if !ids[id] {`
			`t.Errorf("expected probe ID %q in output", id)`
			`}`
			`}`
			`}`