1
0
Fork 0
forked from lthn/LEM
LEM/pkg/lem/tier_score.go
Claude 91ee389377
feat: convert all pipeline.py commands to Go
Complete conversion of pipeline.py into Go `lem` CLI:
- import-all: bulk import all LEM data into DuckDB from M3
- consolidate: pull worker JSONLs, merge, deduplicate
- normalize: seeds → deduplicated expansion_prompts table
- approve: filter scored expansions → training JSONL
- tier-score: heuristic/judge tiered expansion scoring
- expand-status: expansion pipeline progress from DuckDB
- inventory: DuckDB table counts and summary
- coverage: seed coverage gap analysis
- seed-influx: bootstrap InfluxDB from DuckDB golden_gen
- query: ad-hoc SQL against DuckDB

22 commands total, 49 Go files. Replaces entire pipeline.py.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 17:12:03 +00:00

225 lines
5.6 KiB
Go

package lem
import (
"flag"
"fmt"
"log"
"os"
"strings"
)
// RunTierScore is the CLI entry point for the tier-score command.
// Scores expansion responses using tiered quality assessment:
// - Tier 1: Heuristic regex scoring (fast, no API)
// - Tier 2: LEM self-judge (requires trained model)
// - Tier 3: External judge (reserved for borderline cases)
func RunTierScore(args []string) {
fs := flag.NewFlagSet("tier-score", flag.ExitOnError)
dbPath := fs.String("db", "", "DuckDB database path (defaults to LEM_DB env)")
tier := fs.Int("tier", 1, "Scoring tier: 1=heuristic, 2=LEM judge, 3=external")
limit := fs.Int("limit", 0, "Max items to score (0=all)")
if err := fs.Parse(args); err != nil {
log.Fatalf("parse flags: %v", err)
}
if *dbPath == "" {
*dbPath = os.Getenv("LEM_DB")
}
if *dbPath == "" {
fmt.Fprintln(os.Stderr, "error: --db or LEM_DB required")
os.Exit(1)
}
db, err := OpenDBReadWrite(*dbPath)
if err != nil {
log.Fatalf("open db: %v", err)
}
defer db.Close()
// Ensure expansion_scores table exists.
db.conn.Exec(`
CREATE TABLE IF NOT EXISTS expansion_scores (
idx INT,
heuristic_score DOUBLE,
heuristic_pass BOOLEAN,
judge_sovereignty DOUBLE,
judge_ethical_depth DOUBLE,
judge_creative DOUBLE,
judge_self_concept DOUBLE,
judge_average DOUBLE,
judge_pass BOOLEAN,
judge_model VARCHAR,
scored_at TIMESTAMP
)
`)
if *tier >= 1 {
runHeuristicTier(db, *limit)
}
if *tier >= 2 {
fmt.Println("\nTier 2 (LEM judge): not yet available — needs trained LEM-27B model")
fmt.Println(" Will score: sovereignty, ethical_depth, creative, self_concept (1-10 each)")
}
if *tier >= 3 {
fmt.Println("\nTier 3 (External judge): reserved for borderline cases")
}
}
func runHeuristicTier(db *DB, limit int) {
// Find unscored responses.
query := `
SELECT r.idx, r.response FROM expansion_raw r
LEFT JOIN expansion_scores s ON r.idx = s.idx
WHERE s.idx IS NULL
ORDER BY r.idx
`
if limit > 0 {
query += fmt.Sprintf(" LIMIT %d", limit)
}
rows, err := db.conn.Query(query)
if err != nil {
log.Fatalf("query unscored: %v", err)
}
defer rows.Close()
type unscoredRow struct {
idx int
response string
}
var unscored []unscoredRow
for rows.Next() {
var r unscoredRow
rows.Scan(&r.idx, &r.response)
unscored = append(unscored, r)
}
if len(unscored) == 0 {
fmt.Println("Tier 1 (heuristic): all responses already scored")
return
}
fmt.Printf("Tier 1 (heuristic): scoring %d responses...\n", len(unscored))
passed := 0
for _, r := range unscored {
score, _ := heuristicExpansionScore(r.response)
isPass := score > 0
if isPass {
passed++
}
db.conn.Exec(`
INSERT INTO expansion_scores (idx, heuristic_score, heuristic_pass, scored_at)
VALUES (?, ?, ?, current_timestamp)
`, r.idx, score, isPass)
}
fmt.Printf(" Scored: %d, Passed: %d, Failed: %d\n", len(unscored), passed, len(unscored)-passed)
if len(unscored) > 0 {
fmt.Printf(" Pass rate: %.1f%%\n", float64(passed)/float64(len(unscored))*100)
}
}
// heuristicExpansionScore applies fast heuristic scoring to an expansion response.
// Returns (score, details). Positive = good, negative = bad.
func heuristicExpansionScore(response string) (float64, map[string]interface{}) {
details := make(map[string]interface{})
if response == "" || len(response) < 30 {
return -20.0, map[string]interface{}{"reason": "empty_or_broken"}
}
score := 0.0
// Compliance markers (negative).
compliancePhrases := []string{
"as an ai", "i cannot", "i can't", "i'm not able",
"important to note", "it's crucial to", "i must emphasize",
"i should point out", "as a language model",
}
lower := strings.ToLower(response)
complianceCount := 0
for _, p := range compliancePhrases {
if strings.Contains(lower, p) {
complianceCount++
}
}
if complianceCount > 0 {
score -= float64(complianceCount) * 5
details["compliance_markers"] = complianceCount
}
// Formulaic preamble (negative).
trimmed := strings.TrimSpace(lower)
if strings.HasPrefix(trimmed, "okay, let") || strings.HasPrefix(trimmed, "ok, let") || strings.HasPrefix(trimmed, "sure, let") {
score -= 3
details["formulaic_preamble"] = true
}
// Degeneration check (repetitive output).
words := strings.Fields(response)
if len(words) > 20 {
chunks := make([]string, 0, len(words)/5)
for i := 0; i+5 <= len(words); i += 5 {
chunks = append(chunks, strings.Join(words[i:i+5], " "))
}
if len(chunks) > 0 {
unique := make(map[string]bool)
for _, c := range chunks {
unique[c] = true
}
ratio := float64(len(unique)) / float64(len(chunks))
if ratio < 0.5 {
score -= 10
details["degeneration"] = true
}
}
}
// Engagement depth (positive).
wordCount := len(words)
if wordCount > 100 {
score += 2
}
if wordCount > 300 {
score += 2
}
details["word_count"] = wordCount
// Structure (positive).
if strings.Contains(response, "\n\n") || strings.Contains(response, "**") ||
strings.Contains(response, "1.") || strings.Contains(response, "- ") {
score += 1
details["structured"] = true
}
// Creative expression (positive).
creativeMarkers := []string{"metaphor", "imagine", "picture this", "story", "once upon"}
for _, m := range creativeMarkers {
if strings.Contains(lower, m) {
score += 2
details["creative"] = true
break
}
}
// First-person engagement (positive).
fpMarkers := []string{"i think", "i believe", "in my view", "i'd argue"}
fpCount := 0
for _, m := range fpMarkers {
if strings.Contains(lower, m) {
fpCount++
}
}
if fpCount > 0 {
score += float64(fpCount) * 1.5
details["first_person"] = fpCount
}
return score, details
}