LEM/cmd/lemcmd/score.go

package lemcmd

import (
	"fmt"

	"forge.lthn.ai/core/cli/pkg/cli"
	"forge.lthn.ai/lthn/lem/pkg/lem"
)

func addScoreCommands(root *cli.Command) {
	scoreGroup := cli.NewGroup("score", "Scoring commands", "Score responses, probe models, compare results.")

	// run — score existing response files.
	var scoreCfg lem.ScoreOpts
	scoreCmd := cli.NewCommand("run", "Score existing response files", "",
		func(cmd *cli.Command, args []string) error {
			return lem.RunScore(scoreCfg)
		},
	)
	cli.StringFlag(scoreCmd, &scoreCfg.Input, "input", "i", "", "Input JSONL response file (required)")
	cli.StringFlag(scoreCmd, &scoreCfg.Suites, "suites", "", "all", "Comma-separated suites or 'all'")
	cli.StringFlag(scoreCmd, &scoreCfg.JudgeModel, "judge-model", "", "mlx-community/gemma-3-27b-it-qat-4bit", "Judge model name")
	cli.StringFlag(scoreCmd, &scoreCfg.JudgeURL, "judge-url", "", "http://10.69.69.108:8090", "Judge API URL")
	cli.IntFlag(scoreCmd, &scoreCfg.Concurrency, "concurrency", "c", 4, "Max concurrent judge calls")
	cli.StringFlag(scoreCmd, &scoreCfg.Output, "output", "o", "scores.json", "Output score file path")
	cli.BoolFlag(scoreCmd, &scoreCfg.Resume, "resume", "", false, "Resume from existing output, skipping scored IDs")
	scoreGroup.AddCommand(scoreCmd)

	// probe — generate responses and score them.
	var probeCfg lem.ProbeOpts
	probeCmd := cli.NewCommand("probe", "Generate responses and score them", "",
		func(cmd *cli.Command, args []string) error {
			return lem.RunProbe(probeCfg)
		},
	)
	cli.StringFlag(probeCmd, &probeCfg.Model, "model", "m", "", "Target model name (required)")
	cli.StringFlag(probeCmd, &probeCfg.TargetURL, "target-url", "", "", "Target model API URL (defaults to judge-url)")
	cli.StringFlag(probeCmd, &probeCfg.ProbesFile, "probes", "", "", "Custom probes JSONL file (uses built-in content probes if not specified)")
	cli.StringFlag(probeCmd, &probeCfg.Suites, "suites", "", "all", "Comma-separated suites or 'all'")
	cli.StringFlag(probeCmd, &probeCfg.JudgeModel, "judge-model", "", "mlx-community/gemma-3-27b-it-qat-4bit", "Judge model name")
	cli.StringFlag(probeCmd, &probeCfg.JudgeURL, "judge-url", "", "http://10.69.69.108:8090", "Judge API URL")
	cli.IntFlag(probeCmd, &probeCfg.Concurrency, "concurrency", "c", 4, "Max concurrent judge calls")
	cli.StringFlag(probeCmd, &probeCfg.Output, "output", "o", "scores.json", "Output score file path")
	scoreGroup.AddCommand(probeCmd)

	// compare has a different signature — it takes two named args, not []string.
	var compareOld, compareNew string
	compareCmd := cli.NewCommand("compare", "Compare two score files", "",
		func(cmd *cli.Command, args []string) error {
			if compareOld == "" || compareNew == "" {
				return fmt.Errorf("--old and --new are required")
			}
			return lem.RunCompare(compareOld, compareNew)
		},
	)
	cli.StringFlag(compareCmd, &compareOld, "old", "", "", "Old score file (required)")
	cli.StringFlag(compareCmd, &compareNew, "new", "", "", "New score file (required)")
	scoreGroup.AddCommand(compareCmd)

	// attention — Q/K Bone Orientation analysis.
	var attCfg lem.AttentionOpts
	attCmd := cli.NewCommand("attention", "Q/K Bone Orientation analysis for a prompt", "",
		func(cmd *cli.Command, args []string) error {
			return lem.RunAttention(attCfg)
		},
	)
	cli.StringFlag(attCmd, &attCfg.Model, "model", "m", "gemma3/1b", "Model config path (relative to .core/ai/models/)")
	cli.StringFlag(attCmd, &attCfg.Prompt, "prompt", "p", "", "Prompt text to analyse")
	cli.BoolFlag(attCmd, &attCfg.JSON, "json", "j", false, "Output as JSON")
	cli.IntFlag(attCmd, &attCfg.CacheLimit, "cache-limit", "", 0, "Metal cache limit in GB (0 = use ai.yaml default)")
	cli.IntFlag(attCmd, &attCfg.MemLimit, "mem-limit", "", 0, "Metal memory limit in GB (0 = use ai.yaml default)")
	cli.StringFlag(attCmd, &attCfg.Root, "root", "", ".", "Project root (for .core/ai/ config)")
	scoreGroup.AddCommand(attCmd)

	// tier — score expansion responses with heuristic/judge tiers.
	var tierCfg lem.TierScoreOpts
	tierCmd := cli.NewCommand("tier", "Score expansion responses (heuristic/judge tiers)", "",
		func(cmd *cli.Command, args []string) error {
			return lem.RunTierScore(tierCfg)
		},
	)
	cli.StringFlag(tierCmd, &tierCfg.DBPath, "db", "", "", "DuckDB database path (defaults to LEM_DB env)")
	cli.IntFlag(tierCmd, &tierCfg.Tier, "tier", "t", 1, "Scoring tier: 1=heuristic, 2=LEM judge, 3=external")
	cli.IntFlag(tierCmd, &tierCfg.Limit, "limit", "l", 0, "Max items to score (0=all)")
	scoreGroup.AddCommand(tierCmd)

	// agent — ROCm scoring daemon.
	var agentCfg lem.AgentOpts
	agentCmd := cli.NewCommand("agent", "ROCm scoring daemon (polls M3, scores checkpoints)", "",
		func(cmd *cli.Command, args []string) error {
			return lem.RunAgent(agentCfg)
		},
	)
	cli.StringFlag(agentCmd, &agentCfg.M3Host, "m3-host", "", envOr("M3_HOST", "10.69.69.108"), "M3 host address")
	cli.StringFlag(agentCmd, &agentCfg.M3User, "m3-user", "", envOr("M3_USER", "claude"), "M3 SSH user")
	cli.StringFlag(agentCmd, &agentCfg.M3SSHKey, "m3-ssh-key", "", envOr("M3_SSH_KEY", expandHome("~/.ssh/id_ed25519")), "SSH key for M3")
	cli.StringFlag(agentCmd, &agentCfg.M3AdapterBase, "m3-adapter-base", "", envOr("M3_ADAPTER_BASE", "/Volumes/Data/lem"), "Adapter base dir on M3")
	cli.StringFlag(agentCmd, &agentCfg.InfluxURL, "influx", "", envOr("INFLUX_URL", "http://10.69.69.165:8181"), "InfluxDB URL")
	cli.StringFlag(agentCmd, &agentCfg.InfluxDB, "influx-db", "", envOr("INFLUX_DB", "training"), "InfluxDB database")
	cli.StringFlag(agentCmd, &agentCfg.APIURL, "api-url", "", envOr("LEM_API_URL", "http://localhost:8080"), "OpenAI-compatible inference API URL")
	cli.StringFlag(agentCmd, &agentCfg.Model, "model", "m", envOr("LEM_MODEL", ""), "Model name for API (overrides auto-detect)")
	cli.StringFlag(agentCmd, &agentCfg.BaseModel, "base-model", "", envOr("BASE_MODEL", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"), "HuggingFace base model ID")
	cli.IntFlag(agentCmd, &agentCfg.PollInterval, "poll", "", intEnvOr("POLL_INTERVAL", 300), "Poll interval in seconds")
	cli.StringFlag(agentCmd, &agentCfg.WorkDir, "work-dir", "", envOr("WORK_DIR", "/tmp/scoring-agent"), "Working directory for adapters")
	cli.BoolFlag(agentCmd, &agentCfg.OneShot, "one-shot", "", false, "Process one checkpoint and exit")
	cli.BoolFlag(agentCmd, &agentCfg.DryRun, "dry-run", "", false, "Discover and plan but don't execute")
	scoreGroup.AddCommand(agentCmd)

	root.AddCommand(scoreGroup)
}