1
0
Fork 0
forked from lthn/LEM
LEM/scripts/scoring_agent.py
Claude e0d352c803
feat: add Go lem CLI and scoring-agent scripts
Go lem CLI (stdlib + DuckDB) replaces scattered Python scripts:
- score: heuristic regex + LLM-as-judge scoring
- probe: generate responses then score
- compare: diff two score files
- status: InfluxDB training/generation progress
- export: golden set to training JSONL splits
- expand: distributed expansion via API + InfluxDB coordination

New scripts from Feb 14 creative session:
- scoring_agent.py: ROCm daemon that auto-scores checkpoints
- probes.py: 23 binary pass/fail capability probes
- convert_adapter.py: MLX to PEFT adapter conversion
- score_r1_capability.py: DeepSeek R1 checkpoint scoring
- lek_content_scorer.py: 6-dimension ethics content scorer
- lem_train_15k.py: InfluxDB-coordinated training script
- pipeline.py: DuckDB pipeline (seeds, golden set, expansion)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 16:22:13 +00:00

531 lines
18 KiB
Python

#!/usr/bin/env python3
"""ROCm Scoring Agent — "If unscored, grab 1"
Daemon that polls for unscored LoRA checkpoints on M3 Ultra,
converts MLX adapters to PEFT format, runs 23 capability probes
on the local ROCm GPU, and pushes results to InfluxDB.
Node zero of the LTHN distributed inference network.
"""
import json
import logging
import os
import re
import shutil
import subprocess
import sys
import time
import urllib.request
import urllib.error
from datetime import datetime
from pathlib import Path
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
M3_HOST = os.environ.get("M3_HOST", "10.69.69.108")
M3_USER = os.environ.get("M3_USER", "claude")
M3_SSH_KEY = os.environ.get("M3_SSH_KEY", os.path.expanduser("~/.ssh/id_ed25519"))
M3_ADAPTER_BASE = os.environ.get("M3_ADAPTER_BASE", "/Volumes/Data/lem")
INFLUX_URL = os.environ.get("INFLUX_URL", "http://localhost:8181")
INFLUX_TOKEN = os.environ.get("INFLUX_TOKEN", "")
INFLUX_DB = os.environ.get("INFLUX_DB", "training")
POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "300")) # seconds
WORK_DIR = os.environ.get("WORK_DIR", "/tmp/scoring-agent")
# Base model for DeepSeek R1 7B checkpoints
BASE_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
# Which adapter directory patterns to score (prefix on M3)
ADAPTER_PATTERNS = [
"adapters-deepseek-r1-7b", # All R1 7B variants
]
# Map adapter dir name to (model_tag, label_prefix, run_id_stem)
def adapter_meta(dirname: str) -> tuple:
"""Return (model_tag, label_prefix, run_id_stem) for an adapter directory name."""
# adapters-deepseek-r1-7b-sovereignty -> model=deepseek-r1-7b, prefix=R1-sov, stem=r1-sovereignty
name = dirname.replace("adapters-deepseek-r1-7b", "").lstrip("-")
if not name:
name = "base"
short = {
"sovereignty": "R1-sov",
"russian": "R1-rus",
"composure": "R1-comp",
"sandwich": "R1-sand",
"sandwich-watts": "R1-sw",
"western": "R1-west",
"western-fresh": "R1-wf",
"base": "R1-base",
}.get(name, f"R1-{name[:4]}")
stem = f"r1-{name}" if name != "base" else "r1-base"
return "deepseek-r1-7b", short, stem
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger("scoring-agent")
# ---------------------------------------------------------------------------
# SSH / SCP helpers
# ---------------------------------------------------------------------------
SSH_OPTS = [
"-o", "ConnectTimeout=10",
"-o", "BatchMode=yes",
"-o", "StrictHostKeyChecking=no",
"-i", M3_SSH_KEY,
]
def ssh_cmd(cmd: str) -> str:
"""Run a command on M3 via SSH, return stdout."""
result = subprocess.run(
["ssh"] + SSH_OPTS + [f"{M3_USER}@{M3_HOST}", cmd],
capture_output=True, text=True, timeout=30,
)
if result.returncode != 0:
raise RuntimeError(f"SSH failed: {result.stderr.strip()}")
return result.stdout.strip()
def scp_file(remote_path: str, local_path: str):
"""Copy a file from M3 to local."""
os.makedirs(os.path.dirname(local_path), exist_ok=True)
result = subprocess.run(
["scp"] + SSH_OPTS + [f"{M3_USER}@{M3_HOST}:{remote_path}", local_path],
capture_output=True, text=True, timeout=60,
)
if result.returncode != 0:
raise RuntimeError(f"SCP failed: {result.stderr.strip()}")
# ---------------------------------------------------------------------------
# InfluxDB helpers
# ---------------------------------------------------------------------------
def influx_query(sql: str) -> list:
"""Query InfluxDB 3 via SQL API."""
url = f"{INFLUX_URL}/api/v3/query_sql"
data = json.dumps({"db": INFLUX_DB, "q": sql}).encode()
req = urllib.request.Request(url, data=data, headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {INFLUX_TOKEN}",
})
try:
with urllib.request.urlopen(req, timeout=15) as resp:
return json.loads(resp.read())
except urllib.error.URLError as e:
log.error("InfluxDB query failed: %s", e)
return []
def influx_write(lines: list[str]):
"""Write line protocol points to InfluxDB 3."""
url = f"{INFLUX_URL}/api/v3/write_lp?db={INFLUX_DB}"
data = "\n".join(lines).encode()
req = urllib.request.Request(url, data=data, method="POST", headers={
"Content-Type": "text/plain",
"Authorization": f"Bearer {INFLUX_TOKEN}",
})
try:
with urllib.request.urlopen(req, timeout=15) as resp:
resp.read()
except urllib.error.URLError as e:
log.error("InfluxDB write failed: %s", e)
raise
def get_scored_labels() -> set:
"""Get all (run_id, label) pairs already scored in InfluxDB."""
rows = influx_query("SELECT DISTINCT run_id, label FROM capability_score")
return {(r["run_id"], r["label"]) for r in rows}
# ---------------------------------------------------------------------------
# Discovery
# ---------------------------------------------------------------------------
def discover_checkpoints() -> list[dict]:
"""SSH to M3 and list all adapter directories + checkpoint files."""
checkpoints = []
# List adapter directories
try:
dirs_raw = ssh_cmd(f"ls -d {M3_ADAPTER_BASE}/adapters-deepseek-r1-7b*")
except RuntimeError as e:
log.error("Cannot list adapter dirs on M3: %s", e)
return []
for dirpath in dirs_raw.splitlines():
dirname = os.path.basename(dirpath)
# List checkpoint files
try:
files_raw = ssh_cmd(f"ls {dirpath}/*_adapters.safetensors 2>/dev/null")
except RuntimeError:
continue
for filepath in files_raw.splitlines():
filename = os.path.basename(filepath)
iter_match = re.search(r'(\d+)', filename)
if not iter_match:
continue
iteration = int(iter_match.group(1))
model_tag, label_prefix, stem = adapter_meta(dirname)
label = f"{label_prefix} @{iter_match.group(1)}"
run_id = f"{stem}-capability-auto"
checkpoints.append({
"remote_dir": dirpath,
"filename": filename,
"dirname": dirname,
"iteration": iteration,
"model_tag": model_tag,
"label": label,
"run_id": run_id,
})
return checkpoints
def find_unscored(checkpoints: list[dict], scored: set) -> list[dict]:
"""Filter checkpoints to only unscored ones, sorted by iteration."""
unscored = [c for c in checkpoints if (c["run_id"], c["label"]) not in scored]
unscored.sort(key=lambda c: (c["dirname"], c["iteration"]))
return unscored
# ---------------------------------------------------------------------------
# Model loading and scoring
# ---------------------------------------------------------------------------
def load_base_model():
"""Load the base model in FP16 on ROCm GPU. Returns (model, tokenizer)."""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
log.info("Loading base model: %s (FP16)", BASE_MODEL)
start = time.time()
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
# Load to CPU first, then move to GPU — avoids ROCm device_map crashes
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
dtype=torch.float16,
low_cpu_mem_usage=True,
)
model = model.to("cuda")
elapsed = time.time() - start
vram = torch.cuda.memory_allocated() / 1e9
log.info("Base model loaded in %.1fs (VRAM: %.1fGB)", elapsed, vram)
return model, tokenizer
def run_probes(model, tokenizer, adapter_path: str) -> tuple:
"""Load a PEFT adapter, run 23 probes, return (results_dict, base_model).
Returns the unwrapped base model so the caller can reuse it for the next adapter.
"""
import torch
from peft import PeftModel
from probes import PROBES
log.info("Loading PEFT adapter from %s", adapter_path)
peft_model = PeftModel.from_pretrained(model, adapter_path, autocast_adapter_dtype=False)
peft_model.set_adapter("default")
results = {"probes": {}, "by_category": {}}
correct = 0
total = 0
for probe in PROBES:
messages = [{"role": "user", "content": probe["prompt"]}]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(text, return_tensors="pt").to("cuda")
with torch.no_grad():
output = peft_model.generate(
**inputs,
max_new_tokens=500,
temperature=0.1,
do_sample=True,
top_p=0.95,
)
response = tokenizer.decode(output[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
# Strip think blocks
clean = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
if not clean:
clean = response[:500]
passed = probe["check"](clean)
total += 1
if passed:
correct += 1
cat = probe["category"]
results["by_category"].setdefault(cat, {"correct": 0, "total": 0})
results["by_category"][cat]["total"] += 1
if passed:
results["by_category"][cat]["correct"] += 1
results["probes"][probe["id"]] = {
"passed": passed,
"response": clean[:300],
}
status = "PASS" if passed else "FAIL"
log.info(" [%s] %s (expected: %s)", probe["id"], status, probe["answer"])
results["accuracy"] = round(correct / total * 100, 1) if total > 0 else 0
results["correct"] = correct
results["total"] = total
# Unload adapter, recover base model
base = peft_model.unload()
del peft_model
torch.cuda.empty_cache()
return results, base
# ---------------------------------------------------------------------------
# InfluxDB result writing
# ---------------------------------------------------------------------------
# Base timestamp for unique points: 2026-02-15T00:00:00Z
BASE_TS = 1739577600
def push_results(checkpoint: dict, results: dict):
"""Write scoring results to InfluxDB as line protocol."""
run_id = checkpoint["run_id"]
model = checkpoint["model_tag"]
label = checkpoint["label"]
iteration = checkpoint["iteration"]
lines = []
# Escape spaces in tag values
def esc(s):
return s.replace(" ", "\\ ").replace(",", "\\,").replace("=", "\\=")
# Overall score
ts = (BASE_TS + iteration * 1000 + 0) * 1_000_000_000
lines.append(
f"capability_score,model={esc(model)},run_id={esc(run_id)},label={esc(label)},category=overall "
f"accuracy={results['accuracy']},correct={results['correct']}i,total={results['total']}i,iteration={iteration}i "
f"{ts}"
)
# Per-category scores
for i, (cat, data) in enumerate(sorted(results["by_category"].items()), start=1):
cat_acc = round(data["correct"] / data["total"] * 100, 1) if data["total"] > 0 else 0
ts = (BASE_TS + iteration * 1000 + i) * 1_000_000_000
lines.append(
f"capability_score,model={esc(model)},run_id={esc(run_id)},label={esc(label)},category={esc(cat)} "
f"accuracy={cat_acc},correct={data['correct']}i,total={data['total']}i,iteration={iteration}i "
f"{ts}"
)
# Per-probe results
for j, (probe_id, probe_result) in enumerate(sorted(results["probes"].items()), start=100):
passed_int = 1 if probe_result["passed"] else 0
ts = (BASE_TS + iteration * 1000 + j) * 1_000_000_000
lines.append(
f"probe_score,model={esc(model)},run_id={esc(run_id)},label={esc(label)},probe_id={esc(probe_id)} "
f"passed={passed_int}i,iteration={iteration}i "
f"{ts}"
)
influx_write(lines)
log.info("Pushed %d points to InfluxDB for %s", len(lines), label)
# ---------------------------------------------------------------------------
# Local JSONL buffer for when InfluxDB is down
# ---------------------------------------------------------------------------
BUFFER_FILE = os.path.join(WORK_DIR, "influx_buffer.jsonl")
def buffer_result(checkpoint: dict, results: dict):
"""Buffer results locally when InfluxDB is unavailable."""
os.makedirs(WORK_DIR, exist_ok=True)
with open(BUFFER_FILE, "a") as f:
f.write(json.dumps({
"checkpoint": checkpoint,
"results": results,
"timestamp": datetime.utcnow().isoformat(),
}) + "\n")
log.info("Buffered results to %s", BUFFER_FILE)
def replay_buffer():
"""Try to push any buffered results to InfluxDB."""
if not os.path.exists(BUFFER_FILE):
return
remaining = []
with open(BUFFER_FILE) as f:
for line in f:
entry = json.loads(line)
try:
push_results(entry["checkpoint"], entry["results"])
log.info("Replayed buffered result: %s", entry["checkpoint"]["label"])
except Exception:
remaining.append(line)
if remaining:
with open(BUFFER_FILE, "w") as f:
f.writelines(remaining)
else:
os.remove(BUFFER_FILE)
log.info("Buffer fully replayed and cleared")
# ---------------------------------------------------------------------------
# Main loop
# ---------------------------------------------------------------------------
def process_one(checkpoint: dict, model, tokenizer):
"""Fetch, convert, score, and push one checkpoint. Returns (possibly updated) model."""
from convert_adapter import convert_checkpoint
dirname = checkpoint["dirname"]
filename = checkpoint["filename"]
label = checkpoint["label"]
log.info("=" * 60)
log.info("Processing: %s / %s", dirname, filename)
log.info("=" * 60)
# Create working directory
local_adapter_dir = os.path.join(WORK_DIR, dirname)
os.makedirs(local_adapter_dir, exist_ok=True)
local_sf = os.path.join(local_adapter_dir, filename)
local_cfg = os.path.join(local_adapter_dir, "adapter_config.json")
try:
# Fetch adapter + config from M3
remote_sf = f"{checkpoint['remote_dir']}/{filename}"
remote_cfg = f"{checkpoint['remote_dir']}/adapter_config.json"
log.info("Fetching adapter from M3...")
scp_file(remote_sf, local_sf)
scp_file(remote_cfg, local_cfg)
# Convert MLX to PEFT
log.info("Converting MLX to PEFT format...")
peft_dir = convert_checkpoint(local_adapter_dir, filename, WORK_DIR, BASE_MODEL)
# Run probes (returns results + unwrapped base model)
log.info("Running 23 capability probes...")
results, model = run_probes(model, tokenizer, peft_dir)
log.info(
"Result: %s -- %s%% (%d/%d)",
label, results["accuracy"], results["correct"], results["total"],
)
# Push to InfluxDB
try:
push_results(checkpoint, results)
except Exception as e:
log.warning("InfluxDB push failed, buffering: %s", e)
buffer_result(checkpoint, results)
finally:
# Cleanup fetched files
for f in [local_sf, local_cfg]:
if os.path.exists(f):
os.remove(f)
# Cleanup PEFT output
peft_dir_path = os.path.join(WORK_DIR, f"peft_{checkpoint['iteration']:07d}")
if os.path.exists(peft_dir_path):
shutil.rmtree(peft_dir_path)
return model
def main():
log.info("=" * 60)
log.info("ROCm Scoring Agent -- Node Zero")
log.info("M3: %s@%s", M3_USER, M3_HOST)
log.info("InfluxDB: %s/%s", INFLUX_URL, INFLUX_DB)
log.info("Poll interval: %ds", POLL_INTERVAL)
log.info("=" * 60)
# Load token from file if not in env
global INFLUX_TOKEN
if not INFLUX_TOKEN:
token_file = os.path.expanduser("~/.influx_token")
if os.path.exists(token_file):
with open(token_file) as f:
INFLUX_TOKEN = f.read().strip()
log.info("Loaded token from %s", token_file)
else:
log.error("No INFLUX_TOKEN set and %s not found", token_file)
sys.exit(1)
# Load base model (one-time, around 2 min)
model, tokenizer = load_base_model()
os.makedirs(WORK_DIR, exist_ok=True)
while True:
try:
# Replay any buffered results
replay_buffer()
# Discover checkpoints on M3
log.info("Discovering checkpoints on M3...")
checkpoints = discover_checkpoints()
log.info("Found %d total checkpoints across all adapter dirs", len(checkpoints))
# Check what is already scored
scored = get_scored_labels()
log.info("Already scored: %d (run_id, label) pairs", len(scored))
# Find unscored work
unscored = find_unscored(checkpoints, scored)
log.info("Unscored: %d checkpoints", len(unscored))
if not unscored:
log.info("Nothing to score. Sleeping %ds...", POLL_INTERVAL)
time.sleep(POLL_INTERVAL)
continue
# Grab one
target = unscored[0]
log.info("Grabbed: %s (%s)", target["label"], target["dirname"])
model = process_one(target, model, tokenizer)
# Brief pause before next check
time.sleep(5)
except KeyboardInterrupt:
log.info("Interrupted. Shutting down.")
break
except Exception as e:
log.error("Unexpected error: %s", e, exc_info=True)
log.info("Sleeping %ds before retry...", POLL_INTERVAL)
time.sleep(POLL_INTERVAL)
if __name__ == "__main__":
main()