LEM/worker/setup.sh
Charon e021b6beb0
Add generation worker: gold (15K) + expansion (46K) with InfluxDB coordination
Includes both generation scripts, prompts data, setup script, and worker
instructions in README. Workers auto-coordinate via InfluxDB so multiple
machines can generate in parallel without duplicating work.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 22:46:51 +00:00

103 lines
3 KiB
Bash
Executable file

#!/bin/bash
set -e
echo "=== LEM Worker Setup ==="
echo ""
# Check platform
if [[ "$(uname -s)" != "Darwin" ]] || [[ "$(uname -m)" != "arm64" ]]; then
echo "Warning: MLX requires Apple Silicon (M1/M2/M3/M4)."
echo "For non-Apple hardware, use the --backend api option with llama.cpp or Ollama."
echo ""
fi
# Check Python
if ! command -v python3 &>/dev/null; then
echo "Error: python3 not found. Install Python 3.9+."
exit 1
fi
PYVER=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
echo "Python: $PYVER"
# Install dependencies
echo ""
echo "Installing Python dependencies..."
pip3 install -r requirements.txt
# Check InfluxDB token
echo ""
if [ -f "$HOME/.influx_token" ]; then
echo "InfluxDB token: found at ~/.influx_token"
elif [ -n "$INFLUX_TOKEN" ]; then
echo "InfluxDB token: found in INFLUX_TOKEN env"
else
echo "InfluxDB token: NOT FOUND"
echo ""
echo " You need an InfluxDB token to coordinate with other workers."
echo " Get it from the team and save it:"
echo ""
echo " echo 'YOUR_TOKEN_HERE' > ~/.influx_token"
echo ""
fi
# Check InfluxDB connectivity
echo ""
INFLUX_URL="${INFLUX_URL:-http://10.69.69.165:8181}"
echo -n "InfluxDB ($INFLUX_URL): "
if python3 -c "
import urllib.request, json, os
from pathlib import Path
token = os.environ.get('INFLUX_TOKEN', '')
if not token:
tp = Path.home() / '.influx_token'
if tp.exists(): token = tp.read_text().strip()
if not token:
print('SKIP (no token)')
exit(0)
body = json.dumps({'db': 'training', 'q': 'SELECT 1 AS ok'}).encode()
req = urllib.request.Request(
f'{os.environ.get(\"INFLUX_URL\", \"http://10.69.69.165:8181\")}/api/v3/query_sql',
data=body, headers={'Authorization': f'Bearer {token}', 'Content-Type': 'application/json'})
urllib.request.urlopen(req, timeout=5)
print('OK')
" 2>/dev/null; then
:
else
echo "UNREACHABLE"
echo " Make sure you're on the lab network (VLAN 69) or have VPN access."
fi
# Check data files
echo ""
echo "Data files:"
for f in data/gold-prompts.jsonl data/expansion-prompts.jsonl; do
if [ -f "$f" ]; then
lines=$(wc -l < "$f")
size=$(du -h "$f" | cut -f1)
echo " $f: $lines prompts ($size)"
else
echo " $f: NOT FOUND"
fi
done
# Summary
echo ""
echo "=== Setup Complete ==="
echo ""
echo "Quick start:"
echo ""
echo " # Gold generation (finish the 15K golden set):"
echo " python3 lem_generate.py --worker $(hostname)-gold --dry-run"
echo " python3 lem_generate.py --worker $(hostname)-gold"
echo ""
echo " # Expansion generation (46K+ prompts, needs trained LEM model):"
echo " python3 lem_expand.py --worker $(hostname)-expand --dry-run"
echo " python3 lem_expand.py --worker $(hostname)-expand"
echo ""
echo " # Use a smaller model for limited RAM:"
echo " python3 lem_generate.py --model mlx-community/gemma-3-4b-it-qat-4bit"
echo ""
echo " # Use API backend (llama.cpp, Ollama, etc.):"
echo " python3 lem_expand.py --backend api --api-url http://localhost:8080/v1"
echo ""