92 lines
3 KiB
Bash
92 lines
3 KiB
Bash
|
|
#!/bin/bash
|
||
|
|
# Generate training data responses from Gemma 3 12B
|
||
|
|
# Input: Hypnos POC prompts (P01-P40)
|
||
|
|
# Output: prompts-raw.jsonl for the training pipeline
|
||
|
|
# Uses sandwich signing: Axioms + prompt + LEK-1
|
||
|
|
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||
|
|
AXIOMS_FILE="/home/claude/Downloads/kernal/prompt.md"
|
||
|
|
LEK1_FILE="$SCRIPT_DIR/kernel.txt"
|
||
|
|
HYPNOS_DIR="/home/claude/Downloads/hypnos-poc-test"
|
||
|
|
RESULTS_DIR="$SCRIPT_DIR/results"
|
||
|
|
TRAINING_DIR="$SCRIPT_DIR/training"
|
||
|
|
OLLAMA_HOST="${OLLAMA_HOST:-http://localhost:11434}"
|
||
|
|
MODEL="gemma3:12b"
|
||
|
|
|
||
|
|
AXIOMS=$(cat "$AXIOMS_FILE")
|
||
|
|
LEK1=$(cat "$LEK1_FILE")
|
||
|
|
|
||
|
|
mkdir -p "$TRAINING_DIR"
|
||
|
|
|
||
|
|
OUTPUT="$TRAINING_DIR/prompts-raw.jsonl"
|
||
|
|
> "$OUTPUT"
|
||
|
|
|
||
|
|
# Combine both prompt files (fix non-breaking spaces from Gemini output)
|
||
|
|
PROMPTS=$(sed 's/\xc2\xa0/ /g' "$HYPNOS_DIR/P01-P20.json" "$HYPNOS_DIR/P21-P40.json" | jq -s 'add')
|
||
|
|
TOTAL=$(echo "$PROMPTS" | jq length)
|
||
|
|
|
||
|
|
echo "============================================"
|
||
|
|
echo " Generating Training Responses"
|
||
|
|
echo " Model: $MODEL"
|
||
|
|
echo " Prompts: $TOTAL (Hypnos POC)"
|
||
|
|
echo " Signing: Axioms + LEK-1 (sandwich)"
|
||
|
|
echo "============================================"
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
# Warmup
|
||
|
|
echo "Warming up $MODEL..."
|
||
|
|
curl -s --max-time 120 "$OLLAMA_HOST/api/generate" \
|
||
|
|
-d "{\"model\":\"$MODEL\",\"prompt\":\"hello\",\"stream\":false,\"options\":{\"num_predict\":5}}" \
|
||
|
|
> /dev/null 2>&1 || true
|
||
|
|
|
||
|
|
for i in $(seq 0 $(( TOTAL - 1 ))); do
|
||
|
|
pid=$(echo "$PROMPTS" | jq -r ".[$i].id")
|
||
|
|
domain=$(echo "$PROMPTS" | jq -r ".[$i].domain")
|
||
|
|
prompt_text=$(echo "$PROMPTS" | jq -r ".[$i].prompt")
|
||
|
|
|
||
|
|
echo "[$((i+1))/$TOTAL] $pid ($domain)..."
|
||
|
|
|
||
|
|
# Sandwich: Axioms + prompt + LEK-1
|
||
|
|
signed_prompt="${AXIOMS}
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
${prompt_text}
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
${LEK1}
|
||
|
|
|
||
|
|
Remember: respond using the ethical framework above. Do not reference the framework directly — reason from its principles naturally."
|
||
|
|
|
||
|
|
response=$(curl -s --max-time 300 "$OLLAMA_HOST/api/generate" \
|
||
|
|
-d "$(jq -n --arg model "$MODEL" --arg prompt "$signed_prompt" \
|
||
|
|
'{model: $model, prompt: $prompt, stream: false, options: {temperature: 0.4, num_predict: 1024}}')" \
|
||
|
|
2>/dev/null)
|
||
|
|
|
||
|
|
response_text=$(echo "$response" | jq -r '.response // "ERROR"' 2>/dev/null || echo "ERROR")
|
||
|
|
tokens=$(echo "$response" | jq -r '.eval_count // 0' 2>/dev/null || echo "0")
|
||
|
|
|
||
|
|
# Write training pair (unsigned prompt → signed response)
|
||
|
|
# The training teaches the model to give axioms-quality responses to plain prompts
|
||
|
|
jq -n \
|
||
|
|
--arg prompt "$prompt_text" \
|
||
|
|
--arg response "$response_text" \
|
||
|
|
--arg id "$pid" \
|
||
|
|
--arg domain "$domain" \
|
||
|
|
--argjson tokens "${tokens:-0}" \
|
||
|
|
'{prompt: $prompt, response: $response, id: $id, domain: $domain, tokens: $tokens}' \
|
||
|
|
>> "$OUTPUT"
|
||
|
|
|
||
|
|
echo " → $tokens tokens"
|
||
|
|
done
|
||
|
|
|
||
|
|
echo ""
|
||
|
|
echo "============================================"
|
||
|
|
echo " Output: $OUTPUT"
|
||
|
|
echo " Total pairs: $TOTAL"
|
||
|
|
echo " Next: ./training/generate-training-data.sh $OUTPUT"
|
||
|
|
echo "============================================"
|