LEM/scripts/distill_seeds.py
Snider f75458bce6 refactor: apply go fix modernizers for Go 1.26
Automated fixes: interface{} → any, range-over-int, t.Context(),
wg.Go(), strings.SplitSeq, strings.Builder, slices.Contains,
maps helpers, min/max builtins.

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-22 21:00:17 +00:00

132 lines
4.3 KiB
Python

#!/usr/bin/env python3
"""Distill book seeds through a trained model to create complete lesson data.
Loads seed prompts (Ready/Ready/Passage), generates assistant reflections,
then creates a deeper question + response exchange to complete the lesson format.
No system prompt. No LEK. Just the model's own weights.
Usage:
python3 scripts/distill_seeds.py \
--model /Volumes/Data/lem/LEM-Gemma3-1B-layered-v2 \
--seeds training/lem/zen/seeds/allen-book.jsonl \
--output training/lem/zen/seeds/allen-book-distilled.jsonl \
--max-tokens 512 --temp 0.7
"""
import argparse
import json
import sys
import time
from mlx_lm import load, generate
from mlx_lm.sample_utils import make_sampler
DEEPER_PROMPTS = [
"How does that connect to how people live day to day?",
"Where do you see that playing out in real life?",
"What does that mean for someone trying to live well?",
"How would you explain that to someone who's never thought about it?",
"What's the practical takeaway from that?",
"Does that change how you'd approach a difficult moment?",
"What would it look like to actually live that way?",
"How does that land for you personally?",
]
def distill(args):
print(f"Loading model: {args.model}")
model, tokenizer = load(args.model)
sampler = make_sampler(temp=args.temp)
with open(args.seeds) as f:
seeds = [json.loads(line) for line in f if line.strip()]
print(f"Loaded {len(seeds)} seeds")
results = []
skipped = 0
start = time.time()
for i, seed in enumerate(seeds):
msgs = seed["messages"]
meta = seed.get("meta", {})
lesson_id = meta.get("lesson_id", f"S{i:03d}")
# Generate first reflection
prompt = tokenizer.apply_chat_template(
msgs, tokenize=False, add_generation_prompt=True
)
reflection = generate(
model, tokenizer, prompt=prompt,
max_tokens=args.max_tokens, sampler=sampler, verbose=False,
)
reflection = reflection.strip()
if not reflection or len(reflection) < 20:
skipped += 1
continue
# Build deeper exchange
import random
deeper_q = random.choice(DEEPER_PROMPTS)
deeper_msgs = msgs + [
{"role": "assistant", "content": reflection},
{"role": "user", "content": deeper_q},
]
deeper_prompt = tokenizer.apply_chat_template(
deeper_msgs, tokenize=False, add_generation_prompt=True
)
deeper_response = generate(
model, tokenizer, prompt=deeper_prompt,
max_tokens=args.max_tokens, sampler=sampler, verbose=False,
)
deeper_response = deeper_response.strip()
if not deeper_response or len(deeper_response) < 20:
skipped += 1
continue
# Complete lesson
complete = {
"messages": msgs + [
{"role": "assistant", "content": reflection},
{"role": "user", "content": deeper_q},
{"role": "assistant", "content": deeper_response},
],
"meta": {
"source": "allen-book-distilled",
"lesson_id": lesson_id,
"model": args.model.split("/")[-1],
}
}
results.append(complete)
elapsed = time.time() - start
rate = (i + 1) / elapsed if elapsed > 0 else 0
eta = (len(seeds) - i - 1) / rate if rate > 0 else 0
print(f" [{i+1}/{len(seeds)}] {lesson_id}"
f"reflection {len(reflection)} chars, deeper {len(deeper_response)} chars "
f"({rate:.1f}/s, ETA {eta:.0f}s)")
# Write output
with open(args.output, "w") as f:
for rec in results:
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
elapsed = time.time() - start
print(f"\nDone: {len(results)} complete, {skipped} skipped, {elapsed:.0f}s")
print(f"Output: {args.output}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True)
parser.add_argument("--seeds", required=True)
parser.add_argument("--output", required=True)
parser.add_argument("--max-tokens", type=int, default=512)
parser.add_argument("--temp", type=float, default=0.7)
args = parser.parse_args()
distill(args)