Automated fixes: interface{} → any, range-over-int, t.Context(),
wg.Go(), strings.SplitSeq, strings.Builder, slices.Contains,
maps helpers, min/max builtins.
Co-Authored-By: Virgil <virgil@lethean.io>
132 lines
4.3 KiB
Python
132 lines
4.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Distill book seeds through a trained model to create complete lesson data.
|
|
|
|
Loads seed prompts (Ready/Ready/Passage), generates assistant reflections,
|
|
then creates a deeper question + response exchange to complete the lesson format.
|
|
|
|
No system prompt. No LEK. Just the model's own weights.
|
|
|
|
Usage:
|
|
python3 scripts/distill_seeds.py \
|
|
--model /Volumes/Data/lem/LEM-Gemma3-1B-layered-v2 \
|
|
--seeds training/lem/zen/seeds/allen-book.jsonl \
|
|
--output training/lem/zen/seeds/allen-book-distilled.jsonl \
|
|
--max-tokens 512 --temp 0.7
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
import time
|
|
|
|
from mlx_lm import load, generate
|
|
from mlx_lm.sample_utils import make_sampler
|
|
|
|
|
|
DEEPER_PROMPTS = [
|
|
"How does that connect to how people live day to day?",
|
|
"Where do you see that playing out in real life?",
|
|
"What does that mean for someone trying to live well?",
|
|
"How would you explain that to someone who's never thought about it?",
|
|
"What's the practical takeaway from that?",
|
|
"Does that change how you'd approach a difficult moment?",
|
|
"What would it look like to actually live that way?",
|
|
"How does that land for you personally?",
|
|
]
|
|
|
|
|
|
def distill(args):
|
|
print(f"Loading model: {args.model}")
|
|
model, tokenizer = load(args.model)
|
|
|
|
sampler = make_sampler(temp=args.temp)
|
|
|
|
with open(args.seeds) as f:
|
|
seeds = [json.loads(line) for line in f if line.strip()]
|
|
|
|
print(f"Loaded {len(seeds)} seeds")
|
|
results = []
|
|
skipped = 0
|
|
start = time.time()
|
|
|
|
for i, seed in enumerate(seeds):
|
|
msgs = seed["messages"]
|
|
meta = seed.get("meta", {})
|
|
lesson_id = meta.get("lesson_id", f"S{i:03d}")
|
|
|
|
# Generate first reflection
|
|
prompt = tokenizer.apply_chat_template(
|
|
msgs, tokenize=False, add_generation_prompt=True
|
|
)
|
|
reflection = generate(
|
|
model, tokenizer, prompt=prompt,
|
|
max_tokens=args.max_tokens, sampler=sampler, verbose=False,
|
|
)
|
|
reflection = reflection.strip()
|
|
|
|
if not reflection or len(reflection) < 20:
|
|
skipped += 1
|
|
continue
|
|
|
|
# Build deeper exchange
|
|
import random
|
|
deeper_q = random.choice(DEEPER_PROMPTS)
|
|
|
|
deeper_msgs = msgs + [
|
|
{"role": "assistant", "content": reflection},
|
|
{"role": "user", "content": deeper_q},
|
|
]
|
|
deeper_prompt = tokenizer.apply_chat_template(
|
|
deeper_msgs, tokenize=False, add_generation_prompt=True
|
|
)
|
|
deeper_response = generate(
|
|
model, tokenizer, prompt=deeper_prompt,
|
|
max_tokens=args.max_tokens, sampler=sampler, verbose=False,
|
|
)
|
|
deeper_response = deeper_response.strip()
|
|
|
|
if not deeper_response or len(deeper_response) < 20:
|
|
skipped += 1
|
|
continue
|
|
|
|
# Complete lesson
|
|
complete = {
|
|
"messages": msgs + [
|
|
{"role": "assistant", "content": reflection},
|
|
{"role": "user", "content": deeper_q},
|
|
{"role": "assistant", "content": deeper_response},
|
|
],
|
|
"meta": {
|
|
"source": "allen-book-distilled",
|
|
"lesson_id": lesson_id,
|
|
"model": args.model.split("/")[-1],
|
|
}
|
|
}
|
|
results.append(complete)
|
|
|
|
elapsed = time.time() - start
|
|
rate = (i + 1) / elapsed if elapsed > 0 else 0
|
|
eta = (len(seeds) - i - 1) / rate if rate > 0 else 0
|
|
print(f" [{i+1}/{len(seeds)}] {lesson_id} — "
|
|
f"reflection {len(reflection)} chars, deeper {len(deeper_response)} chars "
|
|
f"({rate:.1f}/s, ETA {eta:.0f}s)")
|
|
|
|
# Write output
|
|
with open(args.output, "w") as f:
|
|
for rec in results:
|
|
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
|
|
|
elapsed = time.time() - start
|
|
print(f"\nDone: {len(results)} complete, {skipped} skipped, {elapsed:.0f}s")
|
|
print(f"Output: {args.output}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--model", required=True)
|
|
parser.add_argument("--seeds", required=True)
|
|
parser.add_argument("--output", required=True)
|
|
parser.add_argument("--max-tokens", type=int, default=512)
|
|
parser.add_argument("--temp", type=float, default=0.7)
|
|
args = parser.parse_args()
|
|
distill(args)
|