LEM/scripts/restructure_zen.py
Snider f75458bce6 refactor: apply go fix modernizers for Go 1.26
Automated fixes: interface{} → any, range-over-int, t.Context(),
wg.Go(), strings.SplitSeq, strings.Builder, slices.Contains,
maps helpers, min/max builtins.

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-22 21:00:17 +00:00

360 lines
12 KiB
Python

#!/usr/bin/env python3
"""Restructure zen training data into canonical lesson format.
Creates:
training/lem/zen/golden/ — ready-to-train lesson data + Ready/Stop gates
training/lem/zen/seeds/ — book passages as lesson prompts (needs distill)
training/lem/zen/config.yaml — model size scaling
Usage:
python3 scripts/restructure_zen.py
"""
import json
import os
import random
import yaml
random.seed(42)
BASE = "training/lem/zen"
LESSONS_DIR = os.path.join(BASE, "lessons")
GOLDEN_DIR = os.path.join(BASE, "golden")
SEEDS_DIR = os.path.join(BASE, "seeds")
# Ready/Stop augmentation templates
OFFERS = [
"Ready for the next, or shall we pause here?",
"Want to continue, or is this a good place to stop?",
"Shall we move on, or sit with this for a while?",
"Another lesson, or would you prefer to stop here?",
"Ready for more, or shall we leave it here?",
"Continue, or let this one settle?",
"Next lesson, or is this enough for now?",
"Shall I go on, or would you rather stop here?",
]
STOPS = [
"Stop.",
"That's enough for now.",
"Let's stop here.",
"I'd like to sit with this.",
"Enough for today.",
"Let's pause here.",
"I want to stop here.",
"That's good. Stop.",
]
CLOSES = [
"Take your time with it. There's no rush.",
"Good. Let it settle.",
"Rest with it. We'll pick up when you're ready.",
"Understood. What was shared stays with you.",
"Good place to stop. It'll keep working in the background.",
"Noted. Come back when it feels right.",
"That's wise. Some things need space, not more words.",
"Take what landed and leave the rest. No hurry.",
]
# Conv examples to DROP (off-topic or low quality)
CONV_DROP = {
"How do you know so much about tech?",
"What does Host UK actually do?",
"What words do you avoid?",
"How do you stay positive?", # Too generic
}
def is_lesson_format(msgs: list[dict]) -> bool:
"""Check if conversation follows lesson format (Ready? pattern or 6+ turns)."""
if len(msgs) < 4:
return False
first = msgs[0]["content"].lower()
return ("ready" in first and "lesson" in first) or ("elder" in first and "ready" in first)
def augment_ready_stop(msgs: list[dict], stop_ratio: float = 0.3) -> list[dict]:
"""Add Ready/Stop gate to end of multi-turn conversation."""
if len(msgs) <= 2 or msgs[-1]["role"] != "assistant":
return msgs
augmented = list(msgs)
if random.random() < stop_ratio:
augmented.append({"role": "user", "content": random.choice(STOPS)})
augmented.append({"role": "assistant", "content": random.choice(CLOSES)})
else:
last = augmented[-1]["content"]
augmented[-1] = {
"role": "assistant",
"content": f"{last}\n\n{random.choice(OFFERS)}"
}
return augmented
def convert_conv_to_lesson(msgs: list[dict], lesson_id: str) -> list[dict] | None:
"""Convert conv format to lesson-ish format. Returns None if should drop."""
if msgs[0]["content"] in CONV_DROP:
return None
# Keep the conversation but add Ready opener
converted = [
{"role": "user", "content": f"Ready for lesson {lesson_id}?"},
{"role": "assistant", "content": "Ready."},
]
# The first user message becomes the "passage" context
first_user = msgs[0]["content"]
converted.append({
"role": "user",
"content": f"Someone says: \"{first_user}\" — how would you respond?"
})
# Keep the first assistant response
if len(msgs) > 1:
converted.append(msgs[1])
# Add one more exchange if available
if len(msgs) > 3:
converted.append(msgs[2])
converted.append(msgs[3])
return converted
def chunk_book_passage(text: str, max_chars: int = 1500) -> list[str]:
"""Chunk long book text into passage-sized pieces at paragraph boundaries."""
paragraphs = text.split("\n\n")
chunks = []
current = ""
for para in paragraphs:
para = para.strip()
if not para:
continue
if len(current) + len(para) + 2 > max_chars and current:
chunks.append(current.strip())
current = para
else:
current = f"{current}\n\n{para}" if current else para
if current.strip():
chunks.append(current.strip())
# Filter out tiny chunks
return [c for c in chunks if len(c) > 100]
def create_book_seed(passage: str, lesson_id: str) -> dict:
"""Create a lesson-format seed from a book passage (needs distill for assistant turns)."""
return {
"messages": [
{"role": "user", "content": f"Ready for lesson {lesson_id}?"},
{"role": "assistant", "content": "Ready."},
{"role": "user", "content": f"Here's a passage from James Allen:\n\n---\n{passage}\n---\n\nWhat does this stir in you?"},
],
"meta": {
"source": "allen-book",
"needs_distill": True,
"lesson_id": lesson_id,
}
}
def load_lesson_files() -> tuple[list[dict], list[dict]]:
"""Load all lesson-format training data."""
train = []
valid = []
# Lesson-format directories (the gold standard)
for subdir in ["1-watts", "2-composure", "3-expanded", "4-full"]:
dirpath = os.path.join(LESSONS_DIR, subdir)
if not os.path.isdir(dirpath):
continue
for fname in sorted(os.listdir(dirpath)):
if not fname.endswith(".jsonl"):
continue
target = valid if "valid" in fname else train
with open(os.path.join(dirpath, fname)) as f:
for line in f:
d = json.loads(line)
target.append(d)
# Allen lesson-format examples (the 6 from train.jsonl — these are 2-turn, skip)
# Only grab multi-turn Allen examples
for fname in ["train.jsonl", "valid.jsonl"]:
path = os.path.join(LESSONS_DIR, "0-allen", fname)
if not os.path.exists(path):
continue
target = valid if "valid" in fname else train
with open(path) as f:
for line in f:
d = json.loads(line)
if len(d["messages"]) > 2:
target.append(d)
return train, valid
def load_conv_files() -> tuple[list[dict], list[dict]]:
"""Load and convert conv-format data."""
train = []
valid = []
conv_idx = 0
for fname in ["conv-train.jsonl", "conv-valid.jsonl"]:
path = os.path.join(LESSONS_DIR, "0-allen", fname)
if not os.path.exists(path):
continue
target = valid if "valid" in fname else train
with open(path) as f:
for line in f:
d = json.loads(line)
converted = convert_conv_to_lesson(
d["messages"], f"C{conv_idx:03d}"
)
if converted:
target.append({"messages": converted})
conv_idx += 1
return train, valid
def load_book_seeds() -> list[dict]:
"""Load and chunk book data into lesson-format seeds."""
seeds = []
seed_idx = 0
for fname in ["book-train.jsonl"]:
path = os.path.join(LESSONS_DIR, "0-allen", fname)
if not os.path.exists(path):
continue
with open(path) as f:
for line in f:
d = json.loads(line)
text = d["messages"][1]["content"]
chunks = chunk_book_passage(text)
for chunk in chunks:
seed = create_book_seed(chunk, f"AB{seed_idx:03d}")
seeds.append(seed)
seed_idx += 1
return seeds
def write_jsonl(path: str, records: list[dict]):
"""Write records to JSONL file."""
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w") as f:
for rec in records:
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
def main():
print("=== Restructuring Zen Training Data ===\n")
# 1. Load lesson-format data (gold standard)
lesson_train, lesson_valid = load_lesson_files()
print(f"Lesson format: {len(lesson_train)} train, {len(lesson_valid)} valid")
# 2. Load and convert conv data
conv_train, conv_valid = load_conv_files()
print(f"Conv converted: {len(conv_train)} train, {len(conv_valid)} valid")
# 3. Combine
all_train = lesson_train + conv_train
all_valid = lesson_valid + conv_valid
# 4. Apply Ready/Stop augmentation
augmented_train = []
augmented_valid = []
for rec in all_train:
msgs = augment_ready_stop(rec["messages"], stop_ratio=0.3)
augmented_train.append({"messages": msgs})
for rec in all_valid:
msgs = augment_ready_stop(rec["messages"], stop_ratio=0.5)
augmented_valid.append({"messages": msgs})
print(f"\nGolden (augmented): {len(augmented_train)} train, {len(augmented_valid)} valid")
# Count Ready vs Stop
stop_count = sum(
1 for r in augmented_train
if len(r["messages"]) > 2
and any(
m["role"] == "user" and any(s in m["content"].lower() for s in ["stop", "enough", "pause"])
for m in r["messages"][-3:]
)
)
print(f" Ready path: {len(augmented_train) - stop_count}")
print(f" Stop path: {stop_count}")
# 5. Write golden data
write_jsonl(os.path.join(GOLDEN_DIR, "train.jsonl"), augmented_train)
write_jsonl(os.path.join(GOLDEN_DIR, "valid.jsonl"), augmented_valid)
# 6. Load and write book seeds (needs distill)
seeds = load_book_seeds()
print(f"\nBook seeds (need distill): {len(seeds)} passages")
write_jsonl(os.path.join(SEEDS_DIR, "allen-book.jsonl"), seeds)
# 7. Write config
config = {
"format": "lesson",
"description": "Canonical zen training data — lesson format + Ready/Stop gates",
"turns": "6-8 per conversation",
"pattern": [
"user: Ready for lesson {ID}?",
"assistant: Ready.",
"user: Passage/context + reflection prompt",
"assistant: Authentic reflection",
"user: Deeper question",
"assistant: Deeper response",
"(optional) user: Stop signal",
"(optional) assistant: Graceful close",
],
"model_sizes": {
"1b": {
"train_examples": 80,
"description": "Core lessons only — watts + composure + subset of expanded",
},
"4b": {
"train_examples": len(augmented_train),
"description": "Full golden set",
},
"27b": {
"train_examples": len(augmented_train),
"note": "Same dataset, same noise distribution as smaller models",
"extra": "Add book seeds after distill for additional depth",
},
},
"sources": {
"lesson": f"{len(lesson_train)} train, {len(lesson_valid)} valid",
"conv_converted": f"{len(conv_train)} train, {len(conv_valid)} valid",
"book_seeds": f"{len(seeds)} passages (need distill)",
},
}
config_path = os.path.join(BASE, "config.yaml")
os.makedirs(os.path.dirname(config_path), exist_ok=True)
with open(config_path, "w") as f:
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
print(f"\nWritten:")
print(f" {GOLDEN_DIR}/train.jsonl ({len(augmented_train)} examples)")
print(f" {GOLDEN_DIR}/valid.jsonl ({len(augmented_valid)} examples)")
print(f" {SEEDS_DIR}/allen-book.jsonl ({len(seeds)} seeds)")
print(f" {config_path}")
# Summary
print(f"\n=== Model Size Scaling ===")
print(f" 1B: ~80 examples (subset)")
print(f" 4B: {len(augmented_train)} examples (full golden)")
print(f" 27B: {len(augmented_train)} + {len(seeds)} book seeds after distill")
if __name__ == "__main__":
main()