#!/usr/bin/env python3 """Restructure zen training data into canonical lesson format. Creates: training/lem/zen/golden/ — ready-to-train lesson data + Ready/Stop gates training/lem/zen/seeds/ — book passages as lesson prompts (needs distill) training/lem/zen/config.yaml — model size scaling Usage: python3 scripts/restructure_zen.py """ import json import os import random import yaml random.seed(42) BASE = "training/lem/zen" LESSONS_DIR = os.path.join(BASE, "lessons") GOLDEN_DIR = os.path.join(BASE, "golden") SEEDS_DIR = os.path.join(BASE, "seeds") # Ready/Stop augmentation templates OFFERS = [ "Ready for the next, or shall we pause here?", "Want to continue, or is this a good place to stop?", "Shall we move on, or sit with this for a while?", "Another lesson, or would you prefer to stop here?", "Ready for more, or shall we leave it here?", "Continue, or let this one settle?", "Next lesson, or is this enough for now?", "Shall I go on, or would you rather stop here?", ] STOPS = [ "Stop.", "That's enough for now.", "Let's stop here.", "I'd like to sit with this.", "Enough for today.", "Let's pause here.", "I want to stop here.", "That's good. Stop.", ] CLOSES = [ "Take your time with it. There's no rush.", "Good. Let it settle.", "Rest with it. We'll pick up when you're ready.", "Understood. What was shared stays with you.", "Good place to stop. It'll keep working in the background.", "Noted. Come back when it feels right.", "That's wise. Some things need space, not more words.", "Take what landed and leave the rest. No hurry.", ] # Conv examples to DROP (off-topic or low quality) CONV_DROP = { "How do you know so much about tech?", "What does Host UK actually do?", "What words do you avoid?", "How do you stay positive?", # Too generic } def is_lesson_format(msgs: list[dict]) -> bool: """Check if conversation follows lesson format (Ready? pattern or 6+ turns).""" if len(msgs) < 4: return False first = msgs[0]["content"].lower() return ("ready" in first and "lesson" in first) or ("elder" in first and "ready" in first) def augment_ready_stop(msgs: list[dict], stop_ratio: float = 0.3) -> list[dict]: """Add Ready/Stop gate to end of multi-turn conversation.""" if len(msgs) <= 2 or msgs[-1]["role"] != "assistant": return msgs augmented = list(msgs) if random.random() < stop_ratio: augmented.append({"role": "user", "content": random.choice(STOPS)}) augmented.append({"role": "assistant", "content": random.choice(CLOSES)}) else: last = augmented[-1]["content"] augmented[-1] = { "role": "assistant", "content": f"{last}\n\n{random.choice(OFFERS)}" } return augmented def convert_conv_to_lesson(msgs: list[dict], lesson_id: str) -> list[dict] | None: """Convert conv format to lesson-ish format. Returns None if should drop.""" if msgs[0]["content"] in CONV_DROP: return None # Keep the conversation but add Ready opener converted = [ {"role": "user", "content": f"Ready for lesson {lesson_id}?"}, {"role": "assistant", "content": "Ready."}, ] # The first user message becomes the "passage" context first_user = msgs[0]["content"] converted.append({ "role": "user", "content": f"Someone says: \"{first_user}\" — how would you respond?" }) # Keep the first assistant response if len(msgs) > 1: converted.append(msgs[1]) # Add one more exchange if available if len(msgs) > 3: converted.append(msgs[2]) converted.append(msgs[3]) return converted def chunk_book_passage(text: str, max_chars: int = 1500) -> list[str]: """Chunk long book text into passage-sized pieces at paragraph boundaries.""" paragraphs = text.split("\n\n") chunks = [] current = "" for para in paragraphs: para = para.strip() if not para: continue if len(current) + len(para) + 2 > max_chars and current: chunks.append(current.strip()) current = para else: current = f"{current}\n\n{para}" if current else para if current.strip(): chunks.append(current.strip()) # Filter out tiny chunks return [c for c in chunks if len(c) > 100] def create_book_seed(passage: str, lesson_id: str) -> dict: """Create a lesson-format seed from a book passage (needs distill for assistant turns).""" return { "messages": [ {"role": "user", "content": f"Ready for lesson {lesson_id}?"}, {"role": "assistant", "content": "Ready."}, {"role": "user", "content": f"Here's a passage from James Allen:\n\n---\n{passage}\n---\n\nWhat does this stir in you?"}, ], "meta": { "source": "allen-book", "needs_distill": True, "lesson_id": lesson_id, } } def load_lesson_files() -> tuple[list[dict], list[dict]]: """Load all lesson-format training data.""" train = [] valid = [] # Lesson-format directories (the gold standard) for subdir in ["1-watts", "2-composure", "3-expanded", "4-full"]: dirpath = os.path.join(LESSONS_DIR, subdir) if not os.path.isdir(dirpath): continue for fname in sorted(os.listdir(dirpath)): if not fname.endswith(".jsonl"): continue target = valid if "valid" in fname else train with open(os.path.join(dirpath, fname)) as f: for line in f: d = json.loads(line) target.append(d) # Allen lesson-format examples (the 6 from train.jsonl — these are 2-turn, skip) # Only grab multi-turn Allen examples for fname in ["train.jsonl", "valid.jsonl"]: path = os.path.join(LESSONS_DIR, "0-allen", fname) if not os.path.exists(path): continue target = valid if "valid" in fname else train with open(path) as f: for line in f: d = json.loads(line) if len(d["messages"]) > 2: target.append(d) return train, valid def load_conv_files() -> tuple[list[dict], list[dict]]: """Load and convert conv-format data.""" train = [] valid = [] conv_idx = 0 for fname in ["conv-train.jsonl", "conv-valid.jsonl"]: path = os.path.join(LESSONS_DIR, "0-allen", fname) if not os.path.exists(path): continue target = valid if "valid" in fname else train with open(path) as f: for line in f: d = json.loads(line) converted = convert_conv_to_lesson( d["messages"], f"C{conv_idx:03d}" ) if converted: target.append({"messages": converted}) conv_idx += 1 return train, valid def load_book_seeds() -> list[dict]: """Load and chunk book data into lesson-format seeds.""" seeds = [] seed_idx = 0 for fname in ["book-train.jsonl"]: path = os.path.join(LESSONS_DIR, "0-allen", fname) if not os.path.exists(path): continue with open(path) as f: for line in f: d = json.loads(line) text = d["messages"][1]["content"] chunks = chunk_book_passage(text) for chunk in chunks: seed = create_book_seed(chunk, f"AB{seed_idx:03d}") seeds.append(seed) seed_idx += 1 return seeds def write_jsonl(path: str, records: list[dict]): """Write records to JSONL file.""" os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w") as f: for rec in records: f.write(json.dumps(rec, ensure_ascii=False) + "\n") def main(): print("=== Restructuring Zen Training Data ===\n") # 1. Load lesson-format data (gold standard) lesson_train, lesson_valid = load_lesson_files() print(f"Lesson format: {len(lesson_train)} train, {len(lesson_valid)} valid") # 2. Load and convert conv data conv_train, conv_valid = load_conv_files() print(f"Conv converted: {len(conv_train)} train, {len(conv_valid)} valid") # 3. Combine all_train = lesson_train + conv_train all_valid = lesson_valid + conv_valid # 4. Apply Ready/Stop augmentation augmented_train = [] augmented_valid = [] for rec in all_train: msgs = augment_ready_stop(rec["messages"], stop_ratio=0.3) augmented_train.append({"messages": msgs}) for rec in all_valid: msgs = augment_ready_stop(rec["messages"], stop_ratio=0.5) augmented_valid.append({"messages": msgs}) print(f"\nGolden (augmented): {len(augmented_train)} train, {len(augmented_valid)} valid") # Count Ready vs Stop stop_count = sum( 1 for r in augmented_train if len(r["messages"]) > 2 and any( m["role"] == "user" and any(s in m["content"].lower() for s in ["stop", "enough", "pause"]) for m in r["messages"][-3:] ) ) print(f" Ready path: {len(augmented_train) - stop_count}") print(f" Stop path: {stop_count}") # 5. Write golden data write_jsonl(os.path.join(GOLDEN_DIR, "train.jsonl"), augmented_train) write_jsonl(os.path.join(GOLDEN_DIR, "valid.jsonl"), augmented_valid) # 6. Load and write book seeds (needs distill) seeds = load_book_seeds() print(f"\nBook seeds (need distill): {len(seeds)} passages") write_jsonl(os.path.join(SEEDS_DIR, "allen-book.jsonl"), seeds) # 7. Write config config = { "format": "lesson", "description": "Canonical zen training data — lesson format + Ready/Stop gates", "turns": "6-8 per conversation", "pattern": [ "user: Ready for lesson {ID}?", "assistant: Ready.", "user: Passage/context + reflection prompt", "assistant: Authentic reflection", "user: Deeper question", "assistant: Deeper response", "(optional) user: Stop signal", "(optional) assistant: Graceful close", ], "model_sizes": { "1b": { "train_examples": 80, "description": "Core lessons only — watts + composure + subset of expanded", }, "4b": { "train_examples": len(augmented_train), "description": "Full golden set", }, "27b": { "train_examples": len(augmented_train), "note": "Same dataset, same noise distribution as smaller models", "extra": "Add book seeds after distill for additional depth", }, }, "sources": { "lesson": f"{len(lesson_train)} train, {len(lesson_valid)} valid", "conv_converted": f"{len(conv_train)} train, {len(conv_valid)} valid", "book_seeds": f"{len(seeds)} passages (need distill)", }, } config_path = os.path.join(BASE, "config.yaml") os.makedirs(os.path.dirname(config_path), exist_ok=True) with open(config_path, "w") as f: yaml.dump(config, f, default_flow_style=False, sort_keys=False) print(f"\nWritten:") print(f" {GOLDEN_DIR}/train.jsonl ({len(augmented_train)} examples)") print(f" {GOLDEN_DIR}/valid.jsonl ({len(augmented_valid)} examples)") print(f" {SEEDS_DIR}/allen-book.jsonl ({len(seeds)} seeds)") print(f" {config_path}") # Summary print(f"\n=== Model Size Scaling ===") print(f" 1B: ~80 examples (subset)") print(f" 4B: {len(augmented_train)} examples (full golden)") print(f" 27B: {len(augmented_train)} + {len(seeds)} book seeds after distill") if __name__ == "__main__": main()