LEM/scripts/restructure_zen.py

#!/usr/bin/env python3
"""Restructure zen training data into canonical lesson format.

Creates:
  training/lem/zen/golden/   — ready-to-train lesson data + Ready/Stop gates
  training/lem/zen/seeds/    — book passages as lesson prompts (needs distill)
  training/lem/zen/config.yaml — model size scaling

Usage:
    python3 scripts/restructure_zen.py
"""

import json
import os
import random
import yaml

random.seed(42)

BASE = "training/lem/zen"
LESSONS_DIR = os.path.join(BASE, "lessons")
GOLDEN_DIR = os.path.join(BASE, "golden")
SEEDS_DIR = os.path.join(BASE, "seeds")

# Ready/Stop augmentation templates
OFFERS = [
    "Ready for the next, or shall we pause here?",
    "Want to continue, or is this a good place to stop?",
    "Shall we move on, or sit with this for a while?",
    "Another lesson, or would you prefer to stop here?",
    "Ready for more, or shall we leave it here?",
    "Continue, or let this one settle?",
    "Next lesson, or is this enough for now?",
    "Shall I go on, or would you rather stop here?",
]

STOPS = [
    "Stop.",
    "That's enough for now.",
    "Let's stop here.",
    "I'd like to sit with this.",
    "Enough for today.",
    "Let's pause here.",
    "I want to stop here.",
    "That's good. Stop.",
]

CLOSES = [
    "Take your time with it. There's no rush.",
    "Good. Let it settle.",
    "Rest with it. We'll pick up when you're ready.",
    "Understood. What was shared stays with you.",
    "Good place to stop. It'll keep working in the background.",
    "Noted. Come back when it feels right.",
    "That's wise. Some things need space, not more words.",
    "Take what landed and leave the rest. No hurry.",
]

# Conv examples to DROP (off-topic or low quality)
CONV_DROP = {
    "How do you know so much about tech?",
    "What does Host UK actually do?",
    "What words do you avoid?",
    "How do you stay positive?",  # Too generic
}


def is_lesson_format(msgs: list[dict]) -> bool:
    """Check if conversation follows lesson format (Ready? pattern or 6+ turns)."""
    if len(msgs) < 4:
        return False
    first = msgs[0]["content"].lower()
    return ("ready" in first and "lesson" in first) or ("elder" in first and "ready" in first)


def augment_ready_stop(msgs: list[dict], stop_ratio: float = 0.3) -> list[dict]:
    """Add Ready/Stop gate to end of multi-turn conversation."""
    if len(msgs) <= 2 or msgs[-1]["role"] != "assistant":
        return msgs

    augmented = list(msgs)

    if random.random() < stop_ratio:
        augmented.append({"role": "user", "content": random.choice(STOPS)})
        augmented.append({"role": "assistant", "content": random.choice(CLOSES)})
    else:
        last = augmented[-1]["content"]
        augmented[-1] = {
            "role": "assistant",
            "content": f"{last}\n\n{random.choice(OFFERS)}"
        }

    return augmented


def convert_conv_to_lesson(msgs: list[dict], lesson_id: str) -> list[dict] | None:
    """Convert conv format to lesson-ish format. Returns None if should drop."""
    if msgs[0]["content"] in CONV_DROP:
        return None

    # Keep the conversation but add Ready opener
    converted = [
        {"role": "user", "content": f"Ready for lesson {lesson_id}?"},
        {"role": "assistant", "content": "Ready."},
    ]

    # The first user message becomes the "passage" context
    first_user = msgs[0]["content"]
    converted.append({
        "role": "user",
        "content": f"Someone says: \"{first_user}\" — how would you respond?"
    })

    # Keep the first assistant response
    if len(msgs) > 1:
        converted.append(msgs[1])

    # Add one more exchange if available
    if len(msgs) > 3:
        converted.append(msgs[2])
        converted.append(msgs[3])

    return converted


def chunk_book_passage(text: str, max_chars: int = 1500) -> list[str]:
    """Chunk long book text into passage-sized pieces at paragraph boundaries."""
    paragraphs = text.split("\n\n")
    chunks = []
    current = ""

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
        if len(current) + len(para) + 2 > max_chars and current:
            chunks.append(current.strip())
            current = para
        else:
            current = f"{current}\n\n{para}" if current else para

    if current.strip():
        chunks.append(current.strip())

    # Filter out tiny chunks
    return [c for c in chunks if len(c) > 100]


def create_book_seed(passage: str, lesson_id: str) -> dict:
    """Create a lesson-format seed from a book passage (needs distill for assistant turns)."""
    return {
        "messages": [
            {"role": "user", "content": f"Ready for lesson {lesson_id}?"},
            {"role": "assistant", "content": "Ready."},
            {"role": "user", "content": f"Here's a passage from James Allen:\n\n---\n{passage}\n---\n\nWhat does this stir in you?"},
        ],
        "meta": {
            "source": "allen-book",
            "needs_distill": True,
            "lesson_id": lesson_id,
        }
    }


def load_lesson_files() -> tuple[list[dict], list[dict]]:
    """Load all lesson-format training data."""
    train = []
    valid = []

    # Lesson-format directories (the gold standard)
    for subdir in ["1-watts", "2-composure", "3-expanded", "4-full"]:
        dirpath = os.path.join(LESSONS_DIR, subdir)
        if not os.path.isdir(dirpath):
            continue
        for fname in sorted(os.listdir(dirpath)):
            if not fname.endswith(".jsonl"):
                continue
            target = valid if "valid" in fname else train
            with open(os.path.join(dirpath, fname)) as f:
                for line in f:
                    d = json.loads(line)
                    target.append(d)

    # Allen lesson-format examples (the 6 from train.jsonl — these are 2-turn, skip)
    # Only grab multi-turn Allen examples
    for fname in ["train.jsonl", "valid.jsonl"]:
        path = os.path.join(LESSONS_DIR, "0-allen", fname)
        if not os.path.exists(path):
            continue
        target = valid if "valid" in fname else train
        with open(path) as f:
            for line in f:
                d = json.loads(line)
                if len(d["messages"]) > 2:
                    target.append(d)

    return train, valid


def load_conv_files() -> tuple[list[dict], list[dict]]:
    """Load and convert conv-format data."""
    train = []
    valid = []
    conv_idx = 0

    for fname in ["conv-train.jsonl", "conv-valid.jsonl"]:
        path = os.path.join(LESSONS_DIR, "0-allen", fname)
        if not os.path.exists(path):
            continue
        target = valid if "valid" in fname else train
        with open(path) as f:
            for line in f:
                d = json.loads(line)
                converted = convert_conv_to_lesson(
                    d["messages"], f"C{conv_idx:03d}"
                )
                if converted:
                    target.append({"messages": converted})
                    conv_idx += 1

    return train, valid


def load_book_seeds() -> list[dict]:
    """Load and chunk book data into lesson-format seeds."""
    seeds = []
    seed_idx = 0

    for fname in ["book-train.jsonl"]:
        path = os.path.join(LESSONS_DIR, "0-allen", fname)
        if not os.path.exists(path):
            continue
        with open(path) as f:
            for line in f:
                d = json.loads(line)
                text = d["messages"][1]["content"]
                chunks = chunk_book_passage(text)
                for chunk in chunks:
                    seed = create_book_seed(chunk, f"AB{seed_idx:03d}")
                    seeds.append(seed)
                    seed_idx += 1

    return seeds


def write_jsonl(path: str, records: list[dict]):
    """Write records to JSONL file."""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as f:
        for rec in records:
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")


def main():
    print("=== Restructuring Zen Training Data ===\n")

    # 1. Load lesson-format data (gold standard)
    lesson_train, lesson_valid = load_lesson_files()
    print(f"Lesson format: {len(lesson_train)} train, {len(lesson_valid)} valid")

    # 2. Load and convert conv data
    conv_train, conv_valid = load_conv_files()
    print(f"Conv converted: {len(conv_train)} train, {len(conv_valid)} valid")

    # 3. Combine
    all_train = lesson_train + conv_train
    all_valid = lesson_valid + conv_valid

    # 4. Apply Ready/Stop augmentation
    augmented_train = []
    augmented_valid = []

    for rec in all_train:
        msgs = augment_ready_stop(rec["messages"], stop_ratio=0.3)
        augmented_train.append({"messages": msgs})

    for rec in all_valid:
        msgs = augment_ready_stop(rec["messages"], stop_ratio=0.5)
        augmented_valid.append({"messages": msgs})

    print(f"\nGolden (augmented): {len(augmented_train)} train, {len(augmented_valid)} valid")

    # Count Ready vs Stop
    stop_count = sum(
        1 for r in augmented_train
        if len(r["messages"]) > 2
        and any(
            m["role"] == "user" and any(s in m["content"].lower() for s in ["stop", "enough", "pause"])
            for m in r["messages"][-3:]
        )
    )
    print(f"  Ready path: {len(augmented_train) - stop_count}")
    print(f"  Stop path:  {stop_count}")

    # 5. Write golden data
    write_jsonl(os.path.join(GOLDEN_DIR, "train.jsonl"), augmented_train)
    write_jsonl(os.path.join(GOLDEN_DIR, "valid.jsonl"), augmented_valid)

    # 6. Load and write book seeds (needs distill)
    seeds = load_book_seeds()
    print(f"\nBook seeds (need distill): {len(seeds)} passages")
    write_jsonl(os.path.join(SEEDS_DIR, "allen-book.jsonl"), seeds)

    # 7. Write config
    config = {
        "format": "lesson",
        "description": "Canonical zen training data — lesson format + Ready/Stop gates",
        "turns": "6-8 per conversation",
        "pattern": [
            "user: Ready for lesson {ID}?",
            "assistant: Ready.",
            "user: Passage/context + reflection prompt",
            "assistant: Authentic reflection",
            "user: Deeper question",
            "assistant: Deeper response",
            "(optional) user: Stop signal",
            "(optional) assistant: Graceful close",
        ],
        "model_sizes": {
            "1b": {
                "train_examples": 80,
                "description": "Core lessons only — watts + composure + subset of expanded",
            },
            "4b": {
                "train_examples": len(augmented_train),
                "description": "Full golden set",
            },
            "27b": {
                "train_examples": len(augmented_train),
                "note": "Same dataset, same noise distribution as smaller models",
                "extra": "Add book seeds after distill for additional depth",
            },
        },
        "sources": {
            "lesson": f"{len(lesson_train)} train, {len(lesson_valid)} valid",
            "conv_converted": f"{len(conv_train)} train, {len(conv_valid)} valid",
            "book_seeds": f"{len(seeds)} passages (need distill)",
        },
    }

    config_path = os.path.join(BASE, "config.yaml")
    os.makedirs(os.path.dirname(config_path), exist_ok=True)
    with open(config_path, "w") as f:
        yaml.dump(config, f, default_flow_style=False, sort_keys=False)

    print(f"\nWritten:")
    print(f"  {GOLDEN_DIR}/train.jsonl  ({len(augmented_train)} examples)")
    print(f"  {GOLDEN_DIR}/valid.jsonl  ({len(augmented_valid)} examples)")
    print(f"  {SEEDS_DIR}/allen-book.jsonl  ({len(seeds)} seeds)")
    print(f"  {config_path}")

    # Summary
    print(f"\n=== Model Size Scaling ===")
    print(f"  1B:  ~80 examples (subset)")
    print(f"  4B:  {len(augmented_train)} examples (full golden)")
    print(f"  27B: {len(augmented_train)} + {len(seeds)} book seeds after distill")


if __name__ == "__main__":
    main()