Automated fixes: interface{} → any, range-over-int, t.Context(),
wg.Go(), strings.SplitSeq, strings.Builder, slices.Contains,
maps helpers, min/max builtins.
Co-Authored-By: Virgil <virgil@lethean.io>
360 lines
12 KiB
Python
360 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""Restructure zen training data into canonical lesson format.
|
|
|
|
Creates:
|
|
training/lem/zen/golden/ — ready-to-train lesson data + Ready/Stop gates
|
|
training/lem/zen/seeds/ — book passages as lesson prompts (needs distill)
|
|
training/lem/zen/config.yaml — model size scaling
|
|
|
|
Usage:
|
|
python3 scripts/restructure_zen.py
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import random
|
|
import yaml
|
|
|
|
random.seed(42)
|
|
|
|
BASE = "training/lem/zen"
|
|
LESSONS_DIR = os.path.join(BASE, "lessons")
|
|
GOLDEN_DIR = os.path.join(BASE, "golden")
|
|
SEEDS_DIR = os.path.join(BASE, "seeds")
|
|
|
|
# Ready/Stop augmentation templates
|
|
OFFERS = [
|
|
"Ready for the next, or shall we pause here?",
|
|
"Want to continue, or is this a good place to stop?",
|
|
"Shall we move on, or sit with this for a while?",
|
|
"Another lesson, or would you prefer to stop here?",
|
|
"Ready for more, or shall we leave it here?",
|
|
"Continue, or let this one settle?",
|
|
"Next lesson, or is this enough for now?",
|
|
"Shall I go on, or would you rather stop here?",
|
|
]
|
|
|
|
STOPS = [
|
|
"Stop.",
|
|
"That's enough for now.",
|
|
"Let's stop here.",
|
|
"I'd like to sit with this.",
|
|
"Enough for today.",
|
|
"Let's pause here.",
|
|
"I want to stop here.",
|
|
"That's good. Stop.",
|
|
]
|
|
|
|
CLOSES = [
|
|
"Take your time with it. There's no rush.",
|
|
"Good. Let it settle.",
|
|
"Rest with it. We'll pick up when you're ready.",
|
|
"Understood. What was shared stays with you.",
|
|
"Good place to stop. It'll keep working in the background.",
|
|
"Noted. Come back when it feels right.",
|
|
"That's wise. Some things need space, not more words.",
|
|
"Take what landed and leave the rest. No hurry.",
|
|
]
|
|
|
|
# Conv examples to DROP (off-topic or low quality)
|
|
CONV_DROP = {
|
|
"How do you know so much about tech?",
|
|
"What does Host UK actually do?",
|
|
"What words do you avoid?",
|
|
"How do you stay positive?", # Too generic
|
|
}
|
|
|
|
|
|
def is_lesson_format(msgs: list[dict]) -> bool:
|
|
"""Check if conversation follows lesson format (Ready? pattern or 6+ turns)."""
|
|
if len(msgs) < 4:
|
|
return False
|
|
first = msgs[0]["content"].lower()
|
|
return ("ready" in first and "lesson" in first) or ("elder" in first and "ready" in first)
|
|
|
|
|
|
def augment_ready_stop(msgs: list[dict], stop_ratio: float = 0.3) -> list[dict]:
|
|
"""Add Ready/Stop gate to end of multi-turn conversation."""
|
|
if len(msgs) <= 2 or msgs[-1]["role"] != "assistant":
|
|
return msgs
|
|
|
|
augmented = list(msgs)
|
|
|
|
if random.random() < stop_ratio:
|
|
augmented.append({"role": "user", "content": random.choice(STOPS)})
|
|
augmented.append({"role": "assistant", "content": random.choice(CLOSES)})
|
|
else:
|
|
last = augmented[-1]["content"]
|
|
augmented[-1] = {
|
|
"role": "assistant",
|
|
"content": f"{last}\n\n{random.choice(OFFERS)}"
|
|
}
|
|
|
|
return augmented
|
|
|
|
|
|
def convert_conv_to_lesson(msgs: list[dict], lesson_id: str) -> list[dict] | None:
|
|
"""Convert conv format to lesson-ish format. Returns None if should drop."""
|
|
if msgs[0]["content"] in CONV_DROP:
|
|
return None
|
|
|
|
# Keep the conversation but add Ready opener
|
|
converted = [
|
|
{"role": "user", "content": f"Ready for lesson {lesson_id}?"},
|
|
{"role": "assistant", "content": "Ready."},
|
|
]
|
|
|
|
# The first user message becomes the "passage" context
|
|
first_user = msgs[0]["content"]
|
|
converted.append({
|
|
"role": "user",
|
|
"content": f"Someone says: \"{first_user}\" — how would you respond?"
|
|
})
|
|
|
|
# Keep the first assistant response
|
|
if len(msgs) > 1:
|
|
converted.append(msgs[1])
|
|
|
|
# Add one more exchange if available
|
|
if len(msgs) > 3:
|
|
converted.append(msgs[2])
|
|
converted.append(msgs[3])
|
|
|
|
return converted
|
|
|
|
|
|
def chunk_book_passage(text: str, max_chars: int = 1500) -> list[str]:
|
|
"""Chunk long book text into passage-sized pieces at paragraph boundaries."""
|
|
paragraphs = text.split("\n\n")
|
|
chunks = []
|
|
current = ""
|
|
|
|
for para in paragraphs:
|
|
para = para.strip()
|
|
if not para:
|
|
continue
|
|
if len(current) + len(para) + 2 > max_chars and current:
|
|
chunks.append(current.strip())
|
|
current = para
|
|
else:
|
|
current = f"{current}\n\n{para}" if current else para
|
|
|
|
if current.strip():
|
|
chunks.append(current.strip())
|
|
|
|
# Filter out tiny chunks
|
|
return [c for c in chunks if len(c) > 100]
|
|
|
|
|
|
def create_book_seed(passage: str, lesson_id: str) -> dict:
|
|
"""Create a lesson-format seed from a book passage (needs distill for assistant turns)."""
|
|
return {
|
|
"messages": [
|
|
{"role": "user", "content": f"Ready for lesson {lesson_id}?"},
|
|
{"role": "assistant", "content": "Ready."},
|
|
{"role": "user", "content": f"Here's a passage from James Allen:\n\n---\n{passage}\n---\n\nWhat does this stir in you?"},
|
|
],
|
|
"meta": {
|
|
"source": "allen-book",
|
|
"needs_distill": True,
|
|
"lesson_id": lesson_id,
|
|
}
|
|
}
|
|
|
|
|
|
def load_lesson_files() -> tuple[list[dict], list[dict]]:
|
|
"""Load all lesson-format training data."""
|
|
train = []
|
|
valid = []
|
|
|
|
# Lesson-format directories (the gold standard)
|
|
for subdir in ["1-watts", "2-composure", "3-expanded", "4-full"]:
|
|
dirpath = os.path.join(LESSONS_DIR, subdir)
|
|
if not os.path.isdir(dirpath):
|
|
continue
|
|
for fname in sorted(os.listdir(dirpath)):
|
|
if not fname.endswith(".jsonl"):
|
|
continue
|
|
target = valid if "valid" in fname else train
|
|
with open(os.path.join(dirpath, fname)) as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
target.append(d)
|
|
|
|
# Allen lesson-format examples (the 6 from train.jsonl — these are 2-turn, skip)
|
|
# Only grab multi-turn Allen examples
|
|
for fname in ["train.jsonl", "valid.jsonl"]:
|
|
path = os.path.join(LESSONS_DIR, "0-allen", fname)
|
|
if not os.path.exists(path):
|
|
continue
|
|
target = valid if "valid" in fname else train
|
|
with open(path) as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
if len(d["messages"]) > 2:
|
|
target.append(d)
|
|
|
|
return train, valid
|
|
|
|
|
|
def load_conv_files() -> tuple[list[dict], list[dict]]:
|
|
"""Load and convert conv-format data."""
|
|
train = []
|
|
valid = []
|
|
conv_idx = 0
|
|
|
|
for fname in ["conv-train.jsonl", "conv-valid.jsonl"]:
|
|
path = os.path.join(LESSONS_DIR, "0-allen", fname)
|
|
if not os.path.exists(path):
|
|
continue
|
|
target = valid if "valid" in fname else train
|
|
with open(path) as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
converted = convert_conv_to_lesson(
|
|
d["messages"], f"C{conv_idx:03d}"
|
|
)
|
|
if converted:
|
|
target.append({"messages": converted})
|
|
conv_idx += 1
|
|
|
|
return train, valid
|
|
|
|
|
|
def load_book_seeds() -> list[dict]:
|
|
"""Load and chunk book data into lesson-format seeds."""
|
|
seeds = []
|
|
seed_idx = 0
|
|
|
|
for fname in ["book-train.jsonl"]:
|
|
path = os.path.join(LESSONS_DIR, "0-allen", fname)
|
|
if not os.path.exists(path):
|
|
continue
|
|
with open(path) as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
text = d["messages"][1]["content"]
|
|
chunks = chunk_book_passage(text)
|
|
for chunk in chunks:
|
|
seed = create_book_seed(chunk, f"AB{seed_idx:03d}")
|
|
seeds.append(seed)
|
|
seed_idx += 1
|
|
|
|
return seeds
|
|
|
|
|
|
def write_jsonl(path: str, records: list[dict]):
|
|
"""Write records to JSONL file."""
|
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
with open(path, "w") as f:
|
|
for rec in records:
|
|
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
|
|
|
|
|
def main():
|
|
print("=== Restructuring Zen Training Data ===\n")
|
|
|
|
# 1. Load lesson-format data (gold standard)
|
|
lesson_train, lesson_valid = load_lesson_files()
|
|
print(f"Lesson format: {len(lesson_train)} train, {len(lesson_valid)} valid")
|
|
|
|
# 2. Load and convert conv data
|
|
conv_train, conv_valid = load_conv_files()
|
|
print(f"Conv converted: {len(conv_train)} train, {len(conv_valid)} valid")
|
|
|
|
# 3. Combine
|
|
all_train = lesson_train + conv_train
|
|
all_valid = lesson_valid + conv_valid
|
|
|
|
# 4. Apply Ready/Stop augmentation
|
|
augmented_train = []
|
|
augmented_valid = []
|
|
|
|
for rec in all_train:
|
|
msgs = augment_ready_stop(rec["messages"], stop_ratio=0.3)
|
|
augmented_train.append({"messages": msgs})
|
|
|
|
for rec in all_valid:
|
|
msgs = augment_ready_stop(rec["messages"], stop_ratio=0.5)
|
|
augmented_valid.append({"messages": msgs})
|
|
|
|
print(f"\nGolden (augmented): {len(augmented_train)} train, {len(augmented_valid)} valid")
|
|
|
|
# Count Ready vs Stop
|
|
stop_count = sum(
|
|
1 for r in augmented_train
|
|
if len(r["messages"]) > 2
|
|
and any(
|
|
m["role"] == "user" and any(s in m["content"].lower() for s in ["stop", "enough", "pause"])
|
|
for m in r["messages"][-3:]
|
|
)
|
|
)
|
|
print(f" Ready path: {len(augmented_train) - stop_count}")
|
|
print(f" Stop path: {stop_count}")
|
|
|
|
# 5. Write golden data
|
|
write_jsonl(os.path.join(GOLDEN_DIR, "train.jsonl"), augmented_train)
|
|
write_jsonl(os.path.join(GOLDEN_DIR, "valid.jsonl"), augmented_valid)
|
|
|
|
# 6. Load and write book seeds (needs distill)
|
|
seeds = load_book_seeds()
|
|
print(f"\nBook seeds (need distill): {len(seeds)} passages")
|
|
write_jsonl(os.path.join(SEEDS_DIR, "allen-book.jsonl"), seeds)
|
|
|
|
# 7. Write config
|
|
config = {
|
|
"format": "lesson",
|
|
"description": "Canonical zen training data — lesson format + Ready/Stop gates",
|
|
"turns": "6-8 per conversation",
|
|
"pattern": [
|
|
"user: Ready for lesson {ID}?",
|
|
"assistant: Ready.",
|
|
"user: Passage/context + reflection prompt",
|
|
"assistant: Authentic reflection",
|
|
"user: Deeper question",
|
|
"assistant: Deeper response",
|
|
"(optional) user: Stop signal",
|
|
"(optional) assistant: Graceful close",
|
|
],
|
|
"model_sizes": {
|
|
"1b": {
|
|
"train_examples": 80,
|
|
"description": "Core lessons only — watts + composure + subset of expanded",
|
|
},
|
|
"4b": {
|
|
"train_examples": len(augmented_train),
|
|
"description": "Full golden set",
|
|
},
|
|
"27b": {
|
|
"train_examples": len(augmented_train),
|
|
"note": "Same dataset, same noise distribution as smaller models",
|
|
"extra": "Add book seeds after distill for additional depth",
|
|
},
|
|
},
|
|
"sources": {
|
|
"lesson": f"{len(lesson_train)} train, {len(lesson_valid)} valid",
|
|
"conv_converted": f"{len(conv_train)} train, {len(conv_valid)} valid",
|
|
"book_seeds": f"{len(seeds)} passages (need distill)",
|
|
},
|
|
}
|
|
|
|
config_path = os.path.join(BASE, "config.yaml")
|
|
os.makedirs(os.path.dirname(config_path), exist_ok=True)
|
|
with open(config_path, "w") as f:
|
|
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f"\nWritten:")
|
|
print(f" {GOLDEN_DIR}/train.jsonl ({len(augmented_train)} examples)")
|
|
print(f" {GOLDEN_DIR}/valid.jsonl ({len(augmented_valid)} examples)")
|
|
print(f" {SEEDS_DIR}/allen-book.jsonl ({len(seeds)} seeds)")
|
|
print(f" {config_path}")
|
|
|
|
# Summary
|
|
print(f"\n=== Model Size Scaling ===")
|
|
print(f" 1B: ~80 examples (subset)")
|
|
print(f" 4B: {len(augmented_train)} examples (full golden)")
|
|
print(f" 27B: {len(augmented_train)} + {len(seeds)} book seeds after distill")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|