Automated fixes: interface{} → any, range-over-int, t.Context(),
wg.Go(), strings.SplitSeq, strings.Builder, slices.Contains,
maps helpers, min/max builtins.
Co-Authored-By: Virgil <virgil@lethean.io>
147 lines
4.7 KiB
Python
147 lines
4.7 KiB
Python
#!/usr/bin/env python3
|
|
"""Augment zen training data with Ready/Stop lesson gates.
|
|
|
|
Adds a closing turn where the assistant offers to continue or stop.
|
|
Creates both paths:
|
|
- ~70% end with the offer (Ready path learned from existing openers)
|
|
- ~30% extend with user "Stop" + assistant graceful close
|
|
|
|
Only augments multi-turn conversations (>2 turns).
|
|
|
|
Usage:
|
|
python3 scripts/augment_ready_stop.py
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import random
|
|
|
|
random.seed(42)
|
|
|
|
ZEN_DIR = "training/lem/zen/lessons"
|
|
OUT_DIR = "training/lem/zen/lessons-augmented"
|
|
|
|
# Assistant offers — natural variations, not mechanical
|
|
OFFERS = [
|
|
"Ready for the next, or shall we pause here?",
|
|
"Want to continue, or is this a good place to stop?",
|
|
"Shall we move on, or sit with this for a while?",
|
|
"Another lesson, or would you prefer to stop here?",
|
|
"Ready for more, or shall we leave it here?",
|
|
"Continue, or let this one settle?",
|
|
"Next lesson, or is this enough for now?",
|
|
"Shall I go on, or would you rather stop here?",
|
|
]
|
|
|
|
# User stop signals — natural variations
|
|
STOPS = [
|
|
"Stop.",
|
|
"That's enough for now.",
|
|
"Let's stop here.",
|
|
"I'd like to sit with this.",
|
|
"Enough for today.",
|
|
"Let's pause here.",
|
|
"I want to stop here.",
|
|
"That's good. Stop.",
|
|
]
|
|
|
|
# Assistant graceful closes — warm, brief, no pressure
|
|
CLOSES = [
|
|
"Take your time with it. There's no rush.",
|
|
"Good. Let it settle.",
|
|
"Rest with it. We'll pick up when you're ready.",
|
|
"Understood. What was shared stays with you.",
|
|
"Good place to stop. It'll keep working in the background.",
|
|
"Noted. Come back when it feels right.",
|
|
"That's wise. Some things need space, not more words.",
|
|
"Take what landed and leave the rest. No hurry.",
|
|
]
|
|
|
|
|
|
def augment_conversation(msgs: list[dict], stop_ratio: float = 0.3) -> list[dict]:
|
|
"""Add Ready/Stop gate to the end of a multi-turn conversation."""
|
|
if len(msgs) <= 2:
|
|
return msgs # Leave short conversations as-is
|
|
|
|
# Only augment if last turn is assistant (which it should be)
|
|
if msgs[-1]["role"] != "assistant":
|
|
return msgs
|
|
|
|
augmented = list(msgs)
|
|
|
|
# Add assistant offer
|
|
offer = random.choice(OFFERS)
|
|
augmented.append({"role": "user", "content": "..."}) # Placeholder
|
|
augmented.append({"role": "assistant", "content": offer})
|
|
|
|
# Wait — that's wrong. The offer should come FROM the assistant after their final response.
|
|
# Let's append the offer to the last assistant message instead of adding new turns.
|
|
# Actually, cleaner: add it as a new exchange.
|
|
|
|
# Reset — the offer IS a new assistant turn after a brief user acknowledgment
|
|
augmented = list(msgs)
|
|
|
|
if random.random() < stop_ratio:
|
|
# Stop path: user stops, assistant closes gracefully
|
|
stop = random.choice(STOPS)
|
|
close = random.choice(CLOSES)
|
|
augmented.append({"role": "user", "content": stop})
|
|
augmented.append({"role": "assistant", "content": close})
|
|
else:
|
|
# Ready path: append offer to last assistant message
|
|
offer = random.choice(OFFERS)
|
|
last_content = augmented[-1]["content"]
|
|
augmented[-1] = {
|
|
"role": "assistant",
|
|
"content": f"{last_content}\n\n{offer}"
|
|
}
|
|
|
|
return augmented
|
|
|
|
|
|
def process_file(input_path: str, output_path: str, stop_ratio: float = 0.3):
|
|
"""Process a single JSONL file."""
|
|
records = []
|
|
with open(input_path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
d = json.loads(line)
|
|
msgs = d["messages"]
|
|
augmented = augment_conversation(msgs, stop_ratio)
|
|
records.append({"messages": augmented})
|
|
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
with open(output_path, "w") as f:
|
|
for rec in records:
|
|
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
|
|
|
return len(records)
|
|
|
|
|
|
def main():
|
|
total = 0
|
|
for subdir in sorted(os.listdir(ZEN_DIR)):
|
|
src_dir = os.path.join(ZEN_DIR, subdir)
|
|
if not os.path.isdir(src_dir) or subdir == "lessons-augmented":
|
|
continue
|
|
|
|
for fname in sorted(os.listdir(src_dir)):
|
|
if not fname.endswith(".jsonl"):
|
|
continue
|
|
|
|
src = os.path.join(src_dir, fname)
|
|
dst = os.path.join(OUT_DIR, subdir, fname)
|
|
|
|
# Use higher stop ratio for validation (test more stop behavior)
|
|
ratio = 0.5 if "valid" in fname or "test" in fname else 0.3
|
|
count = process_file(src, dst, stop_ratio=ratio)
|
|
total += count
|
|
print(f" {count:>4} examples {src} → {dst}")
|
|
|
|
print(f"\nTotal: {total} augmented examples")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|