1
0
Fork 0
forked from lthn/LEM
LEM/scripts/export_parquet.py

95 lines
3 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Export LEM training data to Parquet format for HuggingFace datasets.
Reads JSONL training splits and outputs Parquet files with proper schema
for HuggingFace's dataset viewer.
Usage:
python3 scripts/export_parquet.py # export all splits
python3 scripts/export_parquet.py --output ./parquet # custom output dir
"""
import argparse
import json
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).parent.parent
TRAINING_DIR = REPO_ROOT / "training"
DEFAULT_OUTPUT = TRAINING_DIR / "parquet"
def export_split(jsonl_path, output_dir):
import pyarrow as pa
import pyarrow.parquet as pq
split = jsonl_path.stem # train, valid, test
rows = []
with open(jsonl_path) as f:
for line in f:
line = line.strip()
if not line:
continue
data = json.loads(line)
msgs = data.get("messages", [])
prompt = next((m["content"] for m in msgs if m["role"] == "user"), "")
response = next((m["content"] for m in msgs if m["role"] == "assistant"), "")
system = next((m["content"] for m in msgs if m["role"] == "system"), "")
rows.append({
"prompt": prompt,
"response": response,
"system": system,
"messages": json.dumps(msgs),
})
if not rows:
print(f" Skip: {split} — no data")
return
table = pa.table({
"prompt": pa.array([r["prompt"] for r in rows], type=pa.string()),
"response": pa.array([r["response"] for r in rows], type=pa.string()),
"system": pa.array([r["system"] for r in rows], type=pa.string()),
"messages": pa.array([r["messages"] for r in rows], type=pa.string()),
})
output_path = output_dir / f"{split}.parquet"
pq.write_table(table, output_path, compression="snappy")
size_mb = output_path.stat().st_size / 1024 / 1024
print(f" {split}.parquet: {len(rows)} rows ({size_mb:.1f} MB)")
def main():
parser = argparse.ArgumentParser(description="Export LEM training data to Parquet")
parser.add_argument("--output", default=None, help="Output directory")
parser.add_argument("--training-dir", default=None, help="Training data directory")
args = parser.parse_args()
try:
import pyarrow
except ImportError:
print("Error: pip install pyarrow")
sys.exit(1)
training_dir = Path(args.training_dir) if args.training_dir else TRAINING_DIR
output_dir = Path(args.output) if args.output else DEFAULT_OUTPUT
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Exporting Parquet from {training_dir}{output_dir}")
for split in ["train", "valid", "test"]:
jsonl_path = training_dir / f"{split}.jsonl"
if jsonl_path.exists():
export_split(jsonl_path, output_dir)
else:
print(f" Skip: {split}.jsonl not found")
print("Done.")
if __name__ == "__main__":
main()