- `lem parquet` — export JSONL training splits to Parquet (parquet-go) - `lem publish` — push Parquet files to HuggingFace dataset repo - `lem metrics` — push DuckDB golden set stats to InfluxDB - `lem convert` — MLX LoRA adapter → HuggingFace PEFT format (pure Go safetensors read/write/transpose, no PyTorch needed) Dependencies added: parquet-go, go-huggingface, go-rocm, go-pytorch, gotch Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
138 lines
3.8 KiB
Go
138 lines
3.8 KiB
Go
package lem
|
|
|
|
import (
|
|
"bytes"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// RunPublish is the CLI entry point for the publish command.
|
|
// Pushes Parquet files and an optional dataset card to HuggingFace.
|
|
func RunPublish(args []string) {
|
|
fs := flag.NewFlagSet("publish", flag.ExitOnError)
|
|
|
|
inputDir := fs.String("input", "", "Directory containing Parquet files (required)")
|
|
repoID := fs.String("repo", "lthn/LEM-golden-set", "HuggingFace dataset repo ID")
|
|
public := fs.Bool("public", false, "Make dataset public")
|
|
token := fs.String("token", "", "HuggingFace API token (defaults to HF_TOKEN env)")
|
|
dryRun := fs.Bool("dry-run", false, "Show what would be uploaded without uploading")
|
|
|
|
if err := fs.Parse(args); err != nil {
|
|
log.Fatalf("parse flags: %v", err)
|
|
}
|
|
|
|
if *inputDir == "" {
|
|
fmt.Fprintln(os.Stderr, "error: --input is required (directory with Parquet files)")
|
|
fs.Usage()
|
|
os.Exit(1)
|
|
}
|
|
|
|
hfToken := *token
|
|
if hfToken == "" {
|
|
hfToken = os.Getenv("HF_TOKEN")
|
|
}
|
|
if hfToken == "" {
|
|
home, err := os.UserHomeDir()
|
|
if err == nil {
|
|
data, err := os.ReadFile(filepath.Join(home, ".huggingface", "token"))
|
|
if err == nil {
|
|
hfToken = strings.TrimSpace(string(data))
|
|
}
|
|
}
|
|
}
|
|
|
|
if hfToken == "" && !*dryRun {
|
|
fmt.Fprintln(os.Stderr, "error: HuggingFace token required (--token, HF_TOKEN env, or ~/.huggingface/token)")
|
|
os.Exit(1)
|
|
}
|
|
|
|
splits := []string{"train", "valid", "test"}
|
|
type uploadEntry struct {
|
|
local string
|
|
remote string
|
|
}
|
|
var filesToUpload []uploadEntry
|
|
|
|
for _, split := range splits {
|
|
path := filepath.Join(*inputDir, split+".parquet")
|
|
if _, err := os.Stat(path); os.IsNotExist(err) {
|
|
continue
|
|
}
|
|
filesToUpload = append(filesToUpload, uploadEntry{path, fmt.Sprintf("data/%s.parquet", split)})
|
|
}
|
|
|
|
// Check for dataset card in parent directory.
|
|
cardPath := filepath.Join(*inputDir, "..", "dataset_card.md")
|
|
if _, err := os.Stat(cardPath); err == nil {
|
|
filesToUpload = append(filesToUpload, uploadEntry{cardPath, "README.md"})
|
|
}
|
|
|
|
if len(filesToUpload) == 0 {
|
|
fmt.Fprintln(os.Stderr, "error: no Parquet files found in input directory")
|
|
os.Exit(1)
|
|
}
|
|
|
|
if *dryRun {
|
|
fmt.Printf("Dry run: would publish to %s\n", *repoID)
|
|
if *public {
|
|
fmt.Println(" Visibility: public")
|
|
} else {
|
|
fmt.Println(" Visibility: private")
|
|
}
|
|
for _, f := range filesToUpload {
|
|
info, _ := os.Stat(f.local)
|
|
sizeMB := float64(info.Size()) / 1024 / 1024
|
|
fmt.Printf(" %s → %s (%.1f MB)\n", filepath.Base(f.local), f.remote, sizeMB)
|
|
}
|
|
return
|
|
}
|
|
|
|
fmt.Printf("Publishing to https://huggingface.co/datasets/%s\n", *repoID)
|
|
|
|
for _, f := range filesToUpload {
|
|
if err := uploadFileToHF(hfToken, *repoID, f.local, f.remote); err != nil {
|
|
log.Fatalf("upload %s: %v", f.local, err)
|
|
}
|
|
fmt.Printf(" Uploaded %s → %s\n", filepath.Base(f.local), f.remote)
|
|
}
|
|
|
|
fmt.Printf("\nPublished to https://huggingface.co/datasets/%s\n", *repoID)
|
|
}
|
|
|
|
// uploadFileToHF uploads a file to a HuggingFace dataset repo via the Hub API.
|
|
func uploadFileToHF(token, repoID, localPath, remotePath string) error {
|
|
data, err := os.ReadFile(localPath)
|
|
if err != nil {
|
|
return fmt.Errorf("read %s: %w", localPath, err)
|
|
}
|
|
|
|
url := fmt.Sprintf("https://huggingface.co/api/datasets/%s/upload/main/%s", repoID, remotePath)
|
|
|
|
req, err := http.NewRequest(http.MethodPut, url, bytes.NewReader(data))
|
|
if err != nil {
|
|
return fmt.Errorf("create request: %w", err)
|
|
}
|
|
req.Header.Set("Authorization", "Bearer "+token)
|
|
req.Header.Set("Content-Type", "application/octet-stream")
|
|
|
|
client := &http.Client{Timeout: 120 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return fmt.Errorf("upload request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode >= 300 {
|
|
body, _ := io.ReadAll(resp.Body)
|
|
return fmt.Errorf("upload failed: HTTP %d: %s", resp.StatusCode, string(body))
|
|
}
|
|
|
|
return nil
|
|
}
|