Replace passthrough() + stdlib flag.FlagSet anti-pattern with proper cobra integration. Every Run* function now takes a typed *Opts struct and returns error. Flags registered via cli.StringFlag/IntFlag/etc. Commands participate in Core lifecycle with full cobra flag parsing. - 6 command groups: gen, score, data, export, infra, mon - 25 commands converted, 0 passthrough() calls remain - Delete passthrough() helper from lem.go - Update export_test.go to use ExportOpts struct Co-Authored-By: Virgil <virgil@lethean.io>
130 lines
3.6 KiB
Go
130 lines
3.6 KiB
Go
package lem
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// PublishOpts holds configuration for the HuggingFace publish command.
|
|
type PublishOpts struct {
|
|
Input string // Directory containing Parquet files (required)
|
|
Repo string // HuggingFace dataset repo ID
|
|
Public bool // Make dataset public
|
|
Token string // HuggingFace API token (defaults to HF_TOKEN env)
|
|
DryRun bool // Show what would be uploaded without uploading
|
|
}
|
|
|
|
// RunPublish is the CLI entry point for the publish command.
|
|
// Pushes Parquet files and an optional dataset card to HuggingFace.
|
|
func RunPublish(cfg PublishOpts) error {
|
|
if cfg.Input == "" {
|
|
return fmt.Errorf("--input is required (directory with Parquet files)")
|
|
}
|
|
|
|
hfToken := cfg.Token
|
|
if hfToken == "" {
|
|
hfToken = os.Getenv("HF_TOKEN")
|
|
}
|
|
if hfToken == "" {
|
|
home, err := os.UserHomeDir()
|
|
if err == nil {
|
|
data, err := os.ReadFile(filepath.Join(home, ".huggingface", "token"))
|
|
if err == nil {
|
|
hfToken = strings.TrimSpace(string(data))
|
|
}
|
|
}
|
|
}
|
|
|
|
if hfToken == "" && !cfg.DryRun {
|
|
return fmt.Errorf("HuggingFace token required (--token, HF_TOKEN env, or ~/.huggingface/token)")
|
|
}
|
|
|
|
splits := []string{"train", "valid", "test"}
|
|
type uploadEntry struct {
|
|
local string
|
|
remote string
|
|
}
|
|
var filesToUpload []uploadEntry
|
|
|
|
for _, split := range splits {
|
|
path := filepath.Join(cfg.Input, split+".parquet")
|
|
if _, err := os.Stat(path); os.IsNotExist(err) {
|
|
continue
|
|
}
|
|
filesToUpload = append(filesToUpload, uploadEntry{path, fmt.Sprintf("data/%s.parquet", split)})
|
|
}
|
|
|
|
// Check for dataset card in parent directory.
|
|
cardPath := filepath.Join(cfg.Input, "..", "dataset_card.md")
|
|
if _, err := os.Stat(cardPath); err == nil {
|
|
filesToUpload = append(filesToUpload, uploadEntry{cardPath, "README.md"})
|
|
}
|
|
|
|
if len(filesToUpload) == 0 {
|
|
return fmt.Errorf("no Parquet files found in input directory")
|
|
}
|
|
|
|
if cfg.DryRun {
|
|
fmt.Printf("Dry run: would publish to %s\n", cfg.Repo)
|
|
if cfg.Public {
|
|
fmt.Println(" Visibility: public")
|
|
} else {
|
|
fmt.Println(" Visibility: private")
|
|
}
|
|
for _, f := range filesToUpload {
|
|
info, _ := os.Stat(f.local)
|
|
sizeMB := float64(info.Size()) / 1024 / 1024
|
|
fmt.Printf(" %s → %s (%.1f MB)\n", filepath.Base(f.local), f.remote, sizeMB)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
fmt.Printf("Publishing to https://huggingface.co/datasets/%s\n", cfg.Repo)
|
|
|
|
for _, f := range filesToUpload {
|
|
if err := uploadFileToHF(hfToken, cfg.Repo, f.local, f.remote); err != nil {
|
|
return fmt.Errorf("upload %s: %w", f.local, err)
|
|
}
|
|
fmt.Printf(" Uploaded %s → %s\n", filepath.Base(f.local), f.remote)
|
|
}
|
|
|
|
fmt.Printf("\nPublished to https://huggingface.co/datasets/%s\n", cfg.Repo)
|
|
return nil
|
|
}
|
|
|
|
// uploadFileToHF uploads a file to a HuggingFace dataset repo via the Hub API.
|
|
func uploadFileToHF(token, repoID, localPath, remotePath string) error {
|
|
data, err := os.ReadFile(localPath)
|
|
if err != nil {
|
|
return fmt.Errorf("read %s: %w", localPath, err)
|
|
}
|
|
|
|
url := fmt.Sprintf("https://huggingface.co/api/datasets/%s/upload/main/%s", repoID, remotePath)
|
|
|
|
req, err := http.NewRequest(http.MethodPut, url, bytes.NewReader(data))
|
|
if err != nil {
|
|
return fmt.Errorf("create request: %w", err)
|
|
}
|
|
req.Header.Set("Authorization", "Bearer "+token)
|
|
req.Header.Set("Content-Type", "application/octet-stream")
|
|
|
|
client := &http.Client{Timeout: 120 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return fmt.Errorf("upload request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode >= 300 {
|
|
body, _ := io.ReadAll(resp.Body)
|
|
return fmt.Errorf("upload failed: HTTP %d: %s", resp.StatusCode, string(body))
|
|
}
|
|
|
|
return nil
|
|
}
|