LEM/pkg/lem/publish.go
Snider 56eda1a081 refactor: migrate all 25 commands from passthrough to cobra framework
Replace passthrough() + stdlib flag.FlagSet anti-pattern with proper
cobra integration. Every Run* function now takes a typed *Opts struct
and returns error. Flags registered via cli.StringFlag/IntFlag/etc.
Commands participate in Core lifecycle with full cobra flag parsing.

- 6 command groups: gen, score, data, export, infra, mon
- 25 commands converted, 0 passthrough() calls remain
- Delete passthrough() helper from lem.go
- Update export_test.go to use ExportOpts struct

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-23 03:32:53 +00:00

130 lines
3.6 KiB
Go

package lem
import (
"bytes"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"strings"
"time"
)
// PublishOpts holds configuration for the HuggingFace publish command.
type PublishOpts struct {
Input string // Directory containing Parquet files (required)
Repo string // HuggingFace dataset repo ID
Public bool // Make dataset public
Token string // HuggingFace API token (defaults to HF_TOKEN env)
DryRun bool // Show what would be uploaded without uploading
}
// RunPublish is the CLI entry point for the publish command.
// Pushes Parquet files and an optional dataset card to HuggingFace.
func RunPublish(cfg PublishOpts) error {
if cfg.Input == "" {
return fmt.Errorf("--input is required (directory with Parquet files)")
}
hfToken := cfg.Token
if hfToken == "" {
hfToken = os.Getenv("HF_TOKEN")
}
if hfToken == "" {
home, err := os.UserHomeDir()
if err == nil {
data, err := os.ReadFile(filepath.Join(home, ".huggingface", "token"))
if err == nil {
hfToken = strings.TrimSpace(string(data))
}
}
}
if hfToken == "" && !cfg.DryRun {
return fmt.Errorf("HuggingFace token required (--token, HF_TOKEN env, or ~/.huggingface/token)")
}
splits := []string{"train", "valid", "test"}
type uploadEntry struct {
local string
remote string
}
var filesToUpload []uploadEntry
for _, split := range splits {
path := filepath.Join(cfg.Input, split+".parquet")
if _, err := os.Stat(path); os.IsNotExist(err) {
continue
}
filesToUpload = append(filesToUpload, uploadEntry{path, fmt.Sprintf("data/%s.parquet", split)})
}
// Check for dataset card in parent directory.
cardPath := filepath.Join(cfg.Input, "..", "dataset_card.md")
if _, err := os.Stat(cardPath); err == nil {
filesToUpload = append(filesToUpload, uploadEntry{cardPath, "README.md"})
}
if len(filesToUpload) == 0 {
return fmt.Errorf("no Parquet files found in input directory")
}
if cfg.DryRun {
fmt.Printf("Dry run: would publish to %s\n", cfg.Repo)
if cfg.Public {
fmt.Println(" Visibility: public")
} else {
fmt.Println(" Visibility: private")
}
for _, f := range filesToUpload {
info, _ := os.Stat(f.local)
sizeMB := float64(info.Size()) / 1024 / 1024
fmt.Printf(" %s → %s (%.1f MB)\n", filepath.Base(f.local), f.remote, sizeMB)
}
return nil
}
fmt.Printf("Publishing to https://huggingface.co/datasets/%s\n", cfg.Repo)
for _, f := range filesToUpload {
if err := uploadFileToHF(hfToken, cfg.Repo, f.local, f.remote); err != nil {
return fmt.Errorf("upload %s: %w", f.local, err)
}
fmt.Printf(" Uploaded %s → %s\n", filepath.Base(f.local), f.remote)
}
fmt.Printf("\nPublished to https://huggingface.co/datasets/%s\n", cfg.Repo)
return nil
}
// uploadFileToHF uploads a file to a HuggingFace dataset repo via the Hub API.
func uploadFileToHF(token, repoID, localPath, remotePath string) error {
data, err := os.ReadFile(localPath)
if err != nil {
return fmt.Errorf("read %s: %w", localPath, err)
}
url := fmt.Sprintf("https://huggingface.co/api/datasets/%s/upload/main/%s", repoID, remotePath)
req, err := http.NewRequest(http.MethodPut, url, bytes.NewReader(data))
if err != nil {
return fmt.Errorf("create request: %w", err)
}
req.Header.Set("Authorization", "Bearer "+token)
req.Header.Set("Content-Type", "application/octet-stream")
client := &http.Client{Timeout: 120 * time.Second}
resp, err := client.Do(req)
if err != nil {
return fmt.Errorf("upload request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("upload failed: HTTP %d: %s", resp.StatusCode, string(body))
}
return nil
}