Compare commits
5 commits
main
...
feat-colle
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dad5cd2588 | ||
|
|
ef08cdf807 | ||
|
|
2f45e5806f | ||
|
|
0521d835e7 | ||
|
|
fd271f8bd9 |
4 changed files with 451 additions and 0 deletions
224
cmd/collect_batch.go
Normal file
224
cmd/collect_batch.go
Normal file
|
|
@ -0,0 +1,224 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/itchyny/gojq"
|
||||
"github.com/mattn/go-isatty"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// collectBatchCmd represents the collect batch command
|
||||
var collectBatchCmd = NewCollectBatchCmd()
|
||||
|
||||
func init() {
|
||||
GetCollectCmd().AddCommand(GetCollectBatchCmd())
|
||||
}
|
||||
|
||||
func GetCollectBatchCmd() *cobra.Command {
|
||||
return collectBatchCmd
|
||||
}
|
||||
|
||||
func NewCollectBatchCmd() *cobra.Command {
|
||||
collectBatchCmd := &cobra.Command{
|
||||
Use: "batch [file|-]",
|
||||
Short: "Batch collect from a list of URLs",
|
||||
Long: `Collect multiple resources from a list of URLs provided in a file or via stdin.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
inputFile := args[0]
|
||||
outputDir, _ := cmd.Flags().GetString("output-dir")
|
||||
delay, _ := cmd.Flags().GetString("delay")
|
||||
skipExisting, _ := cmd.Flags().GetBool("continue")
|
||||
parallel, _ := cmd.Flags().GetInt("parallel")
|
||||
jqFilter, _ := cmd.Flags().GetString("jq")
|
||||
|
||||
delayDuration, err := time.ParseDuration(delay)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid delay duration: %w", err)
|
||||
}
|
||||
|
||||
var reader io.Reader
|
||||
|
||||
if inputFile == "-" {
|
||||
reader = os.Stdin
|
||||
} else {
|
||||
file, err := os.Open(inputFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error opening input file: %w", err)
|
||||
}
|
||||
defer file.Close()
|
||||
reader = file
|
||||
}
|
||||
|
||||
urls, err := readURLs(reader, jqFilter)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error reading urls: %w", err)
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(outputDir, os.ModePerm); err != nil {
|
||||
return fmt.Errorf("error creating output directory: %w", err)
|
||||
}
|
||||
|
||||
urlsChan := make(chan string, len(urls))
|
||||
var wg sync.WaitGroup
|
||||
var bar *progressbar.ProgressBar
|
||||
var outMutex sync.Mutex
|
||||
|
||||
if isatty.IsTerminal(os.Stdout.Fd()) {
|
||||
bar = progressbar.Default(int64(len(urls)))
|
||||
}
|
||||
|
||||
for i := 0; i < parallel; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for u := range urlsChan {
|
||||
downloadURL(cmd, u, outputDir, skipExisting, delayDuration, bar, &outMutex)
|
||||
if bar != nil {
|
||||
bar.Add(1)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
for _, u := range urls {
|
||||
urlsChan <- u
|
||||
}
|
||||
close(urlsChan)
|
||||
|
||||
wg.Wait()
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
collectBatchCmd.Flags().IntP("parallel", "p", 1, "Number of concurrent downloads")
|
||||
collectBatchCmd.Flags().String("delay", "0s", "Delay between requests")
|
||||
collectBatchCmd.Flags().StringP("output-dir", "o", ".", "Base output directory")
|
||||
collectBatchCmd.Flags().Bool("continue", false, "Skip already collected files")
|
||||
collectBatchCmd.Flags().String("jq", "", "jq filter to extract URLs from JSON input")
|
||||
|
||||
return collectBatchCmd
|
||||
}
|
||||
|
||||
func downloadURL(cmd *cobra.Command, u, outputDir string, skipExisting bool, delayDuration time.Duration, bar *progressbar.ProgressBar, outMutex *sync.Mutex) {
|
||||
fileName, err := getFileNameFromURL(u)
|
||||
if err != nil {
|
||||
logMessage(cmd, fmt.Sprintf("Skipping invalid URL %s: %v", u, err), bar, outMutex)
|
||||
return
|
||||
}
|
||||
filePath := filepath.Join(outputDir, fileName)
|
||||
|
||||
if skipExisting {
|
||||
if _, err := os.Stat(filePath); err == nil {
|
||||
logMessage(cmd, fmt.Sprintf("Skipping already downloaded file: %s", filePath), bar, outMutex)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
resp, err := http.Get(u)
|
||||
if err != nil {
|
||||
logMessage(cmd, fmt.Sprintf("Error downloading %s: %v", u, err), bar, outMutex)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
out, err := os.Create(filePath)
|
||||
if err != nil {
|
||||
logMessage(cmd, fmt.Sprintf("Error creating file for %s: %v", u, err), bar, outMutex)
|
||||
return
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
_, err = io.Copy(out, resp.Body)
|
||||
if err != nil {
|
||||
logMessage(cmd, fmt.Sprintf("Error saving content for %s: %v", u, err), bar, outMutex)
|
||||
return
|
||||
}
|
||||
|
||||
logMessage(cmd, fmt.Sprintf("Downloaded %s to %s", u, filePath), bar, outMutex)
|
||||
|
||||
if delayDuration > 0 {
|
||||
time.Sleep(delayDuration)
|
||||
}
|
||||
}
|
||||
|
||||
func logMessage(cmd *cobra.Command, msg string, bar *progressbar.ProgressBar, outMutex *sync.Mutex) {
|
||||
if bar != nil {
|
||||
bar.Describe(msg)
|
||||
} else {
|
||||
outMutex.Lock()
|
||||
defer outMutex.Unlock()
|
||||
fmt.Fprintln(cmd.OutOrStdout(), msg)
|
||||
}
|
||||
}
|
||||
|
||||
func readURLs(reader io.Reader, jqFilter string) ([]string, error) {
|
||||
if jqFilter == "" {
|
||||
var urls []string
|
||||
scanner := bufio.NewScanner(reader)
|
||||
for scanner.Scan() {
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
if line != "" {
|
||||
urls = append(urls, line)
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return urls, nil
|
||||
}
|
||||
|
||||
query, err := gojq.Parse(jqFilter)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing jq filter: %w", err)
|
||||
}
|
||||
|
||||
var input interface{}
|
||||
decoder := json.NewDecoder(reader)
|
||||
if err := decoder.Decode(&input); err != nil {
|
||||
return nil, fmt.Errorf("error decoding json: %w", err)
|
||||
}
|
||||
|
||||
var urls []string
|
||||
iter := query.Run(input)
|
||||
for {
|
||||
v, ok := iter.Next()
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
if err, ok := v.(error); ok {
|
||||
return nil, fmt.Errorf("error executing jq filter: %w", err)
|
||||
}
|
||||
if s, ok := v.(string); ok {
|
||||
urls = append(urls, s)
|
||||
}
|
||||
}
|
||||
return urls, nil
|
||||
}
|
||||
|
||||
func getFileNameFromURL(rawURL string) (string, error) {
|
||||
parsedURL, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
|
||||
return "", fmt.Errorf("invalid URL scheme: %s", parsedURL.Scheme)
|
||||
}
|
||||
if parsedURL.Path == "" || parsedURL.Path == "/" {
|
||||
return "index.html", nil
|
||||
}
|
||||
return filepath.Base(parsedURL.Path), nil
|
||||
}
|
||||
220
cmd/collect_batch_test.go
Normal file
220
cmd/collect_batch_test.go
Normal file
|
|
@ -0,0 +1,220 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestReadURLs(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
jqFilter string
|
||||
expected []string
|
||||
err bool
|
||||
}{
|
||||
{"Good", "url1\nurl2\nurl3", "", []string{"url1", "url2", "url3"}, false},
|
||||
{"EmptyInput", "", "", []string{}, false},
|
||||
{"WithEmptyLines", "url1\n\nurl2", "", []string{"url1", "url2"}, false},
|
||||
{"JSON", `{"urls": ["url1", "url2"]}`, ".urls[]", []string{"url1", "url2"}, false},
|
||||
{"InvalidJSON", "{", ".urls[]", nil, true},
|
||||
{"InvalidJQ", `{"urls": ["url1", "url2"]}`, ".[", nil, true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
reader := strings.NewReader(tt.input)
|
||||
urls, err := readURLs(reader, tt.jqFilter)
|
||||
if (err != nil) != tt.err {
|
||||
t.Fatalf("readURLs() error = %v, wantErr %v", err, tt.err)
|
||||
}
|
||||
if len(urls) != len(tt.expected) {
|
||||
t.Fatalf("expected %d urls, got %d", len(tt.expected), len(urls))
|
||||
}
|
||||
for i := range urls {
|
||||
if urls[i] != tt.expected[i] {
|
||||
t.Errorf("expected url %s, got %s", tt.expected[i], urls[i])
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetFileNameFromURL(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
rawURL string
|
||||
expected string
|
||||
err bool
|
||||
}{
|
||||
{"Good", "http://example.com/file.txt", "file.txt", false},
|
||||
{"NoPath", "http://example.com", "index.html", false},
|
||||
{"InvalidURL", "ftp://example.com/file.txt", "", true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
fileName, err := getFileNameFromURL(tt.rawURL)
|
||||
if (err != nil) != tt.err {
|
||||
t.Fatalf("getFileNameFromURL() error = %v, wantErr %v", err, tt.err)
|
||||
}
|
||||
if fileName != tt.expected {
|
||||
t.Errorf("expected filename %s, got %s", tt.expected, fileName)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectBatch_ParallelAndJSON(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
fmt.Fprintln(w, "Hello, client")
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
tempDir := t.TempDir()
|
||||
urlsFile := filepath.Join(tempDir, "urls.txt")
|
||||
err := os.WriteFile(urlsFile, []byte(server.URL+"/file1.txt\n"+server.URL+"/file2.txt"), 0644)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
jsonFile := filepath.Join(tempDir, "urls.json")
|
||||
err = os.WriteFile(jsonFile, []byte(`{"files": [{"url": "`+server.URL+`/file1.txt"}, {"url": "`+server.URL+`/file2.txt"}]}`), 0644)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Test with --parallel flag
|
||||
t.Run("Parallel", func(t *testing.T) {
|
||||
outputDir := t.TempDir()
|
||||
cmd := NewCollectBatchCmd()
|
||||
cmd.SetArgs([]string{urlsFile, "--output-dir", outputDir, "--parallel", "2"})
|
||||
err := cmd.Execute()
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
for _, f := range []string{"file1.txt", "file2.txt"} {
|
||||
if _, err := os.Stat(filepath.Join(outputDir, f)); os.IsNotExist(err) {
|
||||
t.Errorf("expected file %s to be created", f)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Test with --jq flag
|
||||
t.Run("JQ", func(t *testing.T) {
|
||||
outputDir := t.TempDir()
|
||||
cmd := NewCollectBatchCmd()
|
||||
cmd.SetArgs([]string{jsonFile, "--output-dir", outputDir, "--jq", ".files[].url"})
|
||||
err := cmd.Execute()
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
for _, f := range []string{"file1.txt", "file2.txt"} {
|
||||
if _, err := os.Stat(filepath.Join(outputDir, f)); os.IsNotExist(err) {
|
||||
t.Errorf("expected file %s to be created", f)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestCollectBatch_Sequential(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
fmt.Fprintln(w, "Hello, client")
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
tempDir := t.TempDir()
|
||||
urlsFile := filepath.Join(tempDir, "urls.txt")
|
||||
err := os.WriteFile(urlsFile, []byte(server.URL+"/file1.txt\n"+server.URL+"/file2.txt"), 0644)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Test with file input
|
||||
t.Run("FromFile", func(t *testing.T) {
|
||||
outputDir := t.TempDir()
|
||||
cmd := NewCollectBatchCmd()
|
||||
cmd.SetArgs([]string{urlsFile, "--output-dir", outputDir})
|
||||
err := cmd.Execute()
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
for _, f := range []string{"file1.txt", "file2.txt"} {
|
||||
if _, err := os.Stat(filepath.Join(outputDir, f)); os.IsNotExist(err) {
|
||||
t.Errorf("expected file %s to be created", f)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// Test with stdin
|
||||
t.Run("FromStdin", func(t *testing.T) {
|
||||
outputDir := t.TempDir()
|
||||
cmd := NewCollectBatchCmd()
|
||||
cmd.SetArgs([]string{"-", "--output-dir", outputDir})
|
||||
|
||||
oldStdin := os.Stdin
|
||||
defer func() { os.Stdin = oldStdin }()
|
||||
r, w, _ := os.Pipe()
|
||||
os.Stdin = r
|
||||
go func() {
|
||||
defer w.Close()
|
||||
w.Write([]byte(server.URL + "/file1.txt\n"))
|
||||
}()
|
||||
|
||||
err := cmd.Execute()
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(outputDir, "file1.txt")); os.IsNotExist(err) {
|
||||
t.Error("expected file file1.txt to be created")
|
||||
}
|
||||
})
|
||||
|
||||
// Test --continue flag
|
||||
t.Run("Continue", func(t *testing.T) {
|
||||
outputDir := t.TempDir()
|
||||
cmd := NewCollectBatchCmd()
|
||||
|
||||
// First run
|
||||
cmd.SetArgs([]string{urlsFile, "--output-dir", outputDir})
|
||||
err := cmd.Execute()
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error on first run: %v", err)
|
||||
}
|
||||
|
||||
// Second run with --continue
|
||||
var out bytes.Buffer
|
||||
cmd.SetOut(&out)
|
||||
cmd.SetArgs([]string{urlsFile, "--output-dir", outputDir, "--continue"})
|
||||
err = cmd.Execute()
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error on second run: %v", err)
|
||||
}
|
||||
if !strings.Contains(out.String(), "Skipping already downloaded file") {
|
||||
t.Error("expected to see skip message")
|
||||
}
|
||||
})
|
||||
|
||||
// Test --delay flag
|
||||
t.Run("Delay", func(t *testing.T) {
|
||||
outputDir := t.TempDir()
|
||||
cmd := NewCollectBatchCmd()
|
||||
cmd.SetArgs([]string{urlsFile, "--output-dir", outputDir, "--delay", "100ms"})
|
||||
start := time.Now()
|
||||
err := cmd.Execute()
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
duration := time.Since(start)
|
||||
if duration < 200*time.Millisecond {
|
||||
t.Errorf("expected delay to be at least 200ms, got %v", duration)
|
||||
}
|
||||
})
|
||||
}
|
||||
2
go.mod
2
go.mod
|
|
@ -35,6 +35,8 @@ require (
|
|||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/gorilla/websocket v1.5.3 // indirect
|
||||
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
||||
github.com/itchyny/gojq v0.12.18 // indirect
|
||||
github.com/itchyny/timefmt-go v0.1.7 // indirect
|
||||
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect
|
||||
github.com/jchv/go-winloader v0.0.0-20210711035445-715c2860da7e // indirect
|
||||
github.com/kevinburke/ssh_config v1.2.0 // indirect
|
||||
|
|
|
|||
5
go.sum
5
go.sum
|
|
@ -61,6 +61,10 @@ github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aN
|
|||
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
||||
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
|
||||
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
|
||||
github.com/itchyny/gojq v0.12.18 h1:gFGHyt/MLbG9n6dqnvlliiya2TaMMh6FFaR2b1H6Drc=
|
||||
github.com/itchyny/gojq v0.12.18/go.mod h1:4hPoZ/3lN9fDL1D+aK7DY1f39XZpY9+1Xpjz8atrEkg=
|
||||
github.com/itchyny/timefmt-go v0.1.7 h1:xyftit9Tbw+Dc/huSSPJaEmX1TVL8lw5vxjJLK4GMMA=
|
||||
github.com/itchyny/timefmt-go v0.1.7/go.mod h1:5E46Q+zj7vbTgWY8o5YkMeYb4I6GeWLFnetPy5oBrAI=
|
||||
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOlocH6Fxy8MmwDt+yVQYULKfN0RoTN8A=
|
||||
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99/go.mod h1:1lJo3i6rXxKeerYnT8Nvf0QmHCRC1n8sfWVwXF2Frvo=
|
||||
github.com/jchv/go-winloader v0.0.0-20210711035445-715c2860da7e h1:Q3+PugElBCf4PFpxhErSzU3/PY5sFL5Z6rfv4AbGAck=
|
||||
|
|
@ -100,6 +104,7 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE
|
|||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
|
||||
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
||||
github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
|
||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
|
||||
github.com/onsi/gomega v1.34.1 h1:EUMJIKUjM8sKjYbtxQI9A4z2o+rruxnzNvpknOXie6k=
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue