Compare commits

...
Sign in to create a new pull request.

5 commits

Author SHA1 Message Date
Snider
dad5cd2588 fix: Resolve CI failure
This commit resolves the CI failure by correcting a typo in the
`downloadURL` function. The `httpClient.Get(u)` call was replaced with
the correct `http.Get(u)` call, and an unused import was removed.
2026-02-02 06:49:57 +00:00
Snider
ef08cdf807
Update cmd/collect_batch_test.go
Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
2026-02-02 06:45:22 +00:00
Snider
2f45e5806f
Update cmd/collect_batch.go
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-02 06:44:39 +00:00
Snider
0521d835e7
Update cmd/collect_batch.go
Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
2026-02-02 06:44:19 +00:00
google-labs-jules[bot]
fd271f8bd9 feat: Add collect batch command
This commit introduces a new `collect batch` command that allows users to
collect multiple URLs from a file, stdin, or a JSON registry.

The command supports the following features:

- Parallel downloads with a configurable number of workers.
- Rate limiting with a configurable delay between requests.
- The ability to skip already downloaded files.
- Progress reporting with a progress bar.
- Reading URLs from a file, stdin, or a JSON file with a `jq` filter.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
2026-02-02 00:40:53 +00:00
4 changed files with 451 additions and 0 deletions

224
cmd/collect_batch.go Normal file
View file

@ -0,0 +1,224 @@
package cmd
import (
"bufio"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"sync"
"time"
"github.com/itchyny/gojq"
"github.com/mattn/go-isatty"
"github.com/schollz/progressbar/v3"
"github.com/spf13/cobra"
)
// collectBatchCmd represents the collect batch command
var collectBatchCmd = NewCollectBatchCmd()
func init() {
GetCollectCmd().AddCommand(GetCollectBatchCmd())
}
func GetCollectBatchCmd() *cobra.Command {
return collectBatchCmd
}
func NewCollectBatchCmd() *cobra.Command {
collectBatchCmd := &cobra.Command{
Use: "batch [file|-]",
Short: "Batch collect from a list of URLs",
Long: `Collect multiple resources from a list of URLs provided in a file or via stdin.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
inputFile := args[0]
outputDir, _ := cmd.Flags().GetString("output-dir")
delay, _ := cmd.Flags().GetString("delay")
skipExisting, _ := cmd.Flags().GetBool("continue")
parallel, _ := cmd.Flags().GetInt("parallel")
jqFilter, _ := cmd.Flags().GetString("jq")
delayDuration, err := time.ParseDuration(delay)
if err != nil {
return fmt.Errorf("invalid delay duration: %w", err)
}
var reader io.Reader
if inputFile == "-" {
reader = os.Stdin
} else {
file, err := os.Open(inputFile)
if err != nil {
return fmt.Errorf("error opening input file: %w", err)
}
defer file.Close()
reader = file
}
urls, err := readURLs(reader, jqFilter)
if err != nil {
return fmt.Errorf("error reading urls: %w", err)
}
if err := os.MkdirAll(outputDir, os.ModePerm); err != nil {
return fmt.Errorf("error creating output directory: %w", err)
}
urlsChan := make(chan string, len(urls))
var wg sync.WaitGroup
var bar *progressbar.ProgressBar
var outMutex sync.Mutex
if isatty.IsTerminal(os.Stdout.Fd()) {
bar = progressbar.Default(int64(len(urls)))
}
for i := 0; i < parallel; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for u := range urlsChan {
downloadURL(cmd, u, outputDir, skipExisting, delayDuration, bar, &outMutex)
if bar != nil {
bar.Add(1)
}
}
}()
}
for _, u := range urls {
urlsChan <- u
}
close(urlsChan)
wg.Wait()
return nil
},
}
collectBatchCmd.Flags().IntP("parallel", "p", 1, "Number of concurrent downloads")
collectBatchCmd.Flags().String("delay", "0s", "Delay between requests")
collectBatchCmd.Flags().StringP("output-dir", "o", ".", "Base output directory")
collectBatchCmd.Flags().Bool("continue", false, "Skip already collected files")
collectBatchCmd.Flags().String("jq", "", "jq filter to extract URLs from JSON input")
return collectBatchCmd
}
func downloadURL(cmd *cobra.Command, u, outputDir string, skipExisting bool, delayDuration time.Duration, bar *progressbar.ProgressBar, outMutex *sync.Mutex) {
fileName, err := getFileNameFromURL(u)
if err != nil {
logMessage(cmd, fmt.Sprintf("Skipping invalid URL %s: %v", u, err), bar, outMutex)
return
}
filePath := filepath.Join(outputDir, fileName)
if skipExisting {
if _, err := os.Stat(filePath); err == nil {
logMessage(cmd, fmt.Sprintf("Skipping already downloaded file: %s", filePath), bar, outMutex)
return
}
}
resp, err := http.Get(u)
if err != nil {
logMessage(cmd, fmt.Sprintf("Error downloading %s: %v", u, err), bar, outMutex)
return
}
defer resp.Body.Close()
out, err := os.Create(filePath)
if err != nil {
logMessage(cmd, fmt.Sprintf("Error creating file for %s: %v", u, err), bar, outMutex)
return
}
defer out.Close()
_, err = io.Copy(out, resp.Body)
if err != nil {
logMessage(cmd, fmt.Sprintf("Error saving content for %s: %v", u, err), bar, outMutex)
return
}
logMessage(cmd, fmt.Sprintf("Downloaded %s to %s", u, filePath), bar, outMutex)
if delayDuration > 0 {
time.Sleep(delayDuration)
}
}
func logMessage(cmd *cobra.Command, msg string, bar *progressbar.ProgressBar, outMutex *sync.Mutex) {
if bar != nil {
bar.Describe(msg)
} else {
outMutex.Lock()
defer outMutex.Unlock()
fmt.Fprintln(cmd.OutOrStdout(), msg)
}
}
func readURLs(reader io.Reader, jqFilter string) ([]string, error) {
if jqFilter == "" {
var urls []string
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line != "" {
urls = append(urls, line)
}
}
if err := scanner.Err(); err != nil {
return nil, err
}
return urls, nil
}
query, err := gojq.Parse(jqFilter)
if err != nil {
return nil, fmt.Errorf("error parsing jq filter: %w", err)
}
var input interface{}
decoder := json.NewDecoder(reader)
if err := decoder.Decode(&input); err != nil {
return nil, fmt.Errorf("error decoding json: %w", err)
}
var urls []string
iter := query.Run(input)
for {
v, ok := iter.Next()
if !ok {
break
}
if err, ok := v.(error); ok {
return nil, fmt.Errorf("error executing jq filter: %w", err)
}
if s, ok := v.(string); ok {
urls = append(urls, s)
}
}
return urls, nil
}
func getFileNameFromURL(rawURL string) (string, error) {
parsedURL, err := url.Parse(rawURL)
if err != nil {
return "", err
}
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
return "", fmt.Errorf("invalid URL scheme: %s", parsedURL.Scheme)
}
if parsedURL.Path == "" || parsedURL.Path == "/" {
return "index.html", nil
}
return filepath.Base(parsedURL.Path), nil
}

220
cmd/collect_batch_test.go Normal file
View file

@ -0,0 +1,220 @@
package cmd
import (
"bytes"
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"time"
)
func TestReadURLs(t *testing.T) {
tests := []struct {
name string
input string
jqFilter string
expected []string
err bool
}{
{"Good", "url1\nurl2\nurl3", "", []string{"url1", "url2", "url3"}, false},
{"EmptyInput", "", "", []string{}, false},
{"WithEmptyLines", "url1\n\nurl2", "", []string{"url1", "url2"}, false},
{"JSON", `{"urls": ["url1", "url2"]}`, ".urls[]", []string{"url1", "url2"}, false},
{"InvalidJSON", "{", ".urls[]", nil, true},
{"InvalidJQ", `{"urls": ["url1", "url2"]}`, ".[", nil, true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
reader := strings.NewReader(tt.input)
urls, err := readURLs(reader, tt.jqFilter)
if (err != nil) != tt.err {
t.Fatalf("readURLs() error = %v, wantErr %v", err, tt.err)
}
if len(urls) != len(tt.expected) {
t.Fatalf("expected %d urls, got %d", len(tt.expected), len(urls))
}
for i := range urls {
if urls[i] != tt.expected[i] {
t.Errorf("expected url %s, got %s", tt.expected[i], urls[i])
}
}
})
}
}
func TestGetFileNameFromURL(t *testing.T) {
tests := []struct {
name string
rawURL string
expected string
err bool
}{
{"Good", "http://example.com/file.txt", "file.txt", false},
{"NoPath", "http://example.com", "index.html", false},
{"InvalidURL", "ftp://example.com/file.txt", "", true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
fileName, err := getFileNameFromURL(tt.rawURL)
if (err != nil) != tt.err {
t.Fatalf("getFileNameFromURL() error = %v, wantErr %v", err, tt.err)
}
if fileName != tt.expected {
t.Errorf("expected filename %s, got %s", tt.expected, fileName)
}
})
}
}
func TestCollectBatch_ParallelAndJSON(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprintln(w, "Hello, client")
}))
defer server.Close()
tempDir := t.TempDir()
urlsFile := filepath.Join(tempDir, "urls.txt")
err := os.WriteFile(urlsFile, []byte(server.URL+"/file1.txt\n"+server.URL+"/file2.txt"), 0644)
if err != nil {
t.Fatal(err)
}
jsonFile := filepath.Join(tempDir, "urls.json")
err = os.WriteFile(jsonFile, []byte(`{"files": [{"url": "`+server.URL+`/file1.txt"}, {"url": "`+server.URL+`/file2.txt"}]}`), 0644)
if err != nil {
t.Fatal(err)
}
// Test with --parallel flag
t.Run("Parallel", func(t *testing.T) {
outputDir := t.TempDir()
cmd := NewCollectBatchCmd()
cmd.SetArgs([]string{urlsFile, "--output-dir", outputDir, "--parallel", "2"})
err := cmd.Execute()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
for _, f := range []string{"file1.txt", "file2.txt"} {
if _, err := os.Stat(filepath.Join(outputDir, f)); os.IsNotExist(err) {
t.Errorf("expected file %s to be created", f)
}
}
})
// Test with --jq flag
t.Run("JQ", func(t *testing.T) {
outputDir := t.TempDir()
cmd := NewCollectBatchCmd()
cmd.SetArgs([]string{jsonFile, "--output-dir", outputDir, "--jq", ".files[].url"})
err := cmd.Execute()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
for _, f := range []string{"file1.txt", "file2.txt"} {
if _, err := os.Stat(filepath.Join(outputDir, f)); os.IsNotExist(err) {
t.Errorf("expected file %s to be created", f)
}
}
})
}
func TestCollectBatch_Sequential(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprintln(w, "Hello, client")
}))
defer server.Close()
tempDir := t.TempDir()
urlsFile := filepath.Join(tempDir, "urls.txt")
err := os.WriteFile(urlsFile, []byte(server.URL+"/file1.txt\n"+server.URL+"/file2.txt"), 0644)
if err != nil {
t.Fatal(err)
}
// Test with file input
t.Run("FromFile", func(t *testing.T) {
outputDir := t.TempDir()
cmd := NewCollectBatchCmd()
cmd.SetArgs([]string{urlsFile, "--output-dir", outputDir})
err := cmd.Execute()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
for _, f := range []string{"file1.txt", "file2.txt"} {
if _, err := os.Stat(filepath.Join(outputDir, f)); os.IsNotExist(err) {
t.Errorf("expected file %s to be created", f)
}
}
})
// Test with stdin
t.Run("FromStdin", func(t *testing.T) {
outputDir := t.TempDir()
cmd := NewCollectBatchCmd()
cmd.SetArgs([]string{"-", "--output-dir", outputDir})
oldStdin := os.Stdin
defer func() { os.Stdin = oldStdin }()
r, w, _ := os.Pipe()
os.Stdin = r
go func() {
defer w.Close()
w.Write([]byte(server.URL + "/file1.txt\n"))
}()
err := cmd.Execute()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if _, err := os.Stat(filepath.Join(outputDir, "file1.txt")); os.IsNotExist(err) {
t.Error("expected file file1.txt to be created")
}
})
// Test --continue flag
t.Run("Continue", func(t *testing.T) {
outputDir := t.TempDir()
cmd := NewCollectBatchCmd()
// First run
cmd.SetArgs([]string{urlsFile, "--output-dir", outputDir})
err := cmd.Execute()
if err != nil {
t.Fatalf("unexpected error on first run: %v", err)
}
// Second run with --continue
var out bytes.Buffer
cmd.SetOut(&out)
cmd.SetArgs([]string{urlsFile, "--output-dir", outputDir, "--continue"})
err = cmd.Execute()
if err != nil {
t.Fatalf("unexpected error on second run: %v", err)
}
if !strings.Contains(out.String(), "Skipping already downloaded file") {
t.Error("expected to see skip message")
}
})
// Test --delay flag
t.Run("Delay", func(t *testing.T) {
outputDir := t.TempDir()
cmd := NewCollectBatchCmd()
cmd.SetArgs([]string{urlsFile, "--output-dir", outputDir, "--delay", "100ms"})
start := time.Now()
err := cmd.Execute()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
duration := time.Since(start)
if duration < 200*time.Millisecond {
t.Errorf("expected delay to be at least 200ms, got %v", duration)
}
})
}

2
go.mod
View file

@ -35,6 +35,8 @@ require (
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/websocket v1.5.3 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/itchyny/gojq v0.12.18 // indirect
github.com/itchyny/timefmt-go v0.1.7 // indirect
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect
github.com/jchv/go-winloader v0.0.0-20210711035445-715c2860da7e // indirect
github.com/kevinburke/ssh_config v1.2.0 // indirect

5
go.sum
View file

@ -61,6 +61,10 @@ github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aN
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/itchyny/gojq v0.12.18 h1:gFGHyt/MLbG9n6dqnvlliiya2TaMMh6FFaR2b1H6Drc=
github.com/itchyny/gojq v0.12.18/go.mod h1:4hPoZ/3lN9fDL1D+aK7DY1f39XZpY9+1Xpjz8atrEkg=
github.com/itchyny/timefmt-go v0.1.7 h1:xyftit9Tbw+Dc/huSSPJaEmX1TVL8lw5vxjJLK4GMMA=
github.com/itchyny/timefmt-go v0.1.7/go.mod h1:5E46Q+zj7vbTgWY8o5YkMeYb4I6GeWLFnetPy5oBrAI=
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOlocH6Fxy8MmwDt+yVQYULKfN0RoTN8A=
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99/go.mod h1:1lJo3i6rXxKeerYnT8Nvf0QmHCRC1n8sfWVwXF2Frvo=
github.com/jchv/go-winloader v0.0.0-20210711035445-715c2860da7e h1:Q3+PugElBCf4PFpxhErSzU3/PY5sFL5Z6rfv4AbGAck=
@ -100,6 +104,7 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw=
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
github.com/onsi/gomega v1.34.1 h1:EUMJIKUjM8sKjYbtxQI9A4z2o+rruxnzNvpknOXie6k=