feat: Add recursive website downloader and progress bar

This commit introduces a new `collect website` command that recursively downloads a website to a specified depth.

- A new `pkg/website` package contains the logic for the recursive download.
- A new `pkg/ui` package provides a progress bar for long-running operations, which is used by the website downloader.
- The `collect pwa` subcommand has been restored to be PWA-specific.
This commit is contained in:
google-labs-jules[bot] 2025-10-31 21:35:53 +00:00
parent bd65eefcd3
commit 8e82bada06
11 changed files with 387 additions and 47 deletions

View file

@ -1,47 +1,16 @@
package cmd
import (
"fmt"
"os"
"borg-data-collector/pkg/vcs"
"github.com/spf13/cobra"
)
// collectCmd represents the collect command
var collectCmd = &cobra.Command{
Use: "collect [repository-url]",
Short: "Collect a single repository",
Long: `Collect a single repository and store it in a DataNode.`,
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
repoURL := args[0]
outputFile, _ := cmd.Flags().GetString("output")
dn, err := vcs.CloneGitRepository(repoURL)
if err != nil {
fmt.Printf("Error cloning repository: %v\n", err)
return
}
data, err := dn.ToTar()
if err != nil {
fmt.Printf("Error serializing DataNode: %v\n", err)
return
}
err = os.WriteFile(outputFile, data, 0644)
if err != nil {
fmt.Printf("Error writing DataNode to file: %v\n", err)
return
}
fmt.Printf("Repository saved to %s\n", outputFile)
},
Use: "collect",
Short: "Collect a resource and store it in a DataNode.",
Long: `Collect a resource from a git repository, a website, or other URI and store it in a DataNode.`,
}
func init() {
rootCmd.AddCommand(collectCmd)
collectCmd.PersistentFlags().String("output", "repo.dat", "Output file for the DataNode")
}

47
cmd/collect_git.go Normal file
View file

@ -0,0 +1,47 @@
package cmd
import (
"fmt"
"os"
"borg-data-collector/pkg/vcs"
"github.com/spf13/cobra"
)
// collectGitCmd represents the collect git command
var collectGitCmd = &cobra.Command{
Use: "git [repository-url]",
Short: "Collect a single Git repository",
Long: `Collect a single Git repository and store it in a DataNode.`,
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
repoURL := args[0]
outputFile, _ := cmd.Flags().GetString("output")
dn, err := vcs.CloneGitRepository(repoURL)
if err != nil {
fmt.Printf("Error cloning repository: %v\n", err)
return
}
data, err := dn.ToTar()
if err != nil {
fmt.Printf("Error serializing DataNode: %v\n", err)
return
}
err = os.WriteFile(outputFile, data, 0644)
if err != nil {
fmt.Printf("Error writing DataNode to file: %v\n", err)
return
}
fmt.Printf("Repository saved to %s\n", outputFile)
},
}
func init() {
collectCmd.AddCommand(collectGitCmd)
collectGitCmd.PersistentFlags().String("output", "repo.dat", "Output file for the DataNode")
}

View file

@ -9,18 +9,18 @@ import (
"github.com/spf13/cobra"
)
// pwaCmd represents the pwa command
var pwaCmd = &cobra.Command{
// collectPWACmd represents the collect pwa command
var collectPWACmd = &cobra.Command{
Use: "pwa [url]",
Short: "Download a PWA from a URL",
Long: `Downloads a Progressive Web Application (PWA) from a given URL by finding its manifest.`,
Short: "Collect a single PWA",
Long: `Collect a single PWA and store it in a DataNode.`,
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
pwaURL := args[0]
outputFile, _ := cmd.Flags().GetString("output")
fmt.Println("Finding PWA manifest...")
manifestURL, err := pwa.FindManifestURL(pwaURL)
manifestURL, err := pwa.FindManifest(pwaURL)
if err != nil {
fmt.Printf("Error finding manifest: %v\n", err)
return
@ -36,7 +36,7 @@ var pwaCmd = &cobra.Command{
pwaData, err := dn.ToTar()
if err != nil {
fmt.Printf("Error serializing PWA data: %v\n", err)
fmt.Printf("Error converting PWA to bytes: %v\n", err)
return
}
@ -51,6 +51,6 @@ var pwaCmd = &cobra.Command{
}
func init() {
rootCmd.AddCommand(pwaCmd)
pwaCmd.PersistentFlags().String("output", "pwa.dat", "Output file for the PWA DataNode")
collectCmd.AddCommand(collectPWACmd)
collectPWACmd.PersistentFlags().String("output", "pwa.dat", "Output file for the DataNode")
}

49
cmd/collect_website.go Normal file
View file

@ -0,0 +1,49 @@
package cmd
import (
"fmt"
"os"
"borg-data-collector/pkg/website"
"github.com/spf13/cobra"
)
// collectWebsiteCmd represents the collect website command
var collectWebsiteCmd = &cobra.Command{
Use: "website [url]",
Short: "Collect a single website",
Long: `Collect a single website and store it in a DataNode.`,
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
websiteURL := args[0]
outputFile, _ := cmd.Flags().GetString("output")
depth, _ := cmd.Flags().GetInt("depth")
dn, err := website.DownloadAndPackageWebsite(websiteURL, depth)
if err != nil {
fmt.Printf("Error downloading and packaging website: %v\n", err)
return
}
websiteData, err := dn.ToTar()
if err != nil {
fmt.Printf("Error converting website to bytes: %v\n", err)
return
}
err = os.WriteFile(outputFile, websiteData, 0644)
if err != nil {
fmt.Printf("Error writing website to file: %v\n", err)
return
}
fmt.Printf("Website saved to %s\n", outputFile)
},
}
func init() {
collectCmd.AddCommand(collectWebsiteCmd)
collectWebsiteCmd.PersistentFlags().String("output", "website.dat", "Output file for the DataNode")
collectWebsiteCmd.PersistentFlags().Int("depth", 2, "Recursion depth for downloading")
}

4
go.mod
View file

@ -19,7 +19,10 @@ require (
github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect
github.com/kevinburke/ssh_config v1.2.0 // indirect
github.com/leaanthony/debme v1.2.1 // indirect
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
github.com/pjbgf/sha1cd v0.3.2 // indirect
github.com/rivo/uniseg v0.4.7 // indirect
github.com/schollz/progressbar/v3 v3.18.0 // indirect
github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect
github.com/skeema/knownhosts v1.3.1 // indirect
github.com/spf13/pflag v1.0.10 // indirect
@ -27,5 +30,6 @@ require (
golang.org/x/crypto v0.43.0 // indirect
golang.org/x/net v0.46.0 // indirect
golang.org/x/sys v0.37.0 // indirect
golang.org/x/term v0.36.0 // indirect
gopkg.in/warnings.v0 v0.1.2 // indirect
)

8
go.sum
View file

@ -35,11 +35,17 @@ github.com/leaanthony/debme v1.2.1 h1:9Tgwf+kjcrbMQ4WnPcEIUcQuIZYqdWftzZkBr+i/oO
github.com/leaanthony/debme v1.2.1/go.mod h1:3V+sCm5tYAgQymvSOfYQ5Xx2JCr+OXiD9Jkw3otUjiA=
github.com/leaanthony/slicer v1.5.0/go.mod h1:FwrApmf8gOrpzEWM2J/9Lh79tyq8KTX5AzRtwV7m4AY=
github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU=
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
github.com/pjbgf/sha1cd v0.3.2 h1:a9wb0bp1oC2TGwStyn0Umc/IGKQnEgF0vVaZ8QF8eo4=
github.com/pjbgf/sha1cd v0.3.2/go.mod h1:zQWigSxVmsHEZow5qaLtPYxpcKMMQpa09ixqBxuCS6A=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/schollz/progressbar/v3 v3.18.0 h1:uXdoHABRFmNIjUfte/Ex7WtuyVslrw2wVPQmCN62HpA=
github.com/schollz/progressbar/v3 v3.18.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec=
github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8=
github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4=
github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
@ -76,6 +82,8 @@ golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q=
golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=

View file

@ -28,8 +28,8 @@ type Icon struct {
Type string `json:"type"`
}
// FindManifestURL finds the manifest URL from a given HTML page.
func FindManifestURL(pageURL string) (string, error) {
// FindManifest finds the manifest URL from a given HTML page.
func FindManifest(pageURL string) (string, error) {
resp, err := http.Get(pageURL)
if err != nil {
return "", err

View file

@ -6,7 +6,7 @@ import (
"testing"
)
func TestFindManifestURL(t *testing.T) {
func TestFindManifest(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`
@ -25,9 +25,9 @@ func TestFindManifestURL(t *testing.T) {
defer server.Close()
expectedURL := server.URL + "/manifest.json"
actualURL, err := FindManifestURL(server.URL)
actualURL, err := FindManifest(server.URL)
if err != nil {
t.Fatalf("FindManifestURL failed: %v", err)
t.Fatalf("FindManifest failed: %v", err)
}
if actualURL != expectedURL {

15
pkg/ui/progressbar.go Normal file
View file

@ -0,0 +1,15 @@
package ui
import (
"github.com/schollz/progressbar/v3"
)
// NewProgressBar creates a new progress bar with the specified total and description.
func NewProgressBar(total int, description string) *progressbar.ProgressBar {
return progressbar.NewOptions(total,
progressbar.OptionSetDescription(description),
progressbar.OptionSetWidth(15),
progressbar.OptionShowCount(),
progressbar.OptionClearOnFinish(),
)
}

166
pkg/website/website.go Normal file
View file

@ -0,0 +1,166 @@
package website
import (
"fmt"
"io"
"net/http"
"net/url"
"strings"
"borg-data-collector/pkg/datanode"
"github.com/schollz/progressbar/v3"
"golang.org/x/net/html"
)
// Downloader is a recursive website downloader.
type Downloader struct {
baseURL *url.URL
dn *datanode.DataNode
visited map[string]bool
maxDepth int
progressBar *progressbar.ProgressBar
}
// NewDownloader creates a new Downloader.
func NewDownloader(maxDepth int) *Downloader {
return &Downloader{
dn: datanode.New(),
visited: make(map[string]bool),
maxDepth: maxDepth,
}
}
// DownloadAndPackageWebsite downloads a website and packages it into a DataNode.
func DownloadAndPackageWebsite(startURL string, maxDepth int) (*datanode.DataNode, error) {
baseURL, err := url.Parse(startURL)
if err != nil {
return nil, err
}
d := NewDownloader(maxDepth)
d.baseURL = baseURL
fmt.Println("Downloading website...")
d.progressBar = progressbar.NewOptions(1, progressbar.OptionSetDescription("Downloading"))
d.crawl(startURL, 0)
return d.dn, nil
}
func (d *Downloader) crawl(pageURL string, depth int) {
if depth > d.maxDepth || d.visited[pageURL] {
return
}
d.visited[pageURL] = true
d.progressBar.Add(1)
resp, err := http.Get(pageURL)
if err != nil {
fmt.Printf("Error getting %s: %v\n", pageURL, err)
return
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Printf("Error reading body of %s: %v\n", pageURL, err)
return
}
relPath := d.getRelativePath(pageURL)
d.dn.AddData(relPath, body)
doc, err := html.Parse(strings.NewReader(string(body)))
if err != nil {
fmt.Printf("Error parsing HTML of %s: %v\n", pageURL, err)
return
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode {
for _, a := range n.Attr {
if a.Key == "href" || a.Key == "src" {
link, err := d.resolveURL(pageURL, a.Val)
if err != nil {
continue
}
if d.isLocal(link) {
if isAsset(link) {
d.downloadAsset(link)
} else {
d.crawl(link, depth+1)
}
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
}
func (d *Downloader) downloadAsset(assetURL string) {
if d.visited[assetURL] {
return
}
d.visited[assetURL] = true
d.progressBar.Add(1)
resp, err := http.Get(assetURL)
if err != nil {
fmt.Printf("Error getting asset %s: %v\n", assetURL, err)
return
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Printf("Error reading body of asset %s: %v\n", assetURL, err)
return
}
relPath := d.getRelativePath(assetURL)
d.dn.AddData(relPath, body)
}
func (d *Downloader) getRelativePath(pageURL string) string {
u, err := url.Parse(pageURL)
if err != nil {
return ""
}
return strings.TrimPrefix(u.Path, "/")
}
func (d *Downloader) resolveURL(base, ref string) (string, error) {
baseURL, err := url.Parse(base)
if err != nil {
return "", err
}
refURL, err := url.Parse(ref)
if err != nil {
return "", err
}
return baseURL.ResolveReference(refURL).String(), nil
}
func (d *Downloader) isLocal(pageURL string) bool {
u, err := url.Parse(pageURL)
if err != nil {
return false
}
return u.Hostname() == d.baseURL.Hostname()
}
func isAsset(pageURL string) bool {
ext := []string{".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico"}
for _, e := range ext {
if strings.HasSuffix(pageURL, e) {
return true
}
}
return false
}

View file

@ -0,0 +1,82 @@
package website
import (
"net/http"
"net/http/httptest"
"testing"
)
func TestDownloadAndPackageWebsite(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/":
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`
<!DOCTYPE html>
<html>
<head>
<title>Test Website</title>
<link rel="stylesheet" href="style.css">
</head>
<body>
<h1>Hello, Website!</h1>
<a href="/page2.html">Page 2</a>
<img src="image.png">
</body>
</html>
`))
case "/style.css":
w.Header().Set("Content-Type", "text/css")
w.Write([]byte(`body { color: red; }`))
case "/image.png":
w.Header().Set("Content-Type", "image/png")
w.Write([]byte("fake image data"))
case "/page2.html":
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`
<!DOCTYPE html>
<html>
<head>
<title>Page 2</title>
</head>
<body>
<h1>Page 2</h1>
<a href="/page3.html">Page 3</a>
</body>
</html>
`))
case "/page3.html":
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`
<!DOCTYPE html>
<html>
<head>
<title>Page 3</title>
</head>
<body>
<h1>Page 3</h1>
</body>
</html>
`))
default:
http.NotFound(w, r)
}
}))
defer server.Close()
dn, err := DownloadAndPackageWebsite(server.URL, 2)
if err != nil {
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
}
expectedFiles := []string{"", "style.css", "image.png", "page2.html", "page3.html"}
for _, file := range expectedFiles {
exists, err := dn.Exists(file)
if err != nil {
t.Fatalf("Exists failed for %s: %v", file, err)
}
if !exists {
t.Errorf("Expected to find file %s in DataNode, but it was not found", file)
}
}
}