From 8e82bada062ea12211e880afca457299dd9af9a9 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 31 Oct 2025 21:35:53 +0000 Subject: [PATCH] feat: Add recursive website downloader and progress bar This commit introduces a new `collect website` command that recursively downloads a website to a specified depth. - A new `pkg/website` package contains the logic for the recursive download. - A new `pkg/ui` package provides a progress bar for long-running operations, which is used by the website downloader. - The `collect pwa` subcommand has been restored to be PWA-specific. --- cmd/collect.go | 37 +------- cmd/collect_git.go | 47 ++++++++++ cmd/{pwa.go => collect_pwa.go} | 16 ++-- cmd/collect_website.go | 49 ++++++++++ go.mod | 4 + go.sum | 8 ++ pkg/pwa/pwa.go | 4 +- pkg/pwa/pwa_test.go | 6 +- pkg/ui/progressbar.go | 15 +++ pkg/website/website.go | 166 +++++++++++++++++++++++++++++++++ pkg/website/website_test.go | 82 ++++++++++++++++ 11 files changed, 387 insertions(+), 47 deletions(-) create mode 100644 cmd/collect_git.go rename cmd/{pwa.go => collect_pwa.go} (68%) create mode 100644 cmd/collect_website.go create mode 100644 pkg/ui/progressbar.go create mode 100644 pkg/website/website.go create mode 100644 pkg/website/website_test.go diff --git a/cmd/collect.go b/cmd/collect.go index 779441c..57960b2 100644 --- a/cmd/collect.go +++ b/cmd/collect.go @@ -1,47 +1,16 @@ package cmd import ( - "fmt" - "os" - - "borg-data-collector/pkg/vcs" - "github.com/spf13/cobra" ) // collectCmd represents the collect command var collectCmd = &cobra.Command{ - Use: "collect [repository-url]", - Short: "Collect a single repository", - Long: `Collect a single repository and store it in a DataNode.`, - Args: cobra.ExactArgs(1), - Run: func(cmd *cobra.Command, args []string) { - repoURL := args[0] - outputFile, _ := cmd.Flags().GetString("output") - - dn, err := vcs.CloneGitRepository(repoURL) - if err != nil { - fmt.Printf("Error cloning repository: %v\n", err) - return - } - - data, err := dn.ToTar() - if err != nil { - fmt.Printf("Error serializing DataNode: %v\n", err) - return - } - - err = os.WriteFile(outputFile, data, 0644) - if err != nil { - fmt.Printf("Error writing DataNode to file: %v\n", err) - return - } - - fmt.Printf("Repository saved to %s\n", outputFile) - }, + Use: "collect", + Short: "Collect a resource and store it in a DataNode.", + Long: `Collect a resource from a git repository, a website, or other URI and store it in a DataNode.`, } func init() { rootCmd.AddCommand(collectCmd) - collectCmd.PersistentFlags().String("output", "repo.dat", "Output file for the DataNode") } diff --git a/cmd/collect_git.go b/cmd/collect_git.go new file mode 100644 index 0000000..2fcd450 --- /dev/null +++ b/cmd/collect_git.go @@ -0,0 +1,47 @@ +package cmd + +import ( + "fmt" + "os" + + "borg-data-collector/pkg/vcs" + + "github.com/spf13/cobra" +) + +// collectGitCmd represents the collect git command +var collectGitCmd = &cobra.Command{ + Use: "git [repository-url]", + Short: "Collect a single Git repository", + Long: `Collect a single Git repository and store it in a DataNode.`, + Args: cobra.ExactArgs(1), + Run: func(cmd *cobra.Command, args []string) { + repoURL := args[0] + outputFile, _ := cmd.Flags().GetString("output") + + dn, err := vcs.CloneGitRepository(repoURL) + if err != nil { + fmt.Printf("Error cloning repository: %v\n", err) + return + } + + data, err := dn.ToTar() + if err != nil { + fmt.Printf("Error serializing DataNode: %v\n", err) + return + } + + err = os.WriteFile(outputFile, data, 0644) + if err != nil { + fmt.Printf("Error writing DataNode to file: %v\n", err) + return + } + + fmt.Printf("Repository saved to %s\n", outputFile) + }, +} + +func init() { + collectCmd.AddCommand(collectGitCmd) + collectGitCmd.PersistentFlags().String("output", "repo.dat", "Output file for the DataNode") +} diff --git a/cmd/pwa.go b/cmd/collect_pwa.go similarity index 68% rename from cmd/pwa.go rename to cmd/collect_pwa.go index e288fcb..d7b5402 100644 --- a/cmd/pwa.go +++ b/cmd/collect_pwa.go @@ -9,18 +9,18 @@ import ( "github.com/spf13/cobra" ) -// pwaCmd represents the pwa command -var pwaCmd = &cobra.Command{ +// collectPWACmd represents the collect pwa command +var collectPWACmd = &cobra.Command{ Use: "pwa [url]", - Short: "Download a PWA from a URL", - Long: `Downloads a Progressive Web Application (PWA) from a given URL by finding its manifest.`, + Short: "Collect a single PWA", + Long: `Collect a single PWA and store it in a DataNode.`, Args: cobra.ExactArgs(1), Run: func(cmd *cobra.Command, args []string) { pwaURL := args[0] outputFile, _ := cmd.Flags().GetString("output") fmt.Println("Finding PWA manifest...") - manifestURL, err := pwa.FindManifestURL(pwaURL) + manifestURL, err := pwa.FindManifest(pwaURL) if err != nil { fmt.Printf("Error finding manifest: %v\n", err) return @@ -36,7 +36,7 @@ var pwaCmd = &cobra.Command{ pwaData, err := dn.ToTar() if err != nil { - fmt.Printf("Error serializing PWA data: %v\n", err) + fmt.Printf("Error converting PWA to bytes: %v\n", err) return } @@ -51,6 +51,6 @@ var pwaCmd = &cobra.Command{ } func init() { - rootCmd.AddCommand(pwaCmd) - pwaCmd.PersistentFlags().String("output", "pwa.dat", "Output file for the PWA DataNode") + collectCmd.AddCommand(collectPWACmd) + collectPWACmd.PersistentFlags().String("output", "pwa.dat", "Output file for the DataNode") } diff --git a/cmd/collect_website.go b/cmd/collect_website.go new file mode 100644 index 0000000..90911a2 --- /dev/null +++ b/cmd/collect_website.go @@ -0,0 +1,49 @@ +package cmd + +import ( + "fmt" + "os" + + "borg-data-collector/pkg/website" + + "github.com/spf13/cobra" +) + +// collectWebsiteCmd represents the collect website command +var collectWebsiteCmd = &cobra.Command{ + Use: "website [url]", + Short: "Collect a single website", + Long: `Collect a single website and store it in a DataNode.`, + Args: cobra.ExactArgs(1), + Run: func(cmd *cobra.Command, args []string) { + websiteURL := args[0] + outputFile, _ := cmd.Flags().GetString("output") + depth, _ := cmd.Flags().GetInt("depth") + + dn, err := website.DownloadAndPackageWebsite(websiteURL, depth) + if err != nil { + fmt.Printf("Error downloading and packaging website: %v\n", err) + return + } + + websiteData, err := dn.ToTar() + if err != nil { + fmt.Printf("Error converting website to bytes: %v\n", err) + return + } + + err = os.WriteFile(outputFile, websiteData, 0644) + if err != nil { + fmt.Printf("Error writing website to file: %v\n", err) + return + } + + fmt.Printf("Website saved to %s\n", outputFile) + }, +} + +func init() { + collectCmd.AddCommand(collectWebsiteCmd) + collectWebsiteCmd.PersistentFlags().String("output", "website.dat", "Output file for the DataNode") + collectWebsiteCmd.PersistentFlags().Int("depth", 2, "Recursion depth for downloading") +} diff --git a/go.mod b/go.mod index 78af759..5b4dcca 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,10 @@ require ( github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect github.com/kevinburke/ssh_config v1.2.0 // indirect github.com/leaanthony/debme v1.2.1 // indirect + github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect github.com/pjbgf/sha1cd v0.3.2 // indirect + github.com/rivo/uniseg v0.4.7 // indirect + github.com/schollz/progressbar/v3 v3.18.0 // indirect github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect github.com/skeema/knownhosts v1.3.1 // indirect github.com/spf13/pflag v1.0.10 // indirect @@ -27,5 +30,6 @@ require ( golang.org/x/crypto v0.43.0 // indirect golang.org/x/net v0.46.0 // indirect golang.org/x/sys v0.37.0 // indirect + golang.org/x/term v0.36.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index 1005854..8957994 100644 --- a/go.sum +++ b/go.sum @@ -35,11 +35,17 @@ github.com/leaanthony/debme v1.2.1 h1:9Tgwf+kjcrbMQ4WnPcEIUcQuIZYqdWftzZkBr+i/oO github.com/leaanthony/debme v1.2.1/go.mod h1:3V+sCm5tYAgQymvSOfYQ5Xx2JCr+OXiD9Jkw3otUjiA= github.com/leaanthony/slicer v1.5.0/go.mod h1:FwrApmf8gOrpzEWM2J/9Lh79tyq8KTX5AzRtwV7m4AY= github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU= +github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= +github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= github.com/pjbgf/sha1cd v0.3.2 h1:a9wb0bp1oC2TGwStyn0Umc/IGKQnEgF0vVaZ8QF8eo4= github.com/pjbgf/sha1cd v0.3.2/go.mod h1:zQWigSxVmsHEZow5qaLtPYxpcKMMQpa09ixqBxuCS6A= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= +github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/schollz/progressbar/v3 v3.18.0 h1:uXdoHABRFmNIjUfte/Ex7WtuyVslrw2wVPQmCN62HpA= +github.com/schollz/progressbar/v3 v3.18.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= @@ -76,6 +82,8 @@ golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q= +golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/pkg/pwa/pwa.go b/pkg/pwa/pwa.go index 6cef4c5..1679fa9 100644 --- a/pkg/pwa/pwa.go +++ b/pkg/pwa/pwa.go @@ -28,8 +28,8 @@ type Icon struct { Type string `json:"type"` } -// FindManifestURL finds the manifest URL from a given HTML page. -func FindManifestURL(pageURL string) (string, error) { +// FindManifest finds the manifest URL from a given HTML page. +func FindManifest(pageURL string) (string, error) { resp, err := http.Get(pageURL) if err != nil { return "", err diff --git a/pkg/pwa/pwa_test.go b/pkg/pwa/pwa_test.go index 186412d..f90fdb3 100644 --- a/pkg/pwa/pwa_test.go +++ b/pkg/pwa/pwa_test.go @@ -6,7 +6,7 @@ import ( "testing" ) -func TestFindManifestURL(t *testing.T) { +func TestFindManifest(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") w.Write([]byte(` @@ -25,9 +25,9 @@ func TestFindManifestURL(t *testing.T) { defer server.Close() expectedURL := server.URL + "/manifest.json" - actualURL, err := FindManifestURL(server.URL) + actualURL, err := FindManifest(server.URL) if err != nil { - t.Fatalf("FindManifestURL failed: %v", err) + t.Fatalf("FindManifest failed: %v", err) } if actualURL != expectedURL { diff --git a/pkg/ui/progressbar.go b/pkg/ui/progressbar.go new file mode 100644 index 0000000..8f143e1 --- /dev/null +++ b/pkg/ui/progressbar.go @@ -0,0 +1,15 @@ +package ui + +import ( + "github.com/schollz/progressbar/v3" +) + +// NewProgressBar creates a new progress bar with the specified total and description. +func NewProgressBar(total int, description string) *progressbar.ProgressBar { + return progressbar.NewOptions(total, + progressbar.OptionSetDescription(description), + progressbar.OptionSetWidth(15), + progressbar.OptionShowCount(), + progressbar.OptionClearOnFinish(), + ) +} diff --git a/pkg/website/website.go b/pkg/website/website.go new file mode 100644 index 0000000..2096a30 --- /dev/null +++ b/pkg/website/website.go @@ -0,0 +1,166 @@ +package website + +import ( + "fmt" + "io" + "net/http" + "net/url" + "strings" + + "borg-data-collector/pkg/datanode" + "github.com/schollz/progressbar/v3" + + "golang.org/x/net/html" +) + +// Downloader is a recursive website downloader. +type Downloader struct { + baseURL *url.URL + dn *datanode.DataNode + visited map[string]bool + maxDepth int + progressBar *progressbar.ProgressBar +} + +// NewDownloader creates a new Downloader. +func NewDownloader(maxDepth int) *Downloader { + return &Downloader{ + dn: datanode.New(), + visited: make(map[string]bool), + maxDepth: maxDepth, + } +} + +// DownloadAndPackageWebsite downloads a website and packages it into a DataNode. +func DownloadAndPackageWebsite(startURL string, maxDepth int) (*datanode.DataNode, error) { + baseURL, err := url.Parse(startURL) + if err != nil { + return nil, err + } + + d := NewDownloader(maxDepth) + d.baseURL = baseURL + + fmt.Println("Downloading website...") + d.progressBar = progressbar.NewOptions(1, progressbar.OptionSetDescription("Downloading")) + d.crawl(startURL, 0) + + return d.dn, nil +} + +func (d *Downloader) crawl(pageURL string, depth int) { + if depth > d.maxDepth || d.visited[pageURL] { + return + } + d.visited[pageURL] = true + d.progressBar.Add(1) + + resp, err := http.Get(pageURL) + if err != nil { + fmt.Printf("Error getting %s: %v\n", pageURL, err) + return + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + fmt.Printf("Error reading body of %s: %v\n", pageURL, err) + return + } + + relPath := d.getRelativePath(pageURL) + d.dn.AddData(relPath, body) + + doc, err := html.Parse(strings.NewReader(string(body))) + if err != nil { + fmt.Printf("Error parsing HTML of %s: %v\n", pageURL, err) + return + } + + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.ElementNode { + for _, a := range n.Attr { + if a.Key == "href" || a.Key == "src" { + link, err := d.resolveURL(pageURL, a.Val) + if err != nil { + continue + } + if d.isLocal(link) { + if isAsset(link) { + d.downloadAsset(link) + } else { + d.crawl(link, depth+1) + } + } + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + f(doc) +} + +func (d *Downloader) downloadAsset(assetURL string) { + if d.visited[assetURL] { + return + } + d.visited[assetURL] = true + d.progressBar.Add(1) + + resp, err := http.Get(assetURL) + if err != nil { + fmt.Printf("Error getting asset %s: %v\n", assetURL, err) + return + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + fmt.Printf("Error reading body of asset %s: %v\n", assetURL, err) + return + } + + relPath := d.getRelativePath(assetURL) + d.dn.AddData(relPath, body) +} + +func (d *Downloader) getRelativePath(pageURL string) string { + u, err := url.Parse(pageURL) + if err != nil { + return "" + } + return strings.TrimPrefix(u.Path, "/") +} + +func (d *Downloader) resolveURL(base, ref string) (string, error) { + baseURL, err := url.Parse(base) + if err != nil { + return "", err + } + refURL, err := url.Parse(ref) + if err != nil { + return "", err + } + return baseURL.ResolveReference(refURL).String(), nil +} + +func (d *Downloader) isLocal(pageURL string) bool { + u, err := url.Parse(pageURL) + if err != nil { + return false + } + return u.Hostname() == d.baseURL.Hostname() +} + +func isAsset(pageURL string) bool { + ext := []string{".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico"} + for _, e := range ext { + if strings.HasSuffix(pageURL, e) { + return true + } + } + return false +} diff --git a/pkg/website/website_test.go b/pkg/website/website_test.go new file mode 100644 index 0000000..9a8ec85 --- /dev/null +++ b/pkg/website/website_test.go @@ -0,0 +1,82 @@ +package website + +import ( + "net/http" + "net/http/httptest" + "testing" +) + +func TestDownloadAndPackageWebsite(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/": + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(` + + + + Test Website + + + +

Hello, Website!

+ Page 2 + + + + `)) + case "/style.css": + w.Header().Set("Content-Type", "text/css") + w.Write([]byte(`body { color: red; }`)) + case "/image.png": + w.Header().Set("Content-Type", "image/png") + w.Write([]byte("fake image data")) + case "/page2.html": + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(` + + + + Page 2 + + +

Page 2

+ Page 3 + + + `)) + case "/page3.html": + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(` + + + + Page 3 + + +

Page 3

+ + + `)) + default: + http.NotFound(w, r) + } + })) + defer server.Close() + + dn, err := DownloadAndPackageWebsite(server.URL, 2) + if err != nil { + t.Fatalf("DownloadAndPackageWebsite failed: %v", err) + } + + expectedFiles := []string{"", "style.css", "image.png", "page2.html", "page3.html"} + for _, file := range expectedFiles { + exists, err := dn.Exists(file) + if err != nil { + t.Fatalf("Exists failed for %s: %v", file, err) + } + if !exists { + t.Errorf("Expected to find file %s in DataNode, but it was not found", file) + } + } +}