From b36990cdecb78bb4cf063f842ee7f8c3c62654df Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
<161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Mon, 2 Feb 2026 00:48:52 +0000
Subject: [PATCH] feat: Sitemap.xml parsing for website collection
This feature adds sitemap.xml parsing to the `borg collect website` command.
It introduces three new flags:
- `--use-sitemap`: Auto-detects and uses the sitemap in combination with crawling.
- `--sitemap-only`: Collects only the URLs found in the sitemap.
- `--sitemap`: Specifies an explicit URL for the sitemap.
The implementation supports standard sitemaps, sitemap indexes, and compressed sitemaps (.xml.gz).
Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
---
cmd/collect_website.go | 8 ++-
cmd/collect_website_test.go | 70 +++++++++++++++++-
examples/collect_website/main.go | 2 +-
pkg/website/sitemap.go | 116 ++++++++++++++++++++++++++++++
pkg/website/sitemap_test.go | 118 +++++++++++++++++++++++++++++++
pkg/website/website.go | 87 +++++++++++++++++------
pkg/website/website_test.go | 71 +++++++++++++++++--
7 files changed, 438 insertions(+), 34 deletions(-)
create mode 100644 pkg/website/sitemap.go
create mode 100644 pkg/website/sitemap_test.go
diff --git a/cmd/collect_website.go b/cmd/collect_website.go
index 3811f32..4016bca 100644
--- a/cmd/collect_website.go
+++ b/cmd/collect_website.go
@@ -38,6 +38,9 @@ func NewCollectWebsiteCmd() *cobra.Command {
format, _ := cmd.Flags().GetString("format")
compression, _ := cmd.Flags().GetString("compression")
password, _ := cmd.Flags().GetString("password")
+ useSitemap, _ := cmd.Flags().GetBool("use-sitemap")
+ sitemapOnly, _ := cmd.Flags().GetBool("sitemap-only")
+ sitemapURL, _ := cmd.Flags().GetString("sitemap")
if format != "datanode" && format != "tim" && format != "trix" {
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
@@ -51,7 +54,7 @@ func NewCollectWebsiteCmd() *cobra.Command {
bar = ui.NewProgressBar(-1, "Crawling website")
}
- dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar)
+ dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, useSitemap, sitemapOnly, sitemapURL, bar)
if err != nil {
return fmt.Errorf("error downloading and packaging website: %w", err)
}
@@ -104,5 +107,8 @@ func NewCollectWebsiteCmd() *cobra.Command {
collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
+ collectWebsiteCmd.Flags().Bool("use-sitemap", false, "Auto-detect and use sitemap")
+ collectWebsiteCmd.Flags().Bool("sitemap-only", false, "Collect only sitemap URLs (no crawling)")
+ collectWebsiteCmd.Flags().String("sitemap", "", "Explicit sitemap URL")
return collectWebsiteCmd
}
diff --git a/cmd/collect_website_test.go b/cmd/collect_website_test.go
index 2c39674..a9dc415 100644
--- a/cmd/collect_website_test.go
+++ b/cmd/collect_website_test.go
@@ -14,7 +14,7 @@ import (
func TestCollectWebsiteCmd_Good(t *testing.T) {
// Mock the website downloader
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
- website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
+ website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, useSitemap, sitemapOnly bool, sitemapURL string, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
return datanode.New(), nil
}
defer func() {
@@ -32,10 +32,76 @@ func TestCollectWebsiteCmd_Good(t *testing.T) {
}
}
+func TestCollectWebsiteCmd_Sitemap_Good(t *testing.T) {
+ var capturedUseSitemap, capturedSitemapOnly bool
+ var capturedSitemapURL string
+
+ oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
+ website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, useSitemap, sitemapOnly bool, sitemapURL string, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
+ capturedUseSitemap = useSitemap
+ capturedSitemapOnly = sitemapOnly
+ capturedSitemapURL = sitemapURL
+ return datanode.New(), nil
+ }
+ defer func() {
+ website.DownloadAndPackageWebsite = oldDownloadAndPackageWebsite
+ }()
+
+ testCases := []struct {
+ name string
+ args []string
+ expectedUseSitemap bool
+ expectedSitemapOnly bool
+ expectedSitemapURL string
+ }{
+ {
+ name: "use-sitemap flag",
+ args: []string{"https://example.com", "--use-sitemap"},
+ expectedUseSitemap: true,
+ expectedSitemapOnly: false,
+ expectedSitemapURL: "",
+ },
+ {
+ name: "sitemap-only flag",
+ args: []string{"https://example.com", "--sitemap-only"},
+ expectedUseSitemap: false,
+ expectedSitemapOnly: true,
+ expectedSitemapURL: "",
+ },
+ {
+ name: "sitemap flag",
+ args: []string{"https://example.com", "--sitemap", "https://example.com/sitemap.xml"},
+ expectedUseSitemap: false,
+ expectedSitemapOnly: false,
+ expectedSitemapURL: "https://example.com/sitemap.xml",
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ rootCmd := NewCollectWebsiteCmd()
+ _, err := executeCommand(rootCmd, tc.args...)
+ if err != nil {
+ t.Fatalf("command execution failed: %v", err)
+ }
+
+ if capturedUseSitemap != tc.expectedUseSitemap {
+ t.Errorf("expected useSitemap to be %v, but got %v", tc.expectedUseSitemap, capturedUseSitemap)
+ }
+ if capturedSitemapOnly != tc.expectedSitemapOnly {
+ t.Errorf("expected sitemapOnly to be %v, but got %v", tc.expectedSitemapOnly, capturedSitemapOnly)
+ }
+ if capturedSitemapURL != tc.expectedSitemapURL {
+ t.Errorf("expected sitemapURL to be %q, but got %q", tc.expectedSitemapURL, capturedSitemapURL)
+ }
+ })
+ }
+}
+
func TestCollectWebsiteCmd_Bad(t *testing.T) {
// Mock the website downloader to return an error
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
- website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
+ website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, useSitemap, sitemapOnly bool, sitemapURL string, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
return nil, fmt.Errorf("website error")
}
defer func() {
diff --git a/examples/collect_website/main.go b/examples/collect_website/main.go
index 2e2f606..bdd96b9 100644
--- a/examples/collect_website/main.go
+++ b/examples/collect_website/main.go
@@ -11,7 +11,7 @@ func main() {
log.Println("Collecting website...")
// Download and package the website.
- dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil)
+ dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, false, false, "", nil)
if err != nil {
log.Fatalf("Failed to collect website: %v", err)
}
diff --git a/pkg/website/sitemap.go b/pkg/website/sitemap.go
new file mode 100644
index 0000000..d59efc8
--- /dev/null
+++ b/pkg/website/sitemap.go
@@ -0,0 +1,116 @@
+package website
+
+import (
+ "compress/gzip"
+ "encoding/xml"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "strings"
+)
+
+// SitemapURL represents a single URL entry in a sitemap.
+type SitemapURL struct {
+ Loc string `xml:"loc"`
+}
+
+// URLSet represents a standard sitemap.
+type URLSet struct {
+ XMLName xml.Name `xml:"urlset"`
+ URLs []SitemapURL `xml:"url"`
+}
+
+// SitemapIndex represents a sitemap index file.
+type SitemapIndex struct {
+ XMLName xml.Name `xml:"sitemapindex"`
+ Sitemaps []SitemapURL `xml:"sitemap"`
+}
+
+// FetchAndParseSitemap fetches and parses a sitemap from the given URL.
+// It handles standard sitemaps, sitemap indexes, and gzipped sitemaps.
+func FetchAndParseSitemap(sitemapURL string, client *http.Client) ([]string, error) {
+ resp, err := client.Get(sitemapURL)
+ if err != nil {
+ return nil, fmt.Errorf("error fetching sitemap: %w", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("bad status for sitemap: %s", resp.Status)
+ }
+
+ var reader io.Reader = resp.Body
+ if strings.HasSuffix(sitemapURL, ".gz") {
+ gzReader, err := gzip.NewReader(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("error creating gzip reader: %w", err)
+ }
+ defer gzReader.Close()
+ reader = gzReader
+ }
+
+ body, err := io.ReadAll(reader)
+ if err != nil {
+ return nil, fmt.Errorf("error reading sitemap body: %w", err)
+ }
+
+ // Try parsing as a sitemap index first
+ var sitemapIndex SitemapIndex
+ if err := xml.Unmarshal(body, &sitemapIndex); err == nil && len(sitemapIndex.Sitemaps) > 0 {
+ var allURLs []string
+ for _, sitemap := range sitemapIndex.Sitemaps {
+ urls, err := FetchAndParseSitemap(sitemap.Loc, client)
+ if err != nil {
+ // In a real-world scenario, you might want to handle this more gracefully
+ return nil, fmt.Errorf("error parsing sitemap from index '%s': %w", sitemap.Loc, err)
+ }
+ allURLs = append(allURLs, urls...)
+ }
+ return allURLs, nil
+ }
+
+ // If not a sitemap index, try parsing as a standard sitemap
+ var urlSet URLSet
+ if err := xml.Unmarshal(body, &urlSet); err == nil {
+ var urls []string
+ for _, u := range urlSet.URLs {
+ urls = append(urls, u.Loc)
+ }
+ return urls, nil
+ }
+
+ return nil, fmt.Errorf("failed to parse sitemap XML from %s", sitemapURL)
+}
+
+// DiscoverSitemap attempts to discover the sitemap URL for a given base URL.
+func DiscoverSitemap(baseURL string, client *http.Client) (string, error) {
+ u, err := url.Parse(baseURL)
+ if err != nil {
+ return "", err
+ }
+ // Make sure we're at the root of the domain
+ u.Path = ""
+ u.RawQuery = ""
+ u.Fragment = ""
+
+
+ sitemapPaths := []string{"sitemap.xml", "sitemap_index.xml", "sitemap.xml.gz", "sitemap_index.xml.gz"}
+
+ for _, path := range sitemapPaths {
+ sitemapURL := u.String() + "/" + path
+ resp, err := client.Head(sitemapURL)
+ if err == nil && resp.StatusCode == http.StatusOK {
+ // Ensure we close the body, even for a HEAD request
+ io.Copy(io.Discard, resp.Body)
+ resp.Body.Close()
+ return sitemapURL, nil
+ }
+ if resp != nil {
+ io.Copy(io.Discard, resp.Body)
+ resp.Body.Close()
+ }
+ }
+
+ return "", fmt.Errorf("sitemap not found for %s", baseURL)
+}
diff --git a/pkg/website/sitemap_test.go b/pkg/website/sitemap_test.go
new file mode 100644
index 0000000..a9f5145
--- /dev/null
+++ b/pkg/website/sitemap_test.go
@@ -0,0 +1,118 @@
+package website
+
+import (
+ "compress/gzip"
+ "fmt"
+ "net/http"
+ "net/http/httptest"
+ "reflect"
+ "testing"
+)
+
+func TestFetchAndParseSitemap_Good(t *testing.T) {
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ switch r.URL.Path {
+ case "/sitemap.xml":
+ fmt.Fprintln(w, `
+