feat: Sitemap.xml parsing for website collection

This feature adds sitemap.xml parsing to the `borg collect website` command.

It introduces three new flags:
- `--use-sitemap`: Auto-detects and uses the sitemap in combination with crawling.
- `--sitemap-only`: Collects only the URLs found in the sitemap.
- `--sitemap`: Specifies an explicit URL for the sitemap.

The implementation supports standard sitemaps, sitemap indexes, and compressed sitemaps (.xml.gz).

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
This commit is contained in:
google-labs-jules[bot] 2026-02-02 00:48:52 +00:00
parent cf2af53ed3
commit b36990cdec
7 changed files with 438 additions and 34 deletions

View file

@ -38,6 +38,9 @@ func NewCollectWebsiteCmd() *cobra.Command {
format, _ := cmd.Flags().GetString("format")
compression, _ := cmd.Flags().GetString("compression")
password, _ := cmd.Flags().GetString("password")
useSitemap, _ := cmd.Flags().GetBool("use-sitemap")
sitemapOnly, _ := cmd.Flags().GetBool("sitemap-only")
sitemapURL, _ := cmd.Flags().GetString("sitemap")
if format != "datanode" && format != "tim" && format != "trix" {
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
@ -51,7 +54,7 @@ func NewCollectWebsiteCmd() *cobra.Command {
bar = ui.NewProgressBar(-1, "Crawling website")
}
dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar)
dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, useSitemap, sitemapOnly, sitemapURL, bar)
if err != nil {
return fmt.Errorf("error downloading and packaging website: %w", err)
}
@ -104,5 +107,8 @@ func NewCollectWebsiteCmd() *cobra.Command {
collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
collectWebsiteCmd.Flags().Bool("use-sitemap", false, "Auto-detect and use sitemap")
collectWebsiteCmd.Flags().Bool("sitemap-only", false, "Collect only sitemap URLs (no crawling)")
collectWebsiteCmd.Flags().String("sitemap", "", "Explicit sitemap URL")
return collectWebsiteCmd
}

View file

@ -14,7 +14,7 @@ import (
func TestCollectWebsiteCmd_Good(t *testing.T) {
// Mock the website downloader
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, useSitemap, sitemapOnly bool, sitemapURL string, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
return datanode.New(), nil
}
defer func() {
@ -32,10 +32,76 @@ func TestCollectWebsiteCmd_Good(t *testing.T) {
}
}
func TestCollectWebsiteCmd_Sitemap_Good(t *testing.T) {
var capturedUseSitemap, capturedSitemapOnly bool
var capturedSitemapURL string
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, useSitemap, sitemapOnly bool, sitemapURL string, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
capturedUseSitemap = useSitemap
capturedSitemapOnly = sitemapOnly
capturedSitemapURL = sitemapURL
return datanode.New(), nil
}
defer func() {
website.DownloadAndPackageWebsite = oldDownloadAndPackageWebsite
}()
testCases := []struct {
name string
args []string
expectedUseSitemap bool
expectedSitemapOnly bool
expectedSitemapURL string
}{
{
name: "use-sitemap flag",
args: []string{"https://example.com", "--use-sitemap"},
expectedUseSitemap: true,
expectedSitemapOnly: false,
expectedSitemapURL: "",
},
{
name: "sitemap-only flag",
args: []string{"https://example.com", "--sitemap-only"},
expectedUseSitemap: false,
expectedSitemapOnly: true,
expectedSitemapURL: "",
},
{
name: "sitemap flag",
args: []string{"https://example.com", "--sitemap", "https://example.com/sitemap.xml"},
expectedUseSitemap: false,
expectedSitemapOnly: false,
expectedSitemapURL: "https://example.com/sitemap.xml",
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
rootCmd := NewCollectWebsiteCmd()
_, err := executeCommand(rootCmd, tc.args...)
if err != nil {
t.Fatalf("command execution failed: %v", err)
}
if capturedUseSitemap != tc.expectedUseSitemap {
t.Errorf("expected useSitemap to be %v, but got %v", tc.expectedUseSitemap, capturedUseSitemap)
}
if capturedSitemapOnly != tc.expectedSitemapOnly {
t.Errorf("expected sitemapOnly to be %v, but got %v", tc.expectedSitemapOnly, capturedSitemapOnly)
}
if capturedSitemapURL != tc.expectedSitemapURL {
t.Errorf("expected sitemapURL to be %q, but got %q", tc.expectedSitemapURL, capturedSitemapURL)
}
})
}
}
func TestCollectWebsiteCmd_Bad(t *testing.T) {
// Mock the website downloader to return an error
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, useSitemap, sitemapOnly bool, sitemapURL string, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
return nil, fmt.Errorf("website error")
}
defer func() {

View file

@ -11,7 +11,7 @@ func main() {
log.Println("Collecting website...")
// Download and package the website.
dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil)
dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, false, false, "", nil)
if err != nil {
log.Fatalf("Failed to collect website: %v", err)
}

116
pkg/website/sitemap.go Normal file
View file

@ -0,0 +1,116 @@
package website
import (
"compress/gzip"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"strings"
)
// SitemapURL represents a single URL entry in a sitemap.
type SitemapURL struct {
Loc string `xml:"loc"`
}
// URLSet represents a standard sitemap.
type URLSet struct {
XMLName xml.Name `xml:"urlset"`
URLs []SitemapURL `xml:"url"`
}
// SitemapIndex represents a sitemap index file.
type SitemapIndex struct {
XMLName xml.Name `xml:"sitemapindex"`
Sitemaps []SitemapURL `xml:"sitemap"`
}
// FetchAndParseSitemap fetches and parses a sitemap from the given URL.
// It handles standard sitemaps, sitemap indexes, and gzipped sitemaps.
func FetchAndParseSitemap(sitemapURL string, client *http.Client) ([]string, error) {
resp, err := client.Get(sitemapURL)
if err != nil {
return nil, fmt.Errorf("error fetching sitemap: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("bad status for sitemap: %s", resp.Status)
}
var reader io.Reader = resp.Body
if strings.HasSuffix(sitemapURL, ".gz") {
gzReader, err := gzip.NewReader(resp.Body)
if err != nil {
return nil, fmt.Errorf("error creating gzip reader: %w", err)
}
defer gzReader.Close()
reader = gzReader
}
body, err := io.ReadAll(reader)
if err != nil {
return nil, fmt.Errorf("error reading sitemap body: %w", err)
}
// Try parsing as a sitemap index first
var sitemapIndex SitemapIndex
if err := xml.Unmarshal(body, &sitemapIndex); err == nil && len(sitemapIndex.Sitemaps) > 0 {
var allURLs []string
for _, sitemap := range sitemapIndex.Sitemaps {
urls, err := FetchAndParseSitemap(sitemap.Loc, client)
if err != nil {
// In a real-world scenario, you might want to handle this more gracefully
return nil, fmt.Errorf("error parsing sitemap from index '%s': %w", sitemap.Loc, err)
}
allURLs = append(allURLs, urls...)
}
return allURLs, nil
}
// If not a sitemap index, try parsing as a standard sitemap
var urlSet URLSet
if err := xml.Unmarshal(body, &urlSet); err == nil {
var urls []string
for _, u := range urlSet.URLs {
urls = append(urls, u.Loc)
}
return urls, nil
}
return nil, fmt.Errorf("failed to parse sitemap XML from %s", sitemapURL)
}
// DiscoverSitemap attempts to discover the sitemap URL for a given base URL.
func DiscoverSitemap(baseURL string, client *http.Client) (string, error) {
u, err := url.Parse(baseURL)
if err != nil {
return "", err
}
// Make sure we're at the root of the domain
u.Path = ""
u.RawQuery = ""
u.Fragment = ""
sitemapPaths := []string{"sitemap.xml", "sitemap_index.xml", "sitemap.xml.gz", "sitemap_index.xml.gz"}
for _, path := range sitemapPaths {
sitemapURL := u.String() + "/" + path
resp, err := client.Head(sitemapURL)
if err == nil && resp.StatusCode == http.StatusOK {
// Ensure we close the body, even for a HEAD request
io.Copy(io.Discard, resp.Body)
resp.Body.Close()
return sitemapURL, nil
}
if resp != nil {
io.Copy(io.Discard, resp.Body)
resp.Body.Close()
}
}
return "", fmt.Errorf("sitemap not found for %s", baseURL)
}

118
pkg/website/sitemap_test.go Normal file
View file

@ -0,0 +1,118 @@
package website
import (
"compress/gzip"
"fmt"
"net/http"
"net/http/httptest"
"reflect"
"testing"
)
func TestFetchAndParseSitemap_Good(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/sitemap.xml":
fmt.Fprintln(w, `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://www.example.com/</loc>
</url>
<url>
<loc>http://www.example.com/page1</loc>
</url>
</urlset>`)
case "/sitemap_index.xml":
fmt.Fprintln(w, `<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>http://`+r.Host+`/sitemap1.xml</loc>
</sitemap>
<sitemap>
<loc>http://`+r.Host+`/sitemap2.xml</loc>
</sitemap>
</sitemapindex>`)
case "/sitemap1.xml":
fmt.Fprintln(w, `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://www.example.com/page2</loc>
</url>
</urlset>`)
case "/sitemap2.xml":
fmt.Fprintln(w, `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://www.example.com/page3</loc>
</url>
</urlset>`)
case "/sitemap.xml.gz":
w.Header().Set("Content-Type", "application/gzip")
gz := gzip.NewWriter(w)
defer gz.Close()
gz.Write([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://www.example.com/gzipped</loc>
</url>
</urlset>`))
default:
http.NotFound(w, r)
}
}))
defer server.Close()
testCases := []struct {
name string
url string
expected []string
}{
{"Standard Sitemap", server.URL + "/sitemap.xml", []string{"http://www.example.com/", "http://www.example.com/page1"}},
{"Sitemap Index", server.URL + "/sitemap_index.xml", []string{"http://www.example.com/page2", "http://www.example.com/page3"}},
{"Gzipped Sitemap", server.URL + "/sitemap.xml.gz", []string{"http://www.example.com/gzipped"}},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
urls, err := FetchAndParseSitemap(tc.url, server.Client())
if err != nil {
t.Fatalf("FetchAndParseSitemap() error = %v", err)
}
if !reflect.DeepEqual(urls, tc.expected) {
t.Errorf("FetchAndParseSitemap() = %v, want %v", urls, tc.expected)
}
})
}
}
func TestDiscoverSitemap_Good(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/sitemap.xml" {
w.WriteHeader(http.StatusOK)
} else {
http.NotFound(w, r)
}
}))
defer server.Close()
sitemapURL, err := DiscoverSitemap(server.URL, server.Client())
if err != nil {
t.Fatalf("DiscoverSitemap() error = %v", err)
}
expected := server.URL + "/sitemap.xml"
if sitemapURL != expected {
t.Errorf("DiscoverSitemap() = %v, want %v", sitemapURL, expected)
}
}
func TestDiscoverSitemap_Bad(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.NotFound(w, r)
}))
defer server.Close()
_, err := DiscoverSitemap(server.URL, server.Client())
if err == nil {
t.Error("DiscoverSitemap() error = nil, want error")
}
}

View file

@ -5,6 +5,7 @@ import (
"io"
"net/http"
"net/url"
"os"
"strings"
"github.com/Snider/Borg/pkg/datanode"
@ -15,6 +16,66 @@ import (
var DownloadAndPackageWebsite = downloadAndPackageWebsite
func downloadAndPackageWebsite(startURL string, maxDepth int, useSitemap, sitemapOnly bool, sitemapURL string, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
baseURL, err := url.Parse(startURL)
if err != nil {
return nil, err
}
d := NewDownloader(maxDepth)
d.baseURL = baseURL
d.progressBar = bar
d.sitemapOnly = sitemapOnly
var sitemapURLs []string
var sitemapErr error
// Determine which sitemap URL to use
actualSitemapURL := sitemapURL
if actualSitemapURL == "" && (useSitemap || sitemapOnly) {
actualSitemapURL, sitemapErr = DiscoverSitemap(startURL, d.client)
if sitemapErr != nil {
if sitemapOnly {
return nil, fmt.Errorf("sitemap discovery failed for %s: %w", startURL, sitemapErr)
}
// For useSitemap, we can warn and proceed with crawling
fmt.Fprintf(os.Stderr, "Warning: sitemap discovery failed for %s: %v. Proceeding with crawl.\n", startURL, sitemapErr)
}
}
if actualSitemapURL != "" {
sitemapURLs, sitemapErr = FetchAndParseSitemap(actualSitemapURL, d.client)
if sitemapErr != nil {
if sitemapOnly {
return nil, fmt.Errorf("sitemap parsing failed for %s: %w", actualSitemapURL, sitemapErr)
}
fmt.Fprintf(os.Stderr, "Warning: sitemap parsing failed for %s: %v. Proceeding with crawl.\n", actualSitemapURL, sitemapErr)
}
}
// Process URLs from sitemap
if len(sitemapURLs) > 0 {
for _, u := range sitemapURLs {
d.crawl(u, 0)
}
}
// Crawl from start URL if not in sitemap-only mode
if !sitemapOnly {
d.crawl(startURL, 0)
}
if len(d.errors) > 0 {
var errs []string
for _, e := range d.errors {
errs = append(errs, e.Error())
}
return nil, fmt.Errorf("failed to download website:\n%s", strings.Join(errs, "\n"))
}
return d.dn, nil
}
// Downloader is a recursive website downloader.
type Downloader struct {
baseURL *url.URL
@ -22,6 +83,7 @@ type Downloader struct {
visited map[string]bool
maxDepth int
progressBar *progressbar.ProgressBar
sitemapOnly bool
client *http.Client
errors []error
}
@ -37,33 +99,12 @@ func NewDownloaderWithClient(maxDepth int, client *http.Client) *Downloader {
dn: datanode.New(),
visited: make(map[string]bool),
maxDepth: maxDepth,
sitemapOnly: false,
client: client,
errors: make([]error, 0),
}
}
// downloadAndPackageWebsite downloads a website and packages it into a DataNode.
func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
baseURL, err := url.Parse(startURL)
if err != nil {
return nil, err
}
d := NewDownloader(maxDepth)
d.baseURL = baseURL
d.progressBar = bar
d.crawl(startURL, 0)
if len(d.errors) > 0 {
var errs []string
for _, e := range d.errors {
errs = append(errs, e.Error())
}
return nil, fmt.Errorf("failed to download website:\n%s", strings.Join(errs, "\n"))
}
return d.dn, nil
}
func (d *Downloader) crawl(pageURL string, depth int) {
if depth > d.maxDepth || d.visited[pageURL] {
@ -118,7 +159,7 @@ func (d *Downloader) crawl(pageURL string, depth int) {
if d.isLocal(link) {
if isAsset(link) {
d.downloadAsset(link)
} else {
} else if !d.sitemapOnly {
d.crawl(link, depth+1)
}
}

View file

@ -20,7 +20,7 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) {
defer server.Close()
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
dn, err := DownloadAndPackageWebsite(server.URL, 2, bar)
dn, err := DownloadAndPackageWebsite(server.URL, 2, false, false, "", bar)
if err != nil {
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
}
@ -52,7 +52,7 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) {
func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
t.Run("Invalid Start URL", func(t *testing.T) {
_, err := DownloadAndPackageWebsite("http://invalid-url", 1, nil)
_, err := DownloadAndPackageWebsite("http://invalid-url", 1, false, false, "", nil)
if err == nil {
t.Fatal("Expected an error for an invalid start URL, but got nil")
}
@ -63,7 +63,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
}))
defer server.Close()
_, err := DownloadAndPackageWebsite(server.URL, 1, nil)
_, err := DownloadAndPackageWebsite(server.URL, 1, false, false, "", nil)
if err == nil {
t.Fatal("Expected an error for a server error on the start URL, but got nil")
}
@ -80,7 +80,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
}))
defer server.Close()
// We expect an error because the link is broken.
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil)
dn, err := DownloadAndPackageWebsite(server.URL, 1, false, false, "", nil)
if err == nil {
t.Fatal("Expected an error for a broken link, but got nil")
}
@ -99,7 +99,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
defer server.Close()
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
dn, err := DownloadAndPackageWebsite(server.URL, 1, bar) // Max depth of 1
dn, err := DownloadAndPackageWebsite(server.URL, 1, false, false, "", bar) // Max depth of 1
if err != nil {
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
}
@ -122,7 +122,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
fmt.Fprint(w, `<a href="http://externalsite.com/page.html">External</a>`)
}))
defer server.Close()
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil)
dn, err := DownloadAndPackageWebsite(server.URL, 1, false, false, "", nil)
if err != nil {
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
}
@ -156,7 +156,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
// For now, we'll just test that it doesn't hang forever.
done := make(chan bool)
go func() {
_, err := DownloadAndPackageWebsite(server.URL, 1, nil)
_, err := DownloadAndPackageWebsite(server.URL, 1, false, false, "", nil)
if err != nil && !strings.Contains(err.Error(), "context deadline exceeded") {
// We expect a timeout error, but other errors are failures.
t.Errorf("unexpected error: %v", err)
@ -172,6 +172,63 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
})
}
func TestDownloadAndPackageWebsite_Sitemap(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/sitemap.xml":
fmt.Fprintln(w, `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://`+r.Host+`/page1</loc>
</url>
</urlset>`)
case "/":
w.Header().Set("Content-Type", "text/html")
fmt.Fprint(w, `
<!DOCTYPE html>
<html><body>
<a href="/page2">Page 2</a>
</body></html>
`)
case "/page1":
w.Header().Set("Content-Type", "text/html")
fmt.Fprint(w, `<html><body><h1>Page 1 from Sitemap</h1></body></html>`)
case "/page2":
w.Header().Set("Content-Type", "text/html")
fmt.Fprint(w, `<html><body><h1>Page 2 from Crawl</h1></body></html>`)
default:
http.NotFound(w, r)
}
}))
defer server.Close()
t.Run("sitemap-only", func(t *testing.T) {
dn, err := DownloadAndPackageWebsite(server.URL, 2, false, true, server.URL+"/sitemap.xml", nil)
if err != nil {
t.Fatalf("DownloadAndPackageWebsite with sitemap-only failed: %v", err)
}
if exists, _ := dn.Exists("page1"); !exists {
t.Error("Expected to find /page1 from sitemap, but it was not found")
}
if exists, _ := dn.Exists("page2"); exists {
t.Error("Did not expect to find /page2 from crawl, but it was found")
}
})
t.Run("use-sitemap", func(t *testing.T) {
dn, err := DownloadAndPackageWebsite(server.URL, 2, true, false, server.URL+"/sitemap.xml", nil)
if err != nil {
t.Fatalf("DownloadAndPackageWebsite with use-sitemap failed: %v", err)
}
if exists, _ := dn.Exists("page1"); !exists {
t.Error("Expected to find /page1 from sitemap, but it was not found")
}
if exists, _ := dn.Exists("page2"); !exists {
t.Error("Expected to find /page2 from crawl, but it was not found")
}
})
}
// --- Helpers ---
func newWebsiteTestServer() *httptest.Server {