This feature adds sitemap.xml parsing to the `borg collect website` command. It introduces three new flags: - `--use-sitemap`: Auto-detects and uses the sitemap in combination with crawling. - `--sitemap-only`: Collects only the URLs found in the sitemap. - `--sitemap`: Specifies an explicit URL for the sitemap. The implementation supports standard sitemaps, sitemap indexes, and compressed sitemaps (.xml.gz). Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
116 lines
3.1 KiB
Go
116 lines
3.1 KiB
Go
package website
|
|
|
|
import (
|
|
"compress/gzip"
|
|
"encoding/xml"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
)
|
|
|
|
// SitemapURL represents a single URL entry in a sitemap.
|
|
type SitemapURL struct {
|
|
Loc string `xml:"loc"`
|
|
}
|
|
|
|
// URLSet represents a standard sitemap.
|
|
type URLSet struct {
|
|
XMLName xml.Name `xml:"urlset"`
|
|
URLs []SitemapURL `xml:"url"`
|
|
}
|
|
|
|
// SitemapIndex represents a sitemap index file.
|
|
type SitemapIndex struct {
|
|
XMLName xml.Name `xml:"sitemapindex"`
|
|
Sitemaps []SitemapURL `xml:"sitemap"`
|
|
}
|
|
|
|
// FetchAndParseSitemap fetches and parses a sitemap from the given URL.
|
|
// It handles standard sitemaps, sitemap indexes, and gzipped sitemaps.
|
|
func FetchAndParseSitemap(sitemapURL string, client *http.Client) ([]string, error) {
|
|
resp, err := client.Get(sitemapURL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error fetching sitemap: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("bad status for sitemap: %s", resp.Status)
|
|
}
|
|
|
|
var reader io.Reader = resp.Body
|
|
if strings.HasSuffix(sitemapURL, ".gz") {
|
|
gzReader, err := gzip.NewReader(resp.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error creating gzip reader: %w", err)
|
|
}
|
|
defer gzReader.Close()
|
|
reader = gzReader
|
|
}
|
|
|
|
body, err := io.ReadAll(reader)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error reading sitemap body: %w", err)
|
|
}
|
|
|
|
// Try parsing as a sitemap index first
|
|
var sitemapIndex SitemapIndex
|
|
if err := xml.Unmarshal(body, &sitemapIndex); err == nil && len(sitemapIndex.Sitemaps) > 0 {
|
|
var allURLs []string
|
|
for _, sitemap := range sitemapIndex.Sitemaps {
|
|
urls, err := FetchAndParseSitemap(sitemap.Loc, client)
|
|
if err != nil {
|
|
// In a real-world scenario, you might want to handle this more gracefully
|
|
return nil, fmt.Errorf("error parsing sitemap from index '%s': %w", sitemap.Loc, err)
|
|
}
|
|
allURLs = append(allURLs, urls...)
|
|
}
|
|
return allURLs, nil
|
|
}
|
|
|
|
// If not a sitemap index, try parsing as a standard sitemap
|
|
var urlSet URLSet
|
|
if err := xml.Unmarshal(body, &urlSet); err == nil {
|
|
var urls []string
|
|
for _, u := range urlSet.URLs {
|
|
urls = append(urls, u.Loc)
|
|
}
|
|
return urls, nil
|
|
}
|
|
|
|
return nil, fmt.Errorf("failed to parse sitemap XML from %s", sitemapURL)
|
|
}
|
|
|
|
// DiscoverSitemap attempts to discover the sitemap URL for a given base URL.
|
|
func DiscoverSitemap(baseURL string, client *http.Client) (string, error) {
|
|
u, err := url.Parse(baseURL)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
// Make sure we're at the root of the domain
|
|
u.Path = ""
|
|
u.RawQuery = ""
|
|
u.Fragment = ""
|
|
|
|
|
|
sitemapPaths := []string{"sitemap.xml", "sitemap_index.xml", "sitemap.xml.gz", "sitemap_index.xml.gz"}
|
|
|
|
for _, path := range sitemapPaths {
|
|
sitemapURL := u.String() + "/" + path
|
|
resp, err := client.Head(sitemapURL)
|
|
if err == nil && resp.StatusCode == http.StatusOK {
|
|
// Ensure we close the body, even for a HEAD request
|
|
io.Copy(io.Discard, resp.Body)
|
|
resp.Body.Close()
|
|
return sitemapURL, nil
|
|
}
|
|
if resp != nil {
|
|
io.Copy(io.Discard, resp.Body)
|
|
resp.Body.Close()
|
|
}
|
|
}
|
|
|
|
return "", fmt.Errorf("sitemap not found for %s", baseURL)
|
|
}
|