feat: Sitemap.xml parsing for website collection
This feature adds sitemap.xml parsing to the `borg collect website` command. It introduces three new flags: - `--use-sitemap`: Auto-detects and uses the sitemap in combination with crawling. - `--sitemap-only`: Collects only the URLs found in the sitemap. - `--sitemap`: Specifies an explicit URL for the sitemap. The implementation supports standard sitemaps, sitemap indexes, and compressed sitemaps (.xml.gz). Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
This commit is contained in:
parent
cf2af53ed3
commit
b36990cdec
7 changed files with 438 additions and 34 deletions
|
|
@ -38,6 +38,9 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
|||
format, _ := cmd.Flags().GetString("format")
|
||||
compression, _ := cmd.Flags().GetString("compression")
|
||||
password, _ := cmd.Flags().GetString("password")
|
||||
useSitemap, _ := cmd.Flags().GetBool("use-sitemap")
|
||||
sitemapOnly, _ := cmd.Flags().GetBool("sitemap-only")
|
||||
sitemapURL, _ := cmd.Flags().GetString("sitemap")
|
||||
|
||||
if format != "datanode" && format != "tim" && format != "trix" {
|
||||
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
|
||||
|
|
@ -51,7 +54,7 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
|||
bar = ui.NewProgressBar(-1, "Crawling website")
|
||||
}
|
||||
|
||||
dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar)
|
||||
dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, useSitemap, sitemapOnly, sitemapURL, bar)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error downloading and packaging website: %w", err)
|
||||
}
|
||||
|
|
@ -104,5 +107,8 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
|||
collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
|
||||
collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
|
||||
collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
|
||||
collectWebsiteCmd.Flags().Bool("use-sitemap", false, "Auto-detect and use sitemap")
|
||||
collectWebsiteCmd.Flags().Bool("sitemap-only", false, "Collect only sitemap URLs (no crawling)")
|
||||
collectWebsiteCmd.Flags().String("sitemap", "", "Explicit sitemap URL")
|
||||
return collectWebsiteCmd
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ import (
|
|||
func TestCollectWebsiteCmd_Good(t *testing.T) {
|
||||
// Mock the website downloader
|
||||
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
|
||||
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
|
||||
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, useSitemap, sitemapOnly bool, sitemapURL string, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
|
||||
return datanode.New(), nil
|
||||
}
|
||||
defer func() {
|
||||
|
|
@ -32,10 +32,76 @@ func TestCollectWebsiteCmd_Good(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestCollectWebsiteCmd_Sitemap_Good(t *testing.T) {
|
||||
var capturedUseSitemap, capturedSitemapOnly bool
|
||||
var capturedSitemapURL string
|
||||
|
||||
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
|
||||
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, useSitemap, sitemapOnly bool, sitemapURL string, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
|
||||
capturedUseSitemap = useSitemap
|
||||
capturedSitemapOnly = sitemapOnly
|
||||
capturedSitemapURL = sitemapURL
|
||||
return datanode.New(), nil
|
||||
}
|
||||
defer func() {
|
||||
website.DownloadAndPackageWebsite = oldDownloadAndPackageWebsite
|
||||
}()
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
args []string
|
||||
expectedUseSitemap bool
|
||||
expectedSitemapOnly bool
|
||||
expectedSitemapURL string
|
||||
}{
|
||||
{
|
||||
name: "use-sitemap flag",
|
||||
args: []string{"https://example.com", "--use-sitemap"},
|
||||
expectedUseSitemap: true,
|
||||
expectedSitemapOnly: false,
|
||||
expectedSitemapURL: "",
|
||||
},
|
||||
{
|
||||
name: "sitemap-only flag",
|
||||
args: []string{"https://example.com", "--sitemap-only"},
|
||||
expectedUseSitemap: false,
|
||||
expectedSitemapOnly: true,
|
||||
expectedSitemapURL: "",
|
||||
},
|
||||
{
|
||||
name: "sitemap flag",
|
||||
args: []string{"https://example.com", "--sitemap", "https://example.com/sitemap.xml"},
|
||||
expectedUseSitemap: false,
|
||||
expectedSitemapOnly: false,
|
||||
expectedSitemapURL: "https://example.com/sitemap.xml",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
rootCmd := NewCollectWebsiteCmd()
|
||||
_, err := executeCommand(rootCmd, tc.args...)
|
||||
if err != nil {
|
||||
t.Fatalf("command execution failed: %v", err)
|
||||
}
|
||||
|
||||
if capturedUseSitemap != tc.expectedUseSitemap {
|
||||
t.Errorf("expected useSitemap to be %v, but got %v", tc.expectedUseSitemap, capturedUseSitemap)
|
||||
}
|
||||
if capturedSitemapOnly != tc.expectedSitemapOnly {
|
||||
t.Errorf("expected sitemapOnly to be %v, but got %v", tc.expectedSitemapOnly, capturedSitemapOnly)
|
||||
}
|
||||
if capturedSitemapURL != tc.expectedSitemapURL {
|
||||
t.Errorf("expected sitemapURL to be %q, but got %q", tc.expectedSitemapURL, capturedSitemapURL)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectWebsiteCmd_Bad(t *testing.T) {
|
||||
// Mock the website downloader to return an error
|
||||
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
|
||||
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
|
||||
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, useSitemap, sitemapOnly bool, sitemapURL string, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
|
||||
return nil, fmt.Errorf("website error")
|
||||
}
|
||||
defer func() {
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ func main() {
|
|||
log.Println("Collecting website...")
|
||||
|
||||
// Download and package the website.
|
||||
dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil)
|
||||
dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, false, false, "", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to collect website: %v", err)
|
||||
}
|
||||
|
|
|
|||
116
pkg/website/sitemap.go
Normal file
116
pkg/website/sitemap.go
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
package website
|
||||
|
||||
import (
|
||||
"compress/gzip"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// SitemapURL represents a single URL entry in a sitemap.
|
||||
type SitemapURL struct {
|
||||
Loc string `xml:"loc"`
|
||||
}
|
||||
|
||||
// URLSet represents a standard sitemap.
|
||||
type URLSet struct {
|
||||
XMLName xml.Name `xml:"urlset"`
|
||||
URLs []SitemapURL `xml:"url"`
|
||||
}
|
||||
|
||||
// SitemapIndex represents a sitemap index file.
|
||||
type SitemapIndex struct {
|
||||
XMLName xml.Name `xml:"sitemapindex"`
|
||||
Sitemaps []SitemapURL `xml:"sitemap"`
|
||||
}
|
||||
|
||||
// FetchAndParseSitemap fetches and parses a sitemap from the given URL.
|
||||
// It handles standard sitemaps, sitemap indexes, and gzipped sitemaps.
|
||||
func FetchAndParseSitemap(sitemapURL string, client *http.Client) ([]string, error) {
|
||||
resp, err := client.Get(sitemapURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error fetching sitemap: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("bad status for sitemap: %s", resp.Status)
|
||||
}
|
||||
|
||||
var reader io.Reader = resp.Body
|
||||
if strings.HasSuffix(sitemapURL, ".gz") {
|
||||
gzReader, err := gzip.NewReader(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error creating gzip reader: %w", err)
|
||||
}
|
||||
defer gzReader.Close()
|
||||
reader = gzReader
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(reader)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading sitemap body: %w", err)
|
||||
}
|
||||
|
||||
// Try parsing as a sitemap index first
|
||||
var sitemapIndex SitemapIndex
|
||||
if err := xml.Unmarshal(body, &sitemapIndex); err == nil && len(sitemapIndex.Sitemaps) > 0 {
|
||||
var allURLs []string
|
||||
for _, sitemap := range sitemapIndex.Sitemaps {
|
||||
urls, err := FetchAndParseSitemap(sitemap.Loc, client)
|
||||
if err != nil {
|
||||
// In a real-world scenario, you might want to handle this more gracefully
|
||||
return nil, fmt.Errorf("error parsing sitemap from index '%s': %w", sitemap.Loc, err)
|
||||
}
|
||||
allURLs = append(allURLs, urls...)
|
||||
}
|
||||
return allURLs, nil
|
||||
}
|
||||
|
||||
// If not a sitemap index, try parsing as a standard sitemap
|
||||
var urlSet URLSet
|
||||
if err := xml.Unmarshal(body, &urlSet); err == nil {
|
||||
var urls []string
|
||||
for _, u := range urlSet.URLs {
|
||||
urls = append(urls, u.Loc)
|
||||
}
|
||||
return urls, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("failed to parse sitemap XML from %s", sitemapURL)
|
||||
}
|
||||
|
||||
// DiscoverSitemap attempts to discover the sitemap URL for a given base URL.
|
||||
func DiscoverSitemap(baseURL string, client *http.Client) (string, error) {
|
||||
u, err := url.Parse(baseURL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Make sure we're at the root of the domain
|
||||
u.Path = ""
|
||||
u.RawQuery = ""
|
||||
u.Fragment = ""
|
||||
|
||||
|
||||
sitemapPaths := []string{"sitemap.xml", "sitemap_index.xml", "sitemap.xml.gz", "sitemap_index.xml.gz"}
|
||||
|
||||
for _, path := range sitemapPaths {
|
||||
sitemapURL := u.String() + "/" + path
|
||||
resp, err := client.Head(sitemapURL)
|
||||
if err == nil && resp.StatusCode == http.StatusOK {
|
||||
// Ensure we close the body, even for a HEAD request
|
||||
io.Copy(io.Discard, resp.Body)
|
||||
resp.Body.Close()
|
||||
return sitemapURL, nil
|
||||
}
|
||||
if resp != nil {
|
||||
io.Copy(io.Discard, resp.Body)
|
||||
resp.Body.Close()
|
||||
}
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("sitemap not found for %s", baseURL)
|
||||
}
|
||||
118
pkg/website/sitemap_test.go
Normal file
118
pkg/website/sitemap_test.go
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
package website
|
||||
|
||||
import (
|
||||
"compress/gzip"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestFetchAndParseSitemap_Good(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch r.URL.Path {
|
||||
case "/sitemap.xml":
|
||||
fmt.Fprintln(w, `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>http://www.example.com/</loc>
|
||||
</url>
|
||||
<url>
|
||||
<loc>http://www.example.com/page1</loc>
|
||||
</url>
|
||||
</urlset>`)
|
||||
case "/sitemap_index.xml":
|
||||
fmt.Fprintln(w, `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<sitemap>
|
||||
<loc>http://`+r.Host+`/sitemap1.xml</loc>
|
||||
</sitemap>
|
||||
<sitemap>
|
||||
<loc>http://`+r.Host+`/sitemap2.xml</loc>
|
||||
</sitemap>
|
||||
</sitemapindex>`)
|
||||
case "/sitemap1.xml":
|
||||
fmt.Fprintln(w, `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>http://www.example.com/page2</loc>
|
||||
</url>
|
||||
</urlset>`)
|
||||
case "/sitemap2.xml":
|
||||
fmt.Fprintln(w, `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>http://www.example.com/page3</loc>
|
||||
</url>
|
||||
</urlset>`)
|
||||
case "/sitemap.xml.gz":
|
||||
w.Header().Set("Content-Type", "application/gzip")
|
||||
gz := gzip.NewWriter(w)
|
||||
defer gz.Close()
|
||||
gz.Write([]byte(`<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>http://www.example.com/gzipped</loc>
|
||||
</url>
|
||||
</urlset>`))
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
url string
|
||||
expected []string
|
||||
}{
|
||||
{"Standard Sitemap", server.URL + "/sitemap.xml", []string{"http://www.example.com/", "http://www.example.com/page1"}},
|
||||
{"Sitemap Index", server.URL + "/sitemap_index.xml", []string{"http://www.example.com/page2", "http://www.example.com/page3"}},
|
||||
{"Gzipped Sitemap", server.URL + "/sitemap.xml.gz", []string{"http://www.example.com/gzipped"}},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
urls, err := FetchAndParseSitemap(tc.url, server.Client())
|
||||
if err != nil {
|
||||
t.Fatalf("FetchAndParseSitemap() error = %v", err)
|
||||
}
|
||||
if !reflect.DeepEqual(urls, tc.expected) {
|
||||
t.Errorf("FetchAndParseSitemap() = %v, want %v", urls, tc.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiscoverSitemap_Good(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path == "/sitemap.xml" {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
} else {
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
sitemapURL, err := DiscoverSitemap(server.URL, server.Client())
|
||||
if err != nil {
|
||||
t.Fatalf("DiscoverSitemap() error = %v", err)
|
||||
}
|
||||
expected := server.URL + "/sitemap.xml"
|
||||
if sitemapURL != expected {
|
||||
t.Errorf("DiscoverSitemap() = %v, want %v", sitemapURL, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiscoverSitemap_Bad(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.NotFound(w, r)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
_, err := DiscoverSitemap(server.URL, server.Client())
|
||||
if err == nil {
|
||||
t.Error("DiscoverSitemap() error = nil, want error")
|
||||
}
|
||||
}
|
||||
|
|
@ -5,6 +5,7 @@ import (
|
|||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/Snider/Borg/pkg/datanode"
|
||||
|
|
@ -15,6 +16,66 @@ import (
|
|||
|
||||
var DownloadAndPackageWebsite = downloadAndPackageWebsite
|
||||
|
||||
func downloadAndPackageWebsite(startURL string, maxDepth int, useSitemap, sitemapOnly bool, sitemapURL string, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
|
||||
baseURL, err := url.Parse(startURL)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
d := NewDownloader(maxDepth)
|
||||
d.baseURL = baseURL
|
||||
d.progressBar = bar
|
||||
d.sitemapOnly = sitemapOnly
|
||||
|
||||
var sitemapURLs []string
|
||||
var sitemapErr error
|
||||
|
||||
// Determine which sitemap URL to use
|
||||
actualSitemapURL := sitemapURL
|
||||
if actualSitemapURL == "" && (useSitemap || sitemapOnly) {
|
||||
actualSitemapURL, sitemapErr = DiscoverSitemap(startURL, d.client)
|
||||
if sitemapErr != nil {
|
||||
if sitemapOnly {
|
||||
return nil, fmt.Errorf("sitemap discovery failed for %s: %w", startURL, sitemapErr)
|
||||
}
|
||||
// For useSitemap, we can warn and proceed with crawling
|
||||
fmt.Fprintf(os.Stderr, "Warning: sitemap discovery failed for %s: %v. Proceeding with crawl.\n", startURL, sitemapErr)
|
||||
}
|
||||
}
|
||||
|
||||
if actualSitemapURL != "" {
|
||||
sitemapURLs, sitemapErr = FetchAndParseSitemap(actualSitemapURL, d.client)
|
||||
if sitemapErr != nil {
|
||||
if sitemapOnly {
|
||||
return nil, fmt.Errorf("sitemap parsing failed for %s: %w", actualSitemapURL, sitemapErr)
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "Warning: sitemap parsing failed for %s: %v. Proceeding with crawl.\n", actualSitemapURL, sitemapErr)
|
||||
}
|
||||
}
|
||||
|
||||
// Process URLs from sitemap
|
||||
if len(sitemapURLs) > 0 {
|
||||
for _, u := range sitemapURLs {
|
||||
d.crawl(u, 0)
|
||||
}
|
||||
}
|
||||
|
||||
// Crawl from start URL if not in sitemap-only mode
|
||||
if !sitemapOnly {
|
||||
d.crawl(startURL, 0)
|
||||
}
|
||||
|
||||
if len(d.errors) > 0 {
|
||||
var errs []string
|
||||
for _, e := range d.errors {
|
||||
errs = append(errs, e.Error())
|
||||
}
|
||||
return nil, fmt.Errorf("failed to download website:\n%s", strings.Join(errs, "\n"))
|
||||
}
|
||||
|
||||
return d.dn, nil
|
||||
}
|
||||
|
||||
// Downloader is a recursive website downloader.
|
||||
type Downloader struct {
|
||||
baseURL *url.URL
|
||||
|
|
@ -22,6 +83,7 @@ type Downloader struct {
|
|||
visited map[string]bool
|
||||
maxDepth int
|
||||
progressBar *progressbar.ProgressBar
|
||||
sitemapOnly bool
|
||||
client *http.Client
|
||||
errors []error
|
||||
}
|
||||
|
|
@ -37,33 +99,12 @@ func NewDownloaderWithClient(maxDepth int, client *http.Client) *Downloader {
|
|||
dn: datanode.New(),
|
||||
visited: make(map[string]bool),
|
||||
maxDepth: maxDepth,
|
||||
sitemapOnly: false,
|
||||
client: client,
|
||||
errors: make([]error, 0),
|
||||
}
|
||||
}
|
||||
|
||||
// downloadAndPackageWebsite downloads a website and packages it into a DataNode.
|
||||
func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
|
||||
baseURL, err := url.Parse(startURL)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
d := NewDownloader(maxDepth)
|
||||
d.baseURL = baseURL
|
||||
d.progressBar = bar
|
||||
d.crawl(startURL, 0)
|
||||
|
||||
if len(d.errors) > 0 {
|
||||
var errs []string
|
||||
for _, e := range d.errors {
|
||||
errs = append(errs, e.Error())
|
||||
}
|
||||
return nil, fmt.Errorf("failed to download website:\n%s", strings.Join(errs, "\n"))
|
||||
}
|
||||
|
||||
return d.dn, nil
|
||||
}
|
||||
|
||||
func (d *Downloader) crawl(pageURL string, depth int) {
|
||||
if depth > d.maxDepth || d.visited[pageURL] {
|
||||
|
|
@ -118,7 +159,7 @@ func (d *Downloader) crawl(pageURL string, depth int) {
|
|||
if d.isLocal(link) {
|
||||
if isAsset(link) {
|
||||
d.downloadAsset(link)
|
||||
} else {
|
||||
} else if !d.sitemapOnly {
|
||||
d.crawl(link, depth+1)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) {
|
|||
defer server.Close()
|
||||
|
||||
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 2, bar)
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 2, false, false, "", bar)
|
||||
if err != nil {
|
||||
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
||||
}
|
||||
|
|
@ -52,7 +52,7 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) {
|
|||
|
||||
func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
|
||||
t.Run("Invalid Start URL", func(t *testing.T) {
|
||||
_, err := DownloadAndPackageWebsite("http://invalid-url", 1, nil)
|
||||
_, err := DownloadAndPackageWebsite("http://invalid-url", 1, false, false, "", nil)
|
||||
if err == nil {
|
||||
t.Fatal("Expected an error for an invalid start URL, but got nil")
|
||||
}
|
||||
|
|
@ -63,7 +63,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
|
|||
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
|
||||
}))
|
||||
defer server.Close()
|
||||
_, err := DownloadAndPackageWebsite(server.URL, 1, nil)
|
||||
_, err := DownloadAndPackageWebsite(server.URL, 1, false, false, "", nil)
|
||||
if err == nil {
|
||||
t.Fatal("Expected an error for a server error on the start URL, but got nil")
|
||||
}
|
||||
|
|
@ -80,7 +80,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
|
|||
}))
|
||||
defer server.Close()
|
||||
// We expect an error because the link is broken.
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil)
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, false, false, "", nil)
|
||||
if err == nil {
|
||||
t.Fatal("Expected an error for a broken link, but got nil")
|
||||
}
|
||||
|
|
@ -99,7 +99,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
|
|||
defer server.Close()
|
||||
|
||||
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, bar) // Max depth of 1
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, false, false, "", bar) // Max depth of 1
|
||||
if err != nil {
|
||||
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
||||
}
|
||||
|
|
@ -122,7 +122,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
|
|||
fmt.Fprint(w, `<a href="http://externalsite.com/page.html">External</a>`)
|
||||
}))
|
||||
defer server.Close()
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil)
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, false, false, "", nil)
|
||||
if err != nil {
|
||||
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
||||
}
|
||||
|
|
@ -156,7 +156,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
|
|||
// For now, we'll just test that it doesn't hang forever.
|
||||
done := make(chan bool)
|
||||
go func() {
|
||||
_, err := DownloadAndPackageWebsite(server.URL, 1, nil)
|
||||
_, err := DownloadAndPackageWebsite(server.URL, 1, false, false, "", nil)
|
||||
if err != nil && !strings.Contains(err.Error(), "context deadline exceeded") {
|
||||
// We expect a timeout error, but other errors are failures.
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
|
|
@ -172,6 +172,63 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
|
|||
})
|
||||
}
|
||||
|
||||
func TestDownloadAndPackageWebsite_Sitemap(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch r.URL.Path {
|
||||
case "/sitemap.xml":
|
||||
fmt.Fprintln(w, `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>http://`+r.Host+`/page1</loc>
|
||||
</url>
|
||||
</urlset>`)
|
||||
case "/":
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
fmt.Fprint(w, `
|
||||
<!DOCTYPE html>
|
||||
<html><body>
|
||||
<a href="/page2">Page 2</a>
|
||||
</body></html>
|
||||
`)
|
||||
case "/page1":
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
fmt.Fprint(w, `<html><body><h1>Page 1 from Sitemap</h1></body></html>`)
|
||||
case "/page2":
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
fmt.Fprint(w, `<html><body><h1>Page 2 from Crawl</h1></body></html>`)
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
t.Run("sitemap-only", func(t *testing.T) {
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 2, false, true, server.URL+"/sitemap.xml", nil)
|
||||
if err != nil {
|
||||
t.Fatalf("DownloadAndPackageWebsite with sitemap-only failed: %v", err)
|
||||
}
|
||||
if exists, _ := dn.Exists("page1"); !exists {
|
||||
t.Error("Expected to find /page1 from sitemap, but it was not found")
|
||||
}
|
||||
if exists, _ := dn.Exists("page2"); exists {
|
||||
t.Error("Did not expect to find /page2 from crawl, but it was found")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("use-sitemap", func(t *testing.T) {
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 2, true, false, server.URL+"/sitemap.xml", nil)
|
||||
if err != nil {
|
||||
t.Fatalf("DownloadAndPackageWebsite with use-sitemap failed: %v", err)
|
||||
}
|
||||
if exists, _ := dn.Exists("page1"); !exists {
|
||||
t.Error("Expected to find /page1 from sitemap, but it was not found")
|
||||
}
|
||||
if exists, _ := dn.Exists("page2"); !exists {
|
||||
t.Error("Expected to find /page2 from crawl, but it was not found")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// --- Helpers ---
|
||||
|
||||
func newWebsiteTestServer() *httptest.Server {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue