Merge 1d8ff02f5c into a77024aad4
This commit is contained in:
commit
ac047e3773
7 changed files with 353 additions and 16 deletions
|
|
@ -38,6 +38,9 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
||||||
format, _ := cmd.Flags().GetString("format")
|
format, _ := cmd.Flags().GetString("format")
|
||||||
compression, _ := cmd.Flags().GetString("compression")
|
compression, _ := cmd.Flags().GetString("compression")
|
||||||
password, _ := cmd.Flags().GetString("password")
|
password, _ := cmd.Flags().GetString("password")
|
||||||
|
userAgent, _ := cmd.Flags().GetString("user-agent")
|
||||||
|
ignoreRobots, _ := cmd.Flags().GetBool("ignore-robots")
|
||||||
|
minDelay, _ := cmd.Flags().GetDuration("min-delay")
|
||||||
|
|
||||||
if format != "datanode" && format != "tim" && format != "trix" {
|
if format != "datanode" && format != "tim" && format != "trix" {
|
||||||
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
|
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
|
||||||
|
|
@ -51,7 +54,7 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
||||||
bar = ui.NewProgressBar(-1, "Crawling website")
|
bar = ui.NewProgressBar(-1, "Crawling website")
|
||||||
}
|
}
|
||||||
|
|
||||||
dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar)
|
dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar, userAgent, ignoreRobots, minDelay)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("error downloading and packaging website: %w", err)
|
return fmt.Errorf("error downloading and packaging website: %w", err)
|
||||||
}
|
}
|
||||||
|
|
@ -104,5 +107,8 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
||||||
collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
|
collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
|
||||||
collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
|
collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
|
||||||
collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
|
collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
|
||||||
|
collectWebsiteCmd.PersistentFlags().String("user-agent", "Borg/1.0", "Custom user agent string")
|
||||||
|
collectWebsiteCmd.PersistentFlags().Bool("ignore-robots", false, "Ignore robots.txt")
|
||||||
|
collectWebsiteCmd.PersistentFlags().Duration("min-delay", 0, "Minimum delay between requests")
|
||||||
return collectWebsiteCmd
|
return collectWebsiteCmd
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,8 @@ import (
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/Snider/Borg/pkg/datanode"
|
"github.com/Snider/Borg/pkg/datanode"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/Snider/Borg/pkg/website"
|
"github.com/Snider/Borg/pkg/website"
|
||||||
"github.com/schollz/progressbar/v3"
|
"github.com/schollz/progressbar/v3"
|
||||||
)
|
)
|
||||||
|
|
@ -14,7 +16,7 @@ import (
|
||||||
func TestCollectWebsiteCmd_Good(t *testing.T) {
|
func TestCollectWebsiteCmd_Good(t *testing.T) {
|
||||||
// Mock the website downloader
|
// Mock the website downloader
|
||||||
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
|
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
|
||||||
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
|
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
|
||||||
return datanode.New(), nil
|
return datanode.New(), nil
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
|
|
@ -35,7 +37,7 @@ func TestCollectWebsiteCmd_Good(t *testing.T) {
|
||||||
func TestCollectWebsiteCmd_Bad(t *testing.T) {
|
func TestCollectWebsiteCmd_Bad(t *testing.T) {
|
||||||
// Mock the website downloader to return an error
|
// Mock the website downloader to return an error
|
||||||
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
|
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
|
||||||
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
|
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
|
||||||
return nil, fmt.Errorf("website error")
|
return nil, fmt.Errorf("website error")
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ func main() {
|
||||||
log.Println("Collecting website...")
|
log.Println("Collecting website...")
|
||||||
|
|
||||||
// Download and package the website.
|
// Download and package the website.
|
||||||
dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil)
|
dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil, "Borg/1.0", false, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("Failed to collect website: %v", err)
|
log.Fatalf("Failed to collect website: %v", err)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
112
pkg/robots/robots.go
Normal file
112
pkg/robots/robots.go
Normal file
|
|
@ -0,0 +1,112 @@
|
||||||
|
package robots
|
||||||
|
|
||||||
|
import (
|
||||||
|
"path"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// RobotsData holds the parsed robots.txt data for a specific user-agent.
|
||||||
|
type RobotsData struct {
|
||||||
|
Disallow []string
|
||||||
|
CrawlDelay time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsAllowed checks if a given path is allowed by the robots.txt rules.
|
||||||
|
func (r *RobotsData) IsAllowed(p string) bool {
|
||||||
|
// A more complete implementation would handle wildcards.
|
||||||
|
// This is a simple path prefix match.
|
||||||
|
for _, rule := range r.Disallow {
|
||||||
|
if rule == "" {
|
||||||
|
// An empty Disallow rule means nothing is disallowed by this rule.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if rule == "/" {
|
||||||
|
// Disallow: / means disallow everything.
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(p, rule) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse parses the content of a robots.txt file for a specific user-agent.
|
||||||
|
func Parse(content []byte, userAgent string) (*RobotsData, error) {
|
||||||
|
lines := strings.Split(string(content), "\n")
|
||||||
|
|
||||||
|
rules := make(map[string]*RobotsData)
|
||||||
|
var currentUAs []string
|
||||||
|
lastWasUA := false
|
||||||
|
|
||||||
|
for _, line := range lines {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if idx := strings.Index(line, "#"); idx != -1 {
|
||||||
|
line = line[:idx]
|
||||||
|
}
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
parts := strings.SplitN(line, ":", 2)
|
||||||
|
if len(parts) != 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
key := strings.ToLower(strings.TrimSpace(parts[0]))
|
||||||
|
value := strings.TrimSpace(parts[1])
|
||||||
|
|
||||||
|
switch key {
|
||||||
|
case "user-agent":
|
||||||
|
if !lastWasUA {
|
||||||
|
currentUAs = []string{} // New group
|
||||||
|
}
|
||||||
|
currentUAs = append(currentUAs, strings.ToLower(value))
|
||||||
|
lastWasUA = true
|
||||||
|
case "disallow", "crawl-delay":
|
||||||
|
if len(currentUAs) == 0 {
|
||||||
|
continue // Rule without a user-agent
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, ua := range currentUAs {
|
||||||
|
if rules[ua] == nil {
|
||||||
|
rules[ua] = &RobotsData{}
|
||||||
|
}
|
||||||
|
if key == "disallow" {
|
||||||
|
rules[ua].Disallow = append(rules[ua].Disallow, path.Clean("/"+value))
|
||||||
|
} else if key == "crawl-delay" {
|
||||||
|
if delay, err := strconv.ParseFloat(value, 64); err == nil {
|
||||||
|
rules[ua].CrawlDelay = time.Duration(delay * float64(time.Second))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lastWasUA = false
|
||||||
|
default:
|
||||||
|
lastWasUA = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lowerUserAgent := strings.ToLower(userAgent)
|
||||||
|
|
||||||
|
// Look for most specific match.
|
||||||
|
bestMatch := ""
|
||||||
|
for ua := range rules {
|
||||||
|
if strings.Contains(lowerUserAgent, ua) {
|
||||||
|
if len(ua) > len(bestMatch) {
|
||||||
|
bestMatch = ua
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if bestMatch != "" {
|
||||||
|
return rules[bestMatch], nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to wildcard.
|
||||||
|
if data, ok := rules["*"]; ok {
|
||||||
|
return data, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return &RobotsData{}, nil
|
||||||
|
}
|
||||||
125
pkg/robots/robots_test.go
Normal file
125
pkg/robots/robots_test.go
Normal file
|
|
@ -0,0 +1,125 @@
|
||||||
|
package robots
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParse(t *testing.T) {
|
||||||
|
testCases := []struct {
|
||||||
|
name string
|
||||||
|
content string
|
||||||
|
userAgent string
|
||||||
|
expected *RobotsData
|
||||||
|
expectedErr bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "Specific user agent",
|
||||||
|
content: `
|
||||||
|
User-agent: BorgBot
|
||||||
|
Disallow: /private/
|
||||||
|
Crawl-delay: 2
|
||||||
|
`,
|
||||||
|
userAgent: "BorgBot/1.0",
|
||||||
|
expected: &RobotsData{
|
||||||
|
Disallow: []string{"/private"},
|
||||||
|
CrawlDelay: 2 * time.Second,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Wildcard user agent",
|
||||||
|
content: `
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /admin/
|
||||||
|
`,
|
||||||
|
userAgent: "AnotherBot",
|
||||||
|
expected: &RobotsData{
|
||||||
|
Disallow: []string{"/admin"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Multiple disallow rules",
|
||||||
|
content: `
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /admin/
|
||||||
|
Disallow: /login
|
||||||
|
`,
|
||||||
|
userAgent: "AnyBot",
|
||||||
|
expected: &RobotsData{
|
||||||
|
Disallow: []string{"/admin", "/login"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "No rules for user agent",
|
||||||
|
content: `
|
||||||
|
User-agent: GoogleBot
|
||||||
|
Disallow: /
|
||||||
|
`,
|
||||||
|
userAgent: "MyBot",
|
||||||
|
expected: &RobotsData{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range testCases {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
robotsData, err := Parse([]byte(tc.content), tc.userAgent)
|
||||||
|
if (err != nil) != tc.expectedErr {
|
||||||
|
t.Fatalf("Parse() error = %v, wantErr %v", err, tc.expectedErr)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(robotsData.Disallow) != len(tc.expected.Disallow) {
|
||||||
|
t.Fatalf("expected %d disallow rules, got %d", len(tc.expected.Disallow), len(robotsData.Disallow))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, rule := range tc.expected.Disallow {
|
||||||
|
if robotsData.Disallow[i] != rule {
|
||||||
|
t.Errorf("expected disallow rule %s, got %s", rule, robotsData.Disallow[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if robotsData.CrawlDelay != tc.expected.CrawlDelay {
|
||||||
|
t.Errorf("expected crawl delay %v, got %v", tc.expected.CrawlDelay, robotsData.CrawlDelay)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIsAllowed(t *testing.T) {
|
||||||
|
testCases := []struct {
|
||||||
|
name string
|
||||||
|
robotsData *RobotsData
|
||||||
|
path string
|
||||||
|
allowed bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "Path is disallowed",
|
||||||
|
robotsData: &RobotsData{
|
||||||
|
Disallow: []string{"/private"},
|
||||||
|
},
|
||||||
|
path: "/private/page.html",
|
||||||
|
allowed: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Path is allowed",
|
||||||
|
robotsData: &RobotsData{
|
||||||
|
Disallow: []string{"/private"},
|
||||||
|
},
|
||||||
|
path: "/public/page.html",
|
||||||
|
allowed: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "No rules",
|
||||||
|
robotsData: &RobotsData{},
|
||||||
|
path: "/any/page.html",
|
||||||
|
allowed: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range testCases {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
if allowed := tc.robotsData.IsAllowed(tc.path); allowed != tc.allowed {
|
||||||
|
t.Errorf("IsAllowed(%s) = %v, want %v", tc.path, allowed, tc.allowed)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -8,9 +8,10 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/Snider/Borg/pkg/datanode"
|
"github.com/Snider/Borg/pkg/datanode"
|
||||||
|
"github.com/Snider/Borg/pkg/robots"
|
||||||
"github.com/schollz/progressbar/v3"
|
"github.com/schollz/progressbar/v3"
|
||||||
|
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
var DownloadAndPackageWebsite = downloadAndPackageWebsite
|
var DownloadAndPackageWebsite = downloadAndPackageWebsite
|
||||||
|
|
@ -24,6 +25,9 @@ type Downloader struct {
|
||||||
progressBar *progressbar.ProgressBar
|
progressBar *progressbar.ProgressBar
|
||||||
client *http.Client
|
client *http.Client
|
||||||
errors []error
|
errors []error
|
||||||
|
robots *robots.RobotsData
|
||||||
|
userAgent string
|
||||||
|
minDelay time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewDownloader creates a new Downloader.
|
// NewDownloader creates a new Downloader.
|
||||||
|
|
@ -39,11 +43,12 @@ func NewDownloaderWithClient(maxDepth int, client *http.Client) *Downloader {
|
||||||
maxDepth: maxDepth,
|
maxDepth: maxDepth,
|
||||||
client: client,
|
client: client,
|
||||||
errors: make([]error, 0),
|
errors: make([]error, 0),
|
||||||
|
userAgent: "Borg/1.0",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// downloadAndPackageWebsite downloads a website and packages it into a DataNode.
|
// downloadAndPackageWebsite downloads a website and packages it into a DataNode.
|
||||||
func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
|
func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
|
||||||
baseURL, err := url.Parse(startURL)
|
baseURL, err := url.Parse(startURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
@ -52,6 +57,23 @@ func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.P
|
||||||
d := NewDownloader(maxDepth)
|
d := NewDownloader(maxDepth)
|
||||||
d.baseURL = baseURL
|
d.baseURL = baseURL
|
||||||
d.progressBar = bar
|
d.progressBar = bar
|
||||||
|
d.userAgent = userAgent
|
||||||
|
d.minDelay = minDelay
|
||||||
|
|
||||||
|
if !ignoreRobots {
|
||||||
|
robotsURL, err := baseURL.Parse("/robots.txt")
|
||||||
|
if err == nil {
|
||||||
|
resp, err := d.client.Get(robotsURL.String())
|
||||||
|
if err == nil && resp.StatusCode == http.StatusOK {
|
||||||
|
body, err := io.ReadAll(resp.Body)
|
||||||
|
resp.Body.Close()
|
||||||
|
if err == nil {
|
||||||
|
d.robots, _ = robots.Parse(body, d.userAgent)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
d.crawl(startURL, 0)
|
d.crawl(startURL, 0)
|
||||||
|
|
||||||
if len(d.errors) > 0 {
|
if len(d.errors) > 0 {
|
||||||
|
|
@ -69,12 +91,28 @@ func (d *Downloader) crawl(pageURL string, depth int) {
|
||||||
if depth > d.maxDepth || d.visited[pageURL] {
|
if depth > d.maxDepth || d.visited[pageURL] {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
u, err := url.Parse(pageURL)
|
||||||
|
if err != nil {
|
||||||
|
d.errors = append(d.errors, fmt.Errorf("invalid URL %s: %w", pageURL, err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if d.robots != nil && !d.robots.IsAllowed(u.Path) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
d.visited[pageURL] = true
|
d.visited[pageURL] = true
|
||||||
if d.progressBar != nil {
|
if d.progressBar != nil {
|
||||||
d.progressBar.Add(1)
|
d.progressBar.Add(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
resp, err := d.client.Get(pageURL)
|
d.delay()
|
||||||
|
|
||||||
|
req, _ := http.NewRequest("GET", pageURL, nil)
|
||||||
|
req.Header.Set("User-Agent", d.userAgent)
|
||||||
|
|
||||||
|
resp, err := d.client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
d.errors = append(d.errors, fmt.Errorf("Error getting %s: %w", pageURL, err))
|
d.errors = append(d.errors, fmt.Errorf("Error getting %s: %w", pageURL, err))
|
||||||
return
|
return
|
||||||
|
|
@ -136,12 +174,28 @@ func (d *Downloader) downloadAsset(assetURL string) {
|
||||||
if d.visited[assetURL] {
|
if d.visited[assetURL] {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
u, err := url.Parse(assetURL)
|
||||||
|
if err != nil {
|
||||||
|
d.errors = append(d.errors, fmt.Errorf("invalid URL %s: %w", assetURL, err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if d.robots != nil && !d.robots.IsAllowed(u.Path) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
d.visited[assetURL] = true
|
d.visited[assetURL] = true
|
||||||
if d.progressBar != nil {
|
if d.progressBar != nil {
|
||||||
d.progressBar.Add(1)
|
d.progressBar.Add(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
resp, err := d.client.Get(assetURL)
|
d.delay()
|
||||||
|
|
||||||
|
req, _ := http.NewRequest("GET", assetURL, nil)
|
||||||
|
req.Header.Set("User-Agent", d.userAgent)
|
||||||
|
|
||||||
|
resp, err := d.client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
d.errors = append(d.errors, fmt.Errorf("Error getting asset %s: %w", assetURL, err))
|
d.errors = append(d.errors, fmt.Errorf("Error getting asset %s: %w", assetURL, err))
|
||||||
return
|
return
|
||||||
|
|
@ -163,6 +217,19 @@ func (d *Downloader) downloadAsset(assetURL string) {
|
||||||
d.dn.AddData(relPath, body)
|
d.dn.AddData(relPath, body)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (d *Downloader) delay() {
|
||||||
|
var delay time.Duration
|
||||||
|
if d.robots != nil {
|
||||||
|
delay = d.robots.CrawlDelay
|
||||||
|
}
|
||||||
|
if d.minDelay > delay {
|
||||||
|
delay = d.minDelay
|
||||||
|
}
|
||||||
|
if delay > 0 {
|
||||||
|
time.Sleep(delay)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (d *Downloader) getRelativePath(pageURL string) string {
|
func (d *Downloader) getRelativePath(pageURL string) string {
|
||||||
u, err := url.Parse(pageURL)
|
u, err := url.Parse(pageURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
||||||
|
|
@ -20,12 +20,12 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) {
|
||||||
defer server.Close()
|
defer server.Close()
|
||||||
|
|
||||||
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
|
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
|
||||||
dn, err := DownloadAndPackageWebsite(server.URL, 2, bar)
|
dn, err := DownloadAndPackageWebsite(server.URL, 2, bar, "Borg/1.0", false, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
expectedFiles := []string{"index.html", "style.css", "image.png", "page2.html", "page3.html"}
|
expectedFiles := []string{"index.html", "style.css", "image.png", "page2.html"}
|
||||||
for _, file := range expectedFiles {
|
for _, file := range expectedFiles {
|
||||||
exists, err := dn.Exists(file)
|
exists, err := dn.Exists(file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
@ -50,9 +50,31 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestDownloadAndPackageWebsite_RespectsRobotsTxt(t *testing.T) {
|
||||||
|
server := newWebsiteTestServer()
|
||||||
|
defer server.Close()
|
||||||
|
|
||||||
|
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
|
||||||
|
dn, err := DownloadAndPackageWebsite(server.URL, 2, bar, "Borg/1.0", false, 0)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// page3.html is disallowed by robots.txt, so it should not be present.
|
||||||
|
exists, _ := dn.Exists("page3.html")
|
||||||
|
if exists {
|
||||||
|
t.Error("page3.html should not have been downloaded due to robots.txt")
|
||||||
|
}
|
||||||
|
// page2.html is not disallowed, so it should be present.
|
||||||
|
exists, _ = dn.Exists("page2.html")
|
||||||
|
if !exists {
|
||||||
|
t.Error("page2.html should have been downloaded")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
|
func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
|
||||||
t.Run("Invalid Start URL", func(t *testing.T) {
|
t.Run("Invalid Start URL", func(t *testing.T) {
|
||||||
_, err := DownloadAndPackageWebsite("http://invalid-url", 1, nil)
|
_, err := DownloadAndPackageWebsite("http://invalid-url", 1, nil, "Borg/1.0", false, 0)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
t.Fatal("Expected an error for an invalid start URL, but got nil")
|
t.Fatal("Expected an error for an invalid start URL, but got nil")
|
||||||
}
|
}
|
||||||
|
|
@ -63,7 +85,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
|
||||||
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
|
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
|
||||||
}))
|
}))
|
||||||
defer server.Close()
|
defer server.Close()
|
||||||
_, err := DownloadAndPackageWebsite(server.URL, 1, nil)
|
_, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
t.Fatal("Expected an error for a server error on the start URL, but got nil")
|
t.Fatal("Expected an error for a server error on the start URL, but got nil")
|
||||||
}
|
}
|
||||||
|
|
@ -80,7 +102,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
|
||||||
}))
|
}))
|
||||||
defer server.Close()
|
defer server.Close()
|
||||||
// We expect an error because the link is broken.
|
// We expect an error because the link is broken.
|
||||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil)
|
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
t.Fatal("Expected an error for a broken link, but got nil")
|
t.Fatal("Expected an error for a broken link, but got nil")
|
||||||
}
|
}
|
||||||
|
|
@ -99,7 +121,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
|
||||||
defer server.Close()
|
defer server.Close()
|
||||||
|
|
||||||
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
|
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
|
||||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, bar) // Max depth of 1
|
dn, err := DownloadAndPackageWebsite(server.URL, 1, bar, "Borg/1.0", false, 0) // Max depth of 1
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
||||||
}
|
}
|
||||||
|
|
@ -122,7 +144,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
|
||||||
fmt.Fprint(w, `<a href="http://externalsite.com/page.html">External</a>`)
|
fmt.Fprint(w, `<a href="http://externalsite.com/page.html">External</a>`)
|
||||||
}))
|
}))
|
||||||
defer server.Close()
|
defer server.Close()
|
||||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil)
|
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
||||||
}
|
}
|
||||||
|
|
@ -156,7 +178,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
|
||||||
// For now, we'll just test that it doesn't hang forever.
|
// For now, we'll just test that it doesn't hang forever.
|
||||||
done := make(chan bool)
|
done := make(chan bool)
|
||||||
go func() {
|
go func() {
|
||||||
_, err := DownloadAndPackageWebsite(server.URL, 1, nil)
|
_, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
|
||||||
if err != nil && !strings.Contains(err.Error(), "context deadline exceeded") {
|
if err != nil && !strings.Contains(err.Error(), "context deadline exceeded") {
|
||||||
// We expect a timeout error, but other errors are failures.
|
// We expect a timeout error, but other errors are failures.
|
||||||
t.Errorf("unexpected error: %v", err)
|
t.Errorf("unexpected error: %v", err)
|
||||||
|
|
@ -177,6 +199,9 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
|
||||||
func newWebsiteTestServer() *httptest.Server {
|
func newWebsiteTestServer() *httptest.Server {
|
||||||
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
switch r.URL.Path {
|
switch r.URL.Path {
|
||||||
|
case "/robots.txt":
|
||||||
|
w.Header().Set("Content-Type", "text/plain")
|
||||||
|
fmt.Fprint(w, "User-agent: *\nDisallow: /page3.html")
|
||||||
case "/":
|
case "/":
|
||||||
w.Header().Set("Content-Type", "text/html")
|
w.Header().Set("Content-Type", "text/html")
|
||||||
fmt.Fprint(w, `
|
fmt.Fprint(w, `
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue