diff --git a/cmd/collect_website.go b/cmd/collect_website.go index 3811f32..3b39ab6 100644 --- a/cmd/collect_website.go +++ b/cmd/collect_website.go @@ -38,6 +38,9 @@ func NewCollectWebsiteCmd() *cobra.Command { format, _ := cmd.Flags().GetString("format") compression, _ := cmd.Flags().GetString("compression") password, _ := cmd.Flags().GetString("password") + userAgent, _ := cmd.Flags().GetString("user-agent") + ignoreRobots, _ := cmd.Flags().GetBool("ignore-robots") + minDelay, _ := cmd.Flags().GetDuration("min-delay") if format != "datanode" && format != "tim" && format != "trix" { return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format) @@ -51,7 +54,7 @@ func NewCollectWebsiteCmd() *cobra.Command { bar = ui.NewProgressBar(-1, "Crawling website") } - dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar) + dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar, userAgent, ignoreRobots, minDelay) if err != nil { return fmt.Errorf("error downloading and packaging website: %w", err) } @@ -104,5 +107,8 @@ func NewCollectWebsiteCmd() *cobra.Command { collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)") collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)") collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption") + collectWebsiteCmd.PersistentFlags().String("user-agent", "Borg/1.0", "Custom user agent string") + collectWebsiteCmd.PersistentFlags().Bool("ignore-robots", false, "Ignore robots.txt") + collectWebsiteCmd.PersistentFlags().Duration("min-delay", 0, "Minimum delay between requests") return collectWebsiteCmd } diff --git a/cmd/collect_website_test.go b/cmd/collect_website_test.go index 2c39674..0d0c5b3 100644 --- a/cmd/collect_website_test.go +++ b/cmd/collect_website_test.go @@ -7,6 +7,8 @@ import ( "testing" "github.com/Snider/Borg/pkg/datanode" + "time" + "github.com/Snider/Borg/pkg/website" "github.com/schollz/progressbar/v3" ) @@ -14,7 +16,7 @@ import ( func TestCollectWebsiteCmd_Good(t *testing.T) { // Mock the website downloader oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite - website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) { + website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) { return datanode.New(), nil } defer func() { @@ -35,7 +37,7 @@ func TestCollectWebsiteCmd_Good(t *testing.T) { func TestCollectWebsiteCmd_Bad(t *testing.T) { // Mock the website downloader to return an error oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite - website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) { + website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) { return nil, fmt.Errorf("website error") } defer func() { diff --git a/examples/collect_website/main.go b/examples/collect_website/main.go index 2e2f606..26d3a0c 100644 --- a/examples/collect_website/main.go +++ b/examples/collect_website/main.go @@ -11,7 +11,7 @@ func main() { log.Println("Collecting website...") // Download and package the website. - dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil) + dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil, "Borg/1.0", false, 0) if err != nil { log.Fatalf("Failed to collect website: %v", err) } diff --git a/pkg/robots/robots.go b/pkg/robots/robots.go new file mode 100644 index 0000000..3682c95 --- /dev/null +++ b/pkg/robots/robots.go @@ -0,0 +1,112 @@ +package robots + +import ( + "path" + "strconv" + "strings" + "time" +) + +// RobotsData holds the parsed robots.txt data for a specific user-agent. +type RobotsData struct { + Disallow []string + CrawlDelay time.Duration +} + +// IsAllowed checks if a given path is allowed by the robots.txt rules. +func (r *RobotsData) IsAllowed(p string) bool { + // A more complete implementation would handle wildcards. + // This is a simple path prefix match. + for _, rule := range r.Disallow { + if rule == "" { + // An empty Disallow rule means nothing is disallowed by this rule. + continue + } + if rule == "/" { + // Disallow: / means disallow everything. + return false + } + if strings.HasPrefix(p, rule) { + return false + } + } + return true +} + +// Parse parses the content of a robots.txt file for a specific user-agent. +func Parse(content []byte, userAgent string) (*RobotsData, error) { + lines := strings.Split(string(content), "\n") + + rules := make(map[string]*RobotsData) + var currentUAs []string + lastWasUA := false + + for _, line := range lines { + line = strings.TrimSpace(line) + if idx := strings.Index(line, "#"); idx != -1 { + line = line[:idx] + } + if line == "" { + continue + } + + parts := strings.SplitN(line, ":", 2) + if len(parts) != 2 { + continue + } + + key := strings.ToLower(strings.TrimSpace(parts[0])) + value := strings.TrimSpace(parts[1]) + + switch key { + case "user-agent": + if !lastWasUA { + currentUAs = []string{} // New group + } + currentUAs = append(currentUAs, strings.ToLower(value)) + lastWasUA = true + case "disallow", "crawl-delay": + if len(currentUAs) == 0 { + continue // Rule without a user-agent + } + + for _, ua := range currentUAs { + if rules[ua] == nil { + rules[ua] = &RobotsData{} + } + if key == "disallow" { + rules[ua].Disallow = append(rules[ua].Disallow, path.Clean("/"+value)) + } else if key == "crawl-delay" { + if delay, err := strconv.ParseFloat(value, 64); err == nil { + rules[ua].CrawlDelay = time.Duration(delay * float64(time.Second)) + } + } + } + lastWasUA = false + default: + lastWasUA = false + } + } + + lowerUserAgent := strings.ToLower(userAgent) + + // Look for most specific match. + bestMatch := "" + for ua := range rules { + if strings.Contains(lowerUserAgent, ua) { + if len(ua) > len(bestMatch) { + bestMatch = ua + } + } + } + if bestMatch != "" { + return rules[bestMatch], nil + } + + // Fallback to wildcard. + if data, ok := rules["*"]; ok { + return data, nil + } + + return &RobotsData{}, nil +} diff --git a/pkg/robots/robots_test.go b/pkg/robots/robots_test.go new file mode 100644 index 0000000..e26ebd1 --- /dev/null +++ b/pkg/robots/robots_test.go @@ -0,0 +1,125 @@ +package robots + +import ( + "testing" + "time" +) + +func TestParse(t *testing.T) { + testCases := []struct { + name string + content string + userAgent string + expected *RobotsData + expectedErr bool + }{ + { + name: "Specific user agent", + content: ` + User-agent: BorgBot + Disallow: /private/ + Crawl-delay: 2 + `, + userAgent: "BorgBot/1.0", + expected: &RobotsData{ + Disallow: []string{"/private"}, + CrawlDelay: 2 * time.Second, + }, + }, + { + name: "Wildcard user agent", + content: ` + User-agent: * + Disallow: /admin/ + `, + userAgent: "AnotherBot", + expected: &RobotsData{ + Disallow: []string{"/admin"}, + }, + }, + { + name: "Multiple disallow rules", + content: ` + User-agent: * + Disallow: /admin/ + Disallow: /login + `, + userAgent: "AnyBot", + expected: &RobotsData{ + Disallow: []string{"/admin", "/login"}, + }, + }, + { + name: "No rules for user agent", + content: ` + User-agent: GoogleBot + Disallow: / + `, + userAgent: "MyBot", + expected: &RobotsData{}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + robotsData, err := Parse([]byte(tc.content), tc.userAgent) + if (err != nil) != tc.expectedErr { + t.Fatalf("Parse() error = %v, wantErr %v", err, tc.expectedErr) + } + + if len(robotsData.Disallow) != len(tc.expected.Disallow) { + t.Fatalf("expected %d disallow rules, got %d", len(tc.expected.Disallow), len(robotsData.Disallow)) + } + + for i, rule := range tc.expected.Disallow { + if robotsData.Disallow[i] != rule { + t.Errorf("expected disallow rule %s, got %s", rule, robotsData.Disallow[i]) + } + } + + if robotsData.CrawlDelay != tc.expected.CrawlDelay { + t.Errorf("expected crawl delay %v, got %v", tc.expected.CrawlDelay, robotsData.CrawlDelay) + } + }) + } +} + +func TestIsAllowed(t *testing.T) { + testCases := []struct { + name string + robotsData *RobotsData + path string + allowed bool + }{ + { + name: "Path is disallowed", + robotsData: &RobotsData{ + Disallow: []string{"/private"}, + }, + path: "/private/page.html", + allowed: false, + }, + { + name: "Path is allowed", + robotsData: &RobotsData{ + Disallow: []string{"/private"}, + }, + path: "/public/page.html", + allowed: true, + }, + { + name: "No rules", + robotsData: &RobotsData{}, + path: "/any/page.html", + allowed: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if allowed := tc.robotsData.IsAllowed(tc.path); allowed != tc.allowed { + t.Errorf("IsAllowed(%s) = %v, want %v", tc.path, allowed, tc.allowed) + } + }) + } +} diff --git a/pkg/website/website.go b/pkg/website/website.go index b2bd517..738760f 100644 --- a/pkg/website/website.go +++ b/pkg/website/website.go @@ -8,9 +8,10 @@ import ( "strings" "github.com/Snider/Borg/pkg/datanode" + "github.com/Snider/Borg/pkg/robots" "github.com/schollz/progressbar/v3" - "golang.org/x/net/html" + "time" ) var DownloadAndPackageWebsite = downloadAndPackageWebsite @@ -24,6 +25,9 @@ type Downloader struct { progressBar *progressbar.ProgressBar client *http.Client errors []error + robots *robots.RobotsData + userAgent string + minDelay time.Duration } // NewDownloader creates a new Downloader. @@ -39,11 +43,12 @@ func NewDownloaderWithClient(maxDepth int, client *http.Client) *Downloader { maxDepth: maxDepth, client: client, errors: make([]error, 0), + userAgent: "Borg/1.0", } } // downloadAndPackageWebsite downloads a website and packages it into a DataNode. -func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) { +func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) { baseURL, err := url.Parse(startURL) if err != nil { return nil, err @@ -52,6 +57,23 @@ func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.P d := NewDownloader(maxDepth) d.baseURL = baseURL d.progressBar = bar + d.userAgent = userAgent + d.minDelay = minDelay + + if !ignoreRobots { + robotsURL, err := baseURL.Parse("/robots.txt") + if err == nil { + resp, err := d.client.Get(robotsURL.String()) + if err == nil && resp.StatusCode == http.StatusOK { + body, err := io.ReadAll(resp.Body) + resp.Body.Close() + if err == nil { + d.robots, _ = robots.Parse(body, d.userAgent) + } + } + } + } + d.crawl(startURL, 0) if len(d.errors) > 0 { @@ -69,12 +91,28 @@ func (d *Downloader) crawl(pageURL string, depth int) { if depth > d.maxDepth || d.visited[pageURL] { return } + + u, err := url.Parse(pageURL) + if err != nil { + d.errors = append(d.errors, fmt.Errorf("invalid URL %s: %w", pageURL, err)) + return + } + + if d.robots != nil && !d.robots.IsAllowed(u.Path) { + return + } + d.visited[pageURL] = true if d.progressBar != nil { d.progressBar.Add(1) } - resp, err := d.client.Get(pageURL) + d.delay() + + req, _ := http.NewRequest("GET", pageURL, nil) + req.Header.Set("User-Agent", d.userAgent) + + resp, err := d.client.Do(req) if err != nil { d.errors = append(d.errors, fmt.Errorf("Error getting %s: %w", pageURL, err)) return @@ -136,12 +174,28 @@ func (d *Downloader) downloadAsset(assetURL string) { if d.visited[assetURL] { return } + + u, err := url.Parse(assetURL) + if err != nil { + d.errors = append(d.errors, fmt.Errorf("invalid URL %s: %w", assetURL, err)) + return + } + + if d.robots != nil && !d.robots.IsAllowed(u.Path) { + return + } + d.visited[assetURL] = true if d.progressBar != nil { d.progressBar.Add(1) } - resp, err := d.client.Get(assetURL) + d.delay() + + req, _ := http.NewRequest("GET", assetURL, nil) + req.Header.Set("User-Agent", d.userAgent) + + resp, err := d.client.Do(req) if err != nil { d.errors = append(d.errors, fmt.Errorf("Error getting asset %s: %w", assetURL, err)) return @@ -163,6 +217,19 @@ func (d *Downloader) downloadAsset(assetURL string) { d.dn.AddData(relPath, body) } +func (d *Downloader) delay() { + var delay time.Duration + if d.robots != nil { + delay = d.robots.CrawlDelay + } + if d.minDelay > delay { + delay = d.minDelay + } + if delay > 0 { + time.Sleep(delay) + } +} + func (d *Downloader) getRelativePath(pageURL string) string { u, err := url.Parse(pageURL) if err != nil { diff --git a/pkg/website/website_test.go b/pkg/website/website_test.go index d3685e5..d616eda 100644 --- a/pkg/website/website_test.go +++ b/pkg/website/website_test.go @@ -20,12 +20,12 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) { defer server.Close() bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard)) - dn, err := DownloadAndPackageWebsite(server.URL, 2, bar) + dn, err := DownloadAndPackageWebsite(server.URL, 2, bar, "Borg/1.0", false, 0) if err != nil { t.Fatalf("DownloadAndPackageWebsite failed: %v", err) } - expectedFiles := []string{"index.html", "style.css", "image.png", "page2.html", "page3.html"} + expectedFiles := []string{"index.html", "style.css", "image.png", "page2.html"} for _, file := range expectedFiles { exists, err := dn.Exists(file) if err != nil { @@ -50,9 +50,31 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) { } } +func TestDownloadAndPackageWebsite_RespectsRobotsTxt(t *testing.T) { + server := newWebsiteTestServer() + defer server.Close() + + bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard)) + dn, err := DownloadAndPackageWebsite(server.URL, 2, bar, "Borg/1.0", false, 0) + if err != nil { + t.Fatalf("DownloadAndPackageWebsite failed: %v", err) + } + + // page3.html is disallowed by robots.txt, so it should not be present. + exists, _ := dn.Exists("page3.html") + if exists { + t.Error("page3.html should not have been downloaded due to robots.txt") + } + // page2.html is not disallowed, so it should be present. + exists, _ = dn.Exists("page2.html") + if !exists { + t.Error("page2.html should have been downloaded") + } +} + func TestDownloadAndPackageWebsite_Bad(t *testing.T) { t.Run("Invalid Start URL", func(t *testing.T) { - _, err := DownloadAndPackageWebsite("http://invalid-url", 1, nil) + _, err := DownloadAndPackageWebsite("http://invalid-url", 1, nil, "Borg/1.0", false, 0) if err == nil { t.Fatal("Expected an error for an invalid start URL, but got nil") } @@ -63,7 +85,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) { http.Error(w, "Internal Server Error", http.StatusInternalServerError) })) defer server.Close() - _, err := DownloadAndPackageWebsite(server.URL, 1, nil) + _, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0) if err == nil { t.Fatal("Expected an error for a server error on the start URL, but got nil") } @@ -80,7 +102,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) { })) defer server.Close() // We expect an error because the link is broken. - dn, err := DownloadAndPackageWebsite(server.URL, 1, nil) + dn, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0) if err == nil { t.Fatal("Expected an error for a broken link, but got nil") } @@ -99,7 +121,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) { defer server.Close() bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard)) - dn, err := DownloadAndPackageWebsite(server.URL, 1, bar) // Max depth of 1 + dn, err := DownloadAndPackageWebsite(server.URL, 1, bar, "Borg/1.0", false, 0) // Max depth of 1 if err != nil { t.Fatalf("DownloadAndPackageWebsite failed: %v", err) } @@ -122,7 +144,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) { fmt.Fprint(w, `External`) })) defer server.Close() - dn, err := DownloadAndPackageWebsite(server.URL, 1, nil) + dn, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0) if err != nil { t.Fatalf("DownloadAndPackageWebsite failed: %v", err) } @@ -156,7 +178,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) { // For now, we'll just test that it doesn't hang forever. done := make(chan bool) go func() { - _, err := DownloadAndPackageWebsite(server.URL, 1, nil) + _, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0) if err != nil && !strings.Contains(err.Error(), "context deadline exceeded") { // We expect a timeout error, but other errors are failures. t.Errorf("unexpected error: %v", err) @@ -177,6 +199,9 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) { func newWebsiteTestServer() *httptest.Server { return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { + case "/robots.txt": + w.Header().Set("Content-Type", "text/plain") + fmt.Fprint(w, "User-agent: *\nDisallow: /page3.html") case "/": w.Header().Set("Content-Type", "text/html") fmt.Fprint(w, `