Adds support for parsing and respecting robots.txt during website collection. This change introduces the following features: - Fetches and parses /robots.txt before crawling a website. - Respects `Disallow` patterns to avoid crawling restricted areas. - Honors the `Crawl-delay` directive to prevent hammering sites. - Adds command-line flags to configure the behavior: - `--ignore-robots`: Ignores robots.txt rules. - `--user-agent`: Sets a custom user-agent string. - `--min-delay`: Overrides the crawl-delay with a minimum value. The implementation includes a new `robots` package for parsing robots.txt files and integrates it into the existing website downloader. Tests have been added to verify the new functionality. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
70 lines
2.1 KiB
Go
70 lines
2.1 KiB
Go
package cmd
|
|
|
|
import (
|
|
"fmt"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
|
|
"github.com/Snider/Borg/pkg/datanode"
|
|
"time"
|
|
|
|
"github.com/Snider/Borg/pkg/website"
|
|
"github.com/schollz/progressbar/v3"
|
|
)
|
|
|
|
func TestCollectWebsiteCmd_Good(t *testing.T) {
|
|
// Mock the website downloader
|
|
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
|
|
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
|
|
return datanode.New(), nil
|
|
}
|
|
defer func() {
|
|
website.DownloadAndPackageWebsite = oldDownloadAndPackageWebsite
|
|
}()
|
|
|
|
rootCmd := NewRootCmd()
|
|
rootCmd.AddCommand(GetCollectCmd())
|
|
|
|
// Execute command
|
|
out := filepath.Join(t.TempDir(), "out")
|
|
_, err := executeCommand(rootCmd, "collect", "website", "https://example.com", "--output", out)
|
|
if err != nil {
|
|
t.Fatalf("collect website command failed: %v", err)
|
|
}
|
|
}
|
|
|
|
func TestCollectWebsiteCmd_Bad(t *testing.T) {
|
|
// Mock the website downloader to return an error
|
|
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
|
|
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
|
|
return nil, fmt.Errorf("website error")
|
|
}
|
|
defer func() {
|
|
website.DownloadAndPackageWebsite = oldDownloadAndPackageWebsite
|
|
}()
|
|
|
|
rootCmd := NewRootCmd()
|
|
rootCmd.AddCommand(GetCollectCmd())
|
|
|
|
// Execute command
|
|
out := filepath.Join(t.TempDir(), "out")
|
|
_, err := executeCommand(rootCmd, "collect", "website", "https://example.com", "--output", out)
|
|
if err == nil {
|
|
t.Fatal("expected an error, but got none")
|
|
}
|
|
}
|
|
|
|
func TestCollectWebsiteCmd_Ugly(t *testing.T) {
|
|
t.Run("No arguments", func(t *testing.T) {
|
|
rootCmd := NewRootCmd()
|
|
rootCmd.AddCommand(GetCollectCmd())
|
|
_, err := executeCommand(rootCmd, "collect", "website")
|
|
if err == nil {
|
|
t.Fatal("expected an error for no arguments, but got none")
|
|
}
|
|
if !strings.Contains(err.Error(), "accepts 1 arg(s), received 0") {
|
|
t.Errorf("unexpected error message: %v", err)
|
|
}
|
|
})
|
|
}
|