feat: add robots.txt support to website collector

Adds support for parsing and respecting robots.txt during website collection. This change introduces the following features: - Fetches and parses /robots.txt before crawling a website. - Respects `Disallow` patterns to avoid crawling restricted areas. - Honors the `Crawl-delay` directive to prevent hammering sites. - Adds command-line flags to configure the behavior: - `--ignore-robots`: Ignores robots.txt rules. - `--user-agent`: Sets a custom user-agent string. - `--min-delay`: Overrides the crawl-delay with a minimum value. The implementation includes a new `robots` package for parsing robots.txt files and integrates it into the existing website downloader. Tests have been added to verify the new functionality. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
2026-02-02 00:42:20 +00:00 · 2026-02-02 00:42:20 +00:00 · 1d8ff02f5c
commit 1d8ff02f5c
parent cf2af53ed3
7 changed files with 353 additions and 16 deletions
--- a/cmd/collect_website.go
+++ b/cmd/collect_website.go
@ -38,6 +38,9 @@ func NewCollectWebsiteCmd() *cobra.Command {
 			format, _ := cmd.Flags().GetString("format")
 			compression, _ := cmd.Flags().GetString("compression")
 			password, _ := cmd.Flags().GetString("password")
+			userAgent, _ := cmd.Flags().GetString("user-agent")
+			ignoreRobots, _ := cmd.Flags().GetBool("ignore-robots")
+			minDelay, _ := cmd.Flags().GetDuration("min-delay")

 			if format != "datanode" && format != "tim" && format != "trix" {
 				return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
@ -51,7 +54,7 @@ func NewCollectWebsiteCmd() *cobra.Command {
 				bar = ui.NewProgressBar(-1, "Crawling website")
 			}

-			dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar)
+			dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar, userAgent, ignoreRobots, minDelay)
 			if err != nil {
 				return fmt.Errorf("error downloading and packaging website: %w", err)
 			}
@ -104,5 +107,8 @@ func NewCollectWebsiteCmd() *cobra.Command {
 	collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
 	collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
 	collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
+	collectWebsiteCmd.PersistentFlags().String("user-agent", "Borg/1.0", "Custom user agent string")
+	collectWebsiteCmd.PersistentFlags().Bool("ignore-robots", false, "Ignore robots.txt")
+	collectWebsiteCmd.PersistentFlags().Duration("min-delay", 0, "Minimum delay between requests")
 	return collectWebsiteCmd
 }
--- a/cmd/collect_website_test.go
+++ b/cmd/collect_website_test.go
@ -7,6 +7,8 @@ import (
 	"testing"

 	"github.com/Snider/Borg/pkg/datanode"
+	"time"
+
 	"github.com/Snider/Borg/pkg/website"
 	"github.com/schollz/progressbar/v3"
 )
@ -14,7 +16,7 @@ import (
 func TestCollectWebsiteCmd_Good(t *testing.T) {
 	// Mock the website downloader
 	oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
-	website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
+	website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
 		return datanode.New(), nil
 	}
 	defer func() {
@ -35,7 +37,7 @@ func TestCollectWebsiteCmd_Good(t *testing.T) {
 func TestCollectWebsiteCmd_Bad(t *testing.T) {
 	// Mock the website downloader to return an error
 	oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
-	website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
+	website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
 		return nil, fmt.Errorf("website error")
 	}
 	defer func() {
--- a/examples/collect_website/main.go
+++ b/examples/collect_website/main.go
@ -11,7 +11,7 @@ func main() {
 	log.Println("Collecting website...")

 	// Download and package the website.
-	dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil)
+	dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil, "Borg/1.0", false, 0)
 	if err != nil {
 		log.Fatalf("Failed to collect website: %v", err)
 	}
--- a/pkg/robots/robots.go
+++ b/pkg/robots/robots.go
@ -0,0 +1,112 @@
+package robots
+
+import (
+	"path"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// RobotsData holds the parsed robots.txt data for a specific user-agent.
+type RobotsData struct {
+	Disallow   []string
+	CrawlDelay time.Duration
+}
+
+// IsAllowed checks if a given path is allowed by the robots.txt rules.
+func (r *RobotsData) IsAllowed(p string) bool {
+	// A more complete implementation would handle wildcards.
+	// This is a simple path prefix match.
+	for _, rule := range r.Disallow {
+		if rule == "" {
+			// An empty Disallow rule means nothing is disallowed by this rule.
+			continue
+		}
+		if rule == "/" {
+			// Disallow: / means disallow everything.
+			return false
+		}
+		if strings.HasPrefix(p, rule) {
+			return false
+		}
+	}
+	return true
+}
+
+// Parse parses the content of a robots.txt file for a specific user-agent.
+func Parse(content []byte, userAgent string) (*RobotsData, error) {
+	lines := strings.Split(string(content), "\n")
+
+	rules := make(map[string]*RobotsData)
+	var currentUAs []string
+	lastWasUA := false
+
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		if idx := strings.Index(line, "#"); idx != -1 {
+			line = line[:idx]
+		}
+		if line == "" {
+			continue
+		}
+
+		parts := strings.SplitN(line, ":", 2)
+		if len(parts) != 2 {
+			continue
+		}
+
+		key := strings.ToLower(strings.TrimSpace(parts[0]))
+		value := strings.TrimSpace(parts[1])
+
+		switch key {
+		case "user-agent":
+			if !lastWasUA {
+				currentUAs = []string{} // New group
+			}
+			currentUAs = append(currentUAs, strings.ToLower(value))
+			lastWasUA = true
+		case "disallow", "crawl-delay":
+			if len(currentUAs) == 0 {
+				continue // Rule without a user-agent
+			}
+
+			for _, ua := range currentUAs {
+				if rules[ua] == nil {
+					rules[ua] = &RobotsData{}
+				}
+				if key == "disallow" {
+					rules[ua].Disallow = append(rules[ua].Disallow, path.Clean("/"+value))
+				} else if key == "crawl-delay" {
+					if delay, err := strconv.ParseFloat(value, 64); err == nil {
+						rules[ua].CrawlDelay = time.Duration(delay * float64(time.Second))
+					}
+				}
+			}
+			lastWasUA = false
+		default:
+			lastWasUA = false
+		}
+	}
+
+	lowerUserAgent := strings.ToLower(userAgent)
+
+	// Look for most specific match.
+	bestMatch := ""
+	for ua := range rules {
+		if strings.Contains(lowerUserAgent, ua) {
+			if len(ua) > len(bestMatch) {
+				bestMatch = ua
+			}
+		}
+	}
+	if bestMatch != "" {
+		return rules[bestMatch], nil
+	}
+
+	// Fallback to wildcard.
+	if data, ok := rules["*"]; ok {
+		return data, nil
+	}
+
+	return &RobotsData{}, nil
+}
--- a/pkg/robots/robots_test.go
+++ b/pkg/robots/robots_test.go
@ -0,0 +1,125 @@
+package robots
+
+import (
+	"testing"
+	"time"
+)
+
+func TestParse(t *testing.T) {
+	testCases := []struct {
+		name        string
+		content     string
+		userAgent   string
+		expected    *RobotsData
+		expectedErr bool
+	}{
+		{
+			name: "Specific user agent",
+			content: `
+				User-agent: BorgBot
+				Disallow: /private/
+				Crawl-delay: 2
+			`,
+			userAgent: "BorgBot/1.0",
+			expected: &RobotsData{
+				Disallow:   []string{"/private"},
+				CrawlDelay: 2 * time.Second,
+			},
+		},
+		{
+			name: "Wildcard user agent",
+			content: `
+				User-agent: *
+				Disallow: /admin/
+			`,
+			userAgent: "AnotherBot",
+			expected: &RobotsData{
+				Disallow: []string{"/admin"},
+			},
+		},
+		{
+			name: "Multiple disallow rules",
+			content: `
+				User-agent: *
+				Disallow: /admin/
+				Disallow: /login
+			`,
+			userAgent: "AnyBot",
+			expected: &RobotsData{
+				Disallow: []string{"/admin", "/login"},
+			},
+		},
+		{
+			name: "No rules for user agent",
+			content: `
+				User-agent: GoogleBot
+				Disallow: /
+			`,
+			userAgent: "MyBot",
+			expected:  &RobotsData{},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			robotsData, err := Parse([]byte(tc.content), tc.userAgent)
+			if (err != nil) != tc.expectedErr {
+				t.Fatalf("Parse() error = %v, wantErr %v", err, tc.expectedErr)
+			}
+
+			if len(robotsData.Disallow) != len(tc.expected.Disallow) {
+				t.Fatalf("expected %d disallow rules, got %d", len(tc.expected.Disallow), len(robotsData.Disallow))
+			}
+
+			for i, rule := range tc.expected.Disallow {
+				if robotsData.Disallow[i] != rule {
+					t.Errorf("expected disallow rule %s, got %s", rule, robotsData.Disallow[i])
+				}
+			}
+
+			if robotsData.CrawlDelay != tc.expected.CrawlDelay {
+				t.Errorf("expected crawl delay %v, got %v", tc.expected.CrawlDelay, robotsData.CrawlDelay)
+			}
+		})
+	}
+}
+
+func TestIsAllowed(t *testing.T) {
+	testCases := []struct {
+		name       string
+		robotsData *RobotsData
+		path       string
+		allowed    bool
+	}{
+		{
+			name: "Path is disallowed",
+			robotsData: &RobotsData{
+				Disallow: []string{"/private"},
+			},
+			path:    "/private/page.html",
+			allowed: false,
+		},
+		{
+			name: "Path is allowed",
+			robotsData: &RobotsData{
+				Disallow: []string{"/private"},
+			},
+			path:    "/public/page.html",
+			allowed: true,
+		},
+		{
+			name:       "No rules",
+			robotsData: &RobotsData{},
+			path:       "/any/page.html",
+			allowed:    true,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if allowed := tc.robotsData.IsAllowed(tc.path); allowed != tc.allowed {
+				t.Errorf("IsAllowed(%s) = %v, want %v", tc.path, allowed, tc.allowed)
+			}
+		})
+	}
+}
--- a/pkg/website/website.go
+++ b/pkg/website/website.go
@ -8,9 +8,10 @@ import (
 	"strings"

 	"github.com/Snider/Borg/pkg/datanode"
+	"github.com/Snider/Borg/pkg/robots"
 	"github.com/schollz/progressbar/v3"
-
 	"golang.org/x/net/html"
+	"time"
 )

 var DownloadAndPackageWebsite = downloadAndPackageWebsite
@ -24,6 +25,9 @@ type Downloader struct {
 	progressBar *progressbar.ProgressBar
 	client      *http.Client
 	errors      []error
+	robots      *robots.RobotsData
+	userAgent   string
+	minDelay    time.Duration
 }

 // NewDownloader creates a new Downloader.
@ -39,11 +43,12 @@ func NewDownloaderWithClient(maxDepth int, client *http.Client) *Downloader {
 		maxDepth:    maxDepth,
 		client:      client,
 		errors:      make([]error, 0),
+		userAgent:   "Borg/1.0",
 	}
 }

 // downloadAndPackageWebsite downloads a website and packages it into a DataNode.
-func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
+func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
 	baseURL, err := url.Parse(startURL)
 	if err != nil {
 		return nil, err
@ -52,6 +57,23 @@ func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.P
 	d := NewDownloader(maxDepth)
 	d.baseURL = baseURL
 	d.progressBar = bar
+	d.userAgent = userAgent
+	d.minDelay = minDelay
+
+	if !ignoreRobots {
+		robotsURL, err := baseURL.Parse("/robots.txt")
+		if err == nil {
+			resp, err := d.client.Get(robotsURL.String())
+			if err == nil && resp.StatusCode == http.StatusOK {
+				body, err := io.ReadAll(resp.Body)
+				resp.Body.Close()
+				if err == nil {
+					d.robots, _ = robots.Parse(body, d.userAgent)
+				}
+			}
+		}
+	}
+
 	d.crawl(startURL, 0)

 	if len(d.errors) > 0 {
@ -69,12 +91,28 @@ func (d *Downloader) crawl(pageURL string, depth int) {
 	if depth > d.maxDepth || d.visited[pageURL] {
 		return
 	}
+
+	u, err := url.Parse(pageURL)
+	if err != nil {
+		d.errors = append(d.errors, fmt.Errorf("invalid URL %s: %w", pageURL, err))
+		return
+	}
+
+	if d.robots != nil && !d.robots.IsAllowed(u.Path) {
+		return
+	}
+
 	d.visited[pageURL] = true
 	if d.progressBar != nil {
 		d.progressBar.Add(1)
 	}

-	resp, err := d.client.Get(pageURL)
+	d.delay()
+
+	req, _ := http.NewRequest("GET", pageURL, nil)
+	req.Header.Set("User-Agent", d.userAgent)
+
+	resp, err := d.client.Do(req)
 	if err != nil {
 		d.errors = append(d.errors, fmt.Errorf("Error getting %s: %w", pageURL, err))
 		return
@ -136,12 +174,28 @@ func (d *Downloader) downloadAsset(assetURL string) {
 	if d.visited[assetURL] {
 		return
 	}
+
+	u, err := url.Parse(assetURL)
+	if err != nil {
+		d.errors = append(d.errors, fmt.Errorf("invalid URL %s: %w", assetURL, err))
+		return
+	}
+
+	if d.robots != nil && !d.robots.IsAllowed(u.Path) {
+		return
+	}
+
 	d.visited[assetURL] = true
 	if d.progressBar != nil {
 		d.progressBar.Add(1)
 	}

-	resp, err := d.client.Get(assetURL)
+	d.delay()
+
+	req, _ := http.NewRequest("GET", assetURL, nil)
+	req.Header.Set("User-Agent", d.userAgent)
+
+	resp, err := d.client.Do(req)
 	if err != nil {
 		d.errors = append(d.errors, fmt.Errorf("Error getting asset %s: %w", assetURL, err))
 		return
@ -163,6 +217,19 @@ func (d *Downloader) downloadAsset(assetURL string) {
 	d.dn.AddData(relPath, body)
 }

+func (d *Downloader) delay() {
+	var delay time.Duration
+	if d.robots != nil {
+		delay = d.robots.CrawlDelay
+	}
+	if d.minDelay > delay {
+		delay = d.minDelay
+	}
+	if delay > 0 {
+		time.Sleep(delay)
+	}
+}
+
 func (d *Downloader) getRelativePath(pageURL string) string {
 	u, err := url.Parse(pageURL)
 	if err != nil {
--- a/pkg/website/website_test.go
+++ b/pkg/website/website_test.go
@ -20,12 +20,12 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) {
 	defer server.Close()

 	bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
-	dn, err := DownloadAndPackageWebsite(server.URL, 2, bar)
+	dn, err := DownloadAndPackageWebsite(server.URL, 2, bar, "Borg/1.0", false, 0)
 	if err != nil {
 		t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
 	}

-	expectedFiles := []string{"index.html", "style.css", "image.png", "page2.html", "page3.html"}
+	expectedFiles := []string{"index.html", "style.css", "image.png", "page2.html"}
 	for _, file := range expectedFiles {
 		exists, err := dn.Exists(file)
 		if err != nil {
@ -50,9 +50,31 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) {
 	}
 }

+func TestDownloadAndPackageWebsite_RespectsRobotsTxt(t *testing.T) {
+	server := newWebsiteTestServer()
+	defer server.Close()
+
+	bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
+	dn, err := DownloadAndPackageWebsite(server.URL, 2, bar, "Borg/1.0", false, 0)
+	if err != nil {
+		t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
+	}
+
+	// page3.html is disallowed by robots.txt, so it should not be present.
+	exists, _ := dn.Exists("page3.html")
+	if exists {
+		t.Error("page3.html should not have been downloaded due to robots.txt")
+	}
+	// page2.html is not disallowed, so it should be present.
+	exists, _ = dn.Exists("page2.html")
+	if !exists {
+		t.Error("page2.html should have been downloaded")
+	}
+}
+
 func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
 	t.Run("Invalid Start URL", func(t *testing.T) {
-		_, err := DownloadAndPackageWebsite("http://invalid-url", 1, nil)
+		_, err := DownloadAndPackageWebsite("http://invalid-url", 1, nil, "Borg/1.0", false, 0)
 		if err == nil {
 			t.Fatal("Expected an error for an invalid start URL, but got nil")
 		}
@ -63,7 +85,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
 			http.Error(w, "Internal Server Error", http.StatusInternalServerError)
 		}))
 		defer server.Close()
-		_, err := DownloadAndPackageWebsite(server.URL, 1, nil)
+		_, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
 		if err == nil {
 			t.Fatal("Expected an error for a server error on the start URL, but got nil")
 		}
@ -80,7 +102,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
 		}))
 		defer server.Close()
 		// We expect an error because the link is broken.
-		dn, err := DownloadAndPackageWebsite(server.URL, 1, nil)
+		dn, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
 		if err == nil {
 			t.Fatal("Expected an error for a broken link, but got nil")
 		}
@ -99,7 +121,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
 		defer server.Close()

 		bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
-		dn, err := DownloadAndPackageWebsite(server.URL, 1, bar) // Max depth of 1
+		dn, err := DownloadAndPackageWebsite(server.URL, 1, bar, "Borg/1.0", false, 0) // Max depth of 1
 		if err != nil {
 			t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
 		}
@ -122,7 +144,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
 			fmt.Fprint(w, `<a href="http://externalsite.com/page.html">External</a>`)
 		}))
 		defer server.Close()
-		dn, err := DownloadAndPackageWebsite(server.URL, 1, nil)
+		dn, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
 		if err != nil {
 			t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
 		}
@ -156,7 +178,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
 		// For now, we'll just test that it doesn't hang forever.
 		done := make(chan bool)
 		go func() {
-			_, err := DownloadAndPackageWebsite(server.URL, 1, nil)
+			_, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
 			if err != nil && !strings.Contains(err.Error(), "context deadline exceeded") {
 				// We expect a timeout error, but other errors are failures.
 				t.Errorf("unexpected error: %v", err)
@ -177,6 +199,9 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
 func newWebsiteTestServer() *httptest.Server {
 	return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		switch r.URL.Path {
+		case "/robots.txt":
+			w.Header().Set("Content-Type", "text/plain")
+			fmt.Fprint(w, "User-agent: *\nDisallow: /page3.html")
 		case "/":
 			w.Header().Set("Content-Type", "text/html")
 			fmt.Fprint(w, `