feat: add robots.txt support to website collector

Adds support for parsing and respecting robots.txt during website collection.

This change introduces the following features:
- Fetches and parses /robots.txt before crawling a website.
- Respects `Disallow` patterns to avoid crawling restricted areas.
- Honors the `Crawl-delay` directive to prevent hammering sites.
- Adds command-line flags to configure the behavior:
  - `--ignore-robots`: Ignores robots.txt rules.
  - `--user-agent`: Sets a custom user-agent string.
  - `--min-delay`: Overrides the crawl-delay with a minimum value.

The implementation includes a new `robots` package for parsing robots.txt files and integrates it into the existing website downloader. Tests have been added to verify the new functionality.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
This commit is contained in:
google-labs-jules[bot] 2026-02-02 00:42:20 +00:00
parent cf2af53ed3
commit 1d8ff02f5c
7 changed files with 353 additions and 16 deletions

View file

@ -38,6 +38,9 @@ func NewCollectWebsiteCmd() *cobra.Command {
format, _ := cmd.Flags().GetString("format")
compression, _ := cmd.Flags().GetString("compression")
password, _ := cmd.Flags().GetString("password")
userAgent, _ := cmd.Flags().GetString("user-agent")
ignoreRobots, _ := cmd.Flags().GetBool("ignore-robots")
minDelay, _ := cmd.Flags().GetDuration("min-delay")
if format != "datanode" && format != "tim" && format != "trix" {
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
@ -51,7 +54,7 @@ func NewCollectWebsiteCmd() *cobra.Command {
bar = ui.NewProgressBar(-1, "Crawling website")
}
dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar)
dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar, userAgent, ignoreRobots, minDelay)
if err != nil {
return fmt.Errorf("error downloading and packaging website: %w", err)
}
@ -104,5 +107,8 @@ func NewCollectWebsiteCmd() *cobra.Command {
collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
collectWebsiteCmd.PersistentFlags().String("user-agent", "Borg/1.0", "Custom user agent string")
collectWebsiteCmd.PersistentFlags().Bool("ignore-robots", false, "Ignore robots.txt")
collectWebsiteCmd.PersistentFlags().Duration("min-delay", 0, "Minimum delay between requests")
return collectWebsiteCmd
}

View file

@ -7,6 +7,8 @@ import (
"testing"
"github.com/Snider/Borg/pkg/datanode"
"time"
"github.com/Snider/Borg/pkg/website"
"github.com/schollz/progressbar/v3"
)
@ -14,7 +16,7 @@ import (
func TestCollectWebsiteCmd_Good(t *testing.T) {
// Mock the website downloader
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
return datanode.New(), nil
}
defer func() {
@ -35,7 +37,7 @@ func TestCollectWebsiteCmd_Good(t *testing.T) {
func TestCollectWebsiteCmd_Bad(t *testing.T) {
// Mock the website downloader to return an error
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
return nil, fmt.Errorf("website error")
}
defer func() {

View file

@ -11,7 +11,7 @@ func main() {
log.Println("Collecting website...")
// Download and package the website.
dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil)
dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil, "Borg/1.0", false, 0)
if err != nil {
log.Fatalf("Failed to collect website: %v", err)
}

112
pkg/robots/robots.go Normal file
View file

@ -0,0 +1,112 @@
package robots
import (
"path"
"strconv"
"strings"
"time"
)
// RobotsData holds the parsed robots.txt data for a specific user-agent.
type RobotsData struct {
Disallow []string
CrawlDelay time.Duration
}
// IsAllowed checks if a given path is allowed by the robots.txt rules.
func (r *RobotsData) IsAllowed(p string) bool {
// A more complete implementation would handle wildcards.
// This is a simple path prefix match.
for _, rule := range r.Disallow {
if rule == "" {
// An empty Disallow rule means nothing is disallowed by this rule.
continue
}
if rule == "/" {
// Disallow: / means disallow everything.
return false
}
if strings.HasPrefix(p, rule) {
return false
}
}
return true
}
// Parse parses the content of a robots.txt file for a specific user-agent.
func Parse(content []byte, userAgent string) (*RobotsData, error) {
lines := strings.Split(string(content), "\n")
rules := make(map[string]*RobotsData)
var currentUAs []string
lastWasUA := false
for _, line := range lines {
line = strings.TrimSpace(line)
if idx := strings.Index(line, "#"); idx != -1 {
line = line[:idx]
}
if line == "" {
continue
}
parts := strings.SplitN(line, ":", 2)
if len(parts) != 2 {
continue
}
key := strings.ToLower(strings.TrimSpace(parts[0]))
value := strings.TrimSpace(parts[1])
switch key {
case "user-agent":
if !lastWasUA {
currentUAs = []string{} // New group
}
currentUAs = append(currentUAs, strings.ToLower(value))
lastWasUA = true
case "disallow", "crawl-delay":
if len(currentUAs) == 0 {
continue // Rule without a user-agent
}
for _, ua := range currentUAs {
if rules[ua] == nil {
rules[ua] = &RobotsData{}
}
if key == "disallow" {
rules[ua].Disallow = append(rules[ua].Disallow, path.Clean("/"+value))
} else if key == "crawl-delay" {
if delay, err := strconv.ParseFloat(value, 64); err == nil {
rules[ua].CrawlDelay = time.Duration(delay * float64(time.Second))
}
}
}
lastWasUA = false
default:
lastWasUA = false
}
}
lowerUserAgent := strings.ToLower(userAgent)
// Look for most specific match.
bestMatch := ""
for ua := range rules {
if strings.Contains(lowerUserAgent, ua) {
if len(ua) > len(bestMatch) {
bestMatch = ua
}
}
}
if bestMatch != "" {
return rules[bestMatch], nil
}
// Fallback to wildcard.
if data, ok := rules["*"]; ok {
return data, nil
}
return &RobotsData{}, nil
}

125
pkg/robots/robots_test.go Normal file
View file

@ -0,0 +1,125 @@
package robots
import (
"testing"
"time"
)
func TestParse(t *testing.T) {
testCases := []struct {
name string
content string
userAgent string
expected *RobotsData
expectedErr bool
}{
{
name: "Specific user agent",
content: `
User-agent: BorgBot
Disallow: /private/
Crawl-delay: 2
`,
userAgent: "BorgBot/1.0",
expected: &RobotsData{
Disallow: []string{"/private"},
CrawlDelay: 2 * time.Second,
},
},
{
name: "Wildcard user agent",
content: `
User-agent: *
Disallow: /admin/
`,
userAgent: "AnotherBot",
expected: &RobotsData{
Disallow: []string{"/admin"},
},
},
{
name: "Multiple disallow rules",
content: `
User-agent: *
Disallow: /admin/
Disallow: /login
`,
userAgent: "AnyBot",
expected: &RobotsData{
Disallow: []string{"/admin", "/login"},
},
},
{
name: "No rules for user agent",
content: `
User-agent: GoogleBot
Disallow: /
`,
userAgent: "MyBot",
expected: &RobotsData{},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
robotsData, err := Parse([]byte(tc.content), tc.userAgent)
if (err != nil) != tc.expectedErr {
t.Fatalf("Parse() error = %v, wantErr %v", err, tc.expectedErr)
}
if len(robotsData.Disallow) != len(tc.expected.Disallow) {
t.Fatalf("expected %d disallow rules, got %d", len(tc.expected.Disallow), len(robotsData.Disallow))
}
for i, rule := range tc.expected.Disallow {
if robotsData.Disallow[i] != rule {
t.Errorf("expected disallow rule %s, got %s", rule, robotsData.Disallow[i])
}
}
if robotsData.CrawlDelay != tc.expected.CrawlDelay {
t.Errorf("expected crawl delay %v, got %v", tc.expected.CrawlDelay, robotsData.CrawlDelay)
}
})
}
}
func TestIsAllowed(t *testing.T) {
testCases := []struct {
name string
robotsData *RobotsData
path string
allowed bool
}{
{
name: "Path is disallowed",
robotsData: &RobotsData{
Disallow: []string{"/private"},
},
path: "/private/page.html",
allowed: false,
},
{
name: "Path is allowed",
robotsData: &RobotsData{
Disallow: []string{"/private"},
},
path: "/public/page.html",
allowed: true,
},
{
name: "No rules",
robotsData: &RobotsData{},
path: "/any/page.html",
allowed: true,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if allowed := tc.robotsData.IsAllowed(tc.path); allowed != tc.allowed {
t.Errorf("IsAllowed(%s) = %v, want %v", tc.path, allowed, tc.allowed)
}
})
}
}

View file

@ -8,9 +8,10 @@ import (
"strings"
"github.com/Snider/Borg/pkg/datanode"
"github.com/Snider/Borg/pkg/robots"
"github.com/schollz/progressbar/v3"
"golang.org/x/net/html"
"time"
)
var DownloadAndPackageWebsite = downloadAndPackageWebsite
@ -24,6 +25,9 @@ type Downloader struct {
progressBar *progressbar.ProgressBar
client *http.Client
errors []error
robots *robots.RobotsData
userAgent string
minDelay time.Duration
}
// NewDownloader creates a new Downloader.
@ -39,11 +43,12 @@ func NewDownloaderWithClient(maxDepth int, client *http.Client) *Downloader {
maxDepth: maxDepth,
client: client,
errors: make([]error, 0),
userAgent: "Borg/1.0",
}
}
// downloadAndPackageWebsite downloads a website and packages it into a DataNode.
func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
baseURL, err := url.Parse(startURL)
if err != nil {
return nil, err
@ -52,6 +57,23 @@ func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.P
d := NewDownloader(maxDepth)
d.baseURL = baseURL
d.progressBar = bar
d.userAgent = userAgent
d.minDelay = minDelay
if !ignoreRobots {
robotsURL, err := baseURL.Parse("/robots.txt")
if err == nil {
resp, err := d.client.Get(robotsURL.String())
if err == nil && resp.StatusCode == http.StatusOK {
body, err := io.ReadAll(resp.Body)
resp.Body.Close()
if err == nil {
d.robots, _ = robots.Parse(body, d.userAgent)
}
}
}
}
d.crawl(startURL, 0)
if len(d.errors) > 0 {
@ -69,12 +91,28 @@ func (d *Downloader) crawl(pageURL string, depth int) {
if depth > d.maxDepth || d.visited[pageURL] {
return
}
u, err := url.Parse(pageURL)
if err != nil {
d.errors = append(d.errors, fmt.Errorf("invalid URL %s: %w", pageURL, err))
return
}
if d.robots != nil && !d.robots.IsAllowed(u.Path) {
return
}
d.visited[pageURL] = true
if d.progressBar != nil {
d.progressBar.Add(1)
}
resp, err := d.client.Get(pageURL)
d.delay()
req, _ := http.NewRequest("GET", pageURL, nil)
req.Header.Set("User-Agent", d.userAgent)
resp, err := d.client.Do(req)
if err != nil {
d.errors = append(d.errors, fmt.Errorf("Error getting %s: %w", pageURL, err))
return
@ -136,12 +174,28 @@ func (d *Downloader) downloadAsset(assetURL string) {
if d.visited[assetURL] {
return
}
u, err := url.Parse(assetURL)
if err != nil {
d.errors = append(d.errors, fmt.Errorf("invalid URL %s: %w", assetURL, err))
return
}
if d.robots != nil && !d.robots.IsAllowed(u.Path) {
return
}
d.visited[assetURL] = true
if d.progressBar != nil {
d.progressBar.Add(1)
}
resp, err := d.client.Get(assetURL)
d.delay()
req, _ := http.NewRequest("GET", assetURL, nil)
req.Header.Set("User-Agent", d.userAgent)
resp, err := d.client.Do(req)
if err != nil {
d.errors = append(d.errors, fmt.Errorf("Error getting asset %s: %w", assetURL, err))
return
@ -163,6 +217,19 @@ func (d *Downloader) downloadAsset(assetURL string) {
d.dn.AddData(relPath, body)
}
func (d *Downloader) delay() {
var delay time.Duration
if d.robots != nil {
delay = d.robots.CrawlDelay
}
if d.minDelay > delay {
delay = d.minDelay
}
if delay > 0 {
time.Sleep(delay)
}
}
func (d *Downloader) getRelativePath(pageURL string) string {
u, err := url.Parse(pageURL)
if err != nil {

View file

@ -20,12 +20,12 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) {
defer server.Close()
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
dn, err := DownloadAndPackageWebsite(server.URL, 2, bar)
dn, err := DownloadAndPackageWebsite(server.URL, 2, bar, "Borg/1.0", false, 0)
if err != nil {
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
}
expectedFiles := []string{"index.html", "style.css", "image.png", "page2.html", "page3.html"}
expectedFiles := []string{"index.html", "style.css", "image.png", "page2.html"}
for _, file := range expectedFiles {
exists, err := dn.Exists(file)
if err != nil {
@ -50,9 +50,31 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) {
}
}
func TestDownloadAndPackageWebsite_RespectsRobotsTxt(t *testing.T) {
server := newWebsiteTestServer()
defer server.Close()
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
dn, err := DownloadAndPackageWebsite(server.URL, 2, bar, "Borg/1.0", false, 0)
if err != nil {
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
}
// page3.html is disallowed by robots.txt, so it should not be present.
exists, _ := dn.Exists("page3.html")
if exists {
t.Error("page3.html should not have been downloaded due to robots.txt")
}
// page2.html is not disallowed, so it should be present.
exists, _ = dn.Exists("page2.html")
if !exists {
t.Error("page2.html should have been downloaded")
}
}
func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
t.Run("Invalid Start URL", func(t *testing.T) {
_, err := DownloadAndPackageWebsite("http://invalid-url", 1, nil)
_, err := DownloadAndPackageWebsite("http://invalid-url", 1, nil, "Borg/1.0", false, 0)
if err == nil {
t.Fatal("Expected an error for an invalid start URL, but got nil")
}
@ -63,7 +85,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
}))
defer server.Close()
_, err := DownloadAndPackageWebsite(server.URL, 1, nil)
_, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
if err == nil {
t.Fatal("Expected an error for a server error on the start URL, but got nil")
}
@ -80,7 +102,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
}))
defer server.Close()
// We expect an error because the link is broken.
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil)
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
if err == nil {
t.Fatal("Expected an error for a broken link, but got nil")
}
@ -99,7 +121,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
defer server.Close()
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
dn, err := DownloadAndPackageWebsite(server.URL, 1, bar) // Max depth of 1
dn, err := DownloadAndPackageWebsite(server.URL, 1, bar, "Borg/1.0", false, 0) // Max depth of 1
if err != nil {
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
}
@ -122,7 +144,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
fmt.Fprint(w, `<a href="http://externalsite.com/page.html">External</a>`)
}))
defer server.Close()
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil)
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
if err != nil {
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
}
@ -156,7 +178,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
// For now, we'll just test that it doesn't hang forever.
done := make(chan bool)
go func() {
_, err := DownloadAndPackageWebsite(server.URL, 1, nil)
_, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
if err != nil && !strings.Contains(err.Error(), "context deadline exceeded") {
// We expect a timeout error, but other errors are failures.
t.Errorf("unexpected error: %v", err)
@ -177,6 +199,9 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
func newWebsiteTestServer() *httptest.Server {
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/robots.txt":
w.Header().Set("Content-Type", "text/plain")
fmt.Fprint(w, "User-agent: *\nDisallow: /page3.html")
case "/":
w.Header().Set("Content-Type", "text/html")
fmt.Fprint(w, `