Borg/pkg/robots/robots.go
google-labs-jules[bot] 1d8ff02f5c feat: add robots.txt support to website collector
Adds support for parsing and respecting robots.txt during website collection.

This change introduces the following features:
- Fetches and parses /robots.txt before crawling a website.
- Respects `Disallow` patterns to avoid crawling restricted areas.
- Honors the `Crawl-delay` directive to prevent hammering sites.
- Adds command-line flags to configure the behavior:
  - `--ignore-robots`: Ignores robots.txt rules.
  - `--user-agent`: Sets a custom user-agent string.
  - `--min-delay`: Overrides the crawl-delay with a minimum value.

The implementation includes a new `robots` package for parsing robots.txt files and integrates it into the existing website downloader. Tests have been added to verify the new functionality.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
2026-02-02 00:42:20 +00:00

112 lines
2.4 KiB
Go

package robots
import (
"path"
"strconv"
"strings"
"time"
)
// RobotsData holds the parsed robots.txt data for a specific user-agent.
type RobotsData struct {
Disallow []string
CrawlDelay time.Duration
}
// IsAllowed checks if a given path is allowed by the robots.txt rules.
func (r *RobotsData) IsAllowed(p string) bool {
// A more complete implementation would handle wildcards.
// This is a simple path prefix match.
for _, rule := range r.Disallow {
if rule == "" {
// An empty Disallow rule means nothing is disallowed by this rule.
continue
}
if rule == "/" {
// Disallow: / means disallow everything.
return false
}
if strings.HasPrefix(p, rule) {
return false
}
}
return true
}
// Parse parses the content of a robots.txt file for a specific user-agent.
func Parse(content []byte, userAgent string) (*RobotsData, error) {
lines := strings.Split(string(content), "\n")
rules := make(map[string]*RobotsData)
var currentUAs []string
lastWasUA := false
for _, line := range lines {
line = strings.TrimSpace(line)
if idx := strings.Index(line, "#"); idx != -1 {
line = line[:idx]
}
if line == "" {
continue
}
parts := strings.SplitN(line, ":", 2)
if len(parts) != 2 {
continue
}
key := strings.ToLower(strings.TrimSpace(parts[0]))
value := strings.TrimSpace(parts[1])
switch key {
case "user-agent":
if !lastWasUA {
currentUAs = []string{} // New group
}
currentUAs = append(currentUAs, strings.ToLower(value))
lastWasUA = true
case "disallow", "crawl-delay":
if len(currentUAs) == 0 {
continue // Rule without a user-agent
}
for _, ua := range currentUAs {
if rules[ua] == nil {
rules[ua] = &RobotsData{}
}
if key == "disallow" {
rules[ua].Disallow = append(rules[ua].Disallow, path.Clean("/"+value))
} else if key == "crawl-delay" {
if delay, err := strconv.ParseFloat(value, 64); err == nil {
rules[ua].CrawlDelay = time.Duration(delay * float64(time.Second))
}
}
}
lastWasUA = false
default:
lastWasUA = false
}
}
lowerUserAgent := strings.ToLower(userAgent)
// Look for most specific match.
bestMatch := ""
for ua := range rules {
if strings.Contains(lowerUserAgent, ua) {
if len(ua) > len(bestMatch) {
bestMatch = ua
}
}
}
if bestMatch != "" {
return rules[bestMatch], nil
}
// Fallback to wildcard.
if data, ok := rules["*"]; ok {
return data, nil
}
return &RobotsData{}, nil
}