Adds support for parsing and respecting robots.txt during website collection. This change introduces the following features: - Fetches and parses /robots.txt before crawling a website. - Respects `Disallow` patterns to avoid crawling restricted areas. - Honors the `Crawl-delay` directive to prevent hammering sites. - Adds command-line flags to configure the behavior: - `--ignore-robots`: Ignores robots.txt rules. - `--user-agent`: Sets a custom user-agent string. - `--min-delay`: Overrides the crawl-delay with a minimum value. The implementation includes a new `robots` package for parsing robots.txt files and integrates it into the existing website downloader. Tests have been added to verify the new functionality. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
112 lines
2.4 KiB
Go
112 lines
2.4 KiB
Go
package robots
|
|
|
|
import (
|
|
"path"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// RobotsData holds the parsed robots.txt data for a specific user-agent.
|
|
type RobotsData struct {
|
|
Disallow []string
|
|
CrawlDelay time.Duration
|
|
}
|
|
|
|
// IsAllowed checks if a given path is allowed by the robots.txt rules.
|
|
func (r *RobotsData) IsAllowed(p string) bool {
|
|
// A more complete implementation would handle wildcards.
|
|
// This is a simple path prefix match.
|
|
for _, rule := range r.Disallow {
|
|
if rule == "" {
|
|
// An empty Disallow rule means nothing is disallowed by this rule.
|
|
continue
|
|
}
|
|
if rule == "/" {
|
|
// Disallow: / means disallow everything.
|
|
return false
|
|
}
|
|
if strings.HasPrefix(p, rule) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// Parse parses the content of a robots.txt file for a specific user-agent.
|
|
func Parse(content []byte, userAgent string) (*RobotsData, error) {
|
|
lines := strings.Split(string(content), "\n")
|
|
|
|
rules := make(map[string]*RobotsData)
|
|
var currentUAs []string
|
|
lastWasUA := false
|
|
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
if idx := strings.Index(line, "#"); idx != -1 {
|
|
line = line[:idx]
|
|
}
|
|
if line == "" {
|
|
continue
|
|
}
|
|
|
|
parts := strings.SplitN(line, ":", 2)
|
|
if len(parts) != 2 {
|
|
continue
|
|
}
|
|
|
|
key := strings.ToLower(strings.TrimSpace(parts[0]))
|
|
value := strings.TrimSpace(parts[1])
|
|
|
|
switch key {
|
|
case "user-agent":
|
|
if !lastWasUA {
|
|
currentUAs = []string{} // New group
|
|
}
|
|
currentUAs = append(currentUAs, strings.ToLower(value))
|
|
lastWasUA = true
|
|
case "disallow", "crawl-delay":
|
|
if len(currentUAs) == 0 {
|
|
continue // Rule without a user-agent
|
|
}
|
|
|
|
for _, ua := range currentUAs {
|
|
if rules[ua] == nil {
|
|
rules[ua] = &RobotsData{}
|
|
}
|
|
if key == "disallow" {
|
|
rules[ua].Disallow = append(rules[ua].Disallow, path.Clean("/"+value))
|
|
} else if key == "crawl-delay" {
|
|
if delay, err := strconv.ParseFloat(value, 64); err == nil {
|
|
rules[ua].CrawlDelay = time.Duration(delay * float64(time.Second))
|
|
}
|
|
}
|
|
}
|
|
lastWasUA = false
|
|
default:
|
|
lastWasUA = false
|
|
}
|
|
}
|
|
|
|
lowerUserAgent := strings.ToLower(userAgent)
|
|
|
|
// Look for most specific match.
|
|
bestMatch := ""
|
|
for ua := range rules {
|
|
if strings.Contains(lowerUserAgent, ua) {
|
|
if len(ua) > len(bestMatch) {
|
|
bestMatch = ua
|
|
}
|
|
}
|
|
}
|
|
if bestMatch != "" {
|
|
return rules[bestMatch], nil
|
|
}
|
|
|
|
// Fallback to wildcard.
|
|
if data, ok := rules["*"]; ok {
|
|
return data, nil
|
|
}
|
|
|
|
return &RobotsData{}, nil
|
|
}
|