Borg/pkg/robots/robots.go

package robots

import (
	"path"
	"strconv"
	"strings"
	"time"
)

// RobotsData holds the parsed robots.txt data for a specific user-agent.
type RobotsData struct {
	Disallow   []string
	CrawlDelay time.Duration
}

// IsAllowed checks if a given path is allowed by the robots.txt rules.
func (r *RobotsData) IsAllowed(p string) bool {
	// A more complete implementation would handle wildcards.
	// This is a simple path prefix match.
	for _, rule := range r.Disallow {
		if rule == "" {
			// An empty Disallow rule means nothing is disallowed by this rule.
			continue
		}
		if rule == "/" {
			// Disallow: / means disallow everything.
			return false
		}
		if strings.HasPrefix(p, rule) {
			return false
		}
	}
	return true
}

// Parse parses the content of a robots.txt file for a specific user-agent.
func Parse(content []byte, userAgent string) (*RobotsData, error) {
	lines := strings.Split(string(content), "\n")

	rules := make(map[string]*RobotsData)
	var currentUAs []string
	lastWasUA := false

	for _, line := range lines {
		line = strings.TrimSpace(line)
		if idx := strings.Index(line, "#"); idx != -1 {
			line = line[:idx]
		}
		if line == "" {
			continue
		}

		parts := strings.SplitN(line, ":", 2)
		if len(parts) != 2 {
			continue
		}

		key := strings.ToLower(strings.TrimSpace(parts[0]))
		value := strings.TrimSpace(parts[1])

		switch key {
		case "user-agent":
			if !lastWasUA {
				currentUAs = []string{} // New group
			}
			currentUAs = append(currentUAs, strings.ToLower(value))
			lastWasUA = true
		case "disallow", "crawl-delay":
			if len(currentUAs) == 0 {
				continue // Rule without a user-agent
			}

			for _, ua := range currentUAs {
				if rules[ua] == nil {
					rules[ua] = &RobotsData{}
				}
				if key == "disallow" {
					rules[ua].Disallow = append(rules[ua].Disallow, path.Clean("/"+value))
				} else if key == "crawl-delay" {
					if delay, err := strconv.ParseFloat(value, 64); err == nil {
						rules[ua].CrawlDelay = time.Duration(delay * float64(time.Second))
					}
				}
			}
			lastWasUA = false
		default:
			lastWasUA = false
		}
	}

	lowerUserAgent := strings.ToLower(userAgent)

	// Look for most specific match.
	bestMatch := ""
	for ua := range rules {
		if strings.Contains(lowerUserAgent, ua) {
			if len(ua) > len(bestMatch) {
				bestMatch = ua
			}
		}
	}
	if bestMatch != "" {
		return rules[bestMatch], nil
	}

	// Fallback to wildcard.
	if data, ok := rules["*"]; ok {
		return data, nil
	}

	return &RobotsData{}, nil
}