Compare commits
1 commit
main
...
feat-robot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1d8ff02f5c |
11 changed files with 356 additions and 352 deletions
|
|
@ -1,333 +0,0 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/Snider/Borg/pkg/compress"
|
||||
"github.com/Snider/Borg/pkg/datanode"
|
||||
"github.com/Snider/Borg/pkg/tim"
|
||||
"github.com/Snider/Borg/pkg/trix"
|
||||
"github.com/Snider/Borg/pkg/ui"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
type CollectLocalCmd struct {
|
||||
cobra.Command
|
||||
}
|
||||
|
||||
// NewCollectLocalCmd creates a new collect local command
|
||||
func NewCollectLocalCmd() *CollectLocalCmd {
|
||||
c := &CollectLocalCmd{}
|
||||
c.Command = cobra.Command{
|
||||
Use: "local [directory]",
|
||||
Short: "Collect files from a local directory",
|
||||
Long: `Collect files from a local directory and store them in a DataNode.
|
||||
|
||||
If no directory is specified, the current working directory is used.
|
||||
|
||||
Examples:
|
||||
borg collect local
|
||||
borg collect local ./src
|
||||
borg collect local /path/to/project --output project.tar
|
||||
borg collect local . --format stim --password secret
|
||||
borg collect local . --exclude "*.log" --exclude "node_modules"`,
|
||||
Args: cobra.MaximumNArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
directory := "."
|
||||
if len(args) > 0 {
|
||||
directory = args[0]
|
||||
}
|
||||
|
||||
outputFile, _ := cmd.Flags().GetString("output")
|
||||
format, _ := cmd.Flags().GetString("format")
|
||||
compression, _ := cmd.Flags().GetString("compression")
|
||||
password, _ := cmd.Flags().GetString("password")
|
||||
excludes, _ := cmd.Flags().GetStringSlice("exclude")
|
||||
includeHidden, _ := cmd.Flags().GetBool("hidden")
|
||||
respectGitignore, _ := cmd.Flags().GetBool("gitignore")
|
||||
|
||||
finalPath, err := CollectLocal(directory, outputFile, format, compression, password, excludes, includeHidden, respectGitignore)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Fprintln(cmd.OutOrStdout(), "Files saved to", finalPath)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
c.Flags().String("output", "", "Output file for the DataNode")
|
||||
c.Flags().String("format", "datanode", "Output format (datanode, tim, trix, or stim)")
|
||||
c.Flags().String("compression", "none", "Compression format (none, gz, or xz)")
|
||||
c.Flags().String("password", "", "Password for encryption (required for stim/trix format)")
|
||||
c.Flags().StringSlice("exclude", nil, "Patterns to exclude (can be specified multiple times)")
|
||||
c.Flags().Bool("hidden", false, "Include hidden files and directories")
|
||||
c.Flags().Bool("gitignore", true, "Respect .gitignore files (default: true)")
|
||||
return c
|
||||
}
|
||||
|
||||
func init() {
|
||||
collectCmd.AddCommand(&NewCollectLocalCmd().Command)
|
||||
}
|
||||
|
||||
// CollectLocal collects files from a local directory into a DataNode
|
||||
func CollectLocal(directory string, outputFile string, format string, compression string, password string, excludes []string, includeHidden bool, respectGitignore bool) (string, error) {
|
||||
// Validate format
|
||||
if format != "datanode" && format != "tim" && format != "trix" && format != "stim" {
|
||||
return "", fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', 'trix', or 'stim')", format)
|
||||
}
|
||||
if (format == "stim" || format == "trix") && password == "" {
|
||||
return "", fmt.Errorf("password is required for %s format", format)
|
||||
}
|
||||
if compression != "none" && compression != "gz" && compression != "xz" {
|
||||
return "", fmt.Errorf("invalid compression: %s (must be 'none', 'gz', or 'xz')", compression)
|
||||
}
|
||||
|
||||
// Resolve directory path
|
||||
absDir, err := filepath.Abs(directory)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error resolving directory path: %w", err)
|
||||
}
|
||||
|
||||
info, err := os.Stat(absDir)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error accessing directory: %w", err)
|
||||
}
|
||||
if !info.IsDir() {
|
||||
return "", fmt.Errorf("not a directory: %s", absDir)
|
||||
}
|
||||
|
||||
// Load gitignore patterns if enabled
|
||||
var gitignorePatterns []string
|
||||
if respectGitignore {
|
||||
gitignorePatterns = loadGitignore(absDir)
|
||||
}
|
||||
|
||||
// Create DataNode and collect files
|
||||
dn := datanode.New()
|
||||
var fileCount int
|
||||
|
||||
bar := ui.NewProgressBar(-1, "Scanning files")
|
||||
defer bar.Finish()
|
||||
|
||||
err = filepath.WalkDir(absDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Get relative path
|
||||
relPath, err := filepath.Rel(absDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Skip root
|
||||
if relPath == "." {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Skip hidden files/dirs unless explicitly included
|
||||
if !includeHidden && isHidden(relPath) {
|
||||
if d.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check gitignore patterns
|
||||
if respectGitignore && matchesGitignore(relPath, d.IsDir(), gitignorePatterns) {
|
||||
if d.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check exclude patterns
|
||||
if matchesExclude(relPath, excludes) {
|
||||
if d.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Skip directories (they're implicit in DataNode)
|
||||
if d.IsDir() {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read file content
|
||||
content, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error reading %s: %w", relPath, err)
|
||||
}
|
||||
|
||||
// Add to DataNode with forward slashes (tar convention)
|
||||
dn.AddData(filepath.ToSlash(relPath), content)
|
||||
fileCount++
|
||||
bar.Describe(fmt.Sprintf("Collected %d files", fileCount))
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error walking directory: %w", err)
|
||||
}
|
||||
|
||||
if fileCount == 0 {
|
||||
return "", fmt.Errorf("no files found in %s", directory)
|
||||
}
|
||||
|
||||
bar.Describe(fmt.Sprintf("Packaging %d files", fileCount))
|
||||
|
||||
// Convert to output format
|
||||
var data []byte
|
||||
if format == "tim" {
|
||||
t, err := tim.FromDataNode(dn)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error creating tim: %w", err)
|
||||
}
|
||||
data, err = t.ToTar()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error serializing tim: %w", err)
|
||||
}
|
||||
} else if format == "stim" {
|
||||
t, err := tim.FromDataNode(dn)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error creating tim: %w", err)
|
||||
}
|
||||
data, err = t.ToSigil(password)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error encrypting stim: %w", err)
|
||||
}
|
||||
} else if format == "trix" {
|
||||
data, err = trix.ToTrix(dn, password)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error serializing trix: %w", err)
|
||||
}
|
||||
} else {
|
||||
data, err = dn.ToTar()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error serializing DataNode: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Apply compression
|
||||
compressedData, err := compress.Compress(data, compression)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error compressing data: %w", err)
|
||||
}
|
||||
|
||||
// Determine output filename
|
||||
if outputFile == "" {
|
||||
baseName := filepath.Base(absDir)
|
||||
if baseName == "." || baseName == "/" {
|
||||
baseName = "local"
|
||||
}
|
||||
outputFile = baseName + "." + format
|
||||
if compression != "none" {
|
||||
outputFile += "." + compression
|
||||
}
|
||||
}
|
||||
|
||||
err = os.WriteFile(outputFile, compressedData, 0644)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error writing output file: %w", err)
|
||||
}
|
||||
|
||||
return outputFile, nil
|
||||
}
|
||||
|
||||
// isHidden checks if a path component starts with a dot
|
||||
func isHidden(path string) bool {
|
||||
parts := strings.Split(filepath.ToSlash(path), "/")
|
||||
for _, part := range parts {
|
||||
if strings.HasPrefix(part, ".") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// loadGitignore loads patterns from .gitignore if it exists
|
||||
func loadGitignore(dir string) []string {
|
||||
var patterns []string
|
||||
|
||||
gitignorePath := filepath.Join(dir, ".gitignore")
|
||||
content, err := os.ReadFile(gitignorePath)
|
||||
if err != nil {
|
||||
return patterns
|
||||
}
|
||||
|
||||
lines := strings.Split(string(content), "\n")
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
// Skip empty lines and comments
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
patterns = append(patterns, line)
|
||||
}
|
||||
|
||||
return patterns
|
||||
}
|
||||
|
||||
// matchesGitignore checks if a path matches any gitignore pattern
|
||||
func matchesGitignore(path string, isDir bool, patterns []string) bool {
|
||||
for _, pattern := range patterns {
|
||||
// Handle directory-only patterns
|
||||
if strings.HasSuffix(pattern, "/") {
|
||||
if !isDir {
|
||||
continue
|
||||
}
|
||||
pattern = strings.TrimSuffix(pattern, "/")
|
||||
}
|
||||
|
||||
// Handle negation (simplified - just skip negated patterns)
|
||||
if strings.HasPrefix(pattern, "!") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Match against path components
|
||||
matched, _ := filepath.Match(pattern, filepath.Base(path))
|
||||
if matched {
|
||||
return true
|
||||
}
|
||||
|
||||
// Also try matching the full path
|
||||
matched, _ = filepath.Match(pattern, path)
|
||||
if matched {
|
||||
return true
|
||||
}
|
||||
|
||||
// Handle ** patterns (simplified)
|
||||
if strings.Contains(pattern, "**") {
|
||||
simplePattern := strings.ReplaceAll(pattern, "**", "*")
|
||||
matched, _ = filepath.Match(simplePattern, path)
|
||||
if matched {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// matchesExclude checks if a path matches any exclude pattern
|
||||
func matchesExclude(path string, excludes []string) bool {
|
||||
for _, pattern := range excludes {
|
||||
// Match against basename
|
||||
matched, _ := filepath.Match(pattern, filepath.Base(path))
|
||||
if matched {
|
||||
return true
|
||||
}
|
||||
|
||||
// Match against full path
|
||||
matched, _ = filepath.Match(pattern, path)
|
||||
if matched {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
@ -38,6 +38,9 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
|||
format, _ := cmd.Flags().GetString("format")
|
||||
compression, _ := cmd.Flags().GetString("compression")
|
||||
password, _ := cmd.Flags().GetString("password")
|
||||
userAgent, _ := cmd.Flags().GetString("user-agent")
|
||||
ignoreRobots, _ := cmd.Flags().GetBool("ignore-robots")
|
||||
minDelay, _ := cmd.Flags().GetDuration("min-delay")
|
||||
|
||||
if format != "datanode" && format != "tim" && format != "trix" {
|
||||
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
|
||||
|
|
@ -51,7 +54,7 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
|||
bar = ui.NewProgressBar(-1, "Crawling website")
|
||||
}
|
||||
|
||||
dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar)
|
||||
dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar, userAgent, ignoreRobots, minDelay)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error downloading and packaging website: %w", err)
|
||||
}
|
||||
|
|
@ -104,5 +107,8 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
|||
collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
|
||||
collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
|
||||
collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
|
||||
collectWebsiteCmd.PersistentFlags().String("user-agent", "Borg/1.0", "Custom user agent string")
|
||||
collectWebsiteCmd.PersistentFlags().Bool("ignore-robots", false, "Ignore robots.txt")
|
||||
collectWebsiteCmd.PersistentFlags().Duration("min-delay", 0, "Minimum delay between requests")
|
||||
return collectWebsiteCmd
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ import (
|
|||
"testing"
|
||||
|
||||
"github.com/Snider/Borg/pkg/datanode"
|
||||
"time"
|
||||
|
||||
"github.com/Snider/Borg/pkg/website"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
)
|
||||
|
|
@ -14,7 +16,7 @@ import (
|
|||
func TestCollectWebsiteCmd_Good(t *testing.T) {
|
||||
// Mock the website downloader
|
||||
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
|
||||
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
|
||||
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
|
||||
return datanode.New(), nil
|
||||
}
|
||||
defer func() {
|
||||
|
|
@ -35,7 +37,7 @@ func TestCollectWebsiteCmd_Good(t *testing.T) {
|
|||
func TestCollectWebsiteCmd_Bad(t *testing.T) {
|
||||
// Mock the website downloader to return an error
|
||||
oldDownloadAndPackageWebsite := website.DownloadAndPackageWebsite
|
||||
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
|
||||
website.DownloadAndPackageWebsite = func(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
|
||||
return nil, fmt.Errorf("website error")
|
||||
}
|
||||
defer func() {
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ func main() {
|
|||
log.Println("Collecting website...")
|
||||
|
||||
// Download and package the website.
|
||||
dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil)
|
||||
dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil, "Borg/1.0", false, 0)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to collect website: %v", err)
|
||||
}
|
||||
|
|
|
|||
BIN
examples/demo-sample.smsg
Normal file
BIN
examples/demo-sample.smsg
Normal file
Binary file not shown.
2
go.mod
2
go.mod
|
|
@ -60,7 +60,7 @@ require (
|
|||
github.com/wailsapp/go-webview2 v1.0.22 // indirect
|
||||
github.com/wailsapp/mimetype v1.4.1 // indirect
|
||||
github.com/xanzy/ssh-agent v0.3.3 // indirect
|
||||
golang.org/x/crypto v0.45.0 // indirect
|
||||
golang.org/x/crypto v0.44.0 // indirect
|
||||
golang.org/x/sys v0.38.0 // indirect
|
||||
golang.org/x/term v0.37.0 // indirect
|
||||
golang.org/x/text v0.31.0 // indirect
|
||||
|
|
|
|||
4
go.sum
4
go.sum
|
|
@ -155,8 +155,8 @@ github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI
|
|||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
|
||||
golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q=
|
||||
golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4=
|
||||
golang.org/x/crypto v0.44.0 h1:A97SsFvM3AIwEEmTBiaxPPTYpDC47w720rdiiUvgoAU=
|
||||
golang.org/x/crypto v0.44.0/go.mod h1:013i+Nw79BMiQiMsOPcVCB5ZIJbYkerPrGnOa00tvmc=
|
||||
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
|
||||
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
|
||||
golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk=
|
||||
|
|
|
|||
112
pkg/robots/robots.go
Normal file
112
pkg/robots/robots.go
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
package robots
|
||||
|
||||
import (
|
||||
"path"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// RobotsData holds the parsed robots.txt data for a specific user-agent.
|
||||
type RobotsData struct {
|
||||
Disallow []string
|
||||
CrawlDelay time.Duration
|
||||
}
|
||||
|
||||
// IsAllowed checks if a given path is allowed by the robots.txt rules.
|
||||
func (r *RobotsData) IsAllowed(p string) bool {
|
||||
// A more complete implementation would handle wildcards.
|
||||
// This is a simple path prefix match.
|
||||
for _, rule := range r.Disallow {
|
||||
if rule == "" {
|
||||
// An empty Disallow rule means nothing is disallowed by this rule.
|
||||
continue
|
||||
}
|
||||
if rule == "/" {
|
||||
// Disallow: / means disallow everything.
|
||||
return false
|
||||
}
|
||||
if strings.HasPrefix(p, rule) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Parse parses the content of a robots.txt file for a specific user-agent.
|
||||
func Parse(content []byte, userAgent string) (*RobotsData, error) {
|
||||
lines := strings.Split(string(content), "\n")
|
||||
|
||||
rules := make(map[string]*RobotsData)
|
||||
var currentUAs []string
|
||||
lastWasUA := false
|
||||
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if idx := strings.Index(line, "#"); idx != -1 {
|
||||
line = line[:idx]
|
||||
}
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
key := strings.ToLower(strings.TrimSpace(parts[0]))
|
||||
value := strings.TrimSpace(parts[1])
|
||||
|
||||
switch key {
|
||||
case "user-agent":
|
||||
if !lastWasUA {
|
||||
currentUAs = []string{} // New group
|
||||
}
|
||||
currentUAs = append(currentUAs, strings.ToLower(value))
|
||||
lastWasUA = true
|
||||
case "disallow", "crawl-delay":
|
||||
if len(currentUAs) == 0 {
|
||||
continue // Rule without a user-agent
|
||||
}
|
||||
|
||||
for _, ua := range currentUAs {
|
||||
if rules[ua] == nil {
|
||||
rules[ua] = &RobotsData{}
|
||||
}
|
||||
if key == "disallow" {
|
||||
rules[ua].Disallow = append(rules[ua].Disallow, path.Clean("/"+value))
|
||||
} else if key == "crawl-delay" {
|
||||
if delay, err := strconv.ParseFloat(value, 64); err == nil {
|
||||
rules[ua].CrawlDelay = time.Duration(delay * float64(time.Second))
|
||||
}
|
||||
}
|
||||
}
|
||||
lastWasUA = false
|
||||
default:
|
||||
lastWasUA = false
|
||||
}
|
||||
}
|
||||
|
||||
lowerUserAgent := strings.ToLower(userAgent)
|
||||
|
||||
// Look for most specific match.
|
||||
bestMatch := ""
|
||||
for ua := range rules {
|
||||
if strings.Contains(lowerUserAgent, ua) {
|
||||
if len(ua) > len(bestMatch) {
|
||||
bestMatch = ua
|
||||
}
|
||||
}
|
||||
}
|
||||
if bestMatch != "" {
|
||||
return rules[bestMatch], nil
|
||||
}
|
||||
|
||||
// Fallback to wildcard.
|
||||
if data, ok := rules["*"]; ok {
|
||||
return data, nil
|
||||
}
|
||||
|
||||
return &RobotsData{}, nil
|
||||
}
|
||||
125
pkg/robots/robots_test.go
Normal file
125
pkg/robots/robots_test.go
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
package robots
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestParse(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
content string
|
||||
userAgent string
|
||||
expected *RobotsData
|
||||
expectedErr bool
|
||||
}{
|
||||
{
|
||||
name: "Specific user agent",
|
||||
content: `
|
||||
User-agent: BorgBot
|
||||
Disallow: /private/
|
||||
Crawl-delay: 2
|
||||
`,
|
||||
userAgent: "BorgBot/1.0",
|
||||
expected: &RobotsData{
|
||||
Disallow: []string{"/private"},
|
||||
CrawlDelay: 2 * time.Second,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "Wildcard user agent",
|
||||
content: `
|
||||
User-agent: *
|
||||
Disallow: /admin/
|
||||
`,
|
||||
userAgent: "AnotherBot",
|
||||
expected: &RobotsData{
|
||||
Disallow: []string{"/admin"},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "Multiple disallow rules",
|
||||
content: `
|
||||
User-agent: *
|
||||
Disallow: /admin/
|
||||
Disallow: /login
|
||||
`,
|
||||
userAgent: "AnyBot",
|
||||
expected: &RobotsData{
|
||||
Disallow: []string{"/admin", "/login"},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "No rules for user agent",
|
||||
content: `
|
||||
User-agent: GoogleBot
|
||||
Disallow: /
|
||||
`,
|
||||
userAgent: "MyBot",
|
||||
expected: &RobotsData{},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
robotsData, err := Parse([]byte(tc.content), tc.userAgent)
|
||||
if (err != nil) != tc.expectedErr {
|
||||
t.Fatalf("Parse() error = %v, wantErr %v", err, tc.expectedErr)
|
||||
}
|
||||
|
||||
if len(robotsData.Disallow) != len(tc.expected.Disallow) {
|
||||
t.Fatalf("expected %d disallow rules, got %d", len(tc.expected.Disallow), len(robotsData.Disallow))
|
||||
}
|
||||
|
||||
for i, rule := range tc.expected.Disallow {
|
||||
if robotsData.Disallow[i] != rule {
|
||||
t.Errorf("expected disallow rule %s, got %s", rule, robotsData.Disallow[i])
|
||||
}
|
||||
}
|
||||
|
||||
if robotsData.CrawlDelay != tc.expected.CrawlDelay {
|
||||
t.Errorf("expected crawl delay %v, got %v", tc.expected.CrawlDelay, robotsData.CrawlDelay)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsAllowed(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
robotsData *RobotsData
|
||||
path string
|
||||
allowed bool
|
||||
}{
|
||||
{
|
||||
name: "Path is disallowed",
|
||||
robotsData: &RobotsData{
|
||||
Disallow: []string{"/private"},
|
||||
},
|
||||
path: "/private/page.html",
|
||||
allowed: false,
|
||||
},
|
||||
{
|
||||
name: "Path is allowed",
|
||||
robotsData: &RobotsData{
|
||||
Disallow: []string{"/private"},
|
||||
},
|
||||
path: "/public/page.html",
|
||||
allowed: true,
|
||||
},
|
||||
{
|
||||
name: "No rules",
|
||||
robotsData: &RobotsData{},
|
||||
path: "/any/page.html",
|
||||
allowed: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if allowed := tc.robotsData.IsAllowed(tc.path); allowed != tc.allowed {
|
||||
t.Errorf("IsAllowed(%s) = %v, want %v", tc.path, allowed, tc.allowed)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -8,9 +8,10 @@ import (
|
|||
"strings"
|
||||
|
||||
"github.com/Snider/Borg/pkg/datanode"
|
||||
"github.com/Snider/Borg/pkg/robots"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
"time"
|
||||
)
|
||||
|
||||
var DownloadAndPackageWebsite = downloadAndPackageWebsite
|
||||
|
|
@ -24,6 +25,9 @@ type Downloader struct {
|
|||
progressBar *progressbar.ProgressBar
|
||||
client *http.Client
|
||||
errors []error
|
||||
robots *robots.RobotsData
|
||||
userAgent string
|
||||
minDelay time.Duration
|
||||
}
|
||||
|
||||
// NewDownloader creates a new Downloader.
|
||||
|
|
@ -39,11 +43,12 @@ func NewDownloaderWithClient(maxDepth int, client *http.Client) *Downloader {
|
|||
maxDepth: maxDepth,
|
||||
client: client,
|
||||
errors: make([]error, 0),
|
||||
userAgent: "Borg/1.0",
|
||||
}
|
||||
}
|
||||
|
||||
// downloadAndPackageWebsite downloads a website and packages it into a DataNode.
|
||||
func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
|
||||
func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar, userAgent string, ignoreRobots bool, minDelay time.Duration) (*datanode.DataNode, error) {
|
||||
baseURL, err := url.Parse(startURL)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
|
@ -52,6 +57,23 @@ func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.P
|
|||
d := NewDownloader(maxDepth)
|
||||
d.baseURL = baseURL
|
||||
d.progressBar = bar
|
||||
d.userAgent = userAgent
|
||||
d.minDelay = minDelay
|
||||
|
||||
if !ignoreRobots {
|
||||
robotsURL, err := baseURL.Parse("/robots.txt")
|
||||
if err == nil {
|
||||
resp, err := d.client.Get(robotsURL.String())
|
||||
if err == nil && resp.StatusCode == http.StatusOK {
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
if err == nil {
|
||||
d.robots, _ = robots.Parse(body, d.userAgent)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
d.crawl(startURL, 0)
|
||||
|
||||
if len(d.errors) > 0 {
|
||||
|
|
@ -69,12 +91,28 @@ func (d *Downloader) crawl(pageURL string, depth int) {
|
|||
if depth > d.maxDepth || d.visited[pageURL] {
|
||||
return
|
||||
}
|
||||
|
||||
u, err := url.Parse(pageURL)
|
||||
if err != nil {
|
||||
d.errors = append(d.errors, fmt.Errorf("invalid URL %s: %w", pageURL, err))
|
||||
return
|
||||
}
|
||||
|
||||
if d.robots != nil && !d.robots.IsAllowed(u.Path) {
|
||||
return
|
||||
}
|
||||
|
||||
d.visited[pageURL] = true
|
||||
if d.progressBar != nil {
|
||||
d.progressBar.Add(1)
|
||||
}
|
||||
|
||||
resp, err := d.client.Get(pageURL)
|
||||
d.delay()
|
||||
|
||||
req, _ := http.NewRequest("GET", pageURL, nil)
|
||||
req.Header.Set("User-Agent", d.userAgent)
|
||||
|
||||
resp, err := d.client.Do(req)
|
||||
if err != nil {
|
||||
d.errors = append(d.errors, fmt.Errorf("Error getting %s: %w", pageURL, err))
|
||||
return
|
||||
|
|
@ -136,12 +174,28 @@ func (d *Downloader) downloadAsset(assetURL string) {
|
|||
if d.visited[assetURL] {
|
||||
return
|
||||
}
|
||||
|
||||
u, err := url.Parse(assetURL)
|
||||
if err != nil {
|
||||
d.errors = append(d.errors, fmt.Errorf("invalid URL %s: %w", assetURL, err))
|
||||
return
|
||||
}
|
||||
|
||||
if d.robots != nil && !d.robots.IsAllowed(u.Path) {
|
||||
return
|
||||
}
|
||||
|
||||
d.visited[assetURL] = true
|
||||
if d.progressBar != nil {
|
||||
d.progressBar.Add(1)
|
||||
}
|
||||
|
||||
resp, err := d.client.Get(assetURL)
|
||||
d.delay()
|
||||
|
||||
req, _ := http.NewRequest("GET", assetURL, nil)
|
||||
req.Header.Set("User-Agent", d.userAgent)
|
||||
|
||||
resp, err := d.client.Do(req)
|
||||
if err != nil {
|
||||
d.errors = append(d.errors, fmt.Errorf("Error getting asset %s: %w", assetURL, err))
|
||||
return
|
||||
|
|
@ -163,6 +217,19 @@ func (d *Downloader) downloadAsset(assetURL string) {
|
|||
d.dn.AddData(relPath, body)
|
||||
}
|
||||
|
||||
func (d *Downloader) delay() {
|
||||
var delay time.Duration
|
||||
if d.robots != nil {
|
||||
delay = d.robots.CrawlDelay
|
||||
}
|
||||
if d.minDelay > delay {
|
||||
delay = d.minDelay
|
||||
}
|
||||
if delay > 0 {
|
||||
time.Sleep(delay)
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Downloader) getRelativePath(pageURL string) string {
|
||||
u, err := url.Parse(pageURL)
|
||||
if err != nil {
|
||||
|
|
|
|||
|
|
@ -20,12 +20,12 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) {
|
|||
defer server.Close()
|
||||
|
||||
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 2, bar)
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 2, bar, "Borg/1.0", false, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
||||
}
|
||||
|
||||
expectedFiles := []string{"index.html", "style.css", "image.png", "page2.html", "page3.html"}
|
||||
expectedFiles := []string{"index.html", "style.css", "image.png", "page2.html"}
|
||||
for _, file := range expectedFiles {
|
||||
exists, err := dn.Exists(file)
|
||||
if err != nil {
|
||||
|
|
@ -50,9 +50,31 @@ func TestDownloadAndPackageWebsite_Good(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestDownloadAndPackageWebsite_RespectsRobotsTxt(t *testing.T) {
|
||||
server := newWebsiteTestServer()
|
||||
defer server.Close()
|
||||
|
||||
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 2, bar, "Borg/1.0", false, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
||||
}
|
||||
|
||||
// page3.html is disallowed by robots.txt, so it should not be present.
|
||||
exists, _ := dn.Exists("page3.html")
|
||||
if exists {
|
||||
t.Error("page3.html should not have been downloaded due to robots.txt")
|
||||
}
|
||||
// page2.html is not disallowed, so it should be present.
|
||||
exists, _ = dn.Exists("page2.html")
|
||||
if !exists {
|
||||
t.Error("page2.html should have been downloaded")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
|
||||
t.Run("Invalid Start URL", func(t *testing.T) {
|
||||
_, err := DownloadAndPackageWebsite("http://invalid-url", 1, nil)
|
||||
_, err := DownloadAndPackageWebsite("http://invalid-url", 1, nil, "Borg/1.0", false, 0)
|
||||
if err == nil {
|
||||
t.Fatal("Expected an error for an invalid start URL, but got nil")
|
||||
}
|
||||
|
|
@ -63,7 +85,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
|
|||
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
|
||||
}))
|
||||
defer server.Close()
|
||||
_, err := DownloadAndPackageWebsite(server.URL, 1, nil)
|
||||
_, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
|
||||
if err == nil {
|
||||
t.Fatal("Expected an error for a server error on the start URL, but got nil")
|
||||
}
|
||||
|
|
@ -80,7 +102,7 @@ func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
|
|||
}))
|
||||
defer server.Close()
|
||||
// We expect an error because the link is broken.
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil)
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
|
||||
if err == nil {
|
||||
t.Fatal("Expected an error for a broken link, but got nil")
|
||||
}
|
||||
|
|
@ -99,7 +121,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
|
|||
defer server.Close()
|
||||
|
||||
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, bar) // Max depth of 1
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, bar, "Borg/1.0", false, 0) // Max depth of 1
|
||||
if err != nil {
|
||||
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
||||
}
|
||||
|
|
@ -122,7 +144,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
|
|||
fmt.Fprint(w, `<a href="http://externalsite.com/page.html">External</a>`)
|
||||
}))
|
||||
defer server.Close()
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil)
|
||||
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
|
||||
}
|
||||
|
|
@ -156,7 +178,7 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
|
|||
// For now, we'll just test that it doesn't hang forever.
|
||||
done := make(chan bool)
|
||||
go func() {
|
||||
_, err := DownloadAndPackageWebsite(server.URL, 1, nil)
|
||||
_, err := DownloadAndPackageWebsite(server.URL, 1, nil, "Borg/1.0", false, 0)
|
||||
if err != nil && !strings.Contains(err.Error(), "context deadline exceeded") {
|
||||
// We expect a timeout error, but other errors are failures.
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
|
|
@ -177,6 +199,9 @@ func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
|
|||
func newWebsiteTestServer() *httptest.Server {
|
||||
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch r.URL.Path {
|
||||
case "/robots.txt":
|
||||
w.Header().Set("Content-Type", "text/plain")
|
||||
fmt.Fprint(w, "User-agent: *\nDisallow: /page3.html")
|
||||
case "/":
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
fmt.Fprint(w, `
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue