Adds support for parsing and respecting robots.txt during website collection. This change introduces the following features: - Fetches and parses /robots.txt before crawling a website. - Respects `Disallow` patterns to avoid crawling restricted areas. - Honors the `Crawl-delay` directive to prevent hammering sites. - Adds command-line flags to configure the behavior: - `--ignore-robots`: Ignores robots.txt rules. - `--user-agent`: Sets a custom user-agent string. - `--min-delay`: Overrides the crawl-delay with a minimum value. The implementation includes a new `robots` package for parsing robots.txt files and integrates it into the existing website downloader. Tests have been added to verify the new functionality. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
32 lines
701 B
Go
32 lines
701 B
Go
package main
|
|
|
|
import (
|
|
"log"
|
|
"os"
|
|
|
|
"github.com/Snider/Borg/pkg/website"
|
|
)
|
|
|
|
func main() {
|
|
log.Println("Collecting website...")
|
|
|
|
// Download and package the website.
|
|
dn, err := website.DownloadAndPackageWebsite("https://example.com", 2, nil, "Borg/1.0", false, 0)
|
|
if err != nil {
|
|
log.Fatalf("Failed to collect website: %v", err)
|
|
}
|
|
|
|
// Serialize the DataNode to a tarball.
|
|
tarball, err := dn.ToTar()
|
|
if err != nil {
|
|
log.Fatalf("Failed to serialize datanode to tar: %v", err)
|
|
}
|
|
|
|
// Write the tarball to a file.
|
|
err = os.WriteFile("website.dat", tarball, 0644)
|
|
if err != nil {
|
|
log.Fatalf("Failed to write datanode file: %v", err)
|
|
}
|
|
|
|
log.Println("Successfully created website.dat")
|
|
}
|