This commit introduces a deduplication cache to avoid re-downloading files across multiple collection jobs. Key changes include: - A new `pkg/cache` package that provides content-addressable storage using SHA256 hashes of the file content. - Integration of the cache into the `collect website` command. Downloads are now skipped if the content already exists in the cache. - The addition of `--no-cache` and `--cache-dir` flags to give users control over the caching behavior. - New `borg cache stats` and `borg cache clear` commands to allow users to manage the cache. - A performance improvement to the cache implementation, which now only writes the URL-to-hash index file once at the end of the collection process, rather than on every file download. - Centralized logic for determining the default cache directory, removing code duplication. - Improved error handling and refactored duplicated cache-checking logic in the website collector. - Added comprehensive unit tests for the new cache package and an integration test to verify that the website collector correctly uses the cache. The implementation of cache size limiting and LRU eviction is still pending and will be addressed in a future commit. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
214 lines
4.8 KiB
Go
214 lines
4.8 KiB
Go
package website
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
|
|
"github.com/Snider/Borg/pkg/cache"
|
|
"github.com/Snider/Borg/pkg/datanode"
|
|
"github.com/schollz/progressbar/v3"
|
|
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
var DownloadAndPackageWebsite = downloadAndPackageWebsite
|
|
|
|
// Downloader is a recursive website downloader.
|
|
type Downloader struct {
|
|
baseURL *url.URL
|
|
dn *datanode.DataNode
|
|
visited map[string]bool
|
|
maxDepth int
|
|
progressBar *progressbar.ProgressBar
|
|
client *http.Client
|
|
errors []error
|
|
cache *cache.Cache
|
|
}
|
|
|
|
// NewDownloader creates a new Downloader.
|
|
func NewDownloader(maxDepth int, cache *cache.Cache) *Downloader {
|
|
return NewDownloaderWithClient(maxDepth, http.DefaultClient, cache)
|
|
}
|
|
|
|
// NewDownloaderWithClient creates a new Downloader with a custom http.Client.
|
|
func NewDownloaderWithClient(maxDepth int, client *http.Client, cache *cache.Cache) *Downloader {
|
|
return &Downloader{
|
|
dn: datanode.New(),
|
|
visited: make(map[string]bool),
|
|
maxDepth: maxDepth,
|
|
client: client,
|
|
errors: make([]error, 0),
|
|
cache: cache,
|
|
}
|
|
}
|
|
|
|
// downloadAndPackageWebsite downloads a website and packages it into a DataNode.
|
|
func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar, cache *cache.Cache) (*datanode.DataNode, error) {
|
|
baseURL, err := url.Parse(startURL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
d := NewDownloader(maxDepth, cache)
|
|
d.baseURL = baseURL
|
|
d.progressBar = bar
|
|
d.crawl(startURL, 0)
|
|
|
|
if len(d.errors) > 0 {
|
|
var errs []string
|
|
for _, e := range d.errors {
|
|
errs = append(errs, e.Error())
|
|
}
|
|
return nil, fmt.Errorf("failed to download website:\n%s", strings.Join(errs, "\n"))
|
|
}
|
|
|
|
return d.dn, nil
|
|
}
|
|
|
|
func (d *Downloader) crawl(pageURL string, depth int) {
|
|
if depth > d.maxDepth || d.visited[pageURL] {
|
|
return
|
|
}
|
|
|
|
body, contentType := d.download(pageURL)
|
|
if body == nil {
|
|
return
|
|
}
|
|
|
|
// Don't try to parse non-html content
|
|
if !strings.HasPrefix(contentType, "text/html") {
|
|
return
|
|
}
|
|
|
|
doc, err := html.Parse(strings.NewReader(string(body)))
|
|
if err != nil {
|
|
d.errors = append(d.errors, fmt.Errorf("Error parsing HTML of %s: %w", pageURL, err))
|
|
return
|
|
}
|
|
|
|
var f func(*html.Node)
|
|
f = func(n *html.Node) {
|
|
if n.Type == html.ElementNode {
|
|
for _, a := range n.Attr {
|
|
if a.Key == "href" || a.Key == "src" {
|
|
link, err := d.resolveURL(pageURL, a.Val)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if d.isLocal(link) {
|
|
if isAsset(link) {
|
|
d.downloadAsset(link)
|
|
} else {
|
|
d.crawl(link, depth+1)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
f(c)
|
|
}
|
|
}
|
|
f(doc)
|
|
}
|
|
|
|
func (d *Downloader) downloadAsset(assetURL string) {
|
|
if d.visited[assetURL] {
|
|
return
|
|
}
|
|
d.download(assetURL)
|
|
}
|
|
|
|
func (d *Downloader) download(pageURL string) ([]byte, string) {
|
|
d.visited[pageURL] = true
|
|
if d.progressBar != nil {
|
|
d.progressBar.Add(1)
|
|
}
|
|
|
|
// Check the cache first
|
|
if d.cache != nil {
|
|
data, ok, err := d.cache.Get(pageURL)
|
|
if err != nil {
|
|
d.errors = append(d.errors, fmt.Errorf("Error getting from cache %s: %w", pageURL, err))
|
|
// Don't return, as we can still try to download it
|
|
}
|
|
if ok {
|
|
relPath := d.getRelativePath(pageURL)
|
|
d.dn.AddData(relPath, data)
|
|
return data, "" // We don't know the content type from the cache
|
|
}
|
|
}
|
|
|
|
resp, err := d.client.Get(pageURL)
|
|
if err != nil {
|
|
d.errors = append(d.errors, fmt.Errorf("Error getting %s: %w", pageURL, err))
|
|
return nil, ""
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode >= 400 {
|
|
d.errors = append(d.errors, fmt.Errorf("bad status for %s: %s", pageURL, resp.Status))
|
|
return nil, ""
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
d.errors = append(d.errors, fmt.Errorf("Error reading body of %s: %w", pageURL, err))
|
|
return nil, ""
|
|
}
|
|
|
|
relPath := d.getRelativePath(pageURL)
|
|
d.dn.AddData(relPath, body)
|
|
|
|
// Add to cache
|
|
if d.cache != nil {
|
|
d.cache.Put(pageURL, body)
|
|
}
|
|
|
|
return body, resp.Header.Get("Content-Type")
|
|
}
|
|
|
|
func (d *Downloader) getRelativePath(pageURL string) string {
|
|
u, err := url.Parse(pageURL)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
path := strings.TrimPrefix(u.Path, "/")
|
|
if path == "" {
|
|
return "index.html"
|
|
}
|
|
return path
|
|
}
|
|
|
|
func (d *Downloader) resolveURL(base, ref string) (string, error) {
|
|
baseURL, err := url.Parse(base)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
refURL, err := url.Parse(ref)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return baseURL.ResolveReference(refURL).String(), nil
|
|
}
|
|
|
|
func (d *Downloader) isLocal(pageURL string) bool {
|
|
u, err := url.Parse(pageURL)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
return u.Hostname() == d.baseURL.Hostname()
|
|
}
|
|
|
|
func isAsset(pageURL string) bool {
|
|
ext := []string{".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico"}
|
|
for _, e := range ext {
|
|
if strings.HasSuffix(pageURL, e) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|