Borg/pkg/website/website.go
google-labs-jules[bot] 07de6d5877 feat: Implement collection hooks/plugins system
Adds a flexible hook system to the `borg collect` commands, allowing users to run custom scripts at various stages of the collection lifecycle.

This feature introduces a new `pkg/hooks` package that encapsulates the core logic for parsing a `.borg-hooks.yaml` configuration file and executing external scripts.

Key features:
- Four hook events are supported: `on_file_collected`, `on_url_found`, `on_collection_complete`, and `on_error`.
- A `--hooks` flag has been added to the `collect website` and `collect pwa` commands.
- The system automatically detects and loads a `.borg-hooks.yaml` file from the current directory if the `--hooks` flag is not provided.
- File-based hooks (`on_file_collected`) support glob pattern matching against the base filename.
- Hook scripts receive a JSON payload on stdin with relevant event context.
- Commands with arguments are correctly handled by executing them through `sh -c`.

The implementation includes a comprehensive test suite with both unit tests for the new `hooks` package and integration tests to validate the end-to-end functionality. All existing tests and examples have been updated to reflect the necessary function signature changes.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
2026-02-02 00:49:08 +00:00

236 lines
5.3 KiB
Go

package website
import (
"fmt"
"io"
"net/http"
"net/url"
"strings"
"github.com/Snider/Borg/pkg/datanode"
"github.com/Snider/Borg/pkg/hooks"
"github.com/schollz/progressbar/v3"
"golang.org/x/net/html"
)
var DownloadAndPackageWebsite = downloadAndPackageWebsite
// Downloader is a recursive website downloader.
type Downloader struct {
baseURL *url.URL
dn *datanode.DataNode
visited map[string]bool
maxDepth int
progressBar *progressbar.ProgressBar
client *http.Client
errors []error
hookRunner *hooks.HookRunner
}
// NewDownloader creates a new Downloader.
func NewDownloader(maxDepth int, hookRunner *hooks.HookRunner) *Downloader {
return NewDownloaderWithClient(maxDepth, http.DefaultClient, hookRunner)
}
// NewDownloaderWithClient creates a new Downloader with a custom http.Client.
func NewDownloaderWithClient(maxDepth int, client *http.Client, hookRunner *hooks.HookRunner) *Downloader {
return &Downloader{
dn: datanode.New(),
visited: make(map[string]bool),
maxDepth: maxDepth,
client: client,
errors: make([]error, 0),
hookRunner: hookRunner,
}
}
// downloadAndPackageWebsite downloads a website and packages it into a DataNode.
func downloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar, hookRunner *hooks.HookRunner) (*datanode.DataNode, error) {
baseURL, err := url.Parse(startURL)
if err != nil {
return nil, err
}
d := NewDownloader(maxDepth, hookRunner)
d.baseURL = baseURL
d.progressBar = bar
d.crawl(startURL, 0)
if len(d.errors) > 0 {
var errs []string
for _, e := range d.errors {
errs = append(errs, e.Error())
}
return nil, fmt.Errorf("failed to download website:\n%s", strings.Join(errs, "\n"))
}
d.hookRunner.Trigger(hooks.Event{
Event: hooks.OnCollectionComplete,
})
return d.dn, nil
}
func (d *Downloader) crawl(pageURL string, depth int) {
if depth > d.maxDepth || d.visited[pageURL] {
return
}
d.visited[pageURL] = true
if d.progressBar != nil {
d.progressBar.Add(1)
}
resp, err := d.client.Get(pageURL)
if err != nil {
d.triggerErrorHook(fmt.Errorf("Error getting %s: %w", pageURL, err))
return
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
d.triggerErrorHook(fmt.Errorf("bad status for %s: %s", pageURL, resp.Status))
return
}
body, err := io.ReadAll(resp.Body)
if err != nil {
d.triggerErrorHook(fmt.Errorf("Error reading body of %s: %w", pageURL, err))
return
}
relPath := d.getRelativePath(pageURL)
d.dn.AddData(relPath, body)
d.hookRunner.Trigger(hooks.Event{
Event: hooks.OnFileCollected,
File: relPath,
URL: pageURL,
Type: resp.Header.Get("Content-Type"),
})
// Don't try to parse non-html content
if !strings.HasPrefix(resp.Header.Get("Content-Type"), "text/html") {
return
}
doc, err := html.Parse(strings.NewReader(string(body)))
if err != nil {
d.triggerErrorHook(fmt.Errorf("Error parsing HTML of %s: %w", pageURL, err))
return
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode {
for _, a := range n.Attr {
if a.Key == "href" || a.Key == "src" {
link, err := d.resolveURL(pageURL, a.Val)
if err != nil {
continue
}
d.hookRunner.Trigger(hooks.Event{
Event: hooks.OnURLFound,
URL: link,
})
if d.isLocal(link) {
if isAsset(link) {
d.downloadAsset(link)
} else {
d.crawl(link, depth+1)
}
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
}
func (d *Downloader) downloadAsset(assetURL string) {
if d.visited[assetURL] {
return
}
d.visited[assetURL] = true
if d.progressBar != nil {
d.progressBar.Add(1)
}
resp, err := d.client.Get(assetURL)
if err != nil {
d.triggerErrorHook(fmt.Errorf("Error getting asset %s: %w", assetURL, err))
return
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
d.triggerErrorHook(fmt.Errorf("bad status for asset %s: %s", assetURL, resp.Status))
return
}
body, err := io.ReadAll(resp.Body)
if err != nil {
d.triggerErrorHook(fmt.Errorf("Error reading body of asset %s: %w", assetURL, err))
return
}
relPath := d.getRelativePath(assetURL)
d.dn.AddData(relPath, body)
d.hookRunner.Trigger(hooks.Event{
Event: hooks.OnFileCollected,
File: relPath,
URL: assetURL,
Type: resp.Header.Get("Content-Type"),
})
}
func (d *Downloader) triggerErrorHook(err error) {
d.errors = append(d.errors, err)
d.hookRunner.Trigger(hooks.Event{
Event: hooks.OnError,
Error: err.Error(),
})
}
func (d *Downloader) getRelativePath(pageURL string) string {
u, err := url.Parse(pageURL)
if err != nil {
return ""
}
path := strings.TrimPrefix(u.Path, "/")
if path == "" {
return "index.html"
}
return path
}
func (d *Downloader) resolveURL(base, ref string) (string, error) {
baseURL, err := url.Parse(base)
if err != nil {
return "", err
}
refURL, err := url.Parse(ref)
if err != nil {
return "", err
}
return baseURL.ResolveReference(refURL).String(), nil
}
func (d *Downloader) isLocal(pageURL string) bool {
u, err := url.Parse(pageURL)
if err != nil {
return false
}
return u.Hostname() == d.baseURL.Hostname()
}
func isAsset(pageURL string) bool {
ext := []string{".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico"}
for _, e := range ext {
if strings.HasSuffix(pageURL, e) {
return true
}
}
return false
}