Borg/pkg/website/website.go
google-labs-jules[bot] 7adfff1d0d feat: Add TDD framework and fix build error
This commit introduces a TDD testing framework for the `collect` commands and resolves a build failure.

- A `TDD/` directory has been added to house tests for the `collect` commands.
- An environment variable `BORG_PLEXSUS=0` has been implemented to enable a mock mode, which prevents external network calls during testing.
- The `collect` commands have been updated to use the command's output streams, allowing for output capturing in tests.
- A `pkg/mocks` package has been added to provide mock implementations for testing.
- The `.gitignore` file has been updated to exclude generated `.datanode` files.
- The "flag redefined" build error has been fixed by refactoring the root command initialization in `cmd/root.go` to prevent duplicate flag definitions.
2025-11-02 18:11:04 +00:00

192 lines
4.1 KiB
Go

package website
import (
"bytes"
"fmt"
"github.com/Snider/Borg/pkg/mocks"
"io"
"net/http"
"net/url"
"os"
"strings"
"github.com/Snider/Borg/pkg/datanode"
"github.com/schollz/progressbar/v3"
"golang.org/x/net/html"
)
var getHTTPClient = func() *http.Client {
if os.Getenv("BORG_PLEXSUS") == "0" {
responses := map[string]*http.Response{
"http://test.com": {
StatusCode: http.StatusOK,
Body: io.NopCloser(bytes.NewBufferString(`<html><body><a href="page2.html">Page 2</a></body></html>`)),
Header: make(http.Header),
},
"http://test.com/page2.html": {
StatusCode: http.StatusOK,
Body: io.NopCloser(bytes.NewBufferString(`<html><body>Hello</body></html>`)),
Header: make(http.Header),
},
}
return mocks.NewMockClient(responses)
}
return http.DefaultClient
}
// Downloader is a recursive website downloader.
type Downloader struct {
baseURL *url.URL
dn *datanode.DataNode
visited map[string]bool
maxDepth int
progressBar *progressbar.ProgressBar
client *http.Client
}
// NewDownloader creates a new Downloader.
func NewDownloader(maxDepth int) *Downloader {
return &Downloader{
dn: datanode.New(),
visited: make(map[string]bool),
maxDepth: maxDepth,
client: getHTTPClient(),
}
}
// DownloadAndPackageWebsite downloads a website and packages it into a DataNode.
func DownloadAndPackageWebsite(startURL string, maxDepth int, bar *progressbar.ProgressBar) (*datanode.DataNode, error) {
baseURL, err := url.Parse(startURL)
if err != nil {
return nil, err
}
d := NewDownloader(maxDepth)
d.baseURL = baseURL
d.progressBar = bar
d.crawl(startURL, 0)
return d.dn, nil
}
func (d *Downloader) crawl(pageURL string, depth int) {
if depth > d.maxDepth || d.visited[pageURL] {
return
}
d.visited[pageURL] = true
if d.progressBar != nil {
d.progressBar.Add(1)
}
resp, err := d.client.Get(pageURL)
if err != nil {
fmt.Printf("Error getting %s: %v\n", pageURL, err)
return
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Printf("Error reading body of %s: %v\n", pageURL, err)
return
}
relPath := d.getRelativePath(pageURL)
d.dn.AddData(relPath, body)
doc, err := html.Parse(strings.NewReader(string(body)))
if err != nil {
fmt.Printf("Error parsing HTML of %s: %v\n", pageURL, err)
return
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode {
for _, a := range n.Attr {
if a.Key == "href" || a.Key == "src" {
link, err := d.resolveURL(pageURL, a.Val)
if err != nil {
continue
}
if d.isLocal(link) {
if isAsset(link) {
d.downloadAsset(link)
} else {
d.crawl(link, depth+1)
}
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
}
func (d *Downloader) downloadAsset(assetURL string) {
if d.visited[assetURL] {
return
}
d.visited[assetURL] = true
if d.progressBar != nil {
d.progressBar.Add(1)
}
resp, err := d.client.Get(assetURL)
if err != nil {
fmt.Printf("Error getting asset %s: %v\n", assetURL, err)
return
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Printf("Error reading body of asset %s: %v\n", assetURL, err)
return
}
relPath := d.getRelativePath(assetURL)
d.dn.AddData(relPath, body)
}
func (d *Downloader) getRelativePath(pageURL string) string {
u, err := url.Parse(pageURL)
if err != nil {
return ""
}
return strings.TrimPrefix(u.Path, "/")
}
func (d *Downloader) resolveURL(base, ref string) (string, error) {
baseURL, err := url.Parse(base)
if err != nil {
return "", err
}
refURL, err := url.Parse(ref)
if err != nil {
return "", err
}
return baseURL.ResolveReference(refURL).String(), nil
}
func (d *Downloader) isLocal(pageURL string) bool {
u, err := url.Parse(pageURL)
if err != nil {
return false
}
return u.Hostname() == d.baseURL.Hostname()
}
func isAsset(pageURL string) bool {
ext := []string{".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico"}
for _, e := range ext {
if strings.HasSuffix(pageURL, e) {
return true
}
}
return false
}