Borg/pkg/website/website_test.go
google-labs-jules[bot] e3efb59d98 feat: Add deduplication cache for collections
This commit introduces a deduplication cache to avoid re-downloading files across multiple collection jobs.

Key changes include:
- A new `pkg/cache` package that provides content-addressable storage using SHA256 hashes of the file content.
- Integration of the cache into the `collect website` command. Downloads are now skipped if the content already exists in the cache.
- The addition of `--no-cache` and `--cache-dir` flags to give users control over the caching behavior.
- New `borg cache stats` and `borg cache clear` commands to allow users to manage the cache.
- A performance improvement to the cache implementation, which now only writes the URL-to-hash index file once at the end of the collection process, rather than on every file download.
- Centralized logic for determining the default cache directory, removing code duplication.
- Improved error handling and refactored duplicated cache-checking logic in the website collector.
- Added comprehensive unit tests for the new cache package and an integration test to verify that the website collector correctly uses the cache.

The implementation of cache size limiting and LRU eviction is still pending and will be addressed in a future commit.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
2026-02-02 00:46:07 +00:00

262 lines
7.6 KiB
Go

package website
import (
"fmt"
"io"
"io/fs"
"net/http"
"net/http/httptest"
"strings"
"sync/atomic"
"testing"
"time"
"github.com/Snider/Borg/pkg/cache"
"github.com/schollz/progressbar/v3"
)
// --- Test Cases ---
func TestDownloadAndPackageWebsite_Good(t *testing.T) {
server := newWebsiteTestServer()
defer server.Close()
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
dn, err := DownloadAndPackageWebsite(server.URL, 2, bar, nil)
if err != nil {
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
}
expectedFiles := []string{"index.html", "style.css", "image.png", "page2.html", "page3.html"}
for _, file := range expectedFiles {
exists, err := dn.Exists(file)
if err != nil {
t.Fatalf("Exists failed for %s: %v", file, err)
}
if !exists {
t.Errorf("Expected to find file %s in DataNode, but it was not found", file)
}
}
// Check content of one file
file, err := dn.Open("style.css")
if err != nil {
t.Fatalf("Failed to open style.css: %v", err)
}
content, err := io.ReadAll(file)
if err != nil {
t.Fatalf("Failed to read style.css: %v", err)
}
if string(content) != `body { color: red; }` {
t.Errorf("Unexpected content for style.css: %s", content)
}
}
func TestDownloadAndPackageWebsite_Bad(t *testing.T) {
t.Run("Invalid Start URL", func(t *testing.T) {
_, err := DownloadAndPackageWebsite("http://invalid-url", 1, nil, nil)
if err == nil {
t.Fatal("Expected an error for an invalid start URL, but got nil")
}
})
t.Run("Server Error on Start URL", func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
}))
defer server.Close()
_, err := DownloadAndPackageWebsite(server.URL, 1, nil, nil)
if err == nil {
t.Fatal("Expected an error for a server error on the start URL, but got nil")
}
})
t.Run("Broken Link", func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/" {
w.Header().Set("Content-Type", "text/html")
fmt.Fprint(w, `<a href="/broken.html">Broken</a>`)
} else {
http.NotFound(w, r)
}
}))
defer server.Close()
// We expect an error because the link is broken.
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil, nil)
if err == nil {
t.Fatal("Expected an error for a broken link, but got nil")
}
if !strings.Contains(err.Error(), "404 Not Found") {
t.Errorf("Expected error to contain '404 Not Found', but got: %v", err)
}
if dn != nil {
t.Error("DataNode should be nil on error")
}
})
}
func TestDownloadAndPackageWebsite_Ugly(t *testing.T) {
t.Run("Exceed Max Depth", func(t *testing.T) {
server := newWebsiteTestServer()
defer server.Close()
bar := progressbar.NewOptions(1, progressbar.OptionSetWriter(io.Discard))
dn, err := DownloadAndPackageWebsite(server.URL, 1, bar, nil) // Max depth of 1
if err != nil {
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
}
// page3.html is at depth 2, so it should not be present.
exists, _ := dn.Exists("page3.html")
if exists {
t.Error("page3.html should not have been downloaded due to max depth")
}
// page2.html is at depth 1, so it should be present.
exists, _ = dn.Exists("page2.html")
if !exists {
t.Error("page2.html should have been downloaded")
}
})
t.Run("External Links", func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
fmt.Fprint(w, `<a href="http://externalsite.com/page.html">External</a>`)
}))
defer server.Close()
dn, err := DownloadAndPackageWebsite(server.URL, 1, nil, nil)
if err != nil {
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
}
if dn == nil {
t.Fatal("DataNode should not be nil")
}
// We can't easily check if the external link was visited, but we can ensure
// it didn't cause an error and didn't add any unexpected files.
var fileCount int
dn.Walk(".", func(path string, d fs.DirEntry, err error) error {
if !d.IsDir() {
fileCount++
}
return nil
})
if fileCount != 1 { // Should only contain the root page
t.Errorf("expected 1 file in datanode, but found %d", fileCount)
}
})
t.Run("Timeout", func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(100 * time.Millisecond)
w.Header().Set("Content-Type", "text/html")
fmt.Fprint(w, `<h1>Hello</h1>`)
}))
defer server.Close()
// This test is tricky as it depends on timing.
// The current implementation uses the default http client with no timeout.
// A proper implementation would allow configuring a timeout.
// For now, we'll just test that it doesn't hang forever.
done := make(chan bool)
go func() {
_, err := DownloadAndPackageWebsite(server.URL, 1, nil, nil)
if err != nil && !strings.Contains(err.Error(), "context deadline exceeded") {
// We expect a timeout error, but other errors are failures.
t.Errorf("unexpected error: %v", err)
}
done <- true
}()
select {
case <-done:
// test finished
case <-time.After(5 * time.Second):
t.Fatal("Test timed out")
}
})
}
// --- Helpers ---
func TestDownloadAndPackageWebsite_Cache(t *testing.T) {
var requestCount int32
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(&requestCount, 1)
switch r.URL.Path {
case "/":
w.Header().Set("Content-Type", "text/html")
fmt.Fprint(w, `<a href="/page2.html">Page 2</a>`)
case "/page2.html":
w.Header().Set("Content-Type", "text/html")
fmt.Fprint(w, `<h1>Page 2</h1>`)
default:
http.NotFound(w, r)
}
}))
defer server.Close()
cacheDir := t.TempDir()
c, err := cache.New(cacheDir)
if err != nil {
t.Fatalf("Failed to create cache: %v", err)
}
// First download
_, err = DownloadAndPackageWebsite(server.URL, 2, nil, c)
if err != nil {
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
}
if err := c.Close(); err != nil {
t.Fatalf("Failed to close cache: %v", err)
}
if atomic.LoadInt32(&requestCount) != 2 {
t.Errorf("Expected 2 requests to the server, but got %d", requestCount)
}
// Second download
c2, err := cache.New(cacheDir)
if err != nil {
t.Fatalf("Failed to create new cache: %v", err)
}
_, err = DownloadAndPackageWebsite(server.URL, 2, nil, c2)
if err != nil {
t.Fatalf("DownloadAndPackageWebsite failed: %v", err)
}
if err := c2.Close(); err != nil {
t.Fatalf("Failed to close cache: %v", err)
}
if atomic.LoadInt32(&requestCount) != 2 {
t.Errorf("Expected 2 requests to the server after caching, but got %d", requestCount)
}
}
func newWebsiteTestServer() *httptest.Server {
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/":
w.Header().Set("Content-Type", "text/html")
fmt.Fprint(w, `
<!DOCTYPE html>
<html><body>
<a href="/page2.html">Page 2</a>
<link rel="stylesheet" href="style.css">
<img src="image.png">
</body></html>
`)
case "/style.css":
w.Header().Set("Content-Type", "text/css")
fmt.Fprint(w, `body { color: red; }`)
case "/image.png":
w.Header().Set("Content-Type", "image/png")
fmt.Fprint(w, "fake image data")
case "/page2.html":
w.Header().Set("Content-Type", "text/html")
fmt.Fprint(w, `<html><body><a href="/page3.html">Page 3</a></body></html>`)
case "/page3.html":
w.Header().Set("Content-Type", "text/html")
fmt.Fprint(w, `<html><body><h1>Page 3</h1></body></html>`)
default:
http.NotFound(w, r)
}
}))
}