Borg/cmd/collect_wayback_test.go
google-labs-jules[bot] 5d71a365cd feat: Add Wayback Machine integration
This commit introduces a new `wayback` command to interact with the Internet Archive's Wayback Machine.

The `wayback` command has two subcommands:
- `list`: Lists available snapshots for a given URL.
- `collect`: Collects a snapshot of a website for offline viewing.

The `collect` subcommand supports the following features:
- Recursive downloading of all assets (CSS, JS, images, etc.).
- Deduplication of content to avoid downloading the same file multiple times.
- Rate-limiting to avoid overwhelming the Wayback Machine's API.
- Rewriting of internal links for offline viewing.

The implementation follows the existing command structure and includes unit and integration tests.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
2026-02-02 00:46:49 +00:00

147 lines
4.6 KiB
Go

package cmd
import (
"bytes"
"io"
"net/http"
"os"
"strings"
"testing"
)
// MockRoundTripper is a mock implementation of http.RoundTripper for testing.
type MockRoundTripper struct {
Response *http.Response
Err error
RoundTripFunc func(req *http.Request) (*http.Response, error)
}
func (m *MockRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
if m.RoundTripFunc != nil {
return m.RoundTripFunc(req)
}
return m.Response, m.Err
}
func NewMockClient(responseBody string, statusCode int) *http.Client {
return &http.Client{
Transport: &MockRoundTripper{
Response: &http.Response{
StatusCode: statusCode,
Body: io.NopCloser(bytes.NewBufferString(responseBody)),
},
},
}
}
func TestWaybackList(t *testing.T) {
t.Cleanup(func() {
RootCmd.SetArgs([]string{})
})
mockResponse := `[
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
["com,example)/", "20220101000000", "http://example.com/", "text/html", "200", "DIGEST", "1234"]
]`
http.DefaultClient = NewMockClient(mockResponse, http.StatusOK)
output, err := executeCommand(RootCmd, "wayback", "list", "http://example.com")
if err != nil {
t.Fatalf("executeCommand returned an unexpected error: %v", err)
}
if !strings.Contains(output, "20220101000000") {
t.Errorf("Expected output to contain timestamp '20220101000000', got '%s'", output)
}
}
func TestWaybackCollect(t *testing.T) {
t.Cleanup(func() {
RootCmd.SetArgs([]string{})
})
t.Run("Good - Latest with Assets", func(t *testing.T) {
mockListResponse := `[
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
["com,example)/", "20230101000000", "http://example.com/", "text/html", "200", "DIGEST1", "1234"]
]`
mockAssetsResponse := `[
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
["com,example)/", "20230101000000", "http://example.com/", "text/html", "200", "DIGEST1", "1234"],
["com,example)/css/style.css", "20230101000000", "http://example.com/css/style.css", "text/css", "200", "DIGEST2", "5678"]
]`
mockHTMLContent := "<html><head><link rel='stylesheet' href='/css/style.css'></head><body>Hello</body></html>"
mockCSSContent := "body { color: red; }"
// This is still a simplified mock, but it's better.
// A more robust solution would use a mock server or a more sophisticated RoundTripper.
var requestCount int
http.DefaultClient = &http.Client{
Transport: &MockRoundTripper{
Response: &http.Response{
StatusCode: http.StatusOK,
Body: io.NopCloser(bytes.NewBufferString("")), // Placeholder
},
},
}
http.DefaultClient.Transport.(*MockRoundTripper).Response.Body = io.NopCloser(bytes.NewBufferString(mockListResponse))
http.DefaultClient.Transport.(*MockRoundTripper).RoundTripFunc = func(req *http.Request) (*http.Response, error) {
var body string
if requestCount == 0 {
body = mockListResponse
} else if requestCount == 1 {
body = mockAssetsResponse
} else if strings.Contains(req.URL.Path, "style.css") {
body = mockCSSContent
} else {
body = mockHTMLContent
}
requestCount++
return &http.Response{
StatusCode: http.StatusOK,
Body: io.NopCloser(bytes.NewBufferString(body)),
}, nil
}
tempDir, err := os.MkdirTemp("", "borg-test")
if err != nil {
t.Fatalf("Failed to create temp dir: %v", err)
}
defer os.RemoveAll(tempDir)
_, err = executeCommand(RootCmd, "wayback", "collect", "http://example.com", "--latest", "--output", tempDir)
if err != nil {
t.Fatalf("executeCommand returned an unexpected error: %v", err)
}
// Verify TIMELINE.md
timelineFile := tempDir + "/TIMELINE.md"
if _, err := os.Stat(timelineFile); os.IsNotExist(err) {
t.Errorf("Expected TIMELINE.md to be created in %s", tempDir)
}
// Verify index.html
indexFile := tempDir + "/20230101000000/index.html"
if _, err := os.Stat(indexFile); os.IsNotExist(err) {
t.Fatalf("Expected index.html to be created in %s", indexFile)
}
content, err := os.ReadFile(indexFile)
if err != nil {
t.Fatalf("Failed to read index.html: %v", err)
}
if !strings.Contains(string(content), "Hello") {
t.Errorf("index.html content is incorrect")
}
// Verify style.css
cssFile := tempDir + "/20230101000000/css/style.css"
if _, err := os.Stat(cssFile); os.IsNotExist(err) {
t.Fatalf("Expected style.css to be created in %s", cssFile)
}
content, err = os.ReadFile(cssFile)
if err != nil {
t.Fatalf("Failed to read style.css: %v", err)
}
if !strings.Contains(string(content), "color: red") {
t.Errorf("style.css content is incorrect")
}
})
}