feat: Add Wayback Machine integration
This commit introduces a new `wayback` command to interact with the Internet Archive's Wayback Machine. The `wayback` command has two subcommands: - `list`: Lists available snapshots for a given URL. - `collect`: Collects a snapshot of a website for offline viewing. The `collect` subcommand supports the following features: - Recursive downloading of all assets (CSS, JS, images, etc.). - Deduplication of content to avoid downloading the same file multiple times. - Rate-limiting to avoid overwhelming the Wayback Machine's API. - Rewriting of internal links for offline viewing. The implementation follows the existing command structure and includes unit and integration tests. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
This commit is contained in:
parent
cf2af53ed3
commit
5d71a365cd
4 changed files with 671 additions and 0 deletions
226
cmd/collect_wayback.go
Normal file
226
cmd/collect_wayback.go
Normal file
|
|
@ -0,0 +1,226 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"time"
|
||||
"github.com/Snider/Borg/pkg/wayback"
|
||||
"github.com/spf13/cobra"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"text/tabwriter"
|
||||
)
|
||||
|
||||
// waybackCmd represents the wayback command
|
||||
var waybackCmd = NewWaybackCmd()
|
||||
var waybackListCmd = NewWaybackListCmd()
|
||||
var waybackCollectCmd = NewWaybackCollectCmd()
|
||||
|
||||
func init() {
|
||||
RootCmd.AddCommand(GetWaybackCmd())
|
||||
GetWaybackCmd().AddCommand(GetWaybackListCmd())
|
||||
GetWaybackCmd().AddCommand(GetWaybackCollectCmd())
|
||||
}
|
||||
|
||||
func GetWaybackCmd() *cobra.Command {
|
||||
return waybackCmd
|
||||
}
|
||||
|
||||
func GetWaybackListCmd() *cobra.Command {
|
||||
return waybackListCmd
|
||||
}
|
||||
|
||||
func GetWaybackCollectCmd() *cobra.Command {
|
||||
return waybackCollectCmd
|
||||
}
|
||||
|
||||
func NewWaybackCmd() *cobra.Command {
|
||||
cmd := &cobra.Command{
|
||||
Use: "wayback",
|
||||
Short: "Interact with the Internet Archive Wayback Machine.",
|
||||
Long: `List and collect historical snapshots of websites from the Internet Archive Wayback Machine.`,
|
||||
}
|
||||
return cmd
|
||||
}
|
||||
|
||||
func NewWaybackListCmd() *cobra.Command {
|
||||
cmd := &cobra.Command{
|
||||
Use: "list [url]",
|
||||
Short: "List available snapshots for a URL.",
|
||||
Long: `Queries the Wayback Machine CDX API to find all available snapshots for a given URL.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
url := args[0]
|
||||
snapshots, err := wayback.ListSnapshots(url)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to list snapshots: %w", err)
|
||||
}
|
||||
|
||||
if len(snapshots) == 0 {
|
||||
fmt.Fprintln(cmd.OutOrStdout(), "No snapshots found.")
|
||||
return nil
|
||||
}
|
||||
|
||||
w := tabwriter.NewWriter(cmd.OutOrStdout(), 0, 0, 3, ' ', 0)
|
||||
fmt.Fprintln(w, "TIMESTAMP\tMIMETYPE\tSTATUS\tLENGTH\tURL")
|
||||
for _, s := range snapshots {
|
||||
fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", s.Timestamp, s.MimeType, s.StatusCode, s.Length, s.Original)
|
||||
}
|
||||
return w.Flush()
|
||||
},
|
||||
}
|
||||
return cmd
|
||||
}
|
||||
|
||||
func NewWaybackCollectCmd() *cobra.Command {
|
||||
cmd := &cobra.Command{
|
||||
Use: "collect [url]",
|
||||
Short: "Collect a snapshot of a website.",
|
||||
Long: `Collects a snapshot of a website from the Wayback Machine.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
urlArg := args[0]
|
||||
outputDir, _ := cmd.Flags().GetString("output")
|
||||
latest, _ := cmd.Flags().GetBool("latest")
|
||||
all, _ := cmd.Flags().GetBool("all")
|
||||
date, _ := cmd.Flags().GetString("date")
|
||||
|
||||
if err := os.MkdirAll(outputDir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create output directory: %w", err)
|
||||
}
|
||||
|
||||
baseURL, err := url.Parse(urlArg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse URL: %w", err)
|
||||
}
|
||||
|
||||
snapshots, err := wayback.ListSnapshots(urlArg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to list snapshots: %w", err)
|
||||
}
|
||||
if len(snapshots) == 0 {
|
||||
fmt.Fprintln(cmd.OutOrStdout(), "No snapshots found.")
|
||||
return nil
|
||||
}
|
||||
|
||||
var timestamps []string
|
||||
if latest {
|
||||
timestamps = []string{snapshots[len(snapshots)-1].Timestamp}
|
||||
} else if all {
|
||||
for _, s := range snapshots {
|
||||
timestamps = append(timestamps, s.Timestamp)
|
||||
}
|
||||
} else if date != "" {
|
||||
filtered := filterSnapshotsByDate(snapshots, date)
|
||||
if len(filtered) == 0 {
|
||||
return fmt.Errorf("no snapshots found for date: %s", date)
|
||||
}
|
||||
for _, s := range filtered {
|
||||
timestamps = append(timestamps, s.Timestamp)
|
||||
}
|
||||
} else {
|
||||
return fmt.Errorf("either --latest, --all, or --date must be specified")
|
||||
}
|
||||
|
||||
timeline := ""
|
||||
downloadedDigests := make(map[string]bool)
|
||||
|
||||
assets, err := wayback.ListSnapshots(fmt.Sprintf("%s/*", urlArg))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to list assets: %w", err)
|
||||
}
|
||||
|
||||
for _, ts := range timestamps {
|
||||
fmt.Fprintf(cmd.OutOrStdout(), "Collecting snapshot from %s...\n", ts)
|
||||
snapshotDir := filepath.Join(outputDir, ts)
|
||||
if err := os.MkdirAll(snapshotDir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create snapshot directory: %w", err)
|
||||
}
|
||||
|
||||
rootSnapshot := wayback.Snapshot{Timestamp: ts, Original: urlArg}
|
||||
if err := downloadAndProcess(rootSnapshot, snapshotDir, baseURL, downloadedDigests); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
timeline += fmt.Sprintf("- %s: %s\n", ts, urlArg)
|
||||
}
|
||||
|
||||
func downloadAndProcess(snapshot wayback.Snapshot, snapshotDir string, baseURL *url.URL, downloadedDigests map[string]bool) error {
|
||||
if downloadedDigests[snapshot.Digest] {
|
||||
return nil
|
||||
}
|
||||
time.Sleep(200 * time.Millisecond) // Simple rate-limiting
|
||||
fmt.Printf(" Downloading %s\n", snapshot.Original)
|
||||
data, err := wayback.DownloadSnapshot(snapshot)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to download asset %s: %w", snapshot.Original, err)
|
||||
}
|
||||
downloadedDigests[snapshot.Digest] = true
|
||||
|
||||
assetURL, err := url.Parse(snapshot.Original)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse asset URL %s: %w", snapshot.Original, err)
|
||||
}
|
||||
path := assetURL.Path
|
||||
if strings.HasSuffix(path, "/") {
|
||||
path = filepath.Join(path, "index.html")
|
||||
}
|
||||
filePath := filepath.Join(snapshotDir, path)
|
||||
if err := os.MkdirAll(filepath.Dir(filePath), 0755); err != nil {
|
||||
return fmt.Errorf("failed to create asset directory for %s: %w", filePath, err)
|
||||
}
|
||||
if err := os.WriteFile(filePath, data, 0644); err != nil {
|
||||
return fmt.Errorf("failed to write asset %s: %w", filePath, err)
|
||||
}
|
||||
|
||||
if strings.HasPrefix(snapshot.MimeType, "text/html") {
|
||||
rewrittenData, err := wayback.RewriteLinks(data, baseURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to rewrite links for %s: %w", snapshot.Original, err)
|
||||
}
|
||||
if err := os.WriteFile(filePath, rewrittenData, 0644); err != nil {
|
||||
return fmt.Errorf("failed to write rewritten asset %s: %w", filePath, err)
|
||||
}
|
||||
|
||||
links, err := wayback.ExtractLinks(data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to extract links from %s: %w", snapshot.Original, err)
|
||||
}
|
||||
|
||||
for _, link := range links {
|
||||
absoluteURL := assetURL.ResolveReference(&url.URL{Path: link})
|
||||
assetSnapshot := wayback.Snapshot{Timestamp: snapshot.Timestamp, Original: absoluteURL.String()}
|
||||
if err := downloadAndProcess(assetSnapshot, snapshotDir, baseURL, downloadedDigests); err != nil {
|
||||
fmt.Printf("Warning: failed to process asset %s: %v\n", absoluteURL.String(), err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
||||
timelineFile := filepath.Join(outputDir, "TIMELINE.md")
|
||||
if err := os.WriteFile(timelineFile, []byte(timeline), 0644); err != nil {
|
||||
return fmt.Errorf("failed to write timeline file: %w", err)
|
||||
}
|
||||
|
||||
fmt.Fprintf(cmd.OutOrStdout(), "Snapshots saved to %s\n", outputDir)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
cmd.Flags().Bool("latest", false, "Collect the latest available snapshot.")
|
||||
cmd.Flags().Bool("all", false, "Collect all available snapshots.")
|
||||
cmd.Flags().String("date", "", "Collect a snapshot from a specific date (YYYY-MM-DD).")
|
||||
cmd.Flags().String("output", "", "Output directory for the collected snapshots.")
|
||||
cmd.MarkFlagRequired("output")
|
||||
return cmd
|
||||
}
|
||||
|
||||
func filterSnapshotsByDate(snapshots []wayback.Snapshot, date string) []wayback.Snapshot {
|
||||
var filtered []wayback.Snapshot
|
||||
for _, s := range snapshots {
|
||||
if len(s.Timestamp) >= 8 && s.Timestamp[:8] == date[:4]+date[5:7]+date[8:10] {
|
||||
filtered = append(filtered, s)
|
||||
}
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
147
cmd/collect_wayback_test.go
Normal file
147
cmd/collect_wayback_test.go
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// MockRoundTripper is a mock implementation of http.RoundTripper for testing.
|
||||
type MockRoundTripper struct {
|
||||
Response *http.Response
|
||||
Err error
|
||||
RoundTripFunc func(req *http.Request) (*http.Response, error)
|
||||
}
|
||||
|
||||
func (m *MockRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
|
||||
if m.RoundTripFunc != nil {
|
||||
return m.RoundTripFunc(req)
|
||||
}
|
||||
return m.Response, m.Err
|
||||
}
|
||||
|
||||
func NewMockClient(responseBody string, statusCode int) *http.Client {
|
||||
return &http.Client{
|
||||
Transport: &MockRoundTripper{
|
||||
Response: &http.Response{
|
||||
StatusCode: statusCode,
|
||||
Body: io.NopCloser(bytes.NewBufferString(responseBody)),
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestWaybackList(t *testing.T) {
|
||||
t.Cleanup(func() {
|
||||
RootCmd.SetArgs([]string{})
|
||||
})
|
||||
mockResponse := `[
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
|
||||
["com,example)/", "20220101000000", "http://example.com/", "text/html", "200", "DIGEST", "1234"]
|
||||
]`
|
||||
http.DefaultClient = NewMockClient(mockResponse, http.StatusOK)
|
||||
|
||||
output, err := executeCommand(RootCmd, "wayback", "list", "http://example.com")
|
||||
if err != nil {
|
||||
t.Fatalf("executeCommand returned an unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if !strings.Contains(output, "20220101000000") {
|
||||
t.Errorf("Expected output to contain timestamp '20220101000000', got '%s'", output)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWaybackCollect(t *testing.T) {
|
||||
t.Cleanup(func() {
|
||||
RootCmd.SetArgs([]string{})
|
||||
})
|
||||
t.Run("Good - Latest with Assets", func(t *testing.T) {
|
||||
mockListResponse := `[
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
|
||||
["com,example)/", "20230101000000", "http://example.com/", "text/html", "200", "DIGEST1", "1234"]
|
||||
]`
|
||||
mockAssetsResponse := `[
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
|
||||
["com,example)/", "20230101000000", "http://example.com/", "text/html", "200", "DIGEST1", "1234"],
|
||||
["com,example)/css/style.css", "20230101000000", "http://example.com/css/style.css", "text/css", "200", "DIGEST2", "5678"]
|
||||
]`
|
||||
mockHTMLContent := "<html><head><link rel='stylesheet' href='/css/style.css'></head><body>Hello</body></html>"
|
||||
mockCSSContent := "body { color: red; }"
|
||||
|
||||
// This is still a simplified mock, but it's better.
|
||||
// A more robust solution would use a mock server or a more sophisticated RoundTripper.
|
||||
var requestCount int
|
||||
http.DefaultClient = &http.Client{
|
||||
Transport: &MockRoundTripper{
|
||||
Response: &http.Response{
|
||||
StatusCode: http.StatusOK,
|
||||
Body: io.NopCloser(bytes.NewBufferString("")), // Placeholder
|
||||
},
|
||||
},
|
||||
}
|
||||
http.DefaultClient.Transport.(*MockRoundTripper).Response.Body = io.NopCloser(bytes.NewBufferString(mockListResponse))
|
||||
http.DefaultClient.Transport.(*MockRoundTripper).RoundTripFunc = func(req *http.Request) (*http.Response, error) {
|
||||
var body string
|
||||
if requestCount == 0 {
|
||||
body = mockListResponse
|
||||
} else if requestCount == 1 {
|
||||
body = mockAssetsResponse
|
||||
} else if strings.Contains(req.URL.Path, "style.css") {
|
||||
body = mockCSSContent
|
||||
} else {
|
||||
body = mockHTMLContent
|
||||
}
|
||||
requestCount++
|
||||
return &http.Response{
|
||||
StatusCode: http.StatusOK,
|
||||
Body: io.NopCloser(bytes.NewBufferString(body)),
|
||||
}, nil
|
||||
}
|
||||
|
||||
tempDir, err := os.MkdirTemp("", "borg-test")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create temp dir: %v", err)
|
||||
}
|
||||
defer os.RemoveAll(tempDir)
|
||||
|
||||
_, err = executeCommand(RootCmd, "wayback", "collect", "http://example.com", "--latest", "--output", tempDir)
|
||||
if err != nil {
|
||||
t.Fatalf("executeCommand returned an unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// Verify TIMELINE.md
|
||||
timelineFile := tempDir + "/TIMELINE.md"
|
||||
if _, err := os.Stat(timelineFile); os.IsNotExist(err) {
|
||||
t.Errorf("Expected TIMELINE.md to be created in %s", tempDir)
|
||||
}
|
||||
|
||||
// Verify index.html
|
||||
indexFile := tempDir + "/20230101000000/index.html"
|
||||
if _, err := os.Stat(indexFile); os.IsNotExist(err) {
|
||||
t.Fatalf("Expected index.html to be created in %s", indexFile)
|
||||
}
|
||||
content, err := os.ReadFile(indexFile)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to read index.html: %v", err)
|
||||
}
|
||||
if !strings.Contains(string(content), "Hello") {
|
||||
t.Errorf("index.html content is incorrect")
|
||||
}
|
||||
|
||||
// Verify style.css
|
||||
cssFile := tempDir + "/20230101000000/css/style.css"
|
||||
if _, err := os.Stat(cssFile); os.IsNotExist(err) {
|
||||
t.Fatalf("Expected style.css to be created in %s", cssFile)
|
||||
}
|
||||
content, err = os.ReadFile(cssFile)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to read style.css: %v", err)
|
||||
}
|
||||
if !strings.Contains(string(content), "color: red") {
|
||||
t.Errorf("style.css content is incorrect")
|
||||
}
|
||||
})
|
||||
}
|
||||
184
pkg/wayback/wayback.go
Normal file
184
pkg/wayback/wayback.go
Normal file
|
|
@ -0,0 +1,184 @@
|
|||
package wayback
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// Snapshot represents a single entry from the Wayback Machine CDX API.
|
||||
type Snapshot struct {
|
||||
URLKey string
|
||||
Timestamp string
|
||||
Original string
|
||||
MimeType string
|
||||
StatusCode string
|
||||
Digest string
|
||||
Length string
|
||||
}
|
||||
|
||||
// ListSnapshots queries the Wayback Machine's CDX API to get a list of
|
||||
// available snapshots for a given URL.
|
||||
func ListSnapshots(url string) ([]Snapshot, error) {
|
||||
return listSnapshots(fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=%s&output=json", url))
|
||||
}
|
||||
|
||||
func listSnapshots(apiURL string) ([]Snapshot, error) {
|
||||
resp, err := http.Get(apiURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to make request to CDX API: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("CDX API returned non-200 status: %s\nBody: %s", resp.Status, string(body))
|
||||
}
|
||||
|
||||
var rawSnapshots [][]string
|
||||
if err := json.NewDecoder(resp.Body).Decode(&rawSnapshots); err != nil {
|
||||
return nil, fmt.Errorf("failed to decode JSON response from CDX API: %w", err)
|
||||
}
|
||||
|
||||
if len(rawSnapshots) < 2 {
|
||||
return []Snapshot{}, nil // No snapshots found is not an error
|
||||
}
|
||||
|
||||
header := rawSnapshots[0]
|
||||
fieldMap := make(map[string]int, len(header))
|
||||
for i, field := range header {
|
||||
fieldMap[field] = i
|
||||
}
|
||||
|
||||
requiredFields := []string{"urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "length"}
|
||||
for _, field := range requiredFields {
|
||||
if _, ok := fieldMap[field]; !ok {
|
||||
return nil, fmt.Errorf("CDX API response is missing the required field: '%s'", field)
|
||||
}
|
||||
}
|
||||
|
||||
snapshots := make([]Snapshot, 0, len(rawSnapshots)-1)
|
||||
for _, record := range rawSnapshots[1:] {
|
||||
if len(record) != len(header) {
|
||||
continue // Skip malformed records
|
||||
}
|
||||
snapshots = append(snapshots, Snapshot{
|
||||
URLKey: record[fieldMap["urlkey"]],
|
||||
Timestamp: record[fieldMap["timestamp"]],
|
||||
Original: record[fieldMap["original"]],
|
||||
MimeType: record[fieldMap["mimetype"]],
|
||||
StatusCode: record[fieldMap["statuscode"]],
|
||||
Digest: record[fieldMap["digest"]],
|
||||
Length: record[fieldMap["length"]],
|
||||
})
|
||||
}
|
||||
|
||||
return snapshots, nil
|
||||
}
|
||||
|
||||
// DownloadSnapshot downloads the raw content of a specific snapshot.
|
||||
func DownloadSnapshot(snapshot Snapshot) ([]byte, error) {
|
||||
// Construct the URL for the raw snapshot content, which includes "id_" for "identity"
|
||||
rawURL := fmt.Sprintf("https://web.archive.org/web/%sid_/%s", snapshot.Timestamp, snapshot.Original)
|
||||
|
||||
resp, err := http.Get(rawURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to make request to download snapshot: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("snapshot download returned non-200 status: %s\nURL: %s\nBody: %s", resp.Status, rawURL, string(body))
|
||||
}
|
||||
|
||||
data, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read snapshot content: %w", err)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// RewriteLinks takes HTML content and rewrites internal links to be relative.
|
||||
func RewriteLinks(htmlContent []byte, baseURL *url.URL) ([]byte, error) {
|
||||
links, err := ExtractLinks(htmlContent)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// This is a simplified implementation for now. A more robust solution
|
||||
// would use a proper HTML parser to replace the links.
|
||||
rewritten := string(htmlContent)
|
||||
for _, link := range links {
|
||||
newURL, changed := rewriteURL(link, baseURL)
|
||||
if changed {
|
||||
rewritten = strings.ReplaceAll(rewritten, link, newURL)
|
||||
}
|
||||
}
|
||||
return []byte(rewritten), nil
|
||||
}
|
||||
|
||||
// ExtractLinks takes HTML content and returns a list of all asset links.
|
||||
func ExtractLinks(htmlContent []byte) ([]string, error) {
|
||||
var links []string
|
||||
doc, err := html.Parse(bytes.NewReader(htmlContent))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse HTML: %w", err)
|
||||
}
|
||||
|
||||
var f func(*html.Node)
|
||||
f = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode {
|
||||
for _, a := range n.Attr {
|
||||
if a.Key == "href" || a.Key == "src" {
|
||||
links = append(links, a.Val)
|
||||
}
|
||||
}
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
f(c)
|
||||
}
|
||||
}
|
||||
f(doc)
|
||||
return links, nil
|
||||
}
|
||||
|
||||
func rewriteURL(rawURL string, baseURL *url.URL) (string, bool) {
|
||||
parsedURL, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return rawURL, false
|
||||
}
|
||||
|
||||
// If the URL is relative, we don't need to do anything.
|
||||
if !parsedURL.IsAbs() {
|
||||
return rawURL, false
|
||||
}
|
||||
|
||||
// Handle Wayback Machine URLs
|
||||
if strings.HasPrefix(parsedURL.Host, "web.archive.org") {
|
||||
// Extract the original URL from the Wayback Machine URL
|
||||
// e.g., /web/20220101120000/https://example.com/ -> https://example.com/
|
||||
parts := strings.SplitN(parsedURL.Path, "/", 4)
|
||||
if len(parts) >= 4 {
|
||||
originalURL, err := url.Parse(parts[3])
|
||||
if err == nil {
|
||||
if originalURL.Host == baseURL.Host {
|
||||
return originalURL.Path, true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle absolute URLs that point to the same host
|
||||
if parsedURL.Host == baseURL.Host {
|
||||
return parsedURL.Path, true
|
||||
}
|
||||
|
||||
return rawURL, false
|
||||
}
|
||||
114
pkg/wayback/wayback_test.go
Normal file
114
pkg/wayback/wayback_test.go
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
package wayback
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// MockRoundTripper is a mock implementation of http.RoundTripper for testing.
|
||||
type MockRoundTripper struct {
|
||||
Response *http.Response
|
||||
Err error
|
||||
}
|
||||
|
||||
func (m *MockRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
|
||||
return m.Response, m.Err
|
||||
}
|
||||
|
||||
func NewMockClient(responseBody string, statusCode int) *http.Client {
|
||||
return &http.Client{
|
||||
Transport: &MockRoundTripper{
|
||||
Response: &http.Response{
|
||||
StatusCode: statusCode,
|
||||
Body: io.NopCloser(bytes.NewBufferString(responseBody)),
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestListSnapshots(t *testing.T) {
|
||||
t.Run("Good", func(t *testing.T) {
|
||||
mockResponse := `[
|
||||
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
|
||||
["com,example)/", "20220101000000", "http://example.com/", "text/html", "200", "DIGEST", "1234"],
|
||||
["com,example)/", "20230101000000", "http://example.com/", "text/html", "200", "DIGEST", "5678"]
|
||||
]`
|
||||
http.DefaultClient = NewMockClient(mockResponse, http.StatusOK)
|
||||
|
||||
snapshots, err := ListSnapshots("http://example.com")
|
||||
if err != nil {
|
||||
t.Fatalf("ListSnapshots returned an unexpected error: %v", err)
|
||||
}
|
||||
if len(snapshots) != 2 {
|
||||
t.Fatalf("Expected 2 snapshots, got %d", len(snapshots))
|
||||
}
|
||||
if snapshots[0].Timestamp != "20220101000000" {
|
||||
t.Errorf("Expected timestamp '20220101000000', got '%s'", snapshots[0].Timestamp)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("Bad - API error", func(t *testing.T) {
|
||||
http.DefaultClient = NewMockClient("server error", http.StatusInternalServerError)
|
||||
_, err := ListSnapshots("http://example.com")
|
||||
if err == nil {
|
||||
t.Fatal("ListSnapshots did not return an error for a non-200 response")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("Ugly - Malformed JSON", func(t *testing.T) {
|
||||
http.DefaultClient = NewMockClient(`[`, http.StatusOK)
|
||||
_, err := ListSnapshots("http://example.com")
|
||||
if err == nil {
|
||||
t.Fatal("ListSnapshots did not return an error for malformed JSON")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestDownloadSnapshot(t *testing.T) {
|
||||
t.Run("Good", func(t *testing.T) {
|
||||
mockResponse := "<html><body>Hello, World!</body></html>"
|
||||
http.DefaultClient = NewMockClient(mockResponse, http.StatusOK)
|
||||
|
||||
snapshot := Snapshot{Timestamp: "20220101000000", Original: "http://example.com/"}
|
||||
data, err := DownloadSnapshot(snapshot)
|
||||
if err != nil {
|
||||
t.Fatalf("DownloadSnapshot returned an unexpected error: %v", err)
|
||||
}
|
||||
if string(data) != mockResponse {
|
||||
t.Errorf("Expected response body '%s', got '%s'", mockResponse, string(data))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestRewriteLinks(t *testing.T) {
|
||||
baseURL, _ := url.Parse("http://example.com")
|
||||
htmlContent := `
|
||||
<html><body>
|
||||
<a href="https://web.archive.org/web/20220101000000/http://example.com/page1">Page 1</a>
|
||||
<a href="https://web.archive.org/web/20220101000000/http://othersite.com/page2">Page 2</a>
|
||||
<a href="/relative/path">Relative Path</a>
|
||||
<img src="https://web.archive.org/web/20220101000000/http://example.com/image.jpg" />
|
||||
</body></html>
|
||||
`
|
||||
rewritten, err := RewriteLinks([]byte(htmlContent), baseURL)
|
||||
if err != nil {
|
||||
t.Fatalf("RewriteLinks returned an unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if !strings.Contains(string(rewritten), `href="/page1"`) {
|
||||
t.Error("Expected link to be rewritten to /page1")
|
||||
}
|
||||
if !strings.Contains(string(rewritten), `href="https://web.archive.org/web/20220101000000/http://othersite.com/page2"`) {
|
||||
t.Error("External link should not have been rewritten")
|
||||
}
|
||||
if !strings.Contains(string(rewritten), `href="/relative/path"`) {
|
||||
t.Error("Relative link should not have been changed")
|
||||
}
|
||||
if !strings.Contains(string(rewritten), `src="/image.jpg"`) {
|
||||
t.Error("Expected image src to be rewritten to /image.jpg")
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue