This commit is contained in:
Snider 2026-02-08 10:40:39 -05:00 committed by GitHub
commit 236f014223
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 671 additions and 0 deletions

226
cmd/collect_wayback.go Normal file
View file

@ -0,0 +1,226 @@
package cmd
import (
"fmt"
"path/filepath"
"time"
"github.com/Snider/Borg/pkg/wayback"
"github.com/spf13/cobra"
"net/url"
"os"
"strings"
"text/tabwriter"
)
// waybackCmd represents the wayback command
var waybackCmd = NewWaybackCmd()
var waybackListCmd = NewWaybackListCmd()
var waybackCollectCmd = NewWaybackCollectCmd()
func init() {
RootCmd.AddCommand(GetWaybackCmd())
GetWaybackCmd().AddCommand(GetWaybackListCmd())
GetWaybackCmd().AddCommand(GetWaybackCollectCmd())
}
func GetWaybackCmd() *cobra.Command {
return waybackCmd
}
func GetWaybackListCmd() *cobra.Command {
return waybackListCmd
}
func GetWaybackCollectCmd() *cobra.Command {
return waybackCollectCmd
}
func NewWaybackCmd() *cobra.Command {
cmd := &cobra.Command{
Use: "wayback",
Short: "Interact with the Internet Archive Wayback Machine.",
Long: `List and collect historical snapshots of websites from the Internet Archive Wayback Machine.`,
}
return cmd
}
func NewWaybackListCmd() *cobra.Command {
cmd := &cobra.Command{
Use: "list [url]",
Short: "List available snapshots for a URL.",
Long: `Queries the Wayback Machine CDX API to find all available snapshots for a given URL.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
url := args[0]
snapshots, err := wayback.ListSnapshots(url)
if err != nil {
return fmt.Errorf("failed to list snapshots: %w", err)
}
if len(snapshots) == 0 {
fmt.Fprintln(cmd.OutOrStdout(), "No snapshots found.")
return nil
}
w := tabwriter.NewWriter(cmd.OutOrStdout(), 0, 0, 3, ' ', 0)
fmt.Fprintln(w, "TIMESTAMP\tMIMETYPE\tSTATUS\tLENGTH\tURL")
for _, s := range snapshots {
fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", s.Timestamp, s.MimeType, s.StatusCode, s.Length, s.Original)
}
return w.Flush()
},
}
return cmd
}
func NewWaybackCollectCmd() *cobra.Command {
cmd := &cobra.Command{
Use: "collect [url]",
Short: "Collect a snapshot of a website.",
Long: `Collects a snapshot of a website from the Wayback Machine.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
urlArg := args[0]
outputDir, _ := cmd.Flags().GetString("output")
latest, _ := cmd.Flags().GetBool("latest")
all, _ := cmd.Flags().GetBool("all")
date, _ := cmd.Flags().GetString("date")
if err := os.MkdirAll(outputDir, 0755); err != nil {
return fmt.Errorf("failed to create output directory: %w", err)
}
baseURL, err := url.Parse(urlArg)
if err != nil {
return fmt.Errorf("failed to parse URL: %w", err)
}
snapshots, err := wayback.ListSnapshots(urlArg)
if err != nil {
return fmt.Errorf("failed to list snapshots: %w", err)
}
if len(snapshots) == 0 {
fmt.Fprintln(cmd.OutOrStdout(), "No snapshots found.")
return nil
}
var timestamps []string
if latest {
timestamps = []string{snapshots[len(snapshots)-1].Timestamp}
} else if all {
for _, s := range snapshots {
timestamps = append(timestamps, s.Timestamp)
}
} else if date != "" {
filtered := filterSnapshotsByDate(snapshots, date)
if len(filtered) == 0 {
return fmt.Errorf("no snapshots found for date: %s", date)
}
for _, s := range filtered {
timestamps = append(timestamps, s.Timestamp)
}
} else {
return fmt.Errorf("either --latest, --all, or --date must be specified")
}
timeline := ""
downloadedDigests := make(map[string]bool)
assets, err := wayback.ListSnapshots(fmt.Sprintf("%s/*", urlArg))
if err != nil {
return fmt.Errorf("failed to list assets: %w", err)
}
for _, ts := range timestamps {
fmt.Fprintf(cmd.OutOrStdout(), "Collecting snapshot from %s...\n", ts)
snapshotDir := filepath.Join(outputDir, ts)
if err := os.MkdirAll(snapshotDir, 0755); err != nil {
return fmt.Errorf("failed to create snapshot directory: %w", err)
}
rootSnapshot := wayback.Snapshot{Timestamp: ts, Original: urlArg}
if err := downloadAndProcess(rootSnapshot, snapshotDir, baseURL, downloadedDigests); err != nil {
return err
}
timeline += fmt.Sprintf("- %s: %s\n", ts, urlArg)
}
func downloadAndProcess(snapshot wayback.Snapshot, snapshotDir string, baseURL *url.URL, downloadedDigests map[string]bool) error {
if downloadedDigests[snapshot.Digest] {
return nil
}
time.Sleep(200 * time.Millisecond) // Simple rate-limiting
fmt.Printf(" Downloading %s\n", snapshot.Original)
data, err := wayback.DownloadSnapshot(snapshot)
if err != nil {
return fmt.Errorf("failed to download asset %s: %w", snapshot.Original, err)
}
downloadedDigests[snapshot.Digest] = true
assetURL, err := url.Parse(snapshot.Original)
if err != nil {
return fmt.Errorf("failed to parse asset URL %s: %w", snapshot.Original, err)
}
path := assetURL.Path
if strings.HasSuffix(path, "/") {
path = filepath.Join(path, "index.html")
}
filePath := filepath.Join(snapshotDir, path)
if err := os.MkdirAll(filepath.Dir(filePath), 0755); err != nil {
return fmt.Errorf("failed to create asset directory for %s: %w", filePath, err)
}
if err := os.WriteFile(filePath, data, 0644); err != nil {
return fmt.Errorf("failed to write asset %s: %w", filePath, err)
}
if strings.HasPrefix(snapshot.MimeType, "text/html") {
rewrittenData, err := wayback.RewriteLinks(data, baseURL)
if err != nil {
return fmt.Errorf("failed to rewrite links for %s: %w", snapshot.Original, err)
}
if err := os.WriteFile(filePath, rewrittenData, 0644); err != nil {
return fmt.Errorf("failed to write rewritten asset %s: %w", filePath, err)
}
links, err := wayback.ExtractLinks(data)
if err != nil {
return fmt.Errorf("failed to extract links from %s: %w", snapshot.Original, err)
}
for _, link := range links {
absoluteURL := assetURL.ResolveReference(&url.URL{Path: link})
assetSnapshot := wayback.Snapshot{Timestamp: snapshot.Timestamp, Original: absoluteURL.String()}
if err := downloadAndProcess(assetSnapshot, snapshotDir, baseURL, downloadedDigests); err != nil {
fmt.Printf("Warning: failed to process asset %s: %v\n", absoluteURL.String(), err)
}
}
}
return nil
timelineFile := filepath.Join(outputDir, "TIMELINE.md")
if err := os.WriteFile(timelineFile, []byte(timeline), 0644); err != nil {
return fmt.Errorf("failed to write timeline file: %w", err)
}
fmt.Fprintf(cmd.OutOrStdout(), "Snapshots saved to %s\n", outputDir)
return nil
},
}
cmd.Flags().Bool("latest", false, "Collect the latest available snapshot.")
cmd.Flags().Bool("all", false, "Collect all available snapshots.")
cmd.Flags().String("date", "", "Collect a snapshot from a specific date (YYYY-MM-DD).")
cmd.Flags().String("output", "", "Output directory for the collected snapshots.")
cmd.MarkFlagRequired("output")
return cmd
}
func filterSnapshotsByDate(snapshots []wayback.Snapshot, date string) []wayback.Snapshot {
var filtered []wayback.Snapshot
for _, s := range snapshots {
if len(s.Timestamp) >= 8 && s.Timestamp[:8] == date[:4]+date[5:7]+date[8:10] {
filtered = append(filtered, s)
}
}
return filtered
}

147
cmd/collect_wayback_test.go Normal file
View file

@ -0,0 +1,147 @@
package cmd
import (
"bytes"
"io"
"net/http"
"os"
"strings"
"testing"
)
// MockRoundTripper is a mock implementation of http.RoundTripper for testing.
type MockRoundTripper struct {
Response *http.Response
Err error
RoundTripFunc func(req *http.Request) (*http.Response, error)
}
func (m *MockRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
if m.RoundTripFunc != nil {
return m.RoundTripFunc(req)
}
return m.Response, m.Err
}
func NewMockClient(responseBody string, statusCode int) *http.Client {
return &http.Client{
Transport: &MockRoundTripper{
Response: &http.Response{
StatusCode: statusCode,
Body: io.NopCloser(bytes.NewBufferString(responseBody)),
},
},
}
}
func TestWaybackList(t *testing.T) {
t.Cleanup(func() {
RootCmd.SetArgs([]string{})
})
mockResponse := `[
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
["com,example)/", "20220101000000", "http://example.com/", "text/html", "200", "DIGEST", "1234"]
]`
http.DefaultClient = NewMockClient(mockResponse, http.StatusOK)
output, err := executeCommand(RootCmd, "wayback", "list", "http://example.com")
if err != nil {
t.Fatalf("executeCommand returned an unexpected error: %v", err)
}
if !strings.Contains(output, "20220101000000") {
t.Errorf("Expected output to contain timestamp '20220101000000', got '%s'", output)
}
}
func TestWaybackCollect(t *testing.T) {
t.Cleanup(func() {
RootCmd.SetArgs([]string{})
})
t.Run("Good - Latest with Assets", func(t *testing.T) {
mockListResponse := `[
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
["com,example)/", "20230101000000", "http://example.com/", "text/html", "200", "DIGEST1", "1234"]
]`
mockAssetsResponse := `[
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
["com,example)/", "20230101000000", "http://example.com/", "text/html", "200", "DIGEST1", "1234"],
["com,example)/css/style.css", "20230101000000", "http://example.com/css/style.css", "text/css", "200", "DIGEST2", "5678"]
]`
mockHTMLContent := "<html><head><link rel='stylesheet' href='/css/style.css'></head><body>Hello</body></html>"
mockCSSContent := "body { color: red; }"
// This is still a simplified mock, but it's better.
// A more robust solution would use a mock server or a more sophisticated RoundTripper.
var requestCount int
http.DefaultClient = &http.Client{
Transport: &MockRoundTripper{
Response: &http.Response{
StatusCode: http.StatusOK,
Body: io.NopCloser(bytes.NewBufferString("")), // Placeholder
},
},
}
http.DefaultClient.Transport.(*MockRoundTripper).Response.Body = io.NopCloser(bytes.NewBufferString(mockListResponse))
http.DefaultClient.Transport.(*MockRoundTripper).RoundTripFunc = func(req *http.Request) (*http.Response, error) {
var body string
if requestCount == 0 {
body = mockListResponse
} else if requestCount == 1 {
body = mockAssetsResponse
} else if strings.Contains(req.URL.Path, "style.css") {
body = mockCSSContent
} else {
body = mockHTMLContent
}
requestCount++
return &http.Response{
StatusCode: http.StatusOK,
Body: io.NopCloser(bytes.NewBufferString(body)),
}, nil
}
tempDir, err := os.MkdirTemp("", "borg-test")
if err != nil {
t.Fatalf("Failed to create temp dir: %v", err)
}
defer os.RemoveAll(tempDir)
_, err = executeCommand(RootCmd, "wayback", "collect", "http://example.com", "--latest", "--output", tempDir)
if err != nil {
t.Fatalf("executeCommand returned an unexpected error: %v", err)
}
// Verify TIMELINE.md
timelineFile := tempDir + "/TIMELINE.md"
if _, err := os.Stat(timelineFile); os.IsNotExist(err) {
t.Errorf("Expected TIMELINE.md to be created in %s", tempDir)
}
// Verify index.html
indexFile := tempDir + "/20230101000000/index.html"
if _, err := os.Stat(indexFile); os.IsNotExist(err) {
t.Fatalf("Expected index.html to be created in %s", indexFile)
}
content, err := os.ReadFile(indexFile)
if err != nil {
t.Fatalf("Failed to read index.html: %v", err)
}
if !strings.Contains(string(content), "Hello") {
t.Errorf("index.html content is incorrect")
}
// Verify style.css
cssFile := tempDir + "/20230101000000/css/style.css"
if _, err := os.Stat(cssFile); os.IsNotExist(err) {
t.Fatalf("Expected style.css to be created in %s", cssFile)
}
content, err = os.ReadFile(cssFile)
if err != nil {
t.Fatalf("Failed to read style.css: %v", err)
}
if !strings.Contains(string(content), "color: red") {
t.Errorf("style.css content is incorrect")
}
})
}

184
pkg/wayback/wayback.go Normal file
View file

@ -0,0 +1,184 @@
package wayback
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"golang.org/x/net/html"
)
// Snapshot represents a single entry from the Wayback Machine CDX API.
type Snapshot struct {
URLKey string
Timestamp string
Original string
MimeType string
StatusCode string
Digest string
Length string
}
// ListSnapshots queries the Wayback Machine's CDX API to get a list of
// available snapshots for a given URL.
func ListSnapshots(url string) ([]Snapshot, error) {
return listSnapshots(fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=%s&output=json", url))
}
func listSnapshots(apiURL string) ([]Snapshot, error) {
resp, err := http.Get(apiURL)
if err != nil {
return nil, fmt.Errorf("failed to make request to CDX API: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("CDX API returned non-200 status: %s\nBody: %s", resp.Status, string(body))
}
var rawSnapshots [][]string
if err := json.NewDecoder(resp.Body).Decode(&rawSnapshots); err != nil {
return nil, fmt.Errorf("failed to decode JSON response from CDX API: %w", err)
}
if len(rawSnapshots) < 2 {
return []Snapshot{}, nil // No snapshots found is not an error
}
header := rawSnapshots[0]
fieldMap := make(map[string]int, len(header))
for i, field := range header {
fieldMap[field] = i
}
requiredFields := []string{"urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "length"}
for _, field := range requiredFields {
if _, ok := fieldMap[field]; !ok {
return nil, fmt.Errorf("CDX API response is missing the required field: '%s'", field)
}
}
snapshots := make([]Snapshot, 0, len(rawSnapshots)-1)
for _, record := range rawSnapshots[1:] {
if len(record) != len(header) {
continue // Skip malformed records
}
snapshots = append(snapshots, Snapshot{
URLKey: record[fieldMap["urlkey"]],
Timestamp: record[fieldMap["timestamp"]],
Original: record[fieldMap["original"]],
MimeType: record[fieldMap["mimetype"]],
StatusCode: record[fieldMap["statuscode"]],
Digest: record[fieldMap["digest"]],
Length: record[fieldMap["length"]],
})
}
return snapshots, nil
}
// DownloadSnapshot downloads the raw content of a specific snapshot.
func DownloadSnapshot(snapshot Snapshot) ([]byte, error) {
// Construct the URL for the raw snapshot content, which includes "id_" for "identity"
rawURL := fmt.Sprintf("https://web.archive.org/web/%sid_/%s", snapshot.Timestamp, snapshot.Original)
resp, err := http.Get(rawURL)
if err != nil {
return nil, fmt.Errorf("failed to make request to download snapshot: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("snapshot download returned non-200 status: %s\nURL: %s\nBody: %s", resp.Status, rawURL, string(body))
}
data, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read snapshot content: %w", err)
}
return data, nil
}
// RewriteLinks takes HTML content and rewrites internal links to be relative.
func RewriteLinks(htmlContent []byte, baseURL *url.URL) ([]byte, error) {
links, err := ExtractLinks(htmlContent)
if err != nil {
return nil, err
}
// This is a simplified implementation for now. A more robust solution
// would use a proper HTML parser to replace the links.
rewritten := string(htmlContent)
for _, link := range links {
newURL, changed := rewriteURL(link, baseURL)
if changed {
rewritten = strings.ReplaceAll(rewritten, link, newURL)
}
}
return []byte(rewritten), nil
}
// ExtractLinks takes HTML content and returns a list of all asset links.
func ExtractLinks(htmlContent []byte) ([]string, error) {
var links []string
doc, err := html.Parse(bytes.NewReader(htmlContent))
if err != nil {
return nil, fmt.Errorf("failed to parse HTML: %w", err)
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode {
for _, a := range n.Attr {
if a.Key == "href" || a.Key == "src" {
links = append(links, a.Val)
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
return links, nil
}
func rewriteURL(rawURL string, baseURL *url.URL) (string, bool) {
parsedURL, err := url.Parse(rawURL)
if err != nil {
return rawURL, false
}
// If the URL is relative, we don't need to do anything.
if !parsedURL.IsAbs() {
return rawURL, false
}
// Handle Wayback Machine URLs
if strings.HasPrefix(parsedURL.Host, "web.archive.org") {
// Extract the original URL from the Wayback Machine URL
// e.g., /web/20220101120000/https://example.com/ -> https://example.com/
parts := strings.SplitN(parsedURL.Path, "/", 4)
if len(parts) >= 4 {
originalURL, err := url.Parse(parts[3])
if err == nil {
if originalURL.Host == baseURL.Host {
return originalURL.Path, true
}
}
}
}
// Handle absolute URLs that point to the same host
if parsedURL.Host == baseURL.Host {
return parsedURL.Path, true
}
return rawURL, false
}

114
pkg/wayback/wayback_test.go Normal file
View file

@ -0,0 +1,114 @@
package wayback
import (
"bytes"
"io"
"net/http"
"net/url"
"strings"
"testing"
)
// MockRoundTripper is a mock implementation of http.RoundTripper for testing.
type MockRoundTripper struct {
Response *http.Response
Err error
}
func (m *MockRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
return m.Response, m.Err
}
func NewMockClient(responseBody string, statusCode int) *http.Client {
return &http.Client{
Transport: &MockRoundTripper{
Response: &http.Response{
StatusCode: statusCode,
Body: io.NopCloser(bytes.NewBufferString(responseBody)),
},
},
}
}
func TestListSnapshots(t *testing.T) {
t.Run("Good", func(t *testing.T) {
mockResponse := `[
["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
["com,example)/", "20220101000000", "http://example.com/", "text/html", "200", "DIGEST", "1234"],
["com,example)/", "20230101000000", "http://example.com/", "text/html", "200", "DIGEST", "5678"]
]`
http.DefaultClient = NewMockClient(mockResponse, http.StatusOK)
snapshots, err := ListSnapshots("http://example.com")
if err != nil {
t.Fatalf("ListSnapshots returned an unexpected error: %v", err)
}
if len(snapshots) != 2 {
t.Fatalf("Expected 2 snapshots, got %d", len(snapshots))
}
if snapshots[0].Timestamp != "20220101000000" {
t.Errorf("Expected timestamp '20220101000000', got '%s'", snapshots[0].Timestamp)
}
})
t.Run("Bad - API error", func(t *testing.T) {
http.DefaultClient = NewMockClient("server error", http.StatusInternalServerError)
_, err := ListSnapshots("http://example.com")
if err == nil {
t.Fatal("ListSnapshots did not return an error for a non-200 response")
}
})
t.Run("Ugly - Malformed JSON", func(t *testing.T) {
http.DefaultClient = NewMockClient(`[`, http.StatusOK)
_, err := ListSnapshots("http://example.com")
if err == nil {
t.Fatal("ListSnapshots did not return an error for malformed JSON")
}
})
}
func TestDownloadSnapshot(t *testing.T) {
t.Run("Good", func(t *testing.T) {
mockResponse := "<html><body>Hello, World!</body></html>"
http.DefaultClient = NewMockClient(mockResponse, http.StatusOK)
snapshot := Snapshot{Timestamp: "20220101000000", Original: "http://example.com/"}
data, err := DownloadSnapshot(snapshot)
if err != nil {
t.Fatalf("DownloadSnapshot returned an unexpected error: %v", err)
}
if string(data) != mockResponse {
t.Errorf("Expected response body '%s', got '%s'", mockResponse, string(data))
}
})
}
func TestRewriteLinks(t *testing.T) {
baseURL, _ := url.Parse("http://example.com")
htmlContent := `
<html><body>
<a href="https://web.archive.org/web/20220101000000/http://example.com/page1">Page 1</a>
<a href="https://web.archive.org/web/20220101000000/http://othersite.com/page2">Page 2</a>
<a href="/relative/path">Relative Path</a>
<img src="https://web.archive.org/web/20220101000000/http://example.com/image.jpg" />
</body></html>
`
rewritten, err := RewriteLinks([]byte(htmlContent), baseURL)
if err != nil {
t.Fatalf("RewriteLinks returned an unexpected error: %v", err)
}
if !strings.Contains(string(rewritten), `href="/page1"`) {
t.Error("Expected link to be rewritten to /page1")
}
if !strings.Contains(string(rewritten), `href="https://web.archive.org/web/20220101000000/http://othersite.com/page2"`) {
t.Error("External link should not have been rewritten")
}
if !strings.Contains(string(rewritten), `href="/relative/path"`) {
t.Error("Relative link should not have been changed")
}
if !strings.Contains(string(rewritten), `src="/image.jpg"`) {
t.Error("Expected image src to be rewritten to /image.jpg")
}
}