From 5d71a365cd87a7e7fef299dff04e9858580dc5ca Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Mon, 2 Feb 2026 00:46:49 +0000 Subject: [PATCH] feat: Add Wayback Machine integration This commit introduces a new `wayback` command to interact with the Internet Archive's Wayback Machine. The `wayback` command has two subcommands: - `list`: Lists available snapshots for a given URL. - `collect`: Collects a snapshot of a website for offline viewing. The `collect` subcommand supports the following features: - Recursive downloading of all assets (CSS, JS, images, etc.). - Deduplication of content to avoid downloading the same file multiple times. - Rate-limiting to avoid overwhelming the Wayback Machine's API. - Rewriting of internal links for offline viewing. The implementation follows the existing command structure and includes unit and integration tests. Co-authored-by: Snider <631881+Snider@users.noreply.github.com> --- cmd/collect_wayback.go | 226 ++++++++++++++++++++++++++++++++++++ cmd/collect_wayback_test.go | 147 +++++++++++++++++++++++ pkg/wayback/wayback.go | 184 +++++++++++++++++++++++++++++ pkg/wayback/wayback_test.go | 114 ++++++++++++++++++ 4 files changed, 671 insertions(+) create mode 100644 cmd/collect_wayback.go create mode 100644 cmd/collect_wayback_test.go create mode 100644 pkg/wayback/wayback.go create mode 100644 pkg/wayback/wayback_test.go diff --git a/cmd/collect_wayback.go b/cmd/collect_wayback.go new file mode 100644 index 0000000..533a0a6 --- /dev/null +++ b/cmd/collect_wayback.go @@ -0,0 +1,226 @@ +package cmd + +import ( + "fmt" + "path/filepath" + "time" + "github.com/Snider/Borg/pkg/wayback" + "github.com/spf13/cobra" + "net/url" + "os" + "strings" + "text/tabwriter" +) + +// waybackCmd represents the wayback command +var waybackCmd = NewWaybackCmd() +var waybackListCmd = NewWaybackListCmd() +var waybackCollectCmd = NewWaybackCollectCmd() + +func init() { + RootCmd.AddCommand(GetWaybackCmd()) + GetWaybackCmd().AddCommand(GetWaybackListCmd()) + GetWaybackCmd().AddCommand(GetWaybackCollectCmd()) +} + +func GetWaybackCmd() *cobra.Command { + return waybackCmd +} + +func GetWaybackListCmd() *cobra.Command { + return waybackListCmd +} + +func GetWaybackCollectCmd() *cobra.Command { + return waybackCollectCmd +} + +func NewWaybackCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "wayback", + Short: "Interact with the Internet Archive Wayback Machine.", + Long: `List and collect historical snapshots of websites from the Internet Archive Wayback Machine.`, + } + return cmd +} + +func NewWaybackListCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "list [url]", + Short: "List available snapshots for a URL.", + Long: `Queries the Wayback Machine CDX API to find all available snapshots for a given URL.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + url := args[0] + snapshots, err := wayback.ListSnapshots(url) + if err != nil { + return fmt.Errorf("failed to list snapshots: %w", err) + } + + if len(snapshots) == 0 { + fmt.Fprintln(cmd.OutOrStdout(), "No snapshots found.") + return nil + } + + w := tabwriter.NewWriter(cmd.OutOrStdout(), 0, 0, 3, ' ', 0) + fmt.Fprintln(w, "TIMESTAMP\tMIMETYPE\tSTATUS\tLENGTH\tURL") + for _, s := range snapshots { + fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", s.Timestamp, s.MimeType, s.StatusCode, s.Length, s.Original) + } + return w.Flush() + }, + } + return cmd +} + +func NewWaybackCollectCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "collect [url]", + Short: "Collect a snapshot of a website.", + Long: `Collects a snapshot of a website from the Wayback Machine.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + urlArg := args[0] + outputDir, _ := cmd.Flags().GetString("output") + latest, _ := cmd.Flags().GetBool("latest") + all, _ := cmd.Flags().GetBool("all") + date, _ := cmd.Flags().GetString("date") + + if err := os.MkdirAll(outputDir, 0755); err != nil { + return fmt.Errorf("failed to create output directory: %w", err) + } + + baseURL, err := url.Parse(urlArg) + if err != nil { + return fmt.Errorf("failed to parse URL: %w", err) + } + + snapshots, err := wayback.ListSnapshots(urlArg) + if err != nil { + return fmt.Errorf("failed to list snapshots: %w", err) + } + if len(snapshots) == 0 { + fmt.Fprintln(cmd.OutOrStdout(), "No snapshots found.") + return nil + } + + var timestamps []string + if latest { + timestamps = []string{snapshots[len(snapshots)-1].Timestamp} + } else if all { + for _, s := range snapshots { + timestamps = append(timestamps, s.Timestamp) + } + } else if date != "" { + filtered := filterSnapshotsByDate(snapshots, date) + if len(filtered) == 0 { + return fmt.Errorf("no snapshots found for date: %s", date) + } + for _, s := range filtered { + timestamps = append(timestamps, s.Timestamp) + } + } else { + return fmt.Errorf("either --latest, --all, or --date must be specified") + } + + timeline := "" + downloadedDigests := make(map[string]bool) + + assets, err := wayback.ListSnapshots(fmt.Sprintf("%s/*", urlArg)) + if err != nil { + return fmt.Errorf("failed to list assets: %w", err) + } + + for _, ts := range timestamps { + fmt.Fprintf(cmd.OutOrStdout(), "Collecting snapshot from %s...\n", ts) + snapshotDir := filepath.Join(outputDir, ts) + if err := os.MkdirAll(snapshotDir, 0755); err != nil { + return fmt.Errorf("failed to create snapshot directory: %w", err) + } + + rootSnapshot := wayback.Snapshot{Timestamp: ts, Original: urlArg} + if err := downloadAndProcess(rootSnapshot, snapshotDir, baseURL, downloadedDigests); err != nil { + return err + } + + timeline += fmt.Sprintf("- %s: %s\n", ts, urlArg) + } + +func downloadAndProcess(snapshot wayback.Snapshot, snapshotDir string, baseURL *url.URL, downloadedDigests map[string]bool) error { + if downloadedDigests[snapshot.Digest] { + return nil + } + time.Sleep(200 * time.Millisecond) // Simple rate-limiting + fmt.Printf(" Downloading %s\n", snapshot.Original) + data, err := wayback.DownloadSnapshot(snapshot) + if err != nil { + return fmt.Errorf("failed to download asset %s: %w", snapshot.Original, err) + } + downloadedDigests[snapshot.Digest] = true + + assetURL, err := url.Parse(snapshot.Original) + if err != nil { + return fmt.Errorf("failed to parse asset URL %s: %w", snapshot.Original, err) + } + path := assetURL.Path + if strings.HasSuffix(path, "/") { + path = filepath.Join(path, "index.html") + } + filePath := filepath.Join(snapshotDir, path) + if err := os.MkdirAll(filepath.Dir(filePath), 0755); err != nil { + return fmt.Errorf("failed to create asset directory for %s: %w", filePath, err) + } + if err := os.WriteFile(filePath, data, 0644); err != nil { + return fmt.Errorf("failed to write asset %s: %w", filePath, err) + } + + if strings.HasPrefix(snapshot.MimeType, "text/html") { + rewrittenData, err := wayback.RewriteLinks(data, baseURL) + if err != nil { + return fmt.Errorf("failed to rewrite links for %s: %w", snapshot.Original, err) + } + if err := os.WriteFile(filePath, rewrittenData, 0644); err != nil { + return fmt.Errorf("failed to write rewritten asset %s: %w", filePath, err) + } + + links, err := wayback.ExtractLinks(data) + if err != nil { + return fmt.Errorf("failed to extract links from %s: %w", snapshot.Original, err) + } + + for _, link := range links { + absoluteURL := assetURL.ResolveReference(&url.URL{Path: link}) + assetSnapshot := wayback.Snapshot{Timestamp: snapshot.Timestamp, Original: absoluteURL.String()} + if err := downloadAndProcess(assetSnapshot, snapshotDir, baseURL, downloadedDigests); err != nil { + fmt.Printf("Warning: failed to process asset %s: %v\n", absoluteURL.String(), err) + } + } + } + return nil + + timelineFile := filepath.Join(outputDir, "TIMELINE.md") + if err := os.WriteFile(timelineFile, []byte(timeline), 0644); err != nil { + return fmt.Errorf("failed to write timeline file: %w", err) + } + + fmt.Fprintf(cmd.OutOrStdout(), "Snapshots saved to %s\n", outputDir) + return nil + }, + } + cmd.Flags().Bool("latest", false, "Collect the latest available snapshot.") + cmd.Flags().Bool("all", false, "Collect all available snapshots.") + cmd.Flags().String("date", "", "Collect a snapshot from a specific date (YYYY-MM-DD).") + cmd.Flags().String("output", "", "Output directory for the collected snapshots.") + cmd.MarkFlagRequired("output") + return cmd +} + +func filterSnapshotsByDate(snapshots []wayback.Snapshot, date string) []wayback.Snapshot { + var filtered []wayback.Snapshot + for _, s := range snapshots { + if len(s.Timestamp) >= 8 && s.Timestamp[:8] == date[:4]+date[5:7]+date[8:10] { + filtered = append(filtered, s) + } + } + return filtered +} diff --git a/cmd/collect_wayback_test.go b/cmd/collect_wayback_test.go new file mode 100644 index 0000000..7d2c07a --- /dev/null +++ b/cmd/collect_wayback_test.go @@ -0,0 +1,147 @@ +package cmd + +import ( + "bytes" + "io" + "net/http" + "os" + "strings" + "testing" +) + +// MockRoundTripper is a mock implementation of http.RoundTripper for testing. +type MockRoundTripper struct { + Response *http.Response + Err error + RoundTripFunc func(req *http.Request) (*http.Response, error) +} + +func (m *MockRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { + if m.RoundTripFunc != nil { + return m.RoundTripFunc(req) + } + return m.Response, m.Err +} + +func NewMockClient(responseBody string, statusCode int) *http.Client { + return &http.Client{ + Transport: &MockRoundTripper{ + Response: &http.Response{ + StatusCode: statusCode, + Body: io.NopCloser(bytes.NewBufferString(responseBody)), + }, + }, + } +} + +func TestWaybackList(t *testing.T) { + t.Cleanup(func() { + RootCmd.SetArgs([]string{}) + }) + mockResponse := `[ + ["urlkey","timestamp","original","mimetype","statuscode","digest","length"], + ["com,example)/", "20220101000000", "http://example.com/", "text/html", "200", "DIGEST", "1234"] + ]` + http.DefaultClient = NewMockClient(mockResponse, http.StatusOK) + + output, err := executeCommand(RootCmd, "wayback", "list", "http://example.com") + if err != nil { + t.Fatalf("executeCommand returned an unexpected error: %v", err) + } + + if !strings.Contains(output, "20220101000000") { + t.Errorf("Expected output to contain timestamp '20220101000000', got '%s'", output) + } +} + +func TestWaybackCollect(t *testing.T) { + t.Cleanup(func() { + RootCmd.SetArgs([]string{}) + }) + t.Run("Good - Latest with Assets", func(t *testing.T) { + mockListResponse := `[ + ["urlkey","timestamp","original","mimetype","statuscode","digest","length"], + ["com,example)/", "20230101000000", "http://example.com/", "text/html", "200", "DIGEST1", "1234"] + ]` + mockAssetsResponse := `[ + ["urlkey","timestamp","original","mimetype","statuscode","digest","length"], + ["com,example)/", "20230101000000", "http://example.com/", "text/html", "200", "DIGEST1", "1234"], + ["com,example)/css/style.css", "20230101000000", "http://example.com/css/style.css", "text/css", "200", "DIGEST2", "5678"] + ]` + mockHTMLContent := "Hello" + mockCSSContent := "body { color: red; }" + + // This is still a simplified mock, but it's better. + // A more robust solution would use a mock server or a more sophisticated RoundTripper. + var requestCount int + http.DefaultClient = &http.Client{ + Transport: &MockRoundTripper{ + Response: &http.Response{ + StatusCode: http.StatusOK, + Body: io.NopCloser(bytes.NewBufferString("")), // Placeholder + }, + }, + } + http.DefaultClient.Transport.(*MockRoundTripper).Response.Body = io.NopCloser(bytes.NewBufferString(mockListResponse)) + http.DefaultClient.Transport.(*MockRoundTripper).RoundTripFunc = func(req *http.Request) (*http.Response, error) { + var body string + if requestCount == 0 { + body = mockListResponse + } else if requestCount == 1 { + body = mockAssetsResponse + } else if strings.Contains(req.URL.Path, "style.css") { + body = mockCSSContent + } else { + body = mockHTMLContent + } + requestCount++ + return &http.Response{ + StatusCode: http.StatusOK, + Body: io.NopCloser(bytes.NewBufferString(body)), + }, nil + } + + tempDir, err := os.MkdirTemp("", "borg-test") + if err != nil { + t.Fatalf("Failed to create temp dir: %v", err) + } + defer os.RemoveAll(tempDir) + + _, err = executeCommand(RootCmd, "wayback", "collect", "http://example.com", "--latest", "--output", tempDir) + if err != nil { + t.Fatalf("executeCommand returned an unexpected error: %v", err) + } + + // Verify TIMELINE.md + timelineFile := tempDir + "/TIMELINE.md" + if _, err := os.Stat(timelineFile); os.IsNotExist(err) { + t.Errorf("Expected TIMELINE.md to be created in %s", tempDir) + } + + // Verify index.html + indexFile := tempDir + "/20230101000000/index.html" + if _, err := os.Stat(indexFile); os.IsNotExist(err) { + t.Fatalf("Expected index.html to be created in %s", indexFile) + } + content, err := os.ReadFile(indexFile) + if err != nil { + t.Fatalf("Failed to read index.html: %v", err) + } + if !strings.Contains(string(content), "Hello") { + t.Errorf("index.html content is incorrect") + } + + // Verify style.css + cssFile := tempDir + "/20230101000000/css/style.css" + if _, err := os.Stat(cssFile); os.IsNotExist(err) { + t.Fatalf("Expected style.css to be created in %s", cssFile) + } + content, err = os.ReadFile(cssFile) + if err != nil { + t.Fatalf("Failed to read style.css: %v", err) + } + if !strings.Contains(string(content), "color: red") { + t.Errorf("style.css content is incorrect") + } + }) +} diff --git a/pkg/wayback/wayback.go b/pkg/wayback/wayback.go new file mode 100644 index 0000000..c2358bc --- /dev/null +++ b/pkg/wayback/wayback.go @@ -0,0 +1,184 @@ +package wayback + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" + + "golang.org/x/net/html" +) + +// Snapshot represents a single entry from the Wayback Machine CDX API. +type Snapshot struct { + URLKey string + Timestamp string + Original string + MimeType string + StatusCode string + Digest string + Length string +} + +// ListSnapshots queries the Wayback Machine's CDX API to get a list of +// available snapshots for a given URL. +func ListSnapshots(url string) ([]Snapshot, error) { + return listSnapshots(fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=%s&output=json", url)) +} + +func listSnapshots(apiURL string) ([]Snapshot, error) { + resp, err := http.Get(apiURL) + if err != nil { + return nil, fmt.Errorf("failed to make request to CDX API: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("CDX API returned non-200 status: %s\nBody: %s", resp.Status, string(body)) + } + + var rawSnapshots [][]string + if err := json.NewDecoder(resp.Body).Decode(&rawSnapshots); err != nil { + return nil, fmt.Errorf("failed to decode JSON response from CDX API: %w", err) + } + + if len(rawSnapshots) < 2 { + return []Snapshot{}, nil // No snapshots found is not an error + } + + header := rawSnapshots[0] + fieldMap := make(map[string]int, len(header)) + for i, field := range header { + fieldMap[field] = i + } + + requiredFields := []string{"urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "length"} + for _, field := range requiredFields { + if _, ok := fieldMap[field]; !ok { + return nil, fmt.Errorf("CDX API response is missing the required field: '%s'", field) + } + } + + snapshots := make([]Snapshot, 0, len(rawSnapshots)-1) + for _, record := range rawSnapshots[1:] { + if len(record) != len(header) { + continue // Skip malformed records + } + snapshots = append(snapshots, Snapshot{ + URLKey: record[fieldMap["urlkey"]], + Timestamp: record[fieldMap["timestamp"]], + Original: record[fieldMap["original"]], + MimeType: record[fieldMap["mimetype"]], + StatusCode: record[fieldMap["statuscode"]], + Digest: record[fieldMap["digest"]], + Length: record[fieldMap["length"]], + }) + } + + return snapshots, nil +} + +// DownloadSnapshot downloads the raw content of a specific snapshot. +func DownloadSnapshot(snapshot Snapshot) ([]byte, error) { + // Construct the URL for the raw snapshot content, which includes "id_" for "identity" + rawURL := fmt.Sprintf("https://web.archive.org/web/%sid_/%s", snapshot.Timestamp, snapshot.Original) + + resp, err := http.Get(rawURL) + if err != nil { + return nil, fmt.Errorf("failed to make request to download snapshot: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("snapshot download returned non-200 status: %s\nURL: %s\nBody: %s", resp.Status, rawURL, string(body)) + } + + data, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read snapshot content: %w", err) + } + + return data, nil +} + +// RewriteLinks takes HTML content and rewrites internal links to be relative. +func RewriteLinks(htmlContent []byte, baseURL *url.URL) ([]byte, error) { + links, err := ExtractLinks(htmlContent) + if err != nil { + return nil, err + } + // This is a simplified implementation for now. A more robust solution + // would use a proper HTML parser to replace the links. + rewritten := string(htmlContent) + for _, link := range links { + newURL, changed := rewriteURL(link, baseURL) + if changed { + rewritten = strings.ReplaceAll(rewritten, link, newURL) + } + } + return []byte(rewritten), nil +} + +// ExtractLinks takes HTML content and returns a list of all asset links. +func ExtractLinks(htmlContent []byte) ([]string, error) { + var links []string + doc, err := html.Parse(bytes.NewReader(htmlContent)) + if err != nil { + return nil, fmt.Errorf("failed to parse HTML: %w", err) + } + + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.ElementNode { + for _, a := range n.Attr { + if a.Key == "href" || a.Key == "src" { + links = append(links, a.Val) + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + f(doc) + return links, nil +} + +func rewriteURL(rawURL string, baseURL *url.URL) (string, bool) { + parsedURL, err := url.Parse(rawURL) + if err != nil { + return rawURL, false + } + + // If the URL is relative, we don't need to do anything. + if !parsedURL.IsAbs() { + return rawURL, false + } + + // Handle Wayback Machine URLs + if strings.HasPrefix(parsedURL.Host, "web.archive.org") { + // Extract the original URL from the Wayback Machine URL + // e.g., /web/20220101120000/https://example.com/ -> https://example.com/ + parts := strings.SplitN(parsedURL.Path, "/", 4) + if len(parts) >= 4 { + originalURL, err := url.Parse(parts[3]) + if err == nil { + if originalURL.Host == baseURL.Host { + return originalURL.Path, true + } + } + } + } + + // Handle absolute URLs that point to the same host + if parsedURL.Host == baseURL.Host { + return parsedURL.Path, true + } + + return rawURL, false +} diff --git a/pkg/wayback/wayback_test.go b/pkg/wayback/wayback_test.go new file mode 100644 index 0000000..ccd4fd1 --- /dev/null +++ b/pkg/wayback/wayback_test.go @@ -0,0 +1,114 @@ +package wayback + +import ( + "bytes" + "io" + "net/http" + "net/url" + "strings" + "testing" +) + +// MockRoundTripper is a mock implementation of http.RoundTripper for testing. +type MockRoundTripper struct { + Response *http.Response + Err error +} + +func (m *MockRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { + return m.Response, m.Err +} + +func NewMockClient(responseBody string, statusCode int) *http.Client { + return &http.Client{ + Transport: &MockRoundTripper{ + Response: &http.Response{ + StatusCode: statusCode, + Body: io.NopCloser(bytes.NewBufferString(responseBody)), + }, + }, + } +} + +func TestListSnapshots(t *testing.T) { + t.Run("Good", func(t *testing.T) { + mockResponse := `[ + ["urlkey","timestamp","original","mimetype","statuscode","digest","length"], + ["com,example)/", "20220101000000", "http://example.com/", "text/html", "200", "DIGEST", "1234"], + ["com,example)/", "20230101000000", "http://example.com/", "text/html", "200", "DIGEST", "5678"] + ]` + http.DefaultClient = NewMockClient(mockResponse, http.StatusOK) + + snapshots, err := ListSnapshots("http://example.com") + if err != nil { + t.Fatalf("ListSnapshots returned an unexpected error: %v", err) + } + if len(snapshots) != 2 { + t.Fatalf("Expected 2 snapshots, got %d", len(snapshots)) + } + if snapshots[0].Timestamp != "20220101000000" { + t.Errorf("Expected timestamp '20220101000000', got '%s'", snapshots[0].Timestamp) + } + }) + + t.Run("Bad - API error", func(t *testing.T) { + http.DefaultClient = NewMockClient("server error", http.StatusInternalServerError) + _, err := ListSnapshots("http://example.com") + if err == nil { + t.Fatal("ListSnapshots did not return an error for a non-200 response") + } + }) + + t.Run("Ugly - Malformed JSON", func(t *testing.T) { + http.DefaultClient = NewMockClient(`[`, http.StatusOK) + _, err := ListSnapshots("http://example.com") + if err == nil { + t.Fatal("ListSnapshots did not return an error for malformed JSON") + } + }) +} + +func TestDownloadSnapshot(t *testing.T) { + t.Run("Good", func(t *testing.T) { + mockResponse := "Hello, World!" + http.DefaultClient = NewMockClient(mockResponse, http.StatusOK) + + snapshot := Snapshot{Timestamp: "20220101000000", Original: "http://example.com/"} + data, err := DownloadSnapshot(snapshot) + if err != nil { + t.Fatalf("DownloadSnapshot returned an unexpected error: %v", err) + } + if string(data) != mockResponse { + t.Errorf("Expected response body '%s', got '%s'", mockResponse, string(data)) + } + }) +} + +func TestRewriteLinks(t *testing.T) { + baseURL, _ := url.Parse("http://example.com") + htmlContent := ` + + Page 1 + Page 2 + Relative Path + + + ` + rewritten, err := RewriteLinks([]byte(htmlContent), baseURL) + if err != nil { + t.Fatalf("RewriteLinks returned an unexpected error: %v", err) + } + + if !strings.Contains(string(rewritten), `href="/page1"`) { + t.Error("Expected link to be rewritten to /page1") + } + if !strings.Contains(string(rewritten), `href="https://web.archive.org/web/20220101000000/http://othersite.com/page2"`) { + t.Error("External link should not have been rewritten") + } + if !strings.Contains(string(rewritten), `href="/relative/path"`) { + t.Error("Relative link should not have been changed") + } + if !strings.Contains(string(rewritten), `src="/image.jpg"`) { + t.Error("Expected image src to be rewritten to /image.jpg") + } +}