From 5a35fa03afaf7dabf757ca320202fd88f3f84e1b Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Mon, 2 Feb 2026 00:46:40 +0000 Subject: [PATCH] feat: Add archive.org collection command This commit introduces the `borg collect archive` command, allowing users to collect items from the Internet Archive. The command includes three subcommands: - `search [query]`: Searches for items and collects them. - `item [identifier]`: Collects a specific item. - `collection [identifier]`: Collects all items in a collection. A new package, `pkg/archive`, has been created to handle all API interactions with archive.org. The implementation includes pagination to ensure all items are retrieved from large searches or collections. Downloaded items are stored in an `archive/` directory, with each item's files and metadata saved in a subdirectory named after its identifier. Unit and integration tests have been added to verify the functionality of the new commands and the API client. All existing tests continue to pass. Co-authored-by: Snider <631881+Snider@users.noreply.github.com> --- .gitignore | 1 + cmd/collect_archive.go | 16 +++ cmd/collect_archive_collection.go | 34 ++++++ cmd/collect_archive_item.go | 22 ++++ cmd/collect_archive_search.go | 41 +++++++ cmd/collect_archive_test.go | 96 +++++++++++++++ pkg/archive/archive.go | 188 ++++++++++++++++++++++++++++++ pkg/archive/archive_test.go | 52 +++++++++ 8 files changed, 450 insertions(+) create mode 100644 cmd/collect_archive.go create mode 100644 cmd/collect_archive_collection.go create mode 100644 cmd/collect_archive_item.go create mode 100644 cmd/collect_archive_search.go create mode 100644 cmd/collect_archive_test.go create mode 100644 pkg/archive/archive.go create mode 100644 pkg/archive/archive_test.go diff --git a/.gitignore b/.gitignore index d3a3066..dadebd7 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ demo-track.smsg # Dev artifacts .playwright-mcp/ +archive/ diff --git a/cmd/collect_archive.go b/cmd/collect_archive.go new file mode 100644 index 0000000..2fce6cc --- /dev/null +++ b/cmd/collect_archive.go @@ -0,0 +1,16 @@ +package cmd + +import ( + "github.com/spf13/cobra" +) + +// collectArchiveCmd represents the collect archive command +var collectArchiveCmd = &cobra.Command{ + Use: "archive", + Short: "Collect a resource from the Internet Archive.", + Long: `Collect a resource from the Internet Archive, such as a search query, an item, or a collection.`, +} + +func init() { + collectCmd.AddCommand(collectArchiveCmd) +} diff --git a/cmd/collect_archive_collection.go b/cmd/collect_archive_collection.go new file mode 100644 index 0000000..95a1421 --- /dev/null +++ b/cmd/collect_archive_collection.go @@ -0,0 +1,34 @@ +package cmd + +import ( + "fmt" + "github.com/Snider/Borg/pkg/archive" + "github.com/spf13/cobra" +) + +// collectArchiveCollectionCmd represents the collect archive collection command +var collectArchiveCollectionCmd = &cobra.Command{ + Use: "collection [identifier]", + Short: "Collect a collection from the Internet Archive.", + Long: `Collect a collection and all of its items from the Internet Archive.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + identifier := args[0] + items, err := archive.GetCollection(identifier) + if err != nil { + return err + } + + for _, item := range items { + if err := archive.DownloadItem(item.Identifier, "archive", ""); err != nil { + fmt.Fprintf(cmd.ErrOrStderr(), "Error downloading item %s from collection: %v\n", item.Identifier, err) + } + } + + return nil + }, +} + +func init() { + collectArchiveCmd.AddCommand(collectArchiveCollectionCmd) +} diff --git a/cmd/collect_archive_item.go b/cmd/collect_archive_item.go new file mode 100644 index 0000000..814d754 --- /dev/null +++ b/cmd/collect_archive_item.go @@ -0,0 +1,22 @@ +package cmd + +import ( + "github.com/Snider/Borg/pkg/archive" + "github.com/spf13/cobra" +) + +// collectArchiveItemCmd represents the collect archive item command +var collectArchiveItemCmd = &cobra.Command{ + Use: "item [identifier]", + Short: "Collect an item from the Internet Archive.", + Long: `Collect an item and all of its files from the Internet Archive.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + identifier := args[0] + return archive.DownloadItem(identifier, "archive", "") + }, +} + +func init() { + collectArchiveCmd.AddCommand(collectArchiveItemCmd) +} diff --git a/cmd/collect_archive_search.go b/cmd/collect_archive_search.go new file mode 100644 index 0000000..268ff3c --- /dev/null +++ b/cmd/collect_archive_search.go @@ -0,0 +1,41 @@ +package cmd + +import ( + "fmt" + "github.com/Snider/Borg/pkg/archive" + "github.com/spf13/cobra" +) + +// collectArchiveSearchCmd represents the collect archive search command +var collectArchiveSearchCmd = &cobra.Command{ + Use: "search [query]", + Short: "Search for items on the Internet Archive.", + Long: `Search for items on the Internet Archive and collect them.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + query := args[0] + mediaType, _ := cmd.Flags().GetString("type") + limit, _ := cmd.Flags().GetInt("limit") + format, _ := cmd.Flags().GetString("format") + + items, err := archive.Search(query, mediaType, limit) + if err != nil { + return err + } + + for _, item := range items { + if err := archive.DownloadItem(item.Identifier, "archive", format); err != nil { + fmt.Fprintf(cmd.ErrOrStderr(), "Error downloading item %s: %v\n", item.Identifier, err) + } + } + + return nil + }, +} + +func init() { + collectArchiveCmd.AddCommand(collectArchiveSearchCmd) + collectArchiveSearchCmd.Flags().String("type", "", "Filter by mediatype (texts, software)") + collectArchiveSearchCmd.Flags().Int("limit", 10, "Max items to collect") + collectArchiveSearchCmd.Flags().String("format", "", "Preferred file format") +} diff --git a/cmd/collect_archive_test.go b/cmd/collect_archive_test.go new file mode 100644 index 0000000..23fee35 --- /dev/null +++ b/cmd/collect_archive_test.go @@ -0,0 +1,96 @@ +package cmd + +import ( + "fmt" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/Snider/Borg/pkg/archive" +) + +func TestCollectArchiveItemCmd_E2E(t *testing.T) { + tempDir := t.TempDir() + archiveDir := filepath.Join(tempDir, "archive") + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.Contains(r.URL.Path, "/metadata/") { + fmt.Fprintln(w, `{"files": [{"name": "test.txt", "format": "Text"}, {"name": "image.jpg", "format": "JPEG"}]}`) + } else if strings.Contains(r.URL.Path, "/download/") { + fmt.Fprintln(w, "file content") + } + })) + defer server.Close() + + originalURL := archive.BaseURL + archive.BaseURL = server.URL + defer func() { + archive.BaseURL = originalURL + }() + + // Change working directory for the test + originalWd, _ := os.Getwd() + os.Chdir(tempDir) + defer os.Chdir(originalWd) + + _, err := executeCommand(RootCmd, "collect", "archive", "item", "test-item") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Verify directory and files + itemDir := filepath.Join(archiveDir, "test-item") + if _, err := os.Stat(itemDir); os.IsNotExist(err) { + t.Errorf("expected directory %s to be created", itemDir) + } + + for _, f := range []string{"metadata.json", "_files.json", "test.txt", "image.jpg"} { + if _, err := os.Stat(filepath.Join(itemDir, f)); os.IsNotExist(err) { + t.Errorf("expected file %s to be created in %s", f, itemDir) + } + } +} + +func TestCollectArchiveSearchCmd_FormatFlag_E2E(t *testing.T) { + tempDir := t.TempDir() + archiveDir := filepath.Join(tempDir, "archive") + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.Contains(r.URL.Path, "/advancedsearch.php") { + fmt.Fprintln(w, `{"response": {"docs": [{"identifier": "test-item"}]}}`) + } else if strings.Contains(r.URL.Path, "/metadata/") { + fmt.Fprintln(w, `{"files": [{"name": "test.txt", "format": "Text"}, {"name": "image.jpg", "format": "JPEG"}]}`) + } else if strings.Contains(r.URL.Path, "/download/") { + fmt.Fprintln(w, "file content") + } + })) + defer server.Close() + + originalURL := archive.BaseURL + archive.BaseURL = server.URL + defer func() { + archive.BaseURL = originalURL + }() + + originalWd, _ := os.Getwd() + os.Chdir(tempDir) + defer os.Chdir(originalWd) + + _, err := executeCommand(RootCmd, "collect", "archive", "search", "test-query", "--format=Text") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + itemDir := filepath.Join(archiveDir, "test-item") + // Verify correct file is downloaded + if _, err := os.Stat(filepath.Join(itemDir, "test.txt")); os.IsNotExist(err) { + t.Errorf("expected test.txt to be downloaded") + } + // Verify incorrect format file is NOT downloaded + if _, err := os.Stat(filepath.Join(itemDir, "image.jpg")); err == nil { + t.Errorf("did not expect image.jpg to be downloaded") + } +} diff --git a/pkg/archive/archive.go b/pkg/archive/archive.go new file mode 100644 index 0000000..d5dab49 --- /dev/null +++ b/pkg/archive/archive.go @@ -0,0 +1,188 @@ +package archive + +import ( + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "os" +) + +var BaseURL = "https://archive.org" + +type Item struct { + Identifier string `json:"identifier"` +} + +type SearchResponse struct { + Response struct { + Docs []Item `json:"docs"` + } `json:"response"` +} + +type ItemMetadata struct { + Files []File `json:"files"` +} + +type File struct { + Name string `json:"name"` + Source string `json:"source"` + Format string `json:"format"` + Size string `json:"size"` +} + +func Search(query, mediaType string, limit int) ([]Item, error) { + var allItems []Item + page := 1 + const rowsPerPage = 100 // A reasonable number of results per page + + for { + baseURL, err := url.Parse(BaseURL + "/advancedsearch.php") + if err != nil { + return nil, err + } + + params := url.Values{} + params.Add("q", query) + if mediaType != "" { + params.Add("fq", "mediatype:"+mediaType) + } + params.Add("fl[]", "identifier") + params.Add("output", "json") + params.Add("page", fmt.Sprintf("%d", page)) + + if limit == -1 { + params.Add("rows", fmt.Sprintf("%d", rowsPerPage)) + } else { + params.Add("rows", fmt.Sprintf("%d", limit)) + } + + baseURL.RawQuery = params.Encode() + + resp, err := http.Get(baseURL.String()) + if err != nil { + return nil, err + } + + if resp.StatusCode != http.StatusOK { + resp.Body.Close() + return nil, fmt.Errorf("bad status: %s", resp.Status) + } + + var searchResponse SearchResponse + if err := json.NewDecoder(resp.Body).Decode(&searchResponse); err != nil { + resp.Body.Close() + return nil, err + } + resp.Body.Close() + + if len(searchResponse.Response.Docs) == 0 { + break // No more results + } + + allItems = append(allItems, searchResponse.Response.Docs...) + + if limit != -1 && len(allItems) >= limit { + return allItems[:limit], nil + } + + if limit != -1 { + break // We only needed one page + } + + page++ + } + + return allItems, nil +} + +func GetItem(identifier string) (*ItemMetadata, error) { + url := fmt.Sprintf("%s/metadata/%s", BaseURL, identifier) + resp, err := http.Get(url) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("bad status: %s", resp.Status) + } + + var itemMetadata ItemMetadata + if err := json.NewDecoder(resp.Body).Decode(&itemMetadata); err != nil { + return nil, err + } + + return &itemMetadata, nil +} + +func DownloadFile(url, filepath string) error { + resp, err := http.Get(url) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("bad status: %s", resp.Status) + } + + out, err := os.Create(filepath) + if err != nil { + return err + } + defer out.Close() + + _, err = io.Copy(out, resp.Body) + return err +} + +func GetCollection(identifier string) ([]Item, error) { + return Search(fmt.Sprintf("collection:%s", identifier), "", -1) // -1 for no limit +} + +func DownloadItem(identifier, baseDir string, formatFilter string) error { + item, err := GetItem(identifier) + if err != nil { + return fmt.Errorf("could not get item metadata for %s: %w", identifier, err) + } + + itemDir := fmt.Sprintf("%s/%s", baseDir, identifier) + if err := os.MkdirAll(itemDir, 0755); err != nil { + return fmt.Errorf("could not create directory %s: %w", itemDir, err) + } + + // Save metadata + metadataJSON, err := json.MarshalIndent(item, "", " ") + if err != nil { + return fmt.Errorf("could not marshal metadata for %s: %w", identifier, err) + } + if err := os.WriteFile(fmt.Sprintf("%s/metadata.json", itemDir), metadataJSON, 0644); err != nil { + return fmt.Errorf("could not write metadata.json for %s: %w", identifier, err) + } + + // Save file list + filesJSON, err := json.MarshalIndent(item.Files, "", " ") + if err != nil { + return fmt.Errorf("could not marshal file list for %s: %w", identifier, err) + } + if err := os.WriteFile(fmt.Sprintf("%s/_files.json", itemDir), filesJSON, 0644); err != nil { + return fmt.Errorf("could not write _files.json for %s: %w", identifier, err) + } + + fmt.Printf("Downloading item %s...\n", identifier) + for _, file := range item.Files { + if formatFilter != "" && file.Format != formatFilter { + continue + } + downloadURL := fmt.Sprintf("%s/download/%s/%s", BaseURL, identifier, file.Name) + filePath := fmt.Sprintf("%s/%s", itemDir, file.Name) + fmt.Printf(" Downloading file %s...\n", file.Name) + if err := DownloadFile(downloadURL, filePath); err != nil { + // Log error but continue trying other files + fmt.Printf(" Error downloading %s: %v\n", file.Name, err) + } + } + return nil +} diff --git a/pkg/archive/archive_test.go b/pkg/archive/archive_test.go new file mode 100644 index 0000000..d74caec --- /dev/null +++ b/pkg/archive/archive_test.go @@ -0,0 +1,52 @@ +package archive + +import ( + "fmt" + "net/http" + "net/http/httptest" + "testing" +) + +func TestSearch(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintln(w, `{"response": {"docs": [{"identifier": "test-item"}]}}`) + })) + defer server.Close() + + originalURL := BaseURL + BaseURL = server.URL + defer func() { + BaseURL = originalURL + }() + + items, err := Search("test", "", 1) + if err != nil { + t.Fatalf("Search failed: %v", err) + } + + if len(items) != 1 || items[0].Identifier != "test-item" { + t.Errorf("Expected to find 1 item with identifier 'test-item', but got %v", items) + } +} + +func TestGetItem(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintln(w, `{"files": [{"name": "test.txt"}]}`) + })) + defer server.Close() + + originalURL := BaseURL + BaseURL = server.URL + defer func() { + BaseURL = originalURL + }() + + item, err := GetItem("test-item") + if err != nil { + t.Fatalf("GetItem failed: %v", err) + } + + if len(item.Files) != 1 || item.Files[0].Name != "test.txt" { + t.Errorf("Expected to find 1 file with name 'test.txt', but got %v", item.Files) + } +}