feat: Add archive.org collection command

This commit introduces the `borg collect archive` command, allowing users to collect items from the Internet Archive.

The command includes three subcommands:
- `search [query]`: Searches for items and collects them.
- `item [identifier]`: Collects a specific item.
- `collection [identifier]`: Collects all items in a collection.

A new package, `pkg/archive`, has been created to handle all API interactions with archive.org. The implementation includes pagination to ensure all items are retrieved from large searches or collections. Downloaded items are stored in an `archive/` directory, with each item's files and metadata saved in a subdirectory named after its identifier.

Unit and integration tests have been added to verify the functionality of the new commands and the API client. All existing tests continue to pass.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
This commit is contained in:
google-labs-jules[bot] 2026-02-02 00:46:40 +00:00
parent cf2af53ed3
commit 5a35fa03af
8 changed files with 450 additions and 0 deletions

1
.gitignore vendored
View file

@ -10,3 +10,4 @@ demo-track.smsg
# Dev artifacts
.playwright-mcp/
archive/

16
cmd/collect_archive.go Normal file
View file

@ -0,0 +1,16 @@
package cmd
import (
"github.com/spf13/cobra"
)
// collectArchiveCmd represents the collect archive command
var collectArchiveCmd = &cobra.Command{
Use: "archive",
Short: "Collect a resource from the Internet Archive.",
Long: `Collect a resource from the Internet Archive, such as a search query, an item, or a collection.`,
}
func init() {
collectCmd.AddCommand(collectArchiveCmd)
}

View file

@ -0,0 +1,34 @@
package cmd
import (
"fmt"
"github.com/Snider/Borg/pkg/archive"
"github.com/spf13/cobra"
)
// collectArchiveCollectionCmd represents the collect archive collection command
var collectArchiveCollectionCmd = &cobra.Command{
Use: "collection [identifier]",
Short: "Collect a collection from the Internet Archive.",
Long: `Collect a collection and all of its items from the Internet Archive.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
identifier := args[0]
items, err := archive.GetCollection(identifier)
if err != nil {
return err
}
for _, item := range items {
if err := archive.DownloadItem(item.Identifier, "archive", ""); err != nil {
fmt.Fprintf(cmd.ErrOrStderr(), "Error downloading item %s from collection: %v\n", item.Identifier, err)
}
}
return nil
},
}
func init() {
collectArchiveCmd.AddCommand(collectArchiveCollectionCmd)
}

View file

@ -0,0 +1,22 @@
package cmd
import (
"github.com/Snider/Borg/pkg/archive"
"github.com/spf13/cobra"
)
// collectArchiveItemCmd represents the collect archive item command
var collectArchiveItemCmd = &cobra.Command{
Use: "item [identifier]",
Short: "Collect an item from the Internet Archive.",
Long: `Collect an item and all of its files from the Internet Archive.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
identifier := args[0]
return archive.DownloadItem(identifier, "archive", "")
},
}
func init() {
collectArchiveCmd.AddCommand(collectArchiveItemCmd)
}

View file

@ -0,0 +1,41 @@
package cmd
import (
"fmt"
"github.com/Snider/Borg/pkg/archive"
"github.com/spf13/cobra"
)
// collectArchiveSearchCmd represents the collect archive search command
var collectArchiveSearchCmd = &cobra.Command{
Use: "search [query]",
Short: "Search for items on the Internet Archive.",
Long: `Search for items on the Internet Archive and collect them.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
query := args[0]
mediaType, _ := cmd.Flags().GetString("type")
limit, _ := cmd.Flags().GetInt("limit")
format, _ := cmd.Flags().GetString("format")
items, err := archive.Search(query, mediaType, limit)
if err != nil {
return err
}
for _, item := range items {
if err := archive.DownloadItem(item.Identifier, "archive", format); err != nil {
fmt.Fprintf(cmd.ErrOrStderr(), "Error downloading item %s: %v\n", item.Identifier, err)
}
}
return nil
},
}
func init() {
collectArchiveCmd.AddCommand(collectArchiveSearchCmd)
collectArchiveSearchCmd.Flags().String("type", "", "Filter by mediatype (texts, software)")
collectArchiveSearchCmd.Flags().Int("limit", 10, "Max items to collect")
collectArchiveSearchCmd.Flags().String("format", "", "Preferred file format")
}

View file

@ -0,0 +1,96 @@
package cmd
import (
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"github.com/Snider/Borg/pkg/archive"
)
func TestCollectArchiveItemCmd_E2E(t *testing.T) {
tempDir := t.TempDir()
archiveDir := filepath.Join(tempDir, "archive")
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if strings.Contains(r.URL.Path, "/metadata/") {
fmt.Fprintln(w, `{"files": [{"name": "test.txt", "format": "Text"}, {"name": "image.jpg", "format": "JPEG"}]}`)
} else if strings.Contains(r.URL.Path, "/download/") {
fmt.Fprintln(w, "file content")
}
}))
defer server.Close()
originalURL := archive.BaseURL
archive.BaseURL = server.URL
defer func() {
archive.BaseURL = originalURL
}()
// Change working directory for the test
originalWd, _ := os.Getwd()
os.Chdir(tempDir)
defer os.Chdir(originalWd)
_, err := executeCommand(RootCmd, "collect", "archive", "item", "test-item")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// Verify directory and files
itemDir := filepath.Join(archiveDir, "test-item")
if _, err := os.Stat(itemDir); os.IsNotExist(err) {
t.Errorf("expected directory %s to be created", itemDir)
}
for _, f := range []string{"metadata.json", "_files.json", "test.txt", "image.jpg"} {
if _, err := os.Stat(filepath.Join(itemDir, f)); os.IsNotExist(err) {
t.Errorf("expected file %s to be created in %s", f, itemDir)
}
}
}
func TestCollectArchiveSearchCmd_FormatFlag_E2E(t *testing.T) {
tempDir := t.TempDir()
archiveDir := filepath.Join(tempDir, "archive")
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if strings.Contains(r.URL.Path, "/advancedsearch.php") {
fmt.Fprintln(w, `{"response": {"docs": [{"identifier": "test-item"}]}}`)
} else if strings.Contains(r.URL.Path, "/metadata/") {
fmt.Fprintln(w, `{"files": [{"name": "test.txt", "format": "Text"}, {"name": "image.jpg", "format": "JPEG"}]}`)
} else if strings.Contains(r.URL.Path, "/download/") {
fmt.Fprintln(w, "file content")
}
}))
defer server.Close()
originalURL := archive.BaseURL
archive.BaseURL = server.URL
defer func() {
archive.BaseURL = originalURL
}()
originalWd, _ := os.Getwd()
os.Chdir(tempDir)
defer os.Chdir(originalWd)
_, err := executeCommand(RootCmd, "collect", "archive", "search", "test-query", "--format=Text")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
itemDir := filepath.Join(archiveDir, "test-item")
// Verify correct file is downloaded
if _, err := os.Stat(filepath.Join(itemDir, "test.txt")); os.IsNotExist(err) {
t.Errorf("expected test.txt to be downloaded")
}
// Verify incorrect format file is NOT downloaded
if _, err := os.Stat(filepath.Join(itemDir, "image.jpg")); err == nil {
t.Errorf("did not expect image.jpg to be downloaded")
}
}

188
pkg/archive/archive.go Normal file
View file

@ -0,0 +1,188 @@
package archive
import (
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"os"
)
var BaseURL = "https://archive.org"
type Item struct {
Identifier string `json:"identifier"`
}
type SearchResponse struct {
Response struct {
Docs []Item `json:"docs"`
} `json:"response"`
}
type ItemMetadata struct {
Files []File `json:"files"`
}
type File struct {
Name string `json:"name"`
Source string `json:"source"`
Format string `json:"format"`
Size string `json:"size"`
}
func Search(query, mediaType string, limit int) ([]Item, error) {
var allItems []Item
page := 1
const rowsPerPage = 100 // A reasonable number of results per page
for {
baseURL, err := url.Parse(BaseURL + "/advancedsearch.php")
if err != nil {
return nil, err
}
params := url.Values{}
params.Add("q", query)
if mediaType != "" {
params.Add("fq", "mediatype:"+mediaType)
}
params.Add("fl[]", "identifier")
params.Add("output", "json")
params.Add("page", fmt.Sprintf("%d", page))
if limit == -1 {
params.Add("rows", fmt.Sprintf("%d", rowsPerPage))
} else {
params.Add("rows", fmt.Sprintf("%d", limit))
}
baseURL.RawQuery = params.Encode()
resp, err := http.Get(baseURL.String())
if err != nil {
return nil, err
}
if resp.StatusCode != http.StatusOK {
resp.Body.Close()
return nil, fmt.Errorf("bad status: %s", resp.Status)
}
var searchResponse SearchResponse
if err := json.NewDecoder(resp.Body).Decode(&searchResponse); err != nil {
resp.Body.Close()
return nil, err
}
resp.Body.Close()
if len(searchResponse.Response.Docs) == 0 {
break // No more results
}
allItems = append(allItems, searchResponse.Response.Docs...)
if limit != -1 && len(allItems) >= limit {
return allItems[:limit], nil
}
if limit != -1 {
break // We only needed one page
}
page++
}
return allItems, nil
}
func GetItem(identifier string) (*ItemMetadata, error) {
url := fmt.Sprintf("%s/metadata/%s", BaseURL, identifier)
resp, err := http.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("bad status: %s", resp.Status)
}
var itemMetadata ItemMetadata
if err := json.NewDecoder(resp.Body).Decode(&itemMetadata); err != nil {
return nil, err
}
return &itemMetadata, nil
}
func DownloadFile(url, filepath string) error {
resp, err := http.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("bad status: %s", resp.Status)
}
out, err := os.Create(filepath)
if err != nil {
return err
}
defer out.Close()
_, err = io.Copy(out, resp.Body)
return err
}
func GetCollection(identifier string) ([]Item, error) {
return Search(fmt.Sprintf("collection:%s", identifier), "", -1) // -1 for no limit
}
func DownloadItem(identifier, baseDir string, formatFilter string) error {
item, err := GetItem(identifier)
if err != nil {
return fmt.Errorf("could not get item metadata for %s: %w", identifier, err)
}
itemDir := fmt.Sprintf("%s/%s", baseDir, identifier)
if err := os.MkdirAll(itemDir, 0755); err != nil {
return fmt.Errorf("could not create directory %s: %w", itemDir, err)
}
// Save metadata
metadataJSON, err := json.MarshalIndent(item, "", " ")
if err != nil {
return fmt.Errorf("could not marshal metadata for %s: %w", identifier, err)
}
if err := os.WriteFile(fmt.Sprintf("%s/metadata.json", itemDir), metadataJSON, 0644); err != nil {
return fmt.Errorf("could not write metadata.json for %s: %w", identifier, err)
}
// Save file list
filesJSON, err := json.MarshalIndent(item.Files, "", " ")
if err != nil {
return fmt.Errorf("could not marshal file list for %s: %w", identifier, err)
}
if err := os.WriteFile(fmt.Sprintf("%s/_files.json", itemDir), filesJSON, 0644); err != nil {
return fmt.Errorf("could not write _files.json for %s: %w", identifier, err)
}
fmt.Printf("Downloading item %s...\n", identifier)
for _, file := range item.Files {
if formatFilter != "" && file.Format != formatFilter {
continue
}
downloadURL := fmt.Sprintf("%s/download/%s/%s", BaseURL, identifier, file.Name)
filePath := fmt.Sprintf("%s/%s", itemDir, file.Name)
fmt.Printf(" Downloading file %s...\n", file.Name)
if err := DownloadFile(downloadURL, filePath); err != nil {
// Log error but continue trying other files
fmt.Printf(" Error downloading %s: %v\n", file.Name, err)
}
}
return nil
}

View file

@ -0,0 +1,52 @@
package archive
import (
"fmt"
"net/http"
"net/http/httptest"
"testing"
)
func TestSearch(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprintln(w, `{"response": {"docs": [{"identifier": "test-item"}]}}`)
}))
defer server.Close()
originalURL := BaseURL
BaseURL = server.URL
defer func() {
BaseURL = originalURL
}()
items, err := Search("test", "", 1)
if err != nil {
t.Fatalf("Search failed: %v", err)
}
if len(items) != 1 || items[0].Identifier != "test-item" {
t.Errorf("Expected to find 1 item with identifier 'test-item', but got %v", items)
}
}
func TestGetItem(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprintln(w, `{"files": [{"name": "test.txt"}]}`)
}))
defer server.Close()
originalURL := BaseURL
BaseURL = server.URL
defer func() {
BaseURL = originalURL
}()
item, err := GetItem("test-item")
if err != nil {
t.Fatalf("GetItem failed: %v", err)
}
if len(item.Files) != 1 || item.Files[0].Name != "test.txt" {
t.Errorf("Expected to find 1 file with name 'test.txt', but got %v", item.Files)
}
}