feat: Add archive.org collection command
This commit introduces the `borg collect archive` command, allowing users to collect items from the Internet Archive. The command includes three subcommands: - `search [query]`: Searches for items and collects them. - `item [identifier]`: Collects a specific item. - `collection [identifier]`: Collects all items in a collection. A new package, `pkg/archive`, has been created to handle all API interactions with archive.org. The implementation includes pagination to ensure all items are retrieved from large searches or collections. Downloaded items are stored in an `archive/` directory, with each item's files and metadata saved in a subdirectory named after its identifier. Unit and integration tests have been added to verify the functionality of the new commands and the API client. All existing tests continue to pass. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
This commit is contained in:
parent
cf2af53ed3
commit
5a35fa03af
8 changed files with 450 additions and 0 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -10,3 +10,4 @@ demo-track.smsg
|
|||
|
||||
# Dev artifacts
|
||||
.playwright-mcp/
|
||||
archive/
|
||||
|
|
|
|||
16
cmd/collect_archive.go
Normal file
16
cmd/collect_archive.go
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// collectArchiveCmd represents the collect archive command
|
||||
var collectArchiveCmd = &cobra.Command{
|
||||
Use: "archive",
|
||||
Short: "Collect a resource from the Internet Archive.",
|
||||
Long: `Collect a resource from the Internet Archive, such as a search query, an item, or a collection.`,
|
||||
}
|
||||
|
||||
func init() {
|
||||
collectCmd.AddCommand(collectArchiveCmd)
|
||||
}
|
||||
34
cmd/collect_archive_collection.go
Normal file
34
cmd/collect_archive_collection.go
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/Snider/Borg/pkg/archive"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// collectArchiveCollectionCmd represents the collect archive collection command
|
||||
var collectArchiveCollectionCmd = &cobra.Command{
|
||||
Use: "collection [identifier]",
|
||||
Short: "Collect a collection from the Internet Archive.",
|
||||
Long: `Collect a collection and all of its items from the Internet Archive.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
identifier := args[0]
|
||||
items, err := archive.GetCollection(identifier)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, item := range items {
|
||||
if err := archive.DownloadItem(item.Identifier, "archive", ""); err != nil {
|
||||
fmt.Fprintf(cmd.ErrOrStderr(), "Error downloading item %s from collection: %v\n", item.Identifier, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
collectArchiveCmd.AddCommand(collectArchiveCollectionCmd)
|
||||
}
|
||||
22
cmd/collect_archive_item.go
Normal file
22
cmd/collect_archive_item.go
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"github.com/Snider/Borg/pkg/archive"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// collectArchiveItemCmd represents the collect archive item command
|
||||
var collectArchiveItemCmd = &cobra.Command{
|
||||
Use: "item [identifier]",
|
||||
Short: "Collect an item from the Internet Archive.",
|
||||
Long: `Collect an item and all of its files from the Internet Archive.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
identifier := args[0]
|
||||
return archive.DownloadItem(identifier, "archive", "")
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
collectArchiveCmd.AddCommand(collectArchiveItemCmd)
|
||||
}
|
||||
41
cmd/collect_archive_search.go
Normal file
41
cmd/collect_archive_search.go
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/Snider/Borg/pkg/archive"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// collectArchiveSearchCmd represents the collect archive search command
|
||||
var collectArchiveSearchCmd = &cobra.Command{
|
||||
Use: "search [query]",
|
||||
Short: "Search for items on the Internet Archive.",
|
||||
Long: `Search for items on the Internet Archive and collect them.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
query := args[0]
|
||||
mediaType, _ := cmd.Flags().GetString("type")
|
||||
limit, _ := cmd.Flags().GetInt("limit")
|
||||
format, _ := cmd.Flags().GetString("format")
|
||||
|
||||
items, err := archive.Search(query, mediaType, limit)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, item := range items {
|
||||
if err := archive.DownloadItem(item.Identifier, "archive", format); err != nil {
|
||||
fmt.Fprintf(cmd.ErrOrStderr(), "Error downloading item %s: %v\n", item.Identifier, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
collectArchiveCmd.AddCommand(collectArchiveSearchCmd)
|
||||
collectArchiveSearchCmd.Flags().String("type", "", "Filter by mediatype (texts, software)")
|
||||
collectArchiveSearchCmd.Flags().Int("limit", 10, "Max items to collect")
|
||||
collectArchiveSearchCmd.Flags().String("format", "", "Preferred file format")
|
||||
}
|
||||
96
cmd/collect_archive_test.go
Normal file
96
cmd/collect_archive_test.go
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/Snider/Borg/pkg/archive"
|
||||
)
|
||||
|
||||
func TestCollectArchiveItemCmd_E2E(t *testing.T) {
|
||||
tempDir := t.TempDir()
|
||||
archiveDir := filepath.Join(tempDir, "archive")
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if strings.Contains(r.URL.Path, "/metadata/") {
|
||||
fmt.Fprintln(w, `{"files": [{"name": "test.txt", "format": "Text"}, {"name": "image.jpg", "format": "JPEG"}]}`)
|
||||
} else if strings.Contains(r.URL.Path, "/download/") {
|
||||
fmt.Fprintln(w, "file content")
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
originalURL := archive.BaseURL
|
||||
archive.BaseURL = server.URL
|
||||
defer func() {
|
||||
archive.BaseURL = originalURL
|
||||
}()
|
||||
|
||||
// Change working directory for the test
|
||||
originalWd, _ := os.Getwd()
|
||||
os.Chdir(tempDir)
|
||||
defer os.Chdir(originalWd)
|
||||
|
||||
_, err := executeCommand(RootCmd, "collect", "archive", "item", "test-item")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// Verify directory and files
|
||||
itemDir := filepath.Join(archiveDir, "test-item")
|
||||
if _, err := os.Stat(itemDir); os.IsNotExist(err) {
|
||||
t.Errorf("expected directory %s to be created", itemDir)
|
||||
}
|
||||
|
||||
for _, f := range []string{"metadata.json", "_files.json", "test.txt", "image.jpg"} {
|
||||
if _, err := os.Stat(filepath.Join(itemDir, f)); os.IsNotExist(err) {
|
||||
t.Errorf("expected file %s to be created in %s", f, itemDir)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectArchiveSearchCmd_FormatFlag_E2E(t *testing.T) {
|
||||
tempDir := t.TempDir()
|
||||
archiveDir := filepath.Join(tempDir, "archive")
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if strings.Contains(r.URL.Path, "/advancedsearch.php") {
|
||||
fmt.Fprintln(w, `{"response": {"docs": [{"identifier": "test-item"}]}}`)
|
||||
} else if strings.Contains(r.URL.Path, "/metadata/") {
|
||||
fmt.Fprintln(w, `{"files": [{"name": "test.txt", "format": "Text"}, {"name": "image.jpg", "format": "JPEG"}]}`)
|
||||
} else if strings.Contains(r.URL.Path, "/download/") {
|
||||
fmt.Fprintln(w, "file content")
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
originalURL := archive.BaseURL
|
||||
archive.BaseURL = server.URL
|
||||
defer func() {
|
||||
archive.BaseURL = originalURL
|
||||
}()
|
||||
|
||||
originalWd, _ := os.Getwd()
|
||||
os.Chdir(tempDir)
|
||||
defer os.Chdir(originalWd)
|
||||
|
||||
_, err := executeCommand(RootCmd, "collect", "archive", "search", "test-query", "--format=Text")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
itemDir := filepath.Join(archiveDir, "test-item")
|
||||
// Verify correct file is downloaded
|
||||
if _, err := os.Stat(filepath.Join(itemDir, "test.txt")); os.IsNotExist(err) {
|
||||
t.Errorf("expected test.txt to be downloaded")
|
||||
}
|
||||
// Verify incorrect format file is NOT downloaded
|
||||
if _, err := os.Stat(filepath.Join(itemDir, "image.jpg")); err == nil {
|
||||
t.Errorf("did not expect image.jpg to be downloaded")
|
||||
}
|
||||
}
|
||||
188
pkg/archive/archive.go
Normal file
188
pkg/archive/archive.go
Normal file
|
|
@ -0,0 +1,188 @@
|
|||
package archive
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
)
|
||||
|
||||
var BaseURL = "https://archive.org"
|
||||
|
||||
type Item struct {
|
||||
Identifier string `json:"identifier"`
|
||||
}
|
||||
|
||||
type SearchResponse struct {
|
||||
Response struct {
|
||||
Docs []Item `json:"docs"`
|
||||
} `json:"response"`
|
||||
}
|
||||
|
||||
type ItemMetadata struct {
|
||||
Files []File `json:"files"`
|
||||
}
|
||||
|
||||
type File struct {
|
||||
Name string `json:"name"`
|
||||
Source string `json:"source"`
|
||||
Format string `json:"format"`
|
||||
Size string `json:"size"`
|
||||
}
|
||||
|
||||
func Search(query, mediaType string, limit int) ([]Item, error) {
|
||||
var allItems []Item
|
||||
page := 1
|
||||
const rowsPerPage = 100 // A reasonable number of results per page
|
||||
|
||||
for {
|
||||
baseURL, err := url.Parse(BaseURL + "/advancedsearch.php")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
params := url.Values{}
|
||||
params.Add("q", query)
|
||||
if mediaType != "" {
|
||||
params.Add("fq", "mediatype:"+mediaType)
|
||||
}
|
||||
params.Add("fl[]", "identifier")
|
||||
params.Add("output", "json")
|
||||
params.Add("page", fmt.Sprintf("%d", page))
|
||||
|
||||
if limit == -1 {
|
||||
params.Add("rows", fmt.Sprintf("%d", rowsPerPage))
|
||||
} else {
|
||||
params.Add("rows", fmt.Sprintf("%d", limit))
|
||||
}
|
||||
|
||||
baseURL.RawQuery = params.Encode()
|
||||
|
||||
resp, err := http.Get(baseURL.String())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
resp.Body.Close()
|
||||
return nil, fmt.Errorf("bad status: %s", resp.Status)
|
||||
}
|
||||
|
||||
var searchResponse SearchResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&searchResponse); err != nil {
|
||||
resp.Body.Close()
|
||||
return nil, err
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
if len(searchResponse.Response.Docs) == 0 {
|
||||
break // No more results
|
||||
}
|
||||
|
||||
allItems = append(allItems, searchResponse.Response.Docs...)
|
||||
|
||||
if limit != -1 && len(allItems) >= limit {
|
||||
return allItems[:limit], nil
|
||||
}
|
||||
|
||||
if limit != -1 {
|
||||
break // We only needed one page
|
||||
}
|
||||
|
||||
page++
|
||||
}
|
||||
|
||||
return allItems, nil
|
||||
}
|
||||
|
||||
func GetItem(identifier string) (*ItemMetadata, error) {
|
||||
url := fmt.Sprintf("%s/metadata/%s", BaseURL, identifier)
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("bad status: %s", resp.Status)
|
||||
}
|
||||
|
||||
var itemMetadata ItemMetadata
|
||||
if err := json.NewDecoder(resp.Body).Decode(&itemMetadata); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &itemMetadata, nil
|
||||
}
|
||||
|
||||
func DownloadFile(url, filepath string) error {
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("bad status: %s", resp.Status)
|
||||
}
|
||||
|
||||
out, err := os.Create(filepath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
_, err = io.Copy(out, resp.Body)
|
||||
return err
|
||||
}
|
||||
|
||||
func GetCollection(identifier string) ([]Item, error) {
|
||||
return Search(fmt.Sprintf("collection:%s", identifier), "", -1) // -1 for no limit
|
||||
}
|
||||
|
||||
func DownloadItem(identifier, baseDir string, formatFilter string) error {
|
||||
item, err := GetItem(identifier)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not get item metadata for %s: %w", identifier, err)
|
||||
}
|
||||
|
||||
itemDir := fmt.Sprintf("%s/%s", baseDir, identifier)
|
||||
if err := os.MkdirAll(itemDir, 0755); err != nil {
|
||||
return fmt.Errorf("could not create directory %s: %w", itemDir, err)
|
||||
}
|
||||
|
||||
// Save metadata
|
||||
metadataJSON, err := json.MarshalIndent(item, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not marshal metadata for %s: %w", identifier, err)
|
||||
}
|
||||
if err := os.WriteFile(fmt.Sprintf("%s/metadata.json", itemDir), metadataJSON, 0644); err != nil {
|
||||
return fmt.Errorf("could not write metadata.json for %s: %w", identifier, err)
|
||||
}
|
||||
|
||||
// Save file list
|
||||
filesJSON, err := json.MarshalIndent(item.Files, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not marshal file list for %s: %w", identifier, err)
|
||||
}
|
||||
if err := os.WriteFile(fmt.Sprintf("%s/_files.json", itemDir), filesJSON, 0644); err != nil {
|
||||
return fmt.Errorf("could not write _files.json for %s: %w", identifier, err)
|
||||
}
|
||||
|
||||
fmt.Printf("Downloading item %s...\n", identifier)
|
||||
for _, file := range item.Files {
|
||||
if formatFilter != "" && file.Format != formatFilter {
|
||||
continue
|
||||
}
|
||||
downloadURL := fmt.Sprintf("%s/download/%s/%s", BaseURL, identifier, file.Name)
|
||||
filePath := fmt.Sprintf("%s/%s", itemDir, file.Name)
|
||||
fmt.Printf(" Downloading file %s...\n", file.Name)
|
||||
if err := DownloadFile(downloadURL, filePath); err != nil {
|
||||
// Log error but continue trying other files
|
||||
fmt.Printf(" Error downloading %s: %v\n", file.Name, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
52
pkg/archive/archive_test.go
Normal file
52
pkg/archive/archive_test.go
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
package archive
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSearch(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
fmt.Fprintln(w, `{"response": {"docs": [{"identifier": "test-item"}]}}`)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
originalURL := BaseURL
|
||||
BaseURL = server.URL
|
||||
defer func() {
|
||||
BaseURL = originalURL
|
||||
}()
|
||||
|
||||
items, err := Search("test", "", 1)
|
||||
if err != nil {
|
||||
t.Fatalf("Search failed: %v", err)
|
||||
}
|
||||
|
||||
if len(items) != 1 || items[0].Identifier != "test-item" {
|
||||
t.Errorf("Expected to find 1 item with identifier 'test-item', but got %v", items)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetItem(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
fmt.Fprintln(w, `{"files": [{"name": "test.txt"}]}`)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
originalURL := BaseURL
|
||||
BaseURL = server.URL
|
||||
defer func() {
|
||||
BaseURL = originalURL
|
||||
}()
|
||||
|
||||
item, err := GetItem("test-item")
|
||||
if err != nil {
|
||||
t.Fatalf("GetItem failed: %v", err)
|
||||
}
|
||||
|
||||
if len(item.Files) != 1 || item.Files[0].Name != "test.txt" {
|
||||
t.Errorf("Expected to find 1 file with name 'test.txt', but got %v", item.Files)
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue