diff --git a/.gitignore b/.gitignore index d3a3066..dadebd7 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ demo-track.smsg # Dev artifacts .playwright-mcp/ +archive/ diff --git a/cmd/collect_archive.go b/cmd/collect_archive.go new file mode 100644 index 0000000..2fce6cc --- /dev/null +++ b/cmd/collect_archive.go @@ -0,0 +1,16 @@ +package cmd + +import ( + "github.com/spf13/cobra" +) + +// collectArchiveCmd represents the collect archive command +var collectArchiveCmd = &cobra.Command{ + Use: "archive", + Short: "Collect a resource from the Internet Archive.", + Long: `Collect a resource from the Internet Archive, such as a search query, an item, or a collection.`, +} + +func init() { + collectCmd.AddCommand(collectArchiveCmd) +} diff --git a/cmd/collect_archive_collection.go b/cmd/collect_archive_collection.go new file mode 100644 index 0000000..95a1421 --- /dev/null +++ b/cmd/collect_archive_collection.go @@ -0,0 +1,34 @@ +package cmd + +import ( + "fmt" + "github.com/Snider/Borg/pkg/archive" + "github.com/spf13/cobra" +) + +// collectArchiveCollectionCmd represents the collect archive collection command +var collectArchiveCollectionCmd = &cobra.Command{ + Use: "collection [identifier]", + Short: "Collect a collection from the Internet Archive.", + Long: `Collect a collection and all of its items from the Internet Archive.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + identifier := args[0] + items, err := archive.GetCollection(identifier) + if err != nil { + return err + } + + for _, item := range items { + if err := archive.DownloadItem(item.Identifier, "archive", ""); err != nil { + fmt.Fprintf(cmd.ErrOrStderr(), "Error downloading item %s from collection: %v\n", item.Identifier, err) + } + } + + return nil + }, +} + +func init() { + collectArchiveCmd.AddCommand(collectArchiveCollectionCmd) +} diff --git a/cmd/collect_archive_item.go b/cmd/collect_archive_item.go new file mode 100644 index 0000000..814d754 --- /dev/null +++ b/cmd/collect_archive_item.go @@ -0,0 +1,22 @@ +package cmd + +import ( + "github.com/Snider/Borg/pkg/archive" + "github.com/spf13/cobra" +) + +// collectArchiveItemCmd represents the collect archive item command +var collectArchiveItemCmd = &cobra.Command{ + Use: "item [identifier]", + Short: "Collect an item from the Internet Archive.", + Long: `Collect an item and all of its files from the Internet Archive.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + identifier := args[0] + return archive.DownloadItem(identifier, "archive", "") + }, +} + +func init() { + collectArchiveCmd.AddCommand(collectArchiveItemCmd) +} diff --git a/cmd/collect_archive_search.go b/cmd/collect_archive_search.go new file mode 100644 index 0000000..268ff3c --- /dev/null +++ b/cmd/collect_archive_search.go @@ -0,0 +1,41 @@ +package cmd + +import ( + "fmt" + "github.com/Snider/Borg/pkg/archive" + "github.com/spf13/cobra" +) + +// collectArchiveSearchCmd represents the collect archive search command +var collectArchiveSearchCmd = &cobra.Command{ + Use: "search [query]", + Short: "Search for items on the Internet Archive.", + Long: `Search for items on the Internet Archive and collect them.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + query := args[0] + mediaType, _ := cmd.Flags().GetString("type") + limit, _ := cmd.Flags().GetInt("limit") + format, _ := cmd.Flags().GetString("format") + + items, err := archive.Search(query, mediaType, limit) + if err != nil { + return err + } + + for _, item := range items { + if err := archive.DownloadItem(item.Identifier, "archive", format); err != nil { + fmt.Fprintf(cmd.ErrOrStderr(), "Error downloading item %s: %v\n", item.Identifier, err) + } + } + + return nil + }, +} + +func init() { + collectArchiveCmd.AddCommand(collectArchiveSearchCmd) + collectArchiveSearchCmd.Flags().String("type", "", "Filter by mediatype (texts, software)") + collectArchiveSearchCmd.Flags().Int("limit", 10, "Max items to collect") + collectArchiveSearchCmd.Flags().String("format", "", "Preferred file format") +} diff --git a/cmd/collect_archive_test.go b/cmd/collect_archive_test.go new file mode 100644 index 0000000..23fee35 --- /dev/null +++ b/cmd/collect_archive_test.go @@ -0,0 +1,96 @@ +package cmd + +import ( + "fmt" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/Snider/Borg/pkg/archive" +) + +func TestCollectArchiveItemCmd_E2E(t *testing.T) { + tempDir := t.TempDir() + archiveDir := filepath.Join(tempDir, "archive") + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.Contains(r.URL.Path, "/metadata/") { + fmt.Fprintln(w, `{"files": [{"name": "test.txt", "format": "Text"}, {"name": "image.jpg", "format": "JPEG"}]}`) + } else if strings.Contains(r.URL.Path, "/download/") { + fmt.Fprintln(w, "file content") + } + })) + defer server.Close() + + originalURL := archive.BaseURL + archive.BaseURL = server.URL + defer func() { + archive.BaseURL = originalURL + }() + + // Change working directory for the test + originalWd, _ := os.Getwd() + os.Chdir(tempDir) + defer os.Chdir(originalWd) + + _, err := executeCommand(RootCmd, "collect", "archive", "item", "test-item") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Verify directory and files + itemDir := filepath.Join(archiveDir, "test-item") + if _, err := os.Stat(itemDir); os.IsNotExist(err) { + t.Errorf("expected directory %s to be created", itemDir) + } + + for _, f := range []string{"metadata.json", "_files.json", "test.txt", "image.jpg"} { + if _, err := os.Stat(filepath.Join(itemDir, f)); os.IsNotExist(err) { + t.Errorf("expected file %s to be created in %s", f, itemDir) + } + } +} + +func TestCollectArchiveSearchCmd_FormatFlag_E2E(t *testing.T) { + tempDir := t.TempDir() + archiveDir := filepath.Join(tempDir, "archive") + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.Contains(r.URL.Path, "/advancedsearch.php") { + fmt.Fprintln(w, `{"response": {"docs": [{"identifier": "test-item"}]}}`) + } else if strings.Contains(r.URL.Path, "/metadata/") { + fmt.Fprintln(w, `{"files": [{"name": "test.txt", "format": "Text"}, {"name": "image.jpg", "format": "JPEG"}]}`) + } else if strings.Contains(r.URL.Path, "/download/") { + fmt.Fprintln(w, "file content") + } + })) + defer server.Close() + + originalURL := archive.BaseURL + archive.BaseURL = server.URL + defer func() { + archive.BaseURL = originalURL + }() + + originalWd, _ := os.Getwd() + os.Chdir(tempDir) + defer os.Chdir(originalWd) + + _, err := executeCommand(RootCmd, "collect", "archive", "search", "test-query", "--format=Text") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + itemDir := filepath.Join(archiveDir, "test-item") + // Verify correct file is downloaded + if _, err := os.Stat(filepath.Join(itemDir, "test.txt")); os.IsNotExist(err) { + t.Errorf("expected test.txt to be downloaded") + } + // Verify incorrect format file is NOT downloaded + if _, err := os.Stat(filepath.Join(itemDir, "image.jpg")); err == nil { + t.Errorf("did not expect image.jpg to be downloaded") + } +} diff --git a/pkg/archive/archive.go b/pkg/archive/archive.go new file mode 100644 index 0000000..d5dab49 --- /dev/null +++ b/pkg/archive/archive.go @@ -0,0 +1,188 @@ +package archive + +import ( + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "os" +) + +var BaseURL = "https://archive.org" + +type Item struct { + Identifier string `json:"identifier"` +} + +type SearchResponse struct { + Response struct { + Docs []Item `json:"docs"` + } `json:"response"` +} + +type ItemMetadata struct { + Files []File `json:"files"` +} + +type File struct { + Name string `json:"name"` + Source string `json:"source"` + Format string `json:"format"` + Size string `json:"size"` +} + +func Search(query, mediaType string, limit int) ([]Item, error) { + var allItems []Item + page := 1 + const rowsPerPage = 100 // A reasonable number of results per page + + for { + baseURL, err := url.Parse(BaseURL + "/advancedsearch.php") + if err != nil { + return nil, err + } + + params := url.Values{} + params.Add("q", query) + if mediaType != "" { + params.Add("fq", "mediatype:"+mediaType) + } + params.Add("fl[]", "identifier") + params.Add("output", "json") + params.Add("page", fmt.Sprintf("%d", page)) + + if limit == -1 { + params.Add("rows", fmt.Sprintf("%d", rowsPerPage)) + } else { + params.Add("rows", fmt.Sprintf("%d", limit)) + } + + baseURL.RawQuery = params.Encode() + + resp, err := http.Get(baseURL.String()) + if err != nil { + return nil, err + } + + if resp.StatusCode != http.StatusOK { + resp.Body.Close() + return nil, fmt.Errorf("bad status: %s", resp.Status) + } + + var searchResponse SearchResponse + if err := json.NewDecoder(resp.Body).Decode(&searchResponse); err != nil { + resp.Body.Close() + return nil, err + } + resp.Body.Close() + + if len(searchResponse.Response.Docs) == 0 { + break // No more results + } + + allItems = append(allItems, searchResponse.Response.Docs...) + + if limit != -1 && len(allItems) >= limit { + return allItems[:limit], nil + } + + if limit != -1 { + break // We only needed one page + } + + page++ + } + + return allItems, nil +} + +func GetItem(identifier string) (*ItemMetadata, error) { + url := fmt.Sprintf("%s/metadata/%s", BaseURL, identifier) + resp, err := http.Get(url) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("bad status: %s", resp.Status) + } + + var itemMetadata ItemMetadata + if err := json.NewDecoder(resp.Body).Decode(&itemMetadata); err != nil { + return nil, err + } + + return &itemMetadata, nil +} + +func DownloadFile(url, filepath string) error { + resp, err := http.Get(url) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("bad status: %s", resp.Status) + } + + out, err := os.Create(filepath) + if err != nil { + return err + } + defer out.Close() + + _, err = io.Copy(out, resp.Body) + return err +} + +func GetCollection(identifier string) ([]Item, error) { + return Search(fmt.Sprintf("collection:%s", identifier), "", -1) // -1 for no limit +} + +func DownloadItem(identifier, baseDir string, formatFilter string) error { + item, err := GetItem(identifier) + if err != nil { + return fmt.Errorf("could not get item metadata for %s: %w", identifier, err) + } + + itemDir := fmt.Sprintf("%s/%s", baseDir, identifier) + if err := os.MkdirAll(itemDir, 0755); err != nil { + return fmt.Errorf("could not create directory %s: %w", itemDir, err) + } + + // Save metadata + metadataJSON, err := json.MarshalIndent(item, "", " ") + if err != nil { + return fmt.Errorf("could not marshal metadata for %s: %w", identifier, err) + } + if err := os.WriteFile(fmt.Sprintf("%s/metadata.json", itemDir), metadataJSON, 0644); err != nil { + return fmt.Errorf("could not write metadata.json for %s: %w", identifier, err) + } + + // Save file list + filesJSON, err := json.MarshalIndent(item.Files, "", " ") + if err != nil { + return fmt.Errorf("could not marshal file list for %s: %w", identifier, err) + } + if err := os.WriteFile(fmt.Sprintf("%s/_files.json", itemDir), filesJSON, 0644); err != nil { + return fmt.Errorf("could not write _files.json for %s: %w", identifier, err) + } + + fmt.Printf("Downloading item %s...\n", identifier) + for _, file := range item.Files { + if formatFilter != "" && file.Format != formatFilter { + continue + } + downloadURL := fmt.Sprintf("%s/download/%s/%s", BaseURL, identifier, file.Name) + filePath := fmt.Sprintf("%s/%s", itemDir, file.Name) + fmt.Printf(" Downloading file %s...\n", file.Name) + if err := DownloadFile(downloadURL, filePath); err != nil { + // Log error but continue trying other files + fmt.Printf(" Error downloading %s: %v\n", file.Name, err) + } + } + return nil +} diff --git a/pkg/archive/archive_test.go b/pkg/archive/archive_test.go new file mode 100644 index 0000000..d74caec --- /dev/null +++ b/pkg/archive/archive_test.go @@ -0,0 +1,52 @@ +package archive + +import ( + "fmt" + "net/http" + "net/http/httptest" + "testing" +) + +func TestSearch(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintln(w, `{"response": {"docs": [{"identifier": "test-item"}]}}`) + })) + defer server.Close() + + originalURL := BaseURL + BaseURL = server.URL + defer func() { + BaseURL = originalURL + }() + + items, err := Search("test", "", 1) + if err != nil { + t.Fatalf("Search failed: %v", err) + } + + if len(items) != 1 || items[0].Identifier != "test-item" { + t.Errorf("Expected to find 1 item with identifier 'test-item', but got %v", items) + } +} + +func TestGetItem(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintln(w, `{"files": [{"name": "test.txt"}]}`) + })) + defer server.Close() + + originalURL := BaseURL + BaseURL = server.URL + defer func() { + BaseURL = originalURL + }() + + item, err := GetItem("test-item") + if err != nil { + t.Fatalf("GetItem failed: %v", err) + } + + if len(item.Files) != 1 || item.Files[0].Name != "test.txt" { + t.Errorf("Expected to find 1 file with name 'test.txt', but got %v", item.Files) + } +}