Compare commits
1 commit
main
...
feat/archi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5a35fa03af |
8 changed files with 450 additions and 0 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -10,3 +10,4 @@ demo-track.smsg
|
|||
|
||||
# Dev artifacts
|
||||
.playwright-mcp/
|
||||
archive/
|
||||
|
|
|
|||
16
cmd/collect_archive.go
Normal file
16
cmd/collect_archive.go
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// collectArchiveCmd represents the collect archive command
|
||||
var collectArchiveCmd = &cobra.Command{
|
||||
Use: "archive",
|
||||
Short: "Collect a resource from the Internet Archive.",
|
||||
Long: `Collect a resource from the Internet Archive, such as a search query, an item, or a collection.`,
|
||||
}
|
||||
|
||||
func init() {
|
||||
collectCmd.AddCommand(collectArchiveCmd)
|
||||
}
|
||||
34
cmd/collect_archive_collection.go
Normal file
34
cmd/collect_archive_collection.go
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/Snider/Borg/pkg/archive"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// collectArchiveCollectionCmd represents the collect archive collection command
|
||||
var collectArchiveCollectionCmd = &cobra.Command{
|
||||
Use: "collection [identifier]",
|
||||
Short: "Collect a collection from the Internet Archive.",
|
||||
Long: `Collect a collection and all of its items from the Internet Archive.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
identifier := args[0]
|
||||
items, err := archive.GetCollection(identifier)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, item := range items {
|
||||
if err := archive.DownloadItem(item.Identifier, "archive", ""); err != nil {
|
||||
fmt.Fprintf(cmd.ErrOrStderr(), "Error downloading item %s from collection: %v\n", item.Identifier, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
collectArchiveCmd.AddCommand(collectArchiveCollectionCmd)
|
||||
}
|
||||
22
cmd/collect_archive_item.go
Normal file
22
cmd/collect_archive_item.go
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"github.com/Snider/Borg/pkg/archive"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// collectArchiveItemCmd represents the collect archive item command
|
||||
var collectArchiveItemCmd = &cobra.Command{
|
||||
Use: "item [identifier]",
|
||||
Short: "Collect an item from the Internet Archive.",
|
||||
Long: `Collect an item and all of its files from the Internet Archive.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
identifier := args[0]
|
||||
return archive.DownloadItem(identifier, "archive", "")
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
collectArchiveCmd.AddCommand(collectArchiveItemCmd)
|
||||
}
|
||||
41
cmd/collect_archive_search.go
Normal file
41
cmd/collect_archive_search.go
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/Snider/Borg/pkg/archive"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// collectArchiveSearchCmd represents the collect archive search command
|
||||
var collectArchiveSearchCmd = &cobra.Command{
|
||||
Use: "search [query]",
|
||||
Short: "Search for items on the Internet Archive.",
|
||||
Long: `Search for items on the Internet Archive and collect them.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
query := args[0]
|
||||
mediaType, _ := cmd.Flags().GetString("type")
|
||||
limit, _ := cmd.Flags().GetInt("limit")
|
||||
format, _ := cmd.Flags().GetString("format")
|
||||
|
||||
items, err := archive.Search(query, mediaType, limit)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, item := range items {
|
||||
if err := archive.DownloadItem(item.Identifier, "archive", format); err != nil {
|
||||
fmt.Fprintf(cmd.ErrOrStderr(), "Error downloading item %s: %v\n", item.Identifier, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
collectArchiveCmd.AddCommand(collectArchiveSearchCmd)
|
||||
collectArchiveSearchCmd.Flags().String("type", "", "Filter by mediatype (texts, software)")
|
||||
collectArchiveSearchCmd.Flags().Int("limit", 10, "Max items to collect")
|
||||
collectArchiveSearchCmd.Flags().String("format", "", "Preferred file format")
|
||||
}
|
||||
96
cmd/collect_archive_test.go
Normal file
96
cmd/collect_archive_test.go
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/Snider/Borg/pkg/archive"
|
||||
)
|
||||
|
||||
func TestCollectArchiveItemCmd_E2E(t *testing.T) {
|
||||
tempDir := t.TempDir()
|
||||
archiveDir := filepath.Join(tempDir, "archive")
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if strings.Contains(r.URL.Path, "/metadata/") {
|
||||
fmt.Fprintln(w, `{"files": [{"name": "test.txt", "format": "Text"}, {"name": "image.jpg", "format": "JPEG"}]}`)
|
||||
} else if strings.Contains(r.URL.Path, "/download/") {
|
||||
fmt.Fprintln(w, "file content")
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
originalURL := archive.BaseURL
|
||||
archive.BaseURL = server.URL
|
||||
defer func() {
|
||||
archive.BaseURL = originalURL
|
||||
}()
|
||||
|
||||
// Change working directory for the test
|
||||
originalWd, _ := os.Getwd()
|
||||
os.Chdir(tempDir)
|
||||
defer os.Chdir(originalWd)
|
||||
|
||||
_, err := executeCommand(RootCmd, "collect", "archive", "item", "test-item")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// Verify directory and files
|
||||
itemDir := filepath.Join(archiveDir, "test-item")
|
||||
if _, err := os.Stat(itemDir); os.IsNotExist(err) {
|
||||
t.Errorf("expected directory %s to be created", itemDir)
|
||||
}
|
||||
|
||||
for _, f := range []string{"metadata.json", "_files.json", "test.txt", "image.jpg"} {
|
||||
if _, err := os.Stat(filepath.Join(itemDir, f)); os.IsNotExist(err) {
|
||||
t.Errorf("expected file %s to be created in %s", f, itemDir)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectArchiveSearchCmd_FormatFlag_E2E(t *testing.T) {
|
||||
tempDir := t.TempDir()
|
||||
archiveDir := filepath.Join(tempDir, "archive")
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if strings.Contains(r.URL.Path, "/advancedsearch.php") {
|
||||
fmt.Fprintln(w, `{"response": {"docs": [{"identifier": "test-item"}]}}`)
|
||||
} else if strings.Contains(r.URL.Path, "/metadata/") {
|
||||
fmt.Fprintln(w, `{"files": [{"name": "test.txt", "format": "Text"}, {"name": "image.jpg", "format": "JPEG"}]}`)
|
||||
} else if strings.Contains(r.URL.Path, "/download/") {
|
||||
fmt.Fprintln(w, "file content")
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
originalURL := archive.BaseURL
|
||||
archive.BaseURL = server.URL
|
||||
defer func() {
|
||||
archive.BaseURL = originalURL
|
||||
}()
|
||||
|
||||
originalWd, _ := os.Getwd()
|
||||
os.Chdir(tempDir)
|
||||
defer os.Chdir(originalWd)
|
||||
|
||||
_, err := executeCommand(RootCmd, "collect", "archive", "search", "test-query", "--format=Text")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
itemDir := filepath.Join(archiveDir, "test-item")
|
||||
// Verify correct file is downloaded
|
||||
if _, err := os.Stat(filepath.Join(itemDir, "test.txt")); os.IsNotExist(err) {
|
||||
t.Errorf("expected test.txt to be downloaded")
|
||||
}
|
||||
// Verify incorrect format file is NOT downloaded
|
||||
if _, err := os.Stat(filepath.Join(itemDir, "image.jpg")); err == nil {
|
||||
t.Errorf("did not expect image.jpg to be downloaded")
|
||||
}
|
||||
}
|
||||
188
pkg/archive/archive.go
Normal file
188
pkg/archive/archive.go
Normal file
|
|
@ -0,0 +1,188 @@
|
|||
package archive
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
)
|
||||
|
||||
var BaseURL = "https://archive.org"
|
||||
|
||||
type Item struct {
|
||||
Identifier string `json:"identifier"`
|
||||
}
|
||||
|
||||
type SearchResponse struct {
|
||||
Response struct {
|
||||
Docs []Item `json:"docs"`
|
||||
} `json:"response"`
|
||||
}
|
||||
|
||||
type ItemMetadata struct {
|
||||
Files []File `json:"files"`
|
||||
}
|
||||
|
||||
type File struct {
|
||||
Name string `json:"name"`
|
||||
Source string `json:"source"`
|
||||
Format string `json:"format"`
|
||||
Size string `json:"size"`
|
||||
}
|
||||
|
||||
func Search(query, mediaType string, limit int) ([]Item, error) {
|
||||
var allItems []Item
|
||||
page := 1
|
||||
const rowsPerPage = 100 // A reasonable number of results per page
|
||||
|
||||
for {
|
||||
baseURL, err := url.Parse(BaseURL + "/advancedsearch.php")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
params := url.Values{}
|
||||
params.Add("q", query)
|
||||
if mediaType != "" {
|
||||
params.Add("fq", "mediatype:"+mediaType)
|
||||
}
|
||||
params.Add("fl[]", "identifier")
|
||||
params.Add("output", "json")
|
||||
params.Add("page", fmt.Sprintf("%d", page))
|
||||
|
||||
if limit == -1 {
|
||||
params.Add("rows", fmt.Sprintf("%d", rowsPerPage))
|
||||
} else {
|
||||
params.Add("rows", fmt.Sprintf("%d", limit))
|
||||
}
|
||||
|
||||
baseURL.RawQuery = params.Encode()
|
||||
|
||||
resp, err := http.Get(baseURL.String())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
resp.Body.Close()
|
||||
return nil, fmt.Errorf("bad status: %s", resp.Status)
|
||||
}
|
||||
|
||||
var searchResponse SearchResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&searchResponse); err != nil {
|
||||
resp.Body.Close()
|
||||
return nil, err
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
if len(searchResponse.Response.Docs) == 0 {
|
||||
break // No more results
|
||||
}
|
||||
|
||||
allItems = append(allItems, searchResponse.Response.Docs...)
|
||||
|
||||
if limit != -1 && len(allItems) >= limit {
|
||||
return allItems[:limit], nil
|
||||
}
|
||||
|
||||
if limit != -1 {
|
||||
break // We only needed one page
|
||||
}
|
||||
|
||||
page++
|
||||
}
|
||||
|
||||
return allItems, nil
|
||||
}
|
||||
|
||||
func GetItem(identifier string) (*ItemMetadata, error) {
|
||||
url := fmt.Sprintf("%s/metadata/%s", BaseURL, identifier)
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("bad status: %s", resp.Status)
|
||||
}
|
||||
|
||||
var itemMetadata ItemMetadata
|
||||
if err := json.NewDecoder(resp.Body).Decode(&itemMetadata); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &itemMetadata, nil
|
||||
}
|
||||
|
||||
func DownloadFile(url, filepath string) error {
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("bad status: %s", resp.Status)
|
||||
}
|
||||
|
||||
out, err := os.Create(filepath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
_, err = io.Copy(out, resp.Body)
|
||||
return err
|
||||
}
|
||||
|
||||
func GetCollection(identifier string) ([]Item, error) {
|
||||
return Search(fmt.Sprintf("collection:%s", identifier), "", -1) // -1 for no limit
|
||||
}
|
||||
|
||||
func DownloadItem(identifier, baseDir string, formatFilter string) error {
|
||||
item, err := GetItem(identifier)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not get item metadata for %s: %w", identifier, err)
|
||||
}
|
||||
|
||||
itemDir := fmt.Sprintf("%s/%s", baseDir, identifier)
|
||||
if err := os.MkdirAll(itemDir, 0755); err != nil {
|
||||
return fmt.Errorf("could not create directory %s: %w", itemDir, err)
|
||||
}
|
||||
|
||||
// Save metadata
|
||||
metadataJSON, err := json.MarshalIndent(item, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not marshal metadata for %s: %w", identifier, err)
|
||||
}
|
||||
if err := os.WriteFile(fmt.Sprintf("%s/metadata.json", itemDir), metadataJSON, 0644); err != nil {
|
||||
return fmt.Errorf("could not write metadata.json for %s: %w", identifier, err)
|
||||
}
|
||||
|
||||
// Save file list
|
||||
filesJSON, err := json.MarshalIndent(item.Files, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not marshal file list for %s: %w", identifier, err)
|
||||
}
|
||||
if err := os.WriteFile(fmt.Sprintf("%s/_files.json", itemDir), filesJSON, 0644); err != nil {
|
||||
return fmt.Errorf("could not write _files.json for %s: %w", identifier, err)
|
||||
}
|
||||
|
||||
fmt.Printf("Downloading item %s...\n", identifier)
|
||||
for _, file := range item.Files {
|
||||
if formatFilter != "" && file.Format != formatFilter {
|
||||
continue
|
||||
}
|
||||
downloadURL := fmt.Sprintf("%s/download/%s/%s", BaseURL, identifier, file.Name)
|
||||
filePath := fmt.Sprintf("%s/%s", itemDir, file.Name)
|
||||
fmt.Printf(" Downloading file %s...\n", file.Name)
|
||||
if err := DownloadFile(downloadURL, filePath); err != nil {
|
||||
// Log error but continue trying other files
|
||||
fmt.Printf(" Error downloading %s: %v\n", file.Name, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
52
pkg/archive/archive_test.go
Normal file
52
pkg/archive/archive_test.go
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
package archive
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSearch(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
fmt.Fprintln(w, `{"response": {"docs": [{"identifier": "test-item"}]}}`)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
originalURL := BaseURL
|
||||
BaseURL = server.URL
|
||||
defer func() {
|
||||
BaseURL = originalURL
|
||||
}()
|
||||
|
||||
items, err := Search("test", "", 1)
|
||||
if err != nil {
|
||||
t.Fatalf("Search failed: %v", err)
|
||||
}
|
||||
|
||||
if len(items) != 1 || items[0].Identifier != "test-item" {
|
||||
t.Errorf("Expected to find 1 item with identifier 'test-item', but got %v", items)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetItem(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
fmt.Fprintln(w, `{"files": [{"name": "test.txt"}]}`)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
originalURL := BaseURL
|
||||
BaseURL = server.URL
|
||||
defer func() {
|
||||
BaseURL = originalURL
|
||||
}()
|
||||
|
||||
item, err := GetItem("test-item")
|
||||
if err != nil {
|
||||
t.Fatalf("GetItem failed: %v", err)
|
||||
}
|
||||
|
||||
if len(item.Files) != 1 || item.Files[0].Name != "test.txt" {
|
||||
t.Errorf("Expected to find 1 file with name 'test.txt', but got %v", item.Files)
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue