diff --git a/Taskfile.yml b/Taskfile.yml index 23c8914..d63d938 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -1,10 +1,15 @@ version: '3' tasks: + install-deps: + cmds: + - sudo apt-get update && sudo apt-get install -y poppler-utils clean: cmds: - rm -f borg build: + deps: + - install-deps cmds: - task: clean - go build -o borg main.go diff --git a/cmd/collect_website.go b/cmd/collect_website.go index 3811f32..6835951 100644 --- a/cmd/collect_website.go +++ b/cmd/collect_website.go @@ -1,11 +1,17 @@ package cmd import ( + "encoding/json" "fmt" + "io" + "io/fs" "os" + "path/filepath" + "strings" "github.com/schollz/progressbar/v3" "github.com/Snider/Borg/pkg/compress" + "github.com/Snider/Borg/pkg/pdf" "github.com/Snider/Borg/pkg/tim" "github.com/Snider/Borg/pkg/trix" "github.com/Snider/Borg/pkg/ui" @@ -38,6 +44,7 @@ func NewCollectWebsiteCmd() *cobra.Command { format, _ := cmd.Flags().GetString("format") compression, _ := cmd.Flags().GetString("compression") password, _ := cmd.Flags().GetString("password") + extractPdfMetadata, _ := cmd.Flags().GetBool("extract-pdf-metadata") if format != "datanode" && format != "tim" && format != "trix" { return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format) @@ -56,6 +63,53 @@ func NewCollectWebsiteCmd() *cobra.Command { return fmt.Errorf("error downloading and packaging website: %w", err) } + if extractPdfMetadata { + var allMetadata []*pdf.Metadata + err := dn.Walk("/", func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if !d.IsDir() && strings.HasSuffix(strings.ToLower(path), ".pdf") { + tempFile, err := os.CreateTemp("", "borg-pdf-*.pdf") + if err != nil { + return fmt.Errorf("failed to create temp file: %w", err) + } + defer os.Remove(tempFile.Name()) + + file, err := dn.Open(path) + if err != nil { + return fmt.Errorf("failed to open %s from DataNode: %w", path, err) + } + defer file.Close() + + if _, err := io.Copy(tempFile, file); err != nil { + return fmt.Errorf("failed to copy content to temp file: %w", err) + } + tempFile.Close() + + metadata, err := pdf.ExtractMetadata(tempFile.Name()) + if err != nil { + fmt.Fprintf(cmd.ErrOrStderr(), "could not extract metadata from %s: %v\n", path, err) + return nil + } + metadata.File = filepath.Base(path) + allMetadata = append(allMetadata, metadata) + } + return nil + }) + if err != nil { + return fmt.Errorf("error walking DataNode for PDF extraction: %w", err) + } + + if len(allMetadata) > 0 { + jsonOutput, err := json.MarshalIndent(allMetadata, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal metadata to JSON: %w", err) + } + dn.AddData("INDEX.json", jsonOutput) + } + } + var data []byte if format == "tim" { tim, err := tim.FromDataNode(dn) @@ -104,5 +158,6 @@ func NewCollectWebsiteCmd() *cobra.Command { collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)") collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)") collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption") + collectWebsiteCmd.PersistentFlags().Bool("extract-pdf-metadata", false, "Extract metadata from PDF files and add INDEX.json") return collectWebsiteCmd } diff --git a/cmd/extract_metadata.go b/cmd/extract_metadata.go new file mode 100644 index 0000000..f80eca3 --- /dev/null +++ b/cmd/extract_metadata.go @@ -0,0 +1,121 @@ +package cmd + +import ( + "encoding/json" + "fmt" + "io" + "io/fs" + "os" + "path/filepath" + "strings" + + "github.com/Snider/Borg/pkg/compress" + "github.com/Snider/Borg/pkg/datanode" + "github.com/Snider/Borg/pkg/pdf" + "github.com/spf13/cobra" +) + +// extractMetadataCmd represents the extract-metadata command +var extractMetadataCmd = NewExtractMetadataCmd() + +func init() { + RootCmd.AddCommand(GetExtractMetadataCmd()) +} + +func NewExtractMetadataCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "extract-metadata [archive]", + Short: "Extract metadata from files in an archive.", + Long: `Extract metadata from files of a specific type within a DataNode archive and create an INDEX.json file.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + archivePath := args[0] + fileType, _ := cmd.Flags().GetString("type") + + if fileType != "pdf" { + return fmt.Errorf("unsupported type: %s. Only 'pdf' is currently supported", fileType) + } + + // Read and decompress the archive + compressedData, err := os.ReadFile(archivePath) + if err != nil { + return fmt.Errorf("failed to read archive file: %w", err) + } + data, err := compress.Decompress(compressedData) + if err != nil { + return fmt.Errorf("failed to decompress archive: %w", err) + } + + // Load the DataNode + dn, err := datanode.FromTar(data) + if err != nil { + return fmt.Errorf("failed to load DataNode from tar: %w", err) + } + + var allMetadata []*pdf.Metadata + + // Walk the DataNode and extract metadata from PDF files + err = dn.Walk("/", func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if !d.IsDir() && strings.HasSuffix(strings.ToLower(path), ".pdf") { + // Create a temporary file to run extraction on + tempFile, err := os.CreateTemp("", "borg-pdf-*.pdf") + if err != nil { + return fmt.Errorf("failed to create temp file: %w", err) + } + defer os.Remove(tempFile.Name()) + + // Get the file content from DataNode + file, err := dn.Open(path) + if err != nil { + return fmt.Errorf("failed to open %s from DataNode: %w", path, err) + } + defer file.Close() + + // Copy content to temp file + if _, err := io.Copy(tempFile, file); err != nil { + return fmt.Errorf("failed to copy content to temp file: %w", err) + } + tempFile.Close() // Close the file to allow reading by the extractor + + // Extract metadata + metadata, err := pdf.ExtractMetadata(tempFile.Name()) + if err != nil { + // Log error but continue processing other files + fmt.Fprintf(cmd.ErrOrStderr(), "could not extract metadata from %s: %v\n", path, err) + return nil + } + metadata.File = filepath.Base(path) // Use the original filename + allMetadata = append(allMetadata, metadata) + } + return nil + }) + + if err != nil { + return fmt.Errorf("error walking DataNode: %w", err) + } + + // Write the aggregated metadata to INDEX.json + jsonOutput, err := json.MarshalIndent(allMetadata, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal metadata to JSON: %w", err) + } + + err = os.WriteFile("INDEX.json", jsonOutput, 0644) + if err != nil { + return fmt.Errorf("failed to write INDEX.json: %w", err) + } + + fmt.Fprintln(cmd.OutOrStdout(), "Metadata extracted and saved to INDEX.json") + return nil + }, + } + cmd.Flags().String("type", "pdf", "The type of files to extract metadata from (currently only 'pdf' is supported)") + return cmd +} + +func GetExtractMetadataCmd() *cobra.Command { + return extractMetadataCmd +} diff --git a/cmd/pdf.go b/cmd/pdf.go new file mode 100644 index 0000000..11c19fa --- /dev/null +++ b/cmd/pdf.go @@ -0,0 +1,24 @@ +package cmd + +import ( + "github.com/spf13/cobra" +) + +// pdfCmd represents the pdf command +var pdfCmd = NewPdfCmd() + +func init() { + RootCmd.AddCommand(GetPdfCmd()) +} + +func NewPdfCmd() *cobra.Command { + return &cobra.Command{ + Use: "pdf", + Short: "Perform PDF operations.", + Long: `A command for performing various PDF operations.`, + } +} + +func GetPdfCmd() *cobra.Command { + return pdfCmd +} diff --git a/cmd/pdf_metadata.go b/cmd/pdf_metadata.go new file mode 100644 index 0000000..0759a66 --- /dev/null +++ b/cmd/pdf_metadata.go @@ -0,0 +1,41 @@ +package cmd + +import ( + "encoding/json" + "fmt" + "github.com/Snider/Borg/pkg/pdf" + "github.com/spf13/cobra" +) + +// pdfMetadataCmd represents the pdf metadata command +var pdfMetadataCmd = NewPdfMetadataCmd() + +func init() { + GetPdfCmd().AddCommand(GetPdfMetadataCmd()) +} + +func NewPdfMetadataCmd() *cobra.Command { + return &cobra.Command{ + Use: "metadata [file]", + Short: "Extract metadata from a PDF file.", + Long: `Extract metadata from a PDF file and print it as JSON.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + filePath := args[0] + metadata, err := pdf.ExtractMetadata(filePath) + if err != nil { + return fmt.Errorf("error extracting metadata: %w", err) + } + jsonMetadata, err := json.MarshalIndent(metadata, "", " ") + if err != nil { + return fmt.Errorf("error marshalling metadata to JSON: %w", err) + } + fmt.Fprintln(cmd.OutOrStdout(), string(jsonMetadata)) + return nil + }, + } +} + +func GetPdfMetadataCmd() *cobra.Command { + return pdfMetadataCmd +} diff --git a/pkg/pdf/metadata.go b/pkg/pdf/metadata.go new file mode 100644 index 0000000..dd63b66 --- /dev/null +++ b/pkg/pdf/metadata.go @@ -0,0 +1,58 @@ +package pdf + +import ( + "bufio" + "bytes" + "os/exec" + "strconv" + "strings" +) + +// Metadata holds the extracted PDF metadata. +type Metadata struct { + File string `json:"file"` + Title string `json:"title"` + Authors []string `json:"authors"` + Abstract string `json:"abstract"` + Pages int `json:"pages"` + Created string `json:"created"` +} + +// ExtractMetadata extracts metadata from a PDF file using the pdfinfo command. +func ExtractMetadata(filePath string) (*Metadata, error) { + cmd := exec.Command("pdfinfo", filePath) + var out bytes.Buffer + cmd.Stdout = &out + err := cmd.Run() + if err != nil { + return nil, err + } + + metadata := &Metadata{File: filePath} + scanner := bufio.NewScanner(&out) + for scanner.Scan() { + line := scanner.Text() + parts := strings.SplitN(line, ":", 2) + if len(parts) != 2 { + continue + } + key := strings.TrimSpace(parts[0]) + value := strings.TrimSpace(parts[1]) + + switch key { + case "Title": + metadata.Title = value + case "Author": + metadata.Authors = strings.Split(value, ",") + case "CreationDate": + metadata.Created = value + case "Pages": + pages, err := strconv.Atoi(value) + if err == nil { + metadata.Pages = pages + } + } + } + + return metadata, nil +} diff --git a/pkg/pdf/metadata_test.go b/pkg/pdf/metadata_test.go new file mode 100644 index 0000000..78ff99a --- /dev/null +++ b/pkg/pdf/metadata_test.go @@ -0,0 +1,103 @@ +package pdf + +import ( + "fmt" + "os" + "os/exec" + "strings" + "testing" +) + +// mockExecCommand is used to mock the exec.Command function for testing. +func mockExecCommand(command string, args ...string) *exec.Cmd { + cs := []string{"-test.run=TestHelperProcess", "--", command} + cs = append(cs, args...) + cmd := exec.Command(os.Args[0], cs...) + cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"} + return cmd +} + +// TestHelperProcess isn't a real test. It's used as a helper process +// for TestExtractMetadata. It simulates the behavior of the `pdfinfo` command. +func TestHelperProcess(t *testing.T) { + if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" { + return + } + defer os.Exit(0) + + args := os.Args + for len(args) > 0 { + if args[0] == "--" { + args = args[1:] + break + } + args = args[1:] + } + if len(args) == 0 { + fmt.Fprintf(os.Stderr, "No command to mock!\n") + os.Exit(1) + } + + cmd, args := args[0], args[1:] + if cmd == "pdfinfo" && len(args) == 1 { + // Simulate pdfinfo output + fmt.Println("Title: Test Title") + fmt.Println("Author: Test Author 1,Test Author 2") + fmt.Println("CreationDate: Sun Jan 1 00:00:00 2023") + fmt.Println("Pages: 42") + } +} + +func TestExtractMetadata(t *testing.T) { + execCommand = mockExecCommand + defer func() { execCommand = exec.Command }() + + metadata, err := ExtractMetadata("dummy.pdf") + if err != nil { + t.Fatalf("ExtractMetadata failed: %v", err) + } + + if metadata.Title != "Test Title" { + t.Errorf("expected title 'Test Title', got '%s'", metadata.Title) + } + if len(metadata.Authors) != 2 || metadata.Authors[0] != "Test Author 1" || metadata.Authors[1] != "Test Author 2" { + t.Errorf("expected authors '[Test Author 1, Test Author 2]', got '%v'", metadata.Authors) + } + if metadata.Created != "Sun Jan 1 00:00:00 2023" { + t.Errorf("expected creation date 'Sun Jan 1 00:00:00 2023', got '%s'", metadata.Created) + } + if metadata.Pages != 42 { + t.Errorf("expected 42 pages, got %d", metadata.Pages) + } + if metadata.File != "dummy.pdf" { + t.Errorf("expected file 'dummy.pdf', got '%s'", metadata.File) + } +} + +func TestExtractMetadata_CommandError(t *testing.T) { + execCommand = func(command string, args ...string) *exec.Cmd { + cs := []string{"-test.run=TestHelperProcess_Error", "--", command} + cs = append(cs, args...) + cmd := exec.Command(os.Args[0], cs...) + cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"} + return cmd + } + defer func() { execCommand = exec.Command }() + + _, err := ExtractMetadata("dummy.pdf") + if err == nil { + t.Fatal("expected an error from exec.Command, but got nil") + } + if !strings.Contains(err.Error(), "exit status 1") { + t.Errorf("expected error to contain 'exit status 1', got '%v'", err) + } +} + +func TestHelperProcess_Error(t *testing.T) { + if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" { + return + } + // Simulate an error by writing to stderr and exiting with a non-zero status + fmt.Fprintf(os.Stderr, "pdfinfo error") + os.Exit(1) +}