feat: PDF metadata extraction

This commit introduces a new feature to extract and index metadata from collected PDF files. The following changes have been made: - Added a new `pdf` command with a `metadata` subcommand to extract metadata from a single PDF file. - Added a new `extract-metadata` command to extract metadata from all PDF files within a given archive and create an `INDEX.json` file. - Added a `--extract-pdf-metadata` flag to the `collect website` command to extract metadata from downloaded PDF files. - Created a new `pdf` package to encapsulate the PDF metadata extraction logic, which uses the `pdfinfo` command from the `poppler-utils` package. - Added unit tests for the new `pdf` package, including mocking the `pdfinfo` command. - Modified `Taskfile.yml` to install `poppler-utils` as a dependency. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
2026-02-02 00:46:59 +00:00 · 2026-02-02 00:46:59 +00:00 · c7e3ba297f
commit c7e3ba297f
parent cf2af53ed3
7 changed files with 407 additions and 0 deletions
--- a/Taskfile.yml
+++ b/Taskfile.yml
@ -1,10 +1,15 @@
 version: '3'

 tasks:
+  install-deps:
+    cmds:
+      - sudo apt-get update && sudo apt-get install -y poppler-utils
  clean:
    cmds:
      - rm -f borg
  build:
+    deps:
+      - install-deps
    cmds:
      - task: clean
      - go build -o borg main.go
--- a/cmd/collect_website.go
+++ b/cmd/collect_website.go
@ -1,11 +1,17 @@
 package cmd

 import (
+	"encoding/json"
 	"fmt"
+	"io"
+	"io/fs"
 	"os"
+	"path/filepath"
+	"strings"

 	"github.com/schollz/progressbar/v3"
 	"github.com/Snider/Borg/pkg/compress"
+	"github.com/Snider/Borg/pkg/pdf"
 	"github.com/Snider/Borg/pkg/tim"
 	"github.com/Snider/Borg/pkg/trix"
 	"github.com/Snider/Borg/pkg/ui"
@ -38,6 +44,7 @@ func NewCollectWebsiteCmd() *cobra.Command {
 			format, _ := cmd.Flags().GetString("format")
 			compression, _ := cmd.Flags().GetString("compression")
 			password, _ := cmd.Flags().GetString("password")
+			extractPdfMetadata, _ := cmd.Flags().GetBool("extract-pdf-metadata")

 			if format != "datanode" && format != "tim" && format != "trix" {
 				return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
@ -56,6 +63,53 @@ func NewCollectWebsiteCmd() *cobra.Command {
 				return fmt.Errorf("error downloading and packaging website: %w", err)
 			}

+			if extractPdfMetadata {
+				var allMetadata []*pdf.Metadata
+				err := dn.Walk("/", func(path string, d fs.DirEntry, err error) error {
+					if err != nil {
+						return err
+					}
+					if !d.IsDir() && strings.HasSuffix(strings.ToLower(path), ".pdf") {
+						tempFile, err := os.CreateTemp("", "borg-pdf-*.pdf")
+						if err != nil {
+							return fmt.Errorf("failed to create temp file: %w", err)
+						}
+						defer os.Remove(tempFile.Name())
+
+						file, err := dn.Open(path)
+						if err != nil {
+							return fmt.Errorf("failed to open %s from DataNode: %w", path, err)
+						}
+						defer file.Close()
+
+						if _, err := io.Copy(tempFile, file); err != nil {
+							return fmt.Errorf("failed to copy content to temp file: %w", err)
+						}
+						tempFile.Close()
+
+						metadata, err := pdf.ExtractMetadata(tempFile.Name())
+						if err != nil {
+							fmt.Fprintf(cmd.ErrOrStderr(), "could not extract metadata from %s: %v\n", path, err)
+							return nil
+						}
+						metadata.File = filepath.Base(path)
+						allMetadata = append(allMetadata, metadata)
+					}
+					return nil
+				})
+				if err != nil {
+					return fmt.Errorf("error walking DataNode for PDF extraction: %w", err)
+				}
+
+				if len(allMetadata) > 0 {
+					jsonOutput, err := json.MarshalIndent(allMetadata, "", "  ")
+					if err != nil {
+						return fmt.Errorf("failed to marshal metadata to JSON: %w", err)
+					}
+					dn.AddData("INDEX.json", jsonOutput)
+				}
+			}
+
 			var data []byte
 			if format == "tim" {
 				tim, err := tim.FromDataNode(dn)
@ -104,5 +158,6 @@ func NewCollectWebsiteCmd() *cobra.Command {
 	collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
 	collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
 	collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
+	collectWebsiteCmd.PersistentFlags().Bool("extract-pdf-metadata", false, "Extract metadata from PDF files and add INDEX.json")
 	return collectWebsiteCmd
 }
--- a/cmd/extract_metadata.go
+++ b/cmd/extract_metadata.go
@ -0,0 +1,121 @@
+package cmd
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/Snider/Borg/pkg/compress"
+	"github.com/Snider/Borg/pkg/datanode"
+	"github.com/Snider/Borg/pkg/pdf"
+	"github.com/spf13/cobra"
+)
+
+// extractMetadataCmd represents the extract-metadata command
+var extractMetadataCmd = NewExtractMetadataCmd()
+
+func init() {
+	RootCmd.AddCommand(GetExtractMetadataCmd())
+}
+
+func NewExtractMetadataCmd() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "extract-metadata [archive]",
+		Short: "Extract metadata from files in an archive.",
+		Long:  `Extract metadata from files of a specific type within a DataNode archive and create an INDEX.json file.`,
+		Args:  cobra.ExactArgs(1),
+		RunE: func(cmd *cobra.Command, args []string) error {
+			archivePath := args[0]
+			fileType, _ := cmd.Flags().GetString("type")
+
+			if fileType != "pdf" {
+				return fmt.Errorf("unsupported type: %s. Only 'pdf' is currently supported", fileType)
+			}
+
+			// Read and decompress the archive
+			compressedData, err := os.ReadFile(archivePath)
+			if err != nil {
+				return fmt.Errorf("failed to read archive file: %w", err)
+			}
+			data, err := compress.Decompress(compressedData)
+			if err != nil {
+				return fmt.Errorf("failed to decompress archive: %w", err)
+			}
+
+			// Load the DataNode
+			dn, err := datanode.FromTar(data)
+			if err != nil {
+				return fmt.Errorf("failed to load DataNode from tar: %w", err)
+			}
+
+			var allMetadata []*pdf.Metadata
+
+			// Walk the DataNode and extract metadata from PDF files
+			err = dn.Walk("/", func(path string, d fs.DirEntry, err error) error {
+				if err != nil {
+					return err
+				}
+				if !d.IsDir() && strings.HasSuffix(strings.ToLower(path), ".pdf") {
+					// Create a temporary file to run extraction on
+					tempFile, err := os.CreateTemp("", "borg-pdf-*.pdf")
+					if err != nil {
+						return fmt.Errorf("failed to create temp file: %w", err)
+					}
+					defer os.Remove(tempFile.Name())
+
+					// Get the file content from DataNode
+					file, err := dn.Open(path)
+					if err != nil {
+						return fmt.Errorf("failed to open %s from DataNode: %w", path, err)
+					}
+					defer file.Close()
+
+					// Copy content to temp file
+					if _, err := io.Copy(tempFile, file); err != nil {
+						return fmt.Errorf("failed to copy content to temp file: %w", err)
+					}
+					tempFile.Close() // Close the file to allow reading by the extractor
+
+					// Extract metadata
+					metadata, err := pdf.ExtractMetadata(tempFile.Name())
+					if err != nil {
+						// Log error but continue processing other files
+						fmt.Fprintf(cmd.ErrOrStderr(), "could not extract metadata from %s: %v\n", path, err)
+						return nil
+					}
+					metadata.File = filepath.Base(path) // Use the original filename
+					allMetadata = append(allMetadata, metadata)
+				}
+				return nil
+			})
+
+			if err != nil {
+				return fmt.Errorf("error walking DataNode: %w", err)
+			}
+
+			// Write the aggregated metadata to INDEX.json
+			jsonOutput, err := json.MarshalIndent(allMetadata, "", "  ")
+			if err != nil {
+				return fmt.Errorf("failed to marshal metadata to JSON: %w", err)
+			}
+
+			err = os.WriteFile("INDEX.json", jsonOutput, 0644)
+			if err != nil {
+				return fmt.Errorf("failed to write INDEX.json: %w", err)
+			}
+
+			fmt.Fprintln(cmd.OutOrStdout(), "Metadata extracted and saved to INDEX.json")
+			return nil
+		},
+	}
+	cmd.Flags().String("type", "pdf", "The type of files to extract metadata from (currently only 'pdf' is supported)")
+	return cmd
+}
+
+func GetExtractMetadataCmd() *cobra.Command {
+	return extractMetadataCmd
+}
--- a/cmd/pdf.go
+++ b/cmd/pdf.go
@ -0,0 +1,24 @@
+package cmd
+
+import (
+	"github.com/spf13/cobra"
+)
+
+// pdfCmd represents the pdf command
+var pdfCmd = NewPdfCmd()
+
+func init() {
+	RootCmd.AddCommand(GetPdfCmd())
+}
+
+func NewPdfCmd() *cobra.Command {
+	return &cobra.Command{
+		Use:   "pdf",
+		Short: "Perform PDF operations.",
+		Long:  `A command for performing various PDF operations.`,
+	}
+}
+
+func GetPdfCmd() *cobra.Command {
+	return pdfCmd
+}
--- a/cmd/pdf_metadata.go
+++ b/cmd/pdf_metadata.go
@ -0,0 +1,41 @@
+package cmd
+
+import (
+	"encoding/json"
+	"fmt"
+	"github.com/Snider/Borg/pkg/pdf"
+	"github.com/spf13/cobra"
+)
+
+// pdfMetadataCmd represents the pdf metadata command
+var pdfMetadataCmd = NewPdfMetadataCmd()
+
+func init() {
+	GetPdfCmd().AddCommand(GetPdfMetadataCmd())
+}
+
+func NewPdfMetadataCmd() *cobra.Command {
+	return &cobra.Command{
+		Use:   "metadata [file]",
+		Short: "Extract metadata from a PDF file.",
+		Long:  `Extract metadata from a PDF file and print it as JSON.`,
+		Args:  cobra.ExactArgs(1),
+		RunE: func(cmd *cobra.Command, args []string) error {
+			filePath := args[0]
+			metadata, err := pdf.ExtractMetadata(filePath)
+			if err != nil {
+				return fmt.Errorf("error extracting metadata: %w", err)
+			}
+			jsonMetadata, err := json.MarshalIndent(metadata, "", "  ")
+			if err != nil {
+				return fmt.Errorf("error marshalling metadata to JSON: %w", err)
+			}
+			fmt.Fprintln(cmd.OutOrStdout(), string(jsonMetadata))
+			return nil
+		},
+	}
+}
+
+func GetPdfMetadataCmd() *cobra.Command {
+	return pdfMetadataCmd
+}
--- a/pkg/pdf/metadata.go
+++ b/pkg/pdf/metadata.go
@ -0,0 +1,58 @@
+package pdf
+
+import (
+	"bufio"
+	"bytes"
+	"os/exec"
+	"strconv"
+	"strings"
+)
+
+// Metadata holds the extracted PDF metadata.
+type Metadata struct {
+	File     string   `json:"file"`
+	Title    string   `json:"title"`
+	Authors  []string `json:"authors"`
+	Abstract string   `json:"abstract"`
+	Pages    int      `json:"pages"`
+	Created  string   `json:"created"`
+}
+
+// ExtractMetadata extracts metadata from a PDF file using the pdfinfo command.
+func ExtractMetadata(filePath string) (*Metadata, error) {
+	cmd := exec.Command("pdfinfo", filePath)
+	var out bytes.Buffer
+	cmd.Stdout = &out
+	err := cmd.Run()
+	if err != nil {
+		return nil, err
+	}
+
+	metadata := &Metadata{File: filePath}
+	scanner := bufio.NewScanner(&out)
+	for scanner.Scan() {
+		line := scanner.Text()
+		parts := strings.SplitN(line, ":", 2)
+		if len(parts) != 2 {
+			continue
+		}
+		key := strings.TrimSpace(parts[0])
+		value := strings.TrimSpace(parts[1])
+
+		switch key {
+		case "Title":
+			metadata.Title = value
+		case "Author":
+			metadata.Authors = strings.Split(value, ",")
+		case "CreationDate":
+			metadata.Created = value
+		case "Pages":
+			pages, err := strconv.Atoi(value)
+			if err == nil {
+				metadata.Pages = pages
+			}
+		}
+	}
+
+	return metadata, nil
+}
--- a/pkg/pdf/metadata_test.go
+++ b/pkg/pdf/metadata_test.go
@ -0,0 +1,103 @@
+package pdf
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"strings"
+	"testing"
+)
+
+// mockExecCommand is used to mock the exec.Command function for testing.
+func mockExecCommand(command string, args ...string) *exec.Cmd {
+	cs := []string{"-test.run=TestHelperProcess", "--", command}
+	cs = append(cs, args...)
+	cmd := exec.Command(os.Args[0], cs...)
+	cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
+	return cmd
+}
+
+// TestHelperProcess isn't a real test. It's used as a helper process
+// for TestExtractMetadata. It simulates the behavior of the `pdfinfo` command.
+func TestHelperProcess(t *testing.T) {
+	if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
+		return
+	}
+	defer os.Exit(0)
+
+	args := os.Args
+	for len(args) > 0 {
+		if args[0] == "--" {
+			args = args[1:]
+			break
+		}
+		args = args[1:]
+	}
+	if len(args) == 0 {
+		fmt.Fprintf(os.Stderr, "No command to mock!\n")
+		os.Exit(1)
+	}
+
+	cmd, args := args[0], args[1:]
+	if cmd == "pdfinfo" && len(args) == 1 {
+		// Simulate pdfinfo output
+		fmt.Println("Title:          Test Title")
+		fmt.Println("Author:         Test Author 1,Test Author 2")
+		fmt.Println("CreationDate:   Sun Jan  1 00:00:00 2023")
+		fmt.Println("Pages:          42")
+	}
+}
+
+func TestExtractMetadata(t *testing.T) {
+	execCommand = mockExecCommand
+	defer func() { execCommand = exec.Command }()
+
+	metadata, err := ExtractMetadata("dummy.pdf")
+	if err != nil {
+		t.Fatalf("ExtractMetadata failed: %v", err)
+	}
+
+	if metadata.Title != "Test Title" {
+		t.Errorf("expected title 'Test Title', got '%s'", metadata.Title)
+	}
+	if len(metadata.Authors) != 2 || metadata.Authors[0] != "Test Author 1" || metadata.Authors[1] != "Test Author 2" {
+		t.Errorf("expected authors '[Test Author 1, Test Author 2]', got '%v'", metadata.Authors)
+	}
+	if metadata.Created != "Sun Jan  1 00:00:00 2023" {
+		t.Errorf("expected creation date 'Sun Jan  1 00:00:00 2023', got '%s'", metadata.Created)
+	}
+	if metadata.Pages != 42 {
+		t.Errorf("expected 42 pages, got %d", metadata.Pages)
+	}
+	if metadata.File != "dummy.pdf" {
+		t.Errorf("expected file 'dummy.pdf', got '%s'", metadata.File)
+	}
+}
+
+func TestExtractMetadata_CommandError(t *testing.T) {
+	execCommand = func(command string, args ...string) *exec.Cmd {
+		cs := []string{"-test.run=TestHelperProcess_Error", "--", command}
+		cs = append(cs, args...)
+		cmd := exec.Command(os.Args[0], cs...)
+		cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
+		return cmd
+	}
+	defer func() { execCommand = exec.Command }()
+
+	_, err := ExtractMetadata("dummy.pdf")
+	if err == nil {
+		t.Fatal("expected an error from exec.Command, but got nil")
+	}
+	if !strings.Contains(err.Error(), "exit status 1") {
+		t.Errorf("expected error to contain 'exit status 1', got '%v'", err)
+	}
+}
+
+func TestHelperProcess_Error(t *testing.T) {
+	if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
+		return
+	}
+	// Simulate an error by writing to stderr and exiting with a non-zero status
+	fmt.Fprintf(os.Stderr, "pdfinfo error")
+	os.Exit(1)
+}