feat: PDF metadata extraction

This commit introduces a new feature to extract and index metadata from collected PDF files.

The following changes have been made:
- Added a new `pdf` command with a `metadata` subcommand to extract metadata from a single PDF file.
- Added a new `extract-metadata` command to extract metadata from all PDF files within a given archive and create an `INDEX.json` file.
- Added a `--extract-pdf-metadata` flag to the `collect website` command to extract metadata from downloaded PDF files.
- Created a new `pdf` package to encapsulate the PDF metadata extraction logic, which uses the `pdfinfo` command from the `poppler-utils` package.
- Added unit tests for the new `pdf` package, including mocking the `pdfinfo` command.
- Modified `Taskfile.yml` to install `poppler-utils` as a dependency.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
This commit is contained in:
google-labs-jules[bot] 2026-02-02 00:46:59 +00:00
parent cf2af53ed3
commit c7e3ba297f
7 changed files with 407 additions and 0 deletions

View file

@ -1,10 +1,15 @@
version: '3'
tasks:
install-deps:
cmds:
- sudo apt-get update && sudo apt-get install -y poppler-utils
clean:
cmds:
- rm -f borg
build:
deps:
- install-deps
cmds:
- task: clean
- go build -o borg main.go

View file

@ -1,11 +1,17 @@
package cmd
import (
"encoding/json"
"fmt"
"io"
"io/fs"
"os"
"path/filepath"
"strings"
"github.com/schollz/progressbar/v3"
"github.com/Snider/Borg/pkg/compress"
"github.com/Snider/Borg/pkg/pdf"
"github.com/Snider/Borg/pkg/tim"
"github.com/Snider/Borg/pkg/trix"
"github.com/Snider/Borg/pkg/ui"
@ -38,6 +44,7 @@ func NewCollectWebsiteCmd() *cobra.Command {
format, _ := cmd.Flags().GetString("format")
compression, _ := cmd.Flags().GetString("compression")
password, _ := cmd.Flags().GetString("password")
extractPdfMetadata, _ := cmd.Flags().GetBool("extract-pdf-metadata")
if format != "datanode" && format != "tim" && format != "trix" {
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
@ -56,6 +63,53 @@ func NewCollectWebsiteCmd() *cobra.Command {
return fmt.Errorf("error downloading and packaging website: %w", err)
}
if extractPdfMetadata {
var allMetadata []*pdf.Metadata
err := dn.Walk("/", func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() && strings.HasSuffix(strings.ToLower(path), ".pdf") {
tempFile, err := os.CreateTemp("", "borg-pdf-*.pdf")
if err != nil {
return fmt.Errorf("failed to create temp file: %w", err)
}
defer os.Remove(tempFile.Name())
file, err := dn.Open(path)
if err != nil {
return fmt.Errorf("failed to open %s from DataNode: %w", path, err)
}
defer file.Close()
if _, err := io.Copy(tempFile, file); err != nil {
return fmt.Errorf("failed to copy content to temp file: %w", err)
}
tempFile.Close()
metadata, err := pdf.ExtractMetadata(tempFile.Name())
if err != nil {
fmt.Fprintf(cmd.ErrOrStderr(), "could not extract metadata from %s: %v\n", path, err)
return nil
}
metadata.File = filepath.Base(path)
allMetadata = append(allMetadata, metadata)
}
return nil
})
if err != nil {
return fmt.Errorf("error walking DataNode for PDF extraction: %w", err)
}
if len(allMetadata) > 0 {
jsonOutput, err := json.MarshalIndent(allMetadata, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal metadata to JSON: %w", err)
}
dn.AddData("INDEX.json", jsonOutput)
}
}
var data []byte
if format == "tim" {
tim, err := tim.FromDataNode(dn)
@ -104,5 +158,6 @@ func NewCollectWebsiteCmd() *cobra.Command {
collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
collectWebsiteCmd.PersistentFlags().Bool("extract-pdf-metadata", false, "Extract metadata from PDF files and add INDEX.json")
return collectWebsiteCmd
}

121
cmd/extract_metadata.go Normal file
View file

@ -0,0 +1,121 @@
package cmd
import (
"encoding/json"
"fmt"
"io"
"io/fs"
"os"
"path/filepath"
"strings"
"github.com/Snider/Borg/pkg/compress"
"github.com/Snider/Borg/pkg/datanode"
"github.com/Snider/Borg/pkg/pdf"
"github.com/spf13/cobra"
)
// extractMetadataCmd represents the extract-metadata command
var extractMetadataCmd = NewExtractMetadataCmd()
func init() {
RootCmd.AddCommand(GetExtractMetadataCmd())
}
func NewExtractMetadataCmd() *cobra.Command {
cmd := &cobra.Command{
Use: "extract-metadata [archive]",
Short: "Extract metadata from files in an archive.",
Long: `Extract metadata from files of a specific type within a DataNode archive and create an INDEX.json file.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
archivePath := args[0]
fileType, _ := cmd.Flags().GetString("type")
if fileType != "pdf" {
return fmt.Errorf("unsupported type: %s. Only 'pdf' is currently supported", fileType)
}
// Read and decompress the archive
compressedData, err := os.ReadFile(archivePath)
if err != nil {
return fmt.Errorf("failed to read archive file: %w", err)
}
data, err := compress.Decompress(compressedData)
if err != nil {
return fmt.Errorf("failed to decompress archive: %w", err)
}
// Load the DataNode
dn, err := datanode.FromTar(data)
if err != nil {
return fmt.Errorf("failed to load DataNode from tar: %w", err)
}
var allMetadata []*pdf.Metadata
// Walk the DataNode and extract metadata from PDF files
err = dn.Walk("/", func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() && strings.HasSuffix(strings.ToLower(path), ".pdf") {
// Create a temporary file to run extraction on
tempFile, err := os.CreateTemp("", "borg-pdf-*.pdf")
if err != nil {
return fmt.Errorf("failed to create temp file: %w", err)
}
defer os.Remove(tempFile.Name())
// Get the file content from DataNode
file, err := dn.Open(path)
if err != nil {
return fmt.Errorf("failed to open %s from DataNode: %w", path, err)
}
defer file.Close()
// Copy content to temp file
if _, err := io.Copy(tempFile, file); err != nil {
return fmt.Errorf("failed to copy content to temp file: %w", err)
}
tempFile.Close() // Close the file to allow reading by the extractor
// Extract metadata
metadata, err := pdf.ExtractMetadata(tempFile.Name())
if err != nil {
// Log error but continue processing other files
fmt.Fprintf(cmd.ErrOrStderr(), "could not extract metadata from %s: %v\n", path, err)
return nil
}
metadata.File = filepath.Base(path) // Use the original filename
allMetadata = append(allMetadata, metadata)
}
return nil
})
if err != nil {
return fmt.Errorf("error walking DataNode: %w", err)
}
// Write the aggregated metadata to INDEX.json
jsonOutput, err := json.MarshalIndent(allMetadata, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal metadata to JSON: %w", err)
}
err = os.WriteFile("INDEX.json", jsonOutput, 0644)
if err != nil {
return fmt.Errorf("failed to write INDEX.json: %w", err)
}
fmt.Fprintln(cmd.OutOrStdout(), "Metadata extracted and saved to INDEX.json")
return nil
},
}
cmd.Flags().String("type", "pdf", "The type of files to extract metadata from (currently only 'pdf' is supported)")
return cmd
}
func GetExtractMetadataCmd() *cobra.Command {
return extractMetadataCmd
}

24
cmd/pdf.go Normal file
View file

@ -0,0 +1,24 @@
package cmd
import (
"github.com/spf13/cobra"
)
// pdfCmd represents the pdf command
var pdfCmd = NewPdfCmd()
func init() {
RootCmd.AddCommand(GetPdfCmd())
}
func NewPdfCmd() *cobra.Command {
return &cobra.Command{
Use: "pdf",
Short: "Perform PDF operations.",
Long: `A command for performing various PDF operations.`,
}
}
func GetPdfCmd() *cobra.Command {
return pdfCmd
}

41
cmd/pdf_metadata.go Normal file
View file

@ -0,0 +1,41 @@
package cmd
import (
"encoding/json"
"fmt"
"github.com/Snider/Borg/pkg/pdf"
"github.com/spf13/cobra"
)
// pdfMetadataCmd represents the pdf metadata command
var pdfMetadataCmd = NewPdfMetadataCmd()
func init() {
GetPdfCmd().AddCommand(GetPdfMetadataCmd())
}
func NewPdfMetadataCmd() *cobra.Command {
return &cobra.Command{
Use: "metadata [file]",
Short: "Extract metadata from a PDF file.",
Long: `Extract metadata from a PDF file and print it as JSON.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
filePath := args[0]
metadata, err := pdf.ExtractMetadata(filePath)
if err != nil {
return fmt.Errorf("error extracting metadata: %w", err)
}
jsonMetadata, err := json.MarshalIndent(metadata, "", " ")
if err != nil {
return fmt.Errorf("error marshalling metadata to JSON: %w", err)
}
fmt.Fprintln(cmd.OutOrStdout(), string(jsonMetadata))
return nil
},
}
}
func GetPdfMetadataCmd() *cobra.Command {
return pdfMetadataCmd
}

58
pkg/pdf/metadata.go Normal file
View file

@ -0,0 +1,58 @@
package pdf
import (
"bufio"
"bytes"
"os/exec"
"strconv"
"strings"
)
// Metadata holds the extracted PDF metadata.
type Metadata struct {
File string `json:"file"`
Title string `json:"title"`
Authors []string `json:"authors"`
Abstract string `json:"abstract"`
Pages int `json:"pages"`
Created string `json:"created"`
}
// ExtractMetadata extracts metadata from a PDF file using the pdfinfo command.
func ExtractMetadata(filePath string) (*Metadata, error) {
cmd := exec.Command("pdfinfo", filePath)
var out bytes.Buffer
cmd.Stdout = &out
err := cmd.Run()
if err != nil {
return nil, err
}
metadata := &Metadata{File: filePath}
scanner := bufio.NewScanner(&out)
for scanner.Scan() {
line := scanner.Text()
parts := strings.SplitN(line, ":", 2)
if len(parts) != 2 {
continue
}
key := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
switch key {
case "Title":
metadata.Title = value
case "Author":
metadata.Authors = strings.Split(value, ",")
case "CreationDate":
metadata.Created = value
case "Pages":
pages, err := strconv.Atoi(value)
if err == nil {
metadata.Pages = pages
}
}
}
return metadata, nil
}

103
pkg/pdf/metadata_test.go Normal file
View file

@ -0,0 +1,103 @@
package pdf
import (
"fmt"
"os"
"os/exec"
"strings"
"testing"
)
// mockExecCommand is used to mock the exec.Command function for testing.
func mockExecCommand(command string, args ...string) *exec.Cmd {
cs := []string{"-test.run=TestHelperProcess", "--", command}
cs = append(cs, args...)
cmd := exec.Command(os.Args[0], cs...)
cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
return cmd
}
// TestHelperProcess isn't a real test. It's used as a helper process
// for TestExtractMetadata. It simulates the behavior of the `pdfinfo` command.
func TestHelperProcess(t *testing.T) {
if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
return
}
defer os.Exit(0)
args := os.Args
for len(args) > 0 {
if args[0] == "--" {
args = args[1:]
break
}
args = args[1:]
}
if len(args) == 0 {
fmt.Fprintf(os.Stderr, "No command to mock!\n")
os.Exit(1)
}
cmd, args := args[0], args[1:]
if cmd == "pdfinfo" && len(args) == 1 {
// Simulate pdfinfo output
fmt.Println("Title: Test Title")
fmt.Println("Author: Test Author 1,Test Author 2")
fmt.Println("CreationDate: Sun Jan 1 00:00:00 2023")
fmt.Println("Pages: 42")
}
}
func TestExtractMetadata(t *testing.T) {
execCommand = mockExecCommand
defer func() { execCommand = exec.Command }()
metadata, err := ExtractMetadata("dummy.pdf")
if err != nil {
t.Fatalf("ExtractMetadata failed: %v", err)
}
if metadata.Title != "Test Title" {
t.Errorf("expected title 'Test Title', got '%s'", metadata.Title)
}
if len(metadata.Authors) != 2 || metadata.Authors[0] != "Test Author 1" || metadata.Authors[1] != "Test Author 2" {
t.Errorf("expected authors '[Test Author 1, Test Author 2]', got '%v'", metadata.Authors)
}
if metadata.Created != "Sun Jan 1 00:00:00 2023" {
t.Errorf("expected creation date 'Sun Jan 1 00:00:00 2023', got '%s'", metadata.Created)
}
if metadata.Pages != 42 {
t.Errorf("expected 42 pages, got %d", metadata.Pages)
}
if metadata.File != "dummy.pdf" {
t.Errorf("expected file 'dummy.pdf', got '%s'", metadata.File)
}
}
func TestExtractMetadata_CommandError(t *testing.T) {
execCommand = func(command string, args ...string) *exec.Cmd {
cs := []string{"-test.run=TestHelperProcess_Error", "--", command}
cs = append(cs, args...)
cmd := exec.Command(os.Args[0], cs...)
cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
return cmd
}
defer func() { execCommand = exec.Command }()
_, err := ExtractMetadata("dummy.pdf")
if err == nil {
t.Fatal("expected an error from exec.Command, but got nil")
}
if !strings.Contains(err.Error(), "exit status 1") {
t.Errorf("expected error to contain 'exit status 1', got '%v'", err)
}
}
func TestHelperProcess_Error(t *testing.T) {
if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
return
}
// Simulate an error by writing to stderr and exiting with a non-zero status
fmt.Fprintf(os.Stderr, "pdfinfo error")
os.Exit(1)
}