Borg/pkg/pdf/metadata_test.go
google-labs-jules[bot] c7e3ba297f feat: PDF metadata extraction
This commit introduces a new feature to extract and index metadata from collected PDF files.

The following changes have been made:
- Added a new `pdf` command with a `metadata` subcommand to extract metadata from a single PDF file.
- Added a new `extract-metadata` command to extract metadata from all PDF files within a given archive and create an `INDEX.json` file.
- Added a `--extract-pdf-metadata` flag to the `collect website` command to extract metadata from downloaded PDF files.
- Created a new `pdf` package to encapsulate the PDF metadata extraction logic, which uses the `pdfinfo` command from the `poppler-utils` package.
- Added unit tests for the new `pdf` package, including mocking the `pdfinfo` command.
- Modified `Taskfile.yml` to install `poppler-utils` as a dependency.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
2026-02-02 00:46:59 +00:00

103 lines
2.9 KiB
Go

package pdf
import (
"fmt"
"os"
"os/exec"
"strings"
"testing"
)
// mockExecCommand is used to mock the exec.Command function for testing.
func mockExecCommand(command string, args ...string) *exec.Cmd {
cs := []string{"-test.run=TestHelperProcess", "--", command}
cs = append(cs, args...)
cmd := exec.Command(os.Args[0], cs...)
cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
return cmd
}
// TestHelperProcess isn't a real test. It's used as a helper process
// for TestExtractMetadata. It simulates the behavior of the `pdfinfo` command.
func TestHelperProcess(t *testing.T) {
if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
return
}
defer os.Exit(0)
args := os.Args
for len(args) > 0 {
if args[0] == "--" {
args = args[1:]
break
}
args = args[1:]
}
if len(args) == 0 {
fmt.Fprintf(os.Stderr, "No command to mock!\n")
os.Exit(1)
}
cmd, args := args[0], args[1:]
if cmd == "pdfinfo" && len(args) == 1 {
// Simulate pdfinfo output
fmt.Println("Title: Test Title")
fmt.Println("Author: Test Author 1,Test Author 2")
fmt.Println("CreationDate: Sun Jan 1 00:00:00 2023")
fmt.Println("Pages: 42")
}
}
func TestExtractMetadata(t *testing.T) {
execCommand = mockExecCommand
defer func() { execCommand = exec.Command }()
metadata, err := ExtractMetadata("dummy.pdf")
if err != nil {
t.Fatalf("ExtractMetadata failed: %v", err)
}
if metadata.Title != "Test Title" {
t.Errorf("expected title 'Test Title', got '%s'", metadata.Title)
}
if len(metadata.Authors) != 2 || metadata.Authors[0] != "Test Author 1" || metadata.Authors[1] != "Test Author 2" {
t.Errorf("expected authors '[Test Author 1, Test Author 2]', got '%v'", metadata.Authors)
}
if metadata.Created != "Sun Jan 1 00:00:00 2023" {
t.Errorf("expected creation date 'Sun Jan 1 00:00:00 2023', got '%s'", metadata.Created)
}
if metadata.Pages != 42 {
t.Errorf("expected 42 pages, got %d", metadata.Pages)
}
if metadata.File != "dummy.pdf" {
t.Errorf("expected file 'dummy.pdf', got '%s'", metadata.File)
}
}
func TestExtractMetadata_CommandError(t *testing.T) {
execCommand = func(command string, args ...string) *exec.Cmd {
cs := []string{"-test.run=TestHelperProcess_Error", "--", command}
cs = append(cs, args...)
cmd := exec.Command(os.Args[0], cs...)
cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
return cmd
}
defer func() { execCommand = exec.Command }()
_, err := ExtractMetadata("dummy.pdf")
if err == nil {
t.Fatal("expected an error from exec.Command, but got nil")
}
if !strings.Contains(err.Error(), "exit status 1") {
t.Errorf("expected error to contain 'exit status 1', got '%v'", err)
}
}
func TestHelperProcess_Error(t *testing.T) {
if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
return
}
// Simulate an error by writing to stderr and exiting with a non-zero status
fmt.Fprintf(os.Stderr, "pdfinfo error")
os.Exit(1)
}