Borg/pkg/pdf/metadata.go
google-labs-jules[bot] c7e3ba297f feat: PDF metadata extraction
This commit introduces a new feature to extract and index metadata from collected PDF files.

The following changes have been made:
- Added a new `pdf` command with a `metadata` subcommand to extract metadata from a single PDF file.
- Added a new `extract-metadata` command to extract metadata from all PDF files within a given archive and create an `INDEX.json` file.
- Added a `--extract-pdf-metadata` flag to the `collect website` command to extract metadata from downloaded PDF files.
- Created a new `pdf` package to encapsulate the PDF metadata extraction logic, which uses the `pdfinfo` command from the `poppler-utils` package.
- Added unit tests for the new `pdf` package, including mocking the `pdfinfo` command.
- Modified `Taskfile.yml` to install `poppler-utils` as a dependency.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
2026-02-02 00:46:59 +00:00

58 lines
1.2 KiB
Go

package pdf
import (
"bufio"
"bytes"
"os/exec"
"strconv"
"strings"
)
// Metadata holds the extracted PDF metadata.
type Metadata struct {
File string `json:"file"`
Title string `json:"title"`
Authors []string `json:"authors"`
Abstract string `json:"abstract"`
Pages int `json:"pages"`
Created string `json:"created"`
}
// ExtractMetadata extracts metadata from a PDF file using the pdfinfo command.
func ExtractMetadata(filePath string) (*Metadata, error) {
cmd := exec.Command("pdfinfo", filePath)
var out bytes.Buffer
cmd.Stdout = &out
err := cmd.Run()
if err != nil {
return nil, err
}
metadata := &Metadata{File: filePath}
scanner := bufio.NewScanner(&out)
for scanner.Scan() {
line := scanner.Text()
parts := strings.SplitN(line, ":", 2)
if len(parts) != 2 {
continue
}
key := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
switch key {
case "Title":
metadata.Title = value
case "Author":
metadata.Authors = strings.Split(value, ",")
case "CreationDate":
metadata.Created = value
case "Pages":
pages, err := strconv.Atoi(value)
if err == nil {
metadata.Pages = pages
}
}
}
return metadata, nil
}