This commit introduces a new feature to extract and index metadata from collected PDF files. The following changes have been made: - Added a new `pdf` command with a `metadata` subcommand to extract metadata from a single PDF file. - Added a new `extract-metadata` command to extract metadata from all PDF files within a given archive and create an `INDEX.json` file. - Added a `--extract-pdf-metadata` flag to the `collect website` command to extract metadata from downloaded PDF files. - Created a new `pdf` package to encapsulate the PDF metadata extraction logic, which uses the `pdfinfo` command from the `poppler-utils` package. - Added unit tests for the new `pdf` package, including mocking the `pdfinfo` command. - Modified `Taskfile.yml` to install `poppler-utils` as a dependency. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
58 lines
1.2 KiB
Go
58 lines
1.2 KiB
Go
package pdf
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"os/exec"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
// Metadata holds the extracted PDF metadata.
|
|
type Metadata struct {
|
|
File string `json:"file"`
|
|
Title string `json:"title"`
|
|
Authors []string `json:"authors"`
|
|
Abstract string `json:"abstract"`
|
|
Pages int `json:"pages"`
|
|
Created string `json:"created"`
|
|
}
|
|
|
|
// ExtractMetadata extracts metadata from a PDF file using the pdfinfo command.
|
|
func ExtractMetadata(filePath string) (*Metadata, error) {
|
|
cmd := exec.Command("pdfinfo", filePath)
|
|
var out bytes.Buffer
|
|
cmd.Stdout = &out
|
|
err := cmd.Run()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
metadata := &Metadata{File: filePath}
|
|
scanner := bufio.NewScanner(&out)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
parts := strings.SplitN(line, ":", 2)
|
|
if len(parts) != 2 {
|
|
continue
|
|
}
|
|
key := strings.TrimSpace(parts[0])
|
|
value := strings.TrimSpace(parts[1])
|
|
|
|
switch key {
|
|
case "Title":
|
|
metadata.Title = value
|
|
case "Author":
|
|
metadata.Authors = strings.Split(value, ",")
|
|
case "CreationDate":
|
|
metadata.Created = value
|
|
case "Pages":
|
|
pages, err := strconv.Atoi(value)
|
|
if err == nil {
|
|
metadata.Pages = pages
|
|
}
|
|
}
|
|
}
|
|
|
|
return metadata, nil
|
|
}
|