This commit introduces a new feature to extract and index metadata from collected PDF files. The following changes have been made: - Added a new `pdf` command with a `metadata` subcommand to extract metadata from a single PDF file. - Added a new `extract-metadata` command to extract metadata from all PDF files within a given archive and create an `INDEX.json` file. - Added a `--extract-pdf-metadata` flag to the `collect website` command to extract metadata from downloaded PDF files. - Created a new `pdf` package to encapsulate the PDF metadata extraction logic, which uses the `pdfinfo` command from the `poppler-utils` package. - Added unit tests for the new `pdf` package, including mocking the `pdfinfo` command. - Modified `Taskfile.yml` to install `poppler-utils` as a dependency. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
41 lines
1,021 B
Go
41 lines
1,021 B
Go
package cmd
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"github.com/Snider/Borg/pkg/pdf"
|
|
"github.com/spf13/cobra"
|
|
)
|
|
|
|
// pdfMetadataCmd represents the pdf metadata command
|
|
var pdfMetadataCmd = NewPdfMetadataCmd()
|
|
|
|
func init() {
|
|
GetPdfCmd().AddCommand(GetPdfMetadataCmd())
|
|
}
|
|
|
|
func NewPdfMetadataCmd() *cobra.Command {
|
|
return &cobra.Command{
|
|
Use: "metadata [file]",
|
|
Short: "Extract metadata from a PDF file.",
|
|
Long: `Extract metadata from a PDF file and print it as JSON.`,
|
|
Args: cobra.ExactArgs(1),
|
|
RunE: func(cmd *cobra.Command, args []string) error {
|
|
filePath := args[0]
|
|
metadata, err := pdf.ExtractMetadata(filePath)
|
|
if err != nil {
|
|
return fmt.Errorf("error extracting metadata: %w", err)
|
|
}
|
|
jsonMetadata, err := json.MarshalIndent(metadata, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("error marshalling metadata to JSON: %w", err)
|
|
}
|
|
fmt.Fprintln(cmd.OutOrStdout(), string(jsonMetadata))
|
|
return nil
|
|
},
|
|
}
|
|
}
|
|
|
|
func GetPdfMetadataCmd() *cobra.Command {
|
|
return pdfMetadataCmd
|
|
}
|