This commit introduces a new feature to extract and index metadata from collected PDF files. The following changes have been made: - Added a new `pdf` command with a `metadata` subcommand to extract metadata from a single PDF file. - Added a new `extract-metadata` command to extract metadata from all PDF files within a given archive and create an `INDEX.json` file. - Added a `--extract-pdf-metadata` flag to the `collect website` command to extract metadata from downloaded PDF files. - Created a new `pdf` package to encapsulate the PDF metadata extraction logic, which uses the `pdfinfo` command from the `poppler-utils` package. - Added unit tests for the new `pdf` package, including mocking the `pdfinfo` command. - Modified `Taskfile.yml` to install `poppler-utils` as a dependency. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
121 lines
3.5 KiB
Go
121 lines
3.5 KiB
Go
package cmd
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"io/fs"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/Snider/Borg/pkg/compress"
|
|
"github.com/Snider/Borg/pkg/datanode"
|
|
"github.com/Snider/Borg/pkg/pdf"
|
|
"github.com/spf13/cobra"
|
|
)
|
|
|
|
// extractMetadataCmd represents the extract-metadata command
|
|
var extractMetadataCmd = NewExtractMetadataCmd()
|
|
|
|
func init() {
|
|
RootCmd.AddCommand(GetExtractMetadataCmd())
|
|
}
|
|
|
|
func NewExtractMetadataCmd() *cobra.Command {
|
|
cmd := &cobra.Command{
|
|
Use: "extract-metadata [archive]",
|
|
Short: "Extract metadata from files in an archive.",
|
|
Long: `Extract metadata from files of a specific type within a DataNode archive and create an INDEX.json file.`,
|
|
Args: cobra.ExactArgs(1),
|
|
RunE: func(cmd *cobra.Command, args []string) error {
|
|
archivePath := args[0]
|
|
fileType, _ := cmd.Flags().GetString("type")
|
|
|
|
if fileType != "pdf" {
|
|
return fmt.Errorf("unsupported type: %s. Only 'pdf' is currently supported", fileType)
|
|
}
|
|
|
|
// Read and decompress the archive
|
|
compressedData, err := os.ReadFile(archivePath)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read archive file: %w", err)
|
|
}
|
|
data, err := compress.Decompress(compressedData)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to decompress archive: %w", err)
|
|
}
|
|
|
|
// Load the DataNode
|
|
dn, err := datanode.FromTar(data)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to load DataNode from tar: %w", err)
|
|
}
|
|
|
|
var allMetadata []*pdf.Metadata
|
|
|
|
// Walk the DataNode and extract metadata from PDF files
|
|
err = dn.Walk("/", func(path string, d fs.DirEntry, err error) error {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !d.IsDir() && strings.HasSuffix(strings.ToLower(path), ".pdf") {
|
|
// Create a temporary file to run extraction on
|
|
tempFile, err := os.CreateTemp("", "borg-pdf-*.pdf")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create temp file: %w", err)
|
|
}
|
|
defer os.Remove(tempFile.Name())
|
|
|
|
// Get the file content from DataNode
|
|
file, err := dn.Open(path)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to open %s from DataNode: %w", path, err)
|
|
}
|
|
defer file.Close()
|
|
|
|
// Copy content to temp file
|
|
if _, err := io.Copy(tempFile, file); err != nil {
|
|
return fmt.Errorf("failed to copy content to temp file: %w", err)
|
|
}
|
|
tempFile.Close() // Close the file to allow reading by the extractor
|
|
|
|
// Extract metadata
|
|
metadata, err := pdf.ExtractMetadata(tempFile.Name())
|
|
if err != nil {
|
|
// Log error but continue processing other files
|
|
fmt.Fprintf(cmd.ErrOrStderr(), "could not extract metadata from %s: %v\n", path, err)
|
|
return nil
|
|
}
|
|
metadata.File = filepath.Base(path) // Use the original filename
|
|
allMetadata = append(allMetadata, metadata)
|
|
}
|
|
return nil
|
|
})
|
|
|
|
if err != nil {
|
|
return fmt.Errorf("error walking DataNode: %w", err)
|
|
}
|
|
|
|
// Write the aggregated metadata to INDEX.json
|
|
jsonOutput, err := json.MarshalIndent(allMetadata, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal metadata to JSON: %w", err)
|
|
}
|
|
|
|
err = os.WriteFile("INDEX.json", jsonOutput, 0644)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to write INDEX.json: %w", err)
|
|
}
|
|
|
|
fmt.Fprintln(cmd.OutOrStdout(), "Metadata extracted and saved to INDEX.json")
|
|
return nil
|
|
},
|
|
}
|
|
cmd.Flags().String("type", "pdf", "The type of files to extract metadata from (currently only 'pdf' is supported)")
|
|
return cmd
|
|
}
|
|
|
|
func GetExtractMetadataCmd() *cobra.Command {
|
|
return extractMetadataCmd
|
|
}
|