Borg/cmd/collect_website.go
google-labs-jules[bot] c7e3ba297f feat: PDF metadata extraction
This commit introduces a new feature to extract and index metadata from collected PDF files.

The following changes have been made:
- Added a new `pdf` command with a `metadata` subcommand to extract metadata from a single PDF file.
- Added a new `extract-metadata` command to extract metadata from all PDF files within a given archive and create an `INDEX.json` file.
- Added a `--extract-pdf-metadata` flag to the `collect website` command to extract metadata from downloaded PDF files.
- Created a new `pdf` package to encapsulate the PDF metadata extraction logic, which uses the `pdfinfo` command from the `poppler-utils` package.
- Added unit tests for the new `pdf` package, including mocking the `pdfinfo` command.
- Modified `Taskfile.yml` to install `poppler-utils` as a dependency.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
2026-02-02 00:46:59 +00:00

163 lines
4.9 KiB
Go

package cmd
import (
"encoding/json"
"fmt"
"io"
"io/fs"
"os"
"path/filepath"
"strings"
"github.com/schollz/progressbar/v3"
"github.com/Snider/Borg/pkg/compress"
"github.com/Snider/Borg/pkg/pdf"
"github.com/Snider/Borg/pkg/tim"
"github.com/Snider/Borg/pkg/trix"
"github.com/Snider/Borg/pkg/ui"
"github.com/Snider/Borg/pkg/website"
"github.com/spf13/cobra"
)
// collectWebsiteCmd represents the collect website command
var collectWebsiteCmd = NewCollectWebsiteCmd()
func init() {
GetCollectCmd().AddCommand(GetCollectWebsiteCmd())
}
func GetCollectWebsiteCmd() *cobra.Command {
return collectWebsiteCmd
}
func NewCollectWebsiteCmd() *cobra.Command {
collectWebsiteCmd := &cobra.Command{
Use: "website [url]",
Short: "Collect a single website",
Long: `Collect a single website and store it in a DataNode.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
websiteURL := args[0]
outputFile, _ := cmd.Flags().GetString("output")
depth, _ := cmd.Flags().GetInt("depth")
format, _ := cmd.Flags().GetString("format")
compression, _ := cmd.Flags().GetString("compression")
password, _ := cmd.Flags().GetString("password")
extractPdfMetadata, _ := cmd.Flags().GetBool("extract-pdf-metadata")
if format != "datanode" && format != "tim" && format != "trix" {
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
}
prompter := ui.NewNonInteractivePrompter(ui.GetWebsiteQuote)
prompter.Start()
defer prompter.Stop()
var bar *progressbar.ProgressBar
if prompter.IsInteractive() {
bar = ui.NewProgressBar(-1, "Crawling website")
}
dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar)
if err != nil {
return fmt.Errorf("error downloading and packaging website: %w", err)
}
if extractPdfMetadata {
var allMetadata []*pdf.Metadata
err := dn.Walk("/", func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() && strings.HasSuffix(strings.ToLower(path), ".pdf") {
tempFile, err := os.CreateTemp("", "borg-pdf-*.pdf")
if err != nil {
return fmt.Errorf("failed to create temp file: %w", err)
}
defer os.Remove(tempFile.Name())
file, err := dn.Open(path)
if err != nil {
return fmt.Errorf("failed to open %s from DataNode: %w", path, err)
}
defer file.Close()
if _, err := io.Copy(tempFile, file); err != nil {
return fmt.Errorf("failed to copy content to temp file: %w", err)
}
tempFile.Close()
metadata, err := pdf.ExtractMetadata(tempFile.Name())
if err != nil {
fmt.Fprintf(cmd.ErrOrStderr(), "could not extract metadata from %s: %v\n", path, err)
return nil
}
metadata.File = filepath.Base(path)
allMetadata = append(allMetadata, metadata)
}
return nil
})
if err != nil {
return fmt.Errorf("error walking DataNode for PDF extraction: %w", err)
}
if len(allMetadata) > 0 {
jsonOutput, err := json.MarshalIndent(allMetadata, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal metadata to JSON: %w", err)
}
dn.AddData("INDEX.json", jsonOutput)
}
}
var data []byte
if format == "tim" {
tim, err := tim.FromDataNode(dn)
if err != nil {
return fmt.Errorf("error creating tim: %w", err)
}
data, err = tim.ToTar()
if err != nil {
return fmt.Errorf("error serializing tim: %w", err)
}
} else if format == "trix" {
data, err = trix.ToTrix(dn, password)
if err != nil {
return fmt.Errorf("error serializing trix: %w", err)
}
} else {
data, err = dn.ToTar()
if err != nil {
return fmt.Errorf("error serializing DataNode: %w", err)
}
}
compressedData, err := compress.Compress(data, compression)
if err != nil {
return fmt.Errorf("error compressing data: %w", err)
}
if outputFile == "" {
outputFile = "website." + format
if compression != "none" {
outputFile += "." + compression
}
}
err = os.WriteFile(outputFile, compressedData, 0644)
if err != nil {
return fmt.Errorf("error writing website to file: %w", err)
}
fmt.Fprintln(cmd.OutOrStdout(), "Website saved to", outputFile)
return nil
},
}
collectWebsiteCmd.PersistentFlags().String("output", "", "Output file for the DataNode")
collectWebsiteCmd.PersistentFlags().Int("depth", 2, "Recursion depth for downloading")
collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
collectWebsiteCmd.PersistentFlags().Bool("extract-pdf-metadata", false, "Extract metadata from PDF files and add INDEX.json")
return collectWebsiteCmd
}