Merge c7e3ba297f into a77024aad4
This commit is contained in:
commit
66cbf3a95c
7 changed files with 407 additions and 0 deletions
|
|
@ -1,10 +1,15 @@
|
|||
version: '3'
|
||||
|
||||
tasks:
|
||||
install-deps:
|
||||
cmds:
|
||||
- sudo apt-get update && sudo apt-get install -y poppler-utils
|
||||
clean:
|
||||
cmds:
|
||||
- rm -f borg
|
||||
build:
|
||||
deps:
|
||||
- install-deps
|
||||
cmds:
|
||||
- task: clean
|
||||
- go build -o borg main.go
|
||||
|
|
|
|||
|
|
@ -1,11 +1,17 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/schollz/progressbar/v3"
|
||||
"github.com/Snider/Borg/pkg/compress"
|
||||
"github.com/Snider/Borg/pkg/pdf"
|
||||
"github.com/Snider/Borg/pkg/tim"
|
||||
"github.com/Snider/Borg/pkg/trix"
|
||||
"github.com/Snider/Borg/pkg/ui"
|
||||
|
|
@ -38,6 +44,7 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
|||
format, _ := cmd.Flags().GetString("format")
|
||||
compression, _ := cmd.Flags().GetString("compression")
|
||||
password, _ := cmd.Flags().GetString("password")
|
||||
extractPdfMetadata, _ := cmd.Flags().GetBool("extract-pdf-metadata")
|
||||
|
||||
if format != "datanode" && format != "tim" && format != "trix" {
|
||||
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
|
||||
|
|
@ -56,6 +63,53 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
|||
return fmt.Errorf("error downloading and packaging website: %w", err)
|
||||
}
|
||||
|
||||
if extractPdfMetadata {
|
||||
var allMetadata []*pdf.Metadata
|
||||
err := dn.Walk("/", func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !d.IsDir() && strings.HasSuffix(strings.ToLower(path), ".pdf") {
|
||||
tempFile, err := os.CreateTemp("", "borg-pdf-*.pdf")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create temp file: %w", err)
|
||||
}
|
||||
defer os.Remove(tempFile.Name())
|
||||
|
||||
file, err := dn.Open(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open %s from DataNode: %w", path, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
if _, err := io.Copy(tempFile, file); err != nil {
|
||||
return fmt.Errorf("failed to copy content to temp file: %w", err)
|
||||
}
|
||||
tempFile.Close()
|
||||
|
||||
metadata, err := pdf.ExtractMetadata(tempFile.Name())
|
||||
if err != nil {
|
||||
fmt.Fprintf(cmd.ErrOrStderr(), "could not extract metadata from %s: %v\n", path, err)
|
||||
return nil
|
||||
}
|
||||
metadata.File = filepath.Base(path)
|
||||
allMetadata = append(allMetadata, metadata)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("error walking DataNode for PDF extraction: %w", err)
|
||||
}
|
||||
|
||||
if len(allMetadata) > 0 {
|
||||
jsonOutput, err := json.MarshalIndent(allMetadata, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal metadata to JSON: %w", err)
|
||||
}
|
||||
dn.AddData("INDEX.json", jsonOutput)
|
||||
}
|
||||
}
|
||||
|
||||
var data []byte
|
||||
if format == "tim" {
|
||||
tim, err := tim.FromDataNode(dn)
|
||||
|
|
@ -104,5 +158,6 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
|||
collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
|
||||
collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
|
||||
collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
|
||||
collectWebsiteCmd.PersistentFlags().Bool("extract-pdf-metadata", false, "Extract metadata from PDF files and add INDEX.json")
|
||||
return collectWebsiteCmd
|
||||
}
|
||||
|
|
|
|||
121
cmd/extract_metadata.go
Normal file
121
cmd/extract_metadata.go
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/Snider/Borg/pkg/compress"
|
||||
"github.com/Snider/Borg/pkg/datanode"
|
||||
"github.com/Snider/Borg/pkg/pdf"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// extractMetadataCmd represents the extract-metadata command
|
||||
var extractMetadataCmd = NewExtractMetadataCmd()
|
||||
|
||||
func init() {
|
||||
RootCmd.AddCommand(GetExtractMetadataCmd())
|
||||
}
|
||||
|
||||
func NewExtractMetadataCmd() *cobra.Command {
|
||||
cmd := &cobra.Command{
|
||||
Use: "extract-metadata [archive]",
|
||||
Short: "Extract metadata from files in an archive.",
|
||||
Long: `Extract metadata from files of a specific type within a DataNode archive and create an INDEX.json file.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
archivePath := args[0]
|
||||
fileType, _ := cmd.Flags().GetString("type")
|
||||
|
||||
if fileType != "pdf" {
|
||||
return fmt.Errorf("unsupported type: %s. Only 'pdf' is currently supported", fileType)
|
||||
}
|
||||
|
||||
// Read and decompress the archive
|
||||
compressedData, err := os.ReadFile(archivePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read archive file: %w", err)
|
||||
}
|
||||
data, err := compress.Decompress(compressedData)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to decompress archive: %w", err)
|
||||
}
|
||||
|
||||
// Load the DataNode
|
||||
dn, err := datanode.FromTar(data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load DataNode from tar: %w", err)
|
||||
}
|
||||
|
||||
var allMetadata []*pdf.Metadata
|
||||
|
||||
// Walk the DataNode and extract metadata from PDF files
|
||||
err = dn.Walk("/", func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !d.IsDir() && strings.HasSuffix(strings.ToLower(path), ".pdf") {
|
||||
// Create a temporary file to run extraction on
|
||||
tempFile, err := os.CreateTemp("", "borg-pdf-*.pdf")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create temp file: %w", err)
|
||||
}
|
||||
defer os.Remove(tempFile.Name())
|
||||
|
||||
// Get the file content from DataNode
|
||||
file, err := dn.Open(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open %s from DataNode: %w", path, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
// Copy content to temp file
|
||||
if _, err := io.Copy(tempFile, file); err != nil {
|
||||
return fmt.Errorf("failed to copy content to temp file: %w", err)
|
||||
}
|
||||
tempFile.Close() // Close the file to allow reading by the extractor
|
||||
|
||||
// Extract metadata
|
||||
metadata, err := pdf.ExtractMetadata(tempFile.Name())
|
||||
if err != nil {
|
||||
// Log error but continue processing other files
|
||||
fmt.Fprintf(cmd.ErrOrStderr(), "could not extract metadata from %s: %v\n", path, err)
|
||||
return nil
|
||||
}
|
||||
metadata.File = filepath.Base(path) // Use the original filename
|
||||
allMetadata = append(allMetadata, metadata)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("error walking DataNode: %w", err)
|
||||
}
|
||||
|
||||
// Write the aggregated metadata to INDEX.json
|
||||
jsonOutput, err := json.MarshalIndent(allMetadata, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal metadata to JSON: %w", err)
|
||||
}
|
||||
|
||||
err = os.WriteFile("INDEX.json", jsonOutput, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to write INDEX.json: %w", err)
|
||||
}
|
||||
|
||||
fmt.Fprintln(cmd.OutOrStdout(), "Metadata extracted and saved to INDEX.json")
|
||||
return nil
|
||||
},
|
||||
}
|
||||
cmd.Flags().String("type", "pdf", "The type of files to extract metadata from (currently only 'pdf' is supported)")
|
||||
return cmd
|
||||
}
|
||||
|
||||
func GetExtractMetadataCmd() *cobra.Command {
|
||||
return extractMetadataCmd
|
||||
}
|
||||
24
cmd/pdf.go
Normal file
24
cmd/pdf.go
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// pdfCmd represents the pdf command
|
||||
var pdfCmd = NewPdfCmd()
|
||||
|
||||
func init() {
|
||||
RootCmd.AddCommand(GetPdfCmd())
|
||||
}
|
||||
|
||||
func NewPdfCmd() *cobra.Command {
|
||||
return &cobra.Command{
|
||||
Use: "pdf",
|
||||
Short: "Perform PDF operations.",
|
||||
Long: `A command for performing various PDF operations.`,
|
||||
}
|
||||
}
|
||||
|
||||
func GetPdfCmd() *cobra.Command {
|
||||
return pdfCmd
|
||||
}
|
||||
41
cmd/pdf_metadata.go
Normal file
41
cmd/pdf_metadata.go
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/Snider/Borg/pkg/pdf"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// pdfMetadataCmd represents the pdf metadata command
|
||||
var pdfMetadataCmd = NewPdfMetadataCmd()
|
||||
|
||||
func init() {
|
||||
GetPdfCmd().AddCommand(GetPdfMetadataCmd())
|
||||
}
|
||||
|
||||
func NewPdfMetadataCmd() *cobra.Command {
|
||||
return &cobra.Command{
|
||||
Use: "metadata [file]",
|
||||
Short: "Extract metadata from a PDF file.",
|
||||
Long: `Extract metadata from a PDF file and print it as JSON.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
filePath := args[0]
|
||||
metadata, err := pdf.ExtractMetadata(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error extracting metadata: %w", err)
|
||||
}
|
||||
jsonMetadata, err := json.MarshalIndent(metadata, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("error marshalling metadata to JSON: %w", err)
|
||||
}
|
||||
fmt.Fprintln(cmd.OutOrStdout(), string(jsonMetadata))
|
||||
return nil
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func GetPdfMetadataCmd() *cobra.Command {
|
||||
return pdfMetadataCmd
|
||||
}
|
||||
58
pkg/pdf/metadata.go
Normal file
58
pkg/pdf/metadata.go
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
package pdf
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Metadata holds the extracted PDF metadata.
|
||||
type Metadata struct {
|
||||
File string `json:"file"`
|
||||
Title string `json:"title"`
|
||||
Authors []string `json:"authors"`
|
||||
Abstract string `json:"abstract"`
|
||||
Pages int `json:"pages"`
|
||||
Created string `json:"created"`
|
||||
}
|
||||
|
||||
// ExtractMetadata extracts metadata from a PDF file using the pdfinfo command.
|
||||
func ExtractMetadata(filePath string) (*Metadata, error) {
|
||||
cmd := exec.Command("pdfinfo", filePath)
|
||||
var out bytes.Buffer
|
||||
cmd.Stdout = &out
|
||||
err := cmd.Run()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
metadata := &Metadata{File: filePath}
|
||||
scanner := bufio.NewScanner(&out)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
key := strings.TrimSpace(parts[0])
|
||||
value := strings.TrimSpace(parts[1])
|
||||
|
||||
switch key {
|
||||
case "Title":
|
||||
metadata.Title = value
|
||||
case "Author":
|
||||
metadata.Authors = strings.Split(value, ",")
|
||||
case "CreationDate":
|
||||
metadata.Created = value
|
||||
case "Pages":
|
||||
pages, err := strconv.Atoi(value)
|
||||
if err == nil {
|
||||
metadata.Pages = pages
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return metadata, nil
|
||||
}
|
||||
103
pkg/pdf/metadata_test.go
Normal file
103
pkg/pdf/metadata_test.go
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
package pdf
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// mockExecCommand is used to mock the exec.Command function for testing.
|
||||
func mockExecCommand(command string, args ...string) *exec.Cmd {
|
||||
cs := []string{"-test.run=TestHelperProcess", "--", command}
|
||||
cs = append(cs, args...)
|
||||
cmd := exec.Command(os.Args[0], cs...)
|
||||
cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
|
||||
return cmd
|
||||
}
|
||||
|
||||
// TestHelperProcess isn't a real test. It's used as a helper process
|
||||
// for TestExtractMetadata. It simulates the behavior of the `pdfinfo` command.
|
||||
func TestHelperProcess(t *testing.T) {
|
||||
if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
|
||||
return
|
||||
}
|
||||
defer os.Exit(0)
|
||||
|
||||
args := os.Args
|
||||
for len(args) > 0 {
|
||||
if args[0] == "--" {
|
||||
args = args[1:]
|
||||
break
|
||||
}
|
||||
args = args[1:]
|
||||
}
|
||||
if len(args) == 0 {
|
||||
fmt.Fprintf(os.Stderr, "No command to mock!\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
cmd, args := args[0], args[1:]
|
||||
if cmd == "pdfinfo" && len(args) == 1 {
|
||||
// Simulate pdfinfo output
|
||||
fmt.Println("Title: Test Title")
|
||||
fmt.Println("Author: Test Author 1,Test Author 2")
|
||||
fmt.Println("CreationDate: Sun Jan 1 00:00:00 2023")
|
||||
fmt.Println("Pages: 42")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractMetadata(t *testing.T) {
|
||||
execCommand = mockExecCommand
|
||||
defer func() { execCommand = exec.Command }()
|
||||
|
||||
metadata, err := ExtractMetadata("dummy.pdf")
|
||||
if err != nil {
|
||||
t.Fatalf("ExtractMetadata failed: %v", err)
|
||||
}
|
||||
|
||||
if metadata.Title != "Test Title" {
|
||||
t.Errorf("expected title 'Test Title', got '%s'", metadata.Title)
|
||||
}
|
||||
if len(metadata.Authors) != 2 || metadata.Authors[0] != "Test Author 1" || metadata.Authors[1] != "Test Author 2" {
|
||||
t.Errorf("expected authors '[Test Author 1, Test Author 2]', got '%v'", metadata.Authors)
|
||||
}
|
||||
if metadata.Created != "Sun Jan 1 00:00:00 2023" {
|
||||
t.Errorf("expected creation date 'Sun Jan 1 00:00:00 2023', got '%s'", metadata.Created)
|
||||
}
|
||||
if metadata.Pages != 42 {
|
||||
t.Errorf("expected 42 pages, got %d", metadata.Pages)
|
||||
}
|
||||
if metadata.File != "dummy.pdf" {
|
||||
t.Errorf("expected file 'dummy.pdf', got '%s'", metadata.File)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractMetadata_CommandError(t *testing.T) {
|
||||
execCommand = func(command string, args ...string) *exec.Cmd {
|
||||
cs := []string{"-test.run=TestHelperProcess_Error", "--", command}
|
||||
cs = append(cs, args...)
|
||||
cmd := exec.Command(os.Args[0], cs...)
|
||||
cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
|
||||
return cmd
|
||||
}
|
||||
defer func() { execCommand = exec.Command }()
|
||||
|
||||
_, err := ExtractMetadata("dummy.pdf")
|
||||
if err == nil {
|
||||
t.Fatal("expected an error from exec.Command, but got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "exit status 1") {
|
||||
t.Errorf("expected error to contain 'exit status 1', got '%v'", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHelperProcess_Error(t *testing.T) {
|
||||
if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
|
||||
return
|
||||
}
|
||||
// Simulate an error by writing to stderr and exiting with a non-zero status
|
||||
fmt.Fprintf(os.Stderr, "pdfinfo error")
|
||||
os.Exit(1)
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue