Compare commits
1 commit
main
...
feat/pdf-m
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c7e3ba297f |
11 changed files with 410 additions and 336 deletions
|
|
@ -1,10 +1,15 @@
|
|||
version: '3'
|
||||
|
||||
tasks:
|
||||
install-deps:
|
||||
cmds:
|
||||
- sudo apt-get update && sudo apt-get install -y poppler-utils
|
||||
clean:
|
||||
cmds:
|
||||
- rm -f borg
|
||||
build:
|
||||
deps:
|
||||
- install-deps
|
||||
cmds:
|
||||
- task: clean
|
||||
- go build -o borg main.go
|
||||
|
|
|
|||
|
|
@ -1,333 +0,0 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/Snider/Borg/pkg/compress"
|
||||
"github.com/Snider/Borg/pkg/datanode"
|
||||
"github.com/Snider/Borg/pkg/tim"
|
||||
"github.com/Snider/Borg/pkg/trix"
|
||||
"github.com/Snider/Borg/pkg/ui"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
type CollectLocalCmd struct {
|
||||
cobra.Command
|
||||
}
|
||||
|
||||
// NewCollectLocalCmd creates a new collect local command
|
||||
func NewCollectLocalCmd() *CollectLocalCmd {
|
||||
c := &CollectLocalCmd{}
|
||||
c.Command = cobra.Command{
|
||||
Use: "local [directory]",
|
||||
Short: "Collect files from a local directory",
|
||||
Long: `Collect files from a local directory and store them in a DataNode.
|
||||
|
||||
If no directory is specified, the current working directory is used.
|
||||
|
||||
Examples:
|
||||
borg collect local
|
||||
borg collect local ./src
|
||||
borg collect local /path/to/project --output project.tar
|
||||
borg collect local . --format stim --password secret
|
||||
borg collect local . --exclude "*.log" --exclude "node_modules"`,
|
||||
Args: cobra.MaximumNArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
directory := "."
|
||||
if len(args) > 0 {
|
||||
directory = args[0]
|
||||
}
|
||||
|
||||
outputFile, _ := cmd.Flags().GetString("output")
|
||||
format, _ := cmd.Flags().GetString("format")
|
||||
compression, _ := cmd.Flags().GetString("compression")
|
||||
password, _ := cmd.Flags().GetString("password")
|
||||
excludes, _ := cmd.Flags().GetStringSlice("exclude")
|
||||
includeHidden, _ := cmd.Flags().GetBool("hidden")
|
||||
respectGitignore, _ := cmd.Flags().GetBool("gitignore")
|
||||
|
||||
finalPath, err := CollectLocal(directory, outputFile, format, compression, password, excludes, includeHidden, respectGitignore)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Fprintln(cmd.OutOrStdout(), "Files saved to", finalPath)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
c.Flags().String("output", "", "Output file for the DataNode")
|
||||
c.Flags().String("format", "datanode", "Output format (datanode, tim, trix, or stim)")
|
||||
c.Flags().String("compression", "none", "Compression format (none, gz, or xz)")
|
||||
c.Flags().String("password", "", "Password for encryption (required for stim/trix format)")
|
||||
c.Flags().StringSlice("exclude", nil, "Patterns to exclude (can be specified multiple times)")
|
||||
c.Flags().Bool("hidden", false, "Include hidden files and directories")
|
||||
c.Flags().Bool("gitignore", true, "Respect .gitignore files (default: true)")
|
||||
return c
|
||||
}
|
||||
|
||||
func init() {
|
||||
collectCmd.AddCommand(&NewCollectLocalCmd().Command)
|
||||
}
|
||||
|
||||
// CollectLocal collects files from a local directory into a DataNode
|
||||
func CollectLocal(directory string, outputFile string, format string, compression string, password string, excludes []string, includeHidden bool, respectGitignore bool) (string, error) {
|
||||
// Validate format
|
||||
if format != "datanode" && format != "tim" && format != "trix" && format != "stim" {
|
||||
return "", fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', 'trix', or 'stim')", format)
|
||||
}
|
||||
if (format == "stim" || format == "trix") && password == "" {
|
||||
return "", fmt.Errorf("password is required for %s format", format)
|
||||
}
|
||||
if compression != "none" && compression != "gz" && compression != "xz" {
|
||||
return "", fmt.Errorf("invalid compression: %s (must be 'none', 'gz', or 'xz')", compression)
|
||||
}
|
||||
|
||||
// Resolve directory path
|
||||
absDir, err := filepath.Abs(directory)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error resolving directory path: %w", err)
|
||||
}
|
||||
|
||||
info, err := os.Stat(absDir)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error accessing directory: %w", err)
|
||||
}
|
||||
if !info.IsDir() {
|
||||
return "", fmt.Errorf("not a directory: %s", absDir)
|
||||
}
|
||||
|
||||
// Load gitignore patterns if enabled
|
||||
var gitignorePatterns []string
|
||||
if respectGitignore {
|
||||
gitignorePatterns = loadGitignore(absDir)
|
||||
}
|
||||
|
||||
// Create DataNode and collect files
|
||||
dn := datanode.New()
|
||||
var fileCount int
|
||||
|
||||
bar := ui.NewProgressBar(-1, "Scanning files")
|
||||
defer bar.Finish()
|
||||
|
||||
err = filepath.WalkDir(absDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Get relative path
|
||||
relPath, err := filepath.Rel(absDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Skip root
|
||||
if relPath == "." {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Skip hidden files/dirs unless explicitly included
|
||||
if !includeHidden && isHidden(relPath) {
|
||||
if d.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check gitignore patterns
|
||||
if respectGitignore && matchesGitignore(relPath, d.IsDir(), gitignorePatterns) {
|
||||
if d.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check exclude patterns
|
||||
if matchesExclude(relPath, excludes) {
|
||||
if d.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Skip directories (they're implicit in DataNode)
|
||||
if d.IsDir() {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read file content
|
||||
content, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error reading %s: %w", relPath, err)
|
||||
}
|
||||
|
||||
// Add to DataNode with forward slashes (tar convention)
|
||||
dn.AddData(filepath.ToSlash(relPath), content)
|
||||
fileCount++
|
||||
bar.Describe(fmt.Sprintf("Collected %d files", fileCount))
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error walking directory: %w", err)
|
||||
}
|
||||
|
||||
if fileCount == 0 {
|
||||
return "", fmt.Errorf("no files found in %s", directory)
|
||||
}
|
||||
|
||||
bar.Describe(fmt.Sprintf("Packaging %d files", fileCount))
|
||||
|
||||
// Convert to output format
|
||||
var data []byte
|
||||
if format == "tim" {
|
||||
t, err := tim.FromDataNode(dn)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error creating tim: %w", err)
|
||||
}
|
||||
data, err = t.ToTar()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error serializing tim: %w", err)
|
||||
}
|
||||
} else if format == "stim" {
|
||||
t, err := tim.FromDataNode(dn)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error creating tim: %w", err)
|
||||
}
|
||||
data, err = t.ToSigil(password)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error encrypting stim: %w", err)
|
||||
}
|
||||
} else if format == "trix" {
|
||||
data, err = trix.ToTrix(dn, password)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error serializing trix: %w", err)
|
||||
}
|
||||
} else {
|
||||
data, err = dn.ToTar()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error serializing DataNode: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Apply compression
|
||||
compressedData, err := compress.Compress(data, compression)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error compressing data: %w", err)
|
||||
}
|
||||
|
||||
// Determine output filename
|
||||
if outputFile == "" {
|
||||
baseName := filepath.Base(absDir)
|
||||
if baseName == "." || baseName == "/" {
|
||||
baseName = "local"
|
||||
}
|
||||
outputFile = baseName + "." + format
|
||||
if compression != "none" {
|
||||
outputFile += "." + compression
|
||||
}
|
||||
}
|
||||
|
||||
err = os.WriteFile(outputFile, compressedData, 0644)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error writing output file: %w", err)
|
||||
}
|
||||
|
||||
return outputFile, nil
|
||||
}
|
||||
|
||||
// isHidden checks if a path component starts with a dot
|
||||
func isHidden(path string) bool {
|
||||
parts := strings.Split(filepath.ToSlash(path), "/")
|
||||
for _, part := range parts {
|
||||
if strings.HasPrefix(part, ".") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// loadGitignore loads patterns from .gitignore if it exists
|
||||
func loadGitignore(dir string) []string {
|
||||
var patterns []string
|
||||
|
||||
gitignorePath := filepath.Join(dir, ".gitignore")
|
||||
content, err := os.ReadFile(gitignorePath)
|
||||
if err != nil {
|
||||
return patterns
|
||||
}
|
||||
|
||||
lines := strings.Split(string(content), "\n")
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
// Skip empty lines and comments
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
patterns = append(patterns, line)
|
||||
}
|
||||
|
||||
return patterns
|
||||
}
|
||||
|
||||
// matchesGitignore checks if a path matches any gitignore pattern
|
||||
func matchesGitignore(path string, isDir bool, patterns []string) bool {
|
||||
for _, pattern := range patterns {
|
||||
// Handle directory-only patterns
|
||||
if strings.HasSuffix(pattern, "/") {
|
||||
if !isDir {
|
||||
continue
|
||||
}
|
||||
pattern = strings.TrimSuffix(pattern, "/")
|
||||
}
|
||||
|
||||
// Handle negation (simplified - just skip negated patterns)
|
||||
if strings.HasPrefix(pattern, "!") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Match against path components
|
||||
matched, _ := filepath.Match(pattern, filepath.Base(path))
|
||||
if matched {
|
||||
return true
|
||||
}
|
||||
|
||||
// Also try matching the full path
|
||||
matched, _ = filepath.Match(pattern, path)
|
||||
if matched {
|
||||
return true
|
||||
}
|
||||
|
||||
// Handle ** patterns (simplified)
|
||||
if strings.Contains(pattern, "**") {
|
||||
simplePattern := strings.ReplaceAll(pattern, "**", "*")
|
||||
matched, _ = filepath.Match(simplePattern, path)
|
||||
if matched {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// matchesExclude checks if a path matches any exclude pattern
|
||||
func matchesExclude(path string, excludes []string) bool {
|
||||
for _, pattern := range excludes {
|
||||
// Match against basename
|
||||
matched, _ := filepath.Match(pattern, filepath.Base(path))
|
||||
if matched {
|
||||
return true
|
||||
}
|
||||
|
||||
// Match against full path
|
||||
matched, _ = filepath.Match(pattern, path)
|
||||
if matched {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
@ -1,11 +1,17 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/schollz/progressbar/v3"
|
||||
"github.com/Snider/Borg/pkg/compress"
|
||||
"github.com/Snider/Borg/pkg/pdf"
|
||||
"github.com/Snider/Borg/pkg/tim"
|
||||
"github.com/Snider/Borg/pkg/trix"
|
||||
"github.com/Snider/Borg/pkg/ui"
|
||||
|
|
@ -38,6 +44,7 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
|||
format, _ := cmd.Flags().GetString("format")
|
||||
compression, _ := cmd.Flags().GetString("compression")
|
||||
password, _ := cmd.Flags().GetString("password")
|
||||
extractPdfMetadata, _ := cmd.Flags().GetBool("extract-pdf-metadata")
|
||||
|
||||
if format != "datanode" && format != "tim" && format != "trix" {
|
||||
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
|
||||
|
|
@ -56,6 +63,53 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
|||
return fmt.Errorf("error downloading and packaging website: %w", err)
|
||||
}
|
||||
|
||||
if extractPdfMetadata {
|
||||
var allMetadata []*pdf.Metadata
|
||||
err := dn.Walk("/", func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !d.IsDir() && strings.HasSuffix(strings.ToLower(path), ".pdf") {
|
||||
tempFile, err := os.CreateTemp("", "borg-pdf-*.pdf")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create temp file: %w", err)
|
||||
}
|
||||
defer os.Remove(tempFile.Name())
|
||||
|
||||
file, err := dn.Open(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open %s from DataNode: %w", path, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
if _, err := io.Copy(tempFile, file); err != nil {
|
||||
return fmt.Errorf("failed to copy content to temp file: %w", err)
|
||||
}
|
||||
tempFile.Close()
|
||||
|
||||
metadata, err := pdf.ExtractMetadata(tempFile.Name())
|
||||
if err != nil {
|
||||
fmt.Fprintf(cmd.ErrOrStderr(), "could not extract metadata from %s: %v\n", path, err)
|
||||
return nil
|
||||
}
|
||||
metadata.File = filepath.Base(path)
|
||||
allMetadata = append(allMetadata, metadata)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("error walking DataNode for PDF extraction: %w", err)
|
||||
}
|
||||
|
||||
if len(allMetadata) > 0 {
|
||||
jsonOutput, err := json.MarshalIndent(allMetadata, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal metadata to JSON: %w", err)
|
||||
}
|
||||
dn.AddData("INDEX.json", jsonOutput)
|
||||
}
|
||||
}
|
||||
|
||||
var data []byte
|
||||
if format == "tim" {
|
||||
tim, err := tim.FromDataNode(dn)
|
||||
|
|
@ -104,5 +158,6 @@ func NewCollectWebsiteCmd() *cobra.Command {
|
|||
collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
|
||||
collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
|
||||
collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
|
||||
collectWebsiteCmd.PersistentFlags().Bool("extract-pdf-metadata", false, "Extract metadata from PDF files and add INDEX.json")
|
||||
return collectWebsiteCmd
|
||||
}
|
||||
|
|
|
|||
121
cmd/extract_metadata.go
Normal file
121
cmd/extract_metadata.go
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/Snider/Borg/pkg/compress"
|
||||
"github.com/Snider/Borg/pkg/datanode"
|
||||
"github.com/Snider/Borg/pkg/pdf"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// extractMetadataCmd represents the extract-metadata command
|
||||
var extractMetadataCmd = NewExtractMetadataCmd()
|
||||
|
||||
func init() {
|
||||
RootCmd.AddCommand(GetExtractMetadataCmd())
|
||||
}
|
||||
|
||||
func NewExtractMetadataCmd() *cobra.Command {
|
||||
cmd := &cobra.Command{
|
||||
Use: "extract-metadata [archive]",
|
||||
Short: "Extract metadata from files in an archive.",
|
||||
Long: `Extract metadata from files of a specific type within a DataNode archive and create an INDEX.json file.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
archivePath := args[0]
|
||||
fileType, _ := cmd.Flags().GetString("type")
|
||||
|
||||
if fileType != "pdf" {
|
||||
return fmt.Errorf("unsupported type: %s. Only 'pdf' is currently supported", fileType)
|
||||
}
|
||||
|
||||
// Read and decompress the archive
|
||||
compressedData, err := os.ReadFile(archivePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read archive file: %w", err)
|
||||
}
|
||||
data, err := compress.Decompress(compressedData)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to decompress archive: %w", err)
|
||||
}
|
||||
|
||||
// Load the DataNode
|
||||
dn, err := datanode.FromTar(data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load DataNode from tar: %w", err)
|
||||
}
|
||||
|
||||
var allMetadata []*pdf.Metadata
|
||||
|
||||
// Walk the DataNode and extract metadata from PDF files
|
||||
err = dn.Walk("/", func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !d.IsDir() && strings.HasSuffix(strings.ToLower(path), ".pdf") {
|
||||
// Create a temporary file to run extraction on
|
||||
tempFile, err := os.CreateTemp("", "borg-pdf-*.pdf")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create temp file: %w", err)
|
||||
}
|
||||
defer os.Remove(tempFile.Name())
|
||||
|
||||
// Get the file content from DataNode
|
||||
file, err := dn.Open(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open %s from DataNode: %w", path, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
// Copy content to temp file
|
||||
if _, err := io.Copy(tempFile, file); err != nil {
|
||||
return fmt.Errorf("failed to copy content to temp file: %w", err)
|
||||
}
|
||||
tempFile.Close() // Close the file to allow reading by the extractor
|
||||
|
||||
// Extract metadata
|
||||
metadata, err := pdf.ExtractMetadata(tempFile.Name())
|
||||
if err != nil {
|
||||
// Log error but continue processing other files
|
||||
fmt.Fprintf(cmd.ErrOrStderr(), "could not extract metadata from %s: %v\n", path, err)
|
||||
return nil
|
||||
}
|
||||
metadata.File = filepath.Base(path) // Use the original filename
|
||||
allMetadata = append(allMetadata, metadata)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("error walking DataNode: %w", err)
|
||||
}
|
||||
|
||||
// Write the aggregated metadata to INDEX.json
|
||||
jsonOutput, err := json.MarshalIndent(allMetadata, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal metadata to JSON: %w", err)
|
||||
}
|
||||
|
||||
err = os.WriteFile("INDEX.json", jsonOutput, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to write INDEX.json: %w", err)
|
||||
}
|
||||
|
||||
fmt.Fprintln(cmd.OutOrStdout(), "Metadata extracted and saved to INDEX.json")
|
||||
return nil
|
||||
},
|
||||
}
|
||||
cmd.Flags().String("type", "pdf", "The type of files to extract metadata from (currently only 'pdf' is supported)")
|
||||
return cmd
|
||||
}
|
||||
|
||||
func GetExtractMetadataCmd() *cobra.Command {
|
||||
return extractMetadataCmd
|
||||
}
|
||||
24
cmd/pdf.go
Normal file
24
cmd/pdf.go
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// pdfCmd represents the pdf command
|
||||
var pdfCmd = NewPdfCmd()
|
||||
|
||||
func init() {
|
||||
RootCmd.AddCommand(GetPdfCmd())
|
||||
}
|
||||
|
||||
func NewPdfCmd() *cobra.Command {
|
||||
return &cobra.Command{
|
||||
Use: "pdf",
|
||||
Short: "Perform PDF operations.",
|
||||
Long: `A command for performing various PDF operations.`,
|
||||
}
|
||||
}
|
||||
|
||||
func GetPdfCmd() *cobra.Command {
|
||||
return pdfCmd
|
||||
}
|
||||
41
cmd/pdf_metadata.go
Normal file
41
cmd/pdf_metadata.go
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/Snider/Borg/pkg/pdf"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// pdfMetadataCmd represents the pdf metadata command
|
||||
var pdfMetadataCmd = NewPdfMetadataCmd()
|
||||
|
||||
func init() {
|
||||
GetPdfCmd().AddCommand(GetPdfMetadataCmd())
|
||||
}
|
||||
|
||||
func NewPdfMetadataCmd() *cobra.Command {
|
||||
return &cobra.Command{
|
||||
Use: "metadata [file]",
|
||||
Short: "Extract metadata from a PDF file.",
|
||||
Long: `Extract metadata from a PDF file and print it as JSON.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
filePath := args[0]
|
||||
metadata, err := pdf.ExtractMetadata(filePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error extracting metadata: %w", err)
|
||||
}
|
||||
jsonMetadata, err := json.MarshalIndent(metadata, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("error marshalling metadata to JSON: %w", err)
|
||||
}
|
||||
fmt.Fprintln(cmd.OutOrStdout(), string(jsonMetadata))
|
||||
return nil
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func GetPdfMetadataCmd() *cobra.Command {
|
||||
return pdfMetadataCmd
|
||||
}
|
||||
BIN
examples/demo-sample.smsg
Normal file
BIN
examples/demo-sample.smsg
Normal file
Binary file not shown.
2
go.mod
2
go.mod
|
|
@ -60,7 +60,7 @@ require (
|
|||
github.com/wailsapp/go-webview2 v1.0.22 // indirect
|
||||
github.com/wailsapp/mimetype v1.4.1 // indirect
|
||||
github.com/xanzy/ssh-agent v0.3.3 // indirect
|
||||
golang.org/x/crypto v0.45.0 // indirect
|
||||
golang.org/x/crypto v0.44.0 // indirect
|
||||
golang.org/x/sys v0.38.0 // indirect
|
||||
golang.org/x/term v0.37.0 // indirect
|
||||
golang.org/x/text v0.31.0 // indirect
|
||||
|
|
|
|||
4
go.sum
4
go.sum
|
|
@ -155,8 +155,8 @@ github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI
|
|||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
|
||||
golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q=
|
||||
golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4=
|
||||
golang.org/x/crypto v0.44.0 h1:A97SsFvM3AIwEEmTBiaxPPTYpDC47w720rdiiUvgoAU=
|
||||
golang.org/x/crypto v0.44.0/go.mod h1:013i+Nw79BMiQiMsOPcVCB5ZIJbYkerPrGnOa00tvmc=
|
||||
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
|
||||
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
|
||||
golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk=
|
||||
|
|
|
|||
58
pkg/pdf/metadata.go
Normal file
58
pkg/pdf/metadata.go
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
package pdf
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Metadata holds the extracted PDF metadata.
|
||||
type Metadata struct {
|
||||
File string `json:"file"`
|
||||
Title string `json:"title"`
|
||||
Authors []string `json:"authors"`
|
||||
Abstract string `json:"abstract"`
|
||||
Pages int `json:"pages"`
|
||||
Created string `json:"created"`
|
||||
}
|
||||
|
||||
// ExtractMetadata extracts metadata from a PDF file using the pdfinfo command.
|
||||
func ExtractMetadata(filePath string) (*Metadata, error) {
|
||||
cmd := exec.Command("pdfinfo", filePath)
|
||||
var out bytes.Buffer
|
||||
cmd.Stdout = &out
|
||||
err := cmd.Run()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
metadata := &Metadata{File: filePath}
|
||||
scanner := bufio.NewScanner(&out)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
key := strings.TrimSpace(parts[0])
|
||||
value := strings.TrimSpace(parts[1])
|
||||
|
||||
switch key {
|
||||
case "Title":
|
||||
metadata.Title = value
|
||||
case "Author":
|
||||
metadata.Authors = strings.Split(value, ",")
|
||||
case "CreationDate":
|
||||
metadata.Created = value
|
||||
case "Pages":
|
||||
pages, err := strconv.Atoi(value)
|
||||
if err == nil {
|
||||
metadata.Pages = pages
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return metadata, nil
|
||||
}
|
||||
103
pkg/pdf/metadata_test.go
Normal file
103
pkg/pdf/metadata_test.go
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
package pdf
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// mockExecCommand is used to mock the exec.Command function for testing.
|
||||
func mockExecCommand(command string, args ...string) *exec.Cmd {
|
||||
cs := []string{"-test.run=TestHelperProcess", "--", command}
|
||||
cs = append(cs, args...)
|
||||
cmd := exec.Command(os.Args[0], cs...)
|
||||
cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
|
||||
return cmd
|
||||
}
|
||||
|
||||
// TestHelperProcess isn't a real test. It's used as a helper process
|
||||
// for TestExtractMetadata. It simulates the behavior of the `pdfinfo` command.
|
||||
func TestHelperProcess(t *testing.T) {
|
||||
if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
|
||||
return
|
||||
}
|
||||
defer os.Exit(0)
|
||||
|
||||
args := os.Args
|
||||
for len(args) > 0 {
|
||||
if args[0] == "--" {
|
||||
args = args[1:]
|
||||
break
|
||||
}
|
||||
args = args[1:]
|
||||
}
|
||||
if len(args) == 0 {
|
||||
fmt.Fprintf(os.Stderr, "No command to mock!\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
cmd, args := args[0], args[1:]
|
||||
if cmd == "pdfinfo" && len(args) == 1 {
|
||||
// Simulate pdfinfo output
|
||||
fmt.Println("Title: Test Title")
|
||||
fmt.Println("Author: Test Author 1,Test Author 2")
|
||||
fmt.Println("CreationDate: Sun Jan 1 00:00:00 2023")
|
||||
fmt.Println("Pages: 42")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractMetadata(t *testing.T) {
|
||||
execCommand = mockExecCommand
|
||||
defer func() { execCommand = exec.Command }()
|
||||
|
||||
metadata, err := ExtractMetadata("dummy.pdf")
|
||||
if err != nil {
|
||||
t.Fatalf("ExtractMetadata failed: %v", err)
|
||||
}
|
||||
|
||||
if metadata.Title != "Test Title" {
|
||||
t.Errorf("expected title 'Test Title', got '%s'", metadata.Title)
|
||||
}
|
||||
if len(metadata.Authors) != 2 || metadata.Authors[0] != "Test Author 1" || metadata.Authors[1] != "Test Author 2" {
|
||||
t.Errorf("expected authors '[Test Author 1, Test Author 2]', got '%v'", metadata.Authors)
|
||||
}
|
||||
if metadata.Created != "Sun Jan 1 00:00:00 2023" {
|
||||
t.Errorf("expected creation date 'Sun Jan 1 00:00:00 2023', got '%s'", metadata.Created)
|
||||
}
|
||||
if metadata.Pages != 42 {
|
||||
t.Errorf("expected 42 pages, got %d", metadata.Pages)
|
||||
}
|
||||
if metadata.File != "dummy.pdf" {
|
||||
t.Errorf("expected file 'dummy.pdf', got '%s'", metadata.File)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExtractMetadata_CommandError(t *testing.T) {
|
||||
execCommand = func(command string, args ...string) *exec.Cmd {
|
||||
cs := []string{"-test.run=TestHelperProcess_Error", "--", command}
|
||||
cs = append(cs, args...)
|
||||
cmd := exec.Command(os.Args[0], cs...)
|
||||
cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
|
||||
return cmd
|
||||
}
|
||||
defer func() { execCommand = exec.Command }()
|
||||
|
||||
_, err := ExtractMetadata("dummy.pdf")
|
||||
if err == nil {
|
||||
t.Fatal("expected an error from exec.Command, but got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "exit status 1") {
|
||||
t.Errorf("expected error to contain 'exit status 1', got '%v'", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHelperProcess_Error(t *testing.T) {
|
||||
if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
|
||||
return
|
||||
}
|
||||
// Simulate an error by writing to stderr and exiting with a non-zero status
|
||||
fmt.Fprintf(os.Stderr, "pdfinfo error")
|
||||
os.Exit(1)
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue