Borg/cmd/extract_metadata.go

package cmd

import (
	"encoding/json"
	"fmt"
	"io"
	"io/fs"
	"os"
	"path/filepath"
	"strings"

	"github.com/Snider/Borg/pkg/compress"
	"github.com/Snider/Borg/pkg/datanode"
	"github.com/Snider/Borg/pkg/pdf"
	"github.com/spf13/cobra"
)

// extractMetadataCmd represents the extract-metadata command
var extractMetadataCmd = NewExtractMetadataCmd()

func init() {
	RootCmd.AddCommand(GetExtractMetadataCmd())
}

func NewExtractMetadataCmd() *cobra.Command {
	cmd := &cobra.Command{
		Use:   "extract-metadata [archive]",
		Short: "Extract metadata from files in an archive.",
		Long:  `Extract metadata from files of a specific type within a DataNode archive and create an INDEX.json file.`,
		Args:  cobra.ExactArgs(1),
		RunE: func(cmd *cobra.Command, args []string) error {
			archivePath := args[0]
			fileType, _ := cmd.Flags().GetString("type")

			if fileType != "pdf" {
				return fmt.Errorf("unsupported type: %s. Only 'pdf' is currently supported", fileType)
			}

			// Read and decompress the archive
			compressedData, err := os.ReadFile(archivePath)
			if err != nil {
				return fmt.Errorf("failed to read archive file: %w", err)
			}
			data, err := compress.Decompress(compressedData)
			if err != nil {
				return fmt.Errorf("failed to decompress archive: %w", err)
			}

			// Load the DataNode
			dn, err := datanode.FromTar(data)
			if err != nil {
				return fmt.Errorf("failed to load DataNode from tar: %w", err)
			}

			var allMetadata []*pdf.Metadata

			// Walk the DataNode and extract metadata from PDF files
			err = dn.Walk("/", func(path string, d fs.DirEntry, err error) error {
				if err != nil {
					return err
				}
				if !d.IsDir() && strings.HasSuffix(strings.ToLower(path), ".pdf") {
					// Create a temporary file to run extraction on
					tempFile, err := os.CreateTemp("", "borg-pdf-*.pdf")
					if err != nil {
						return fmt.Errorf("failed to create temp file: %w", err)
					}
					defer os.Remove(tempFile.Name())

					// Get the file content from DataNode
					file, err := dn.Open(path)
					if err != nil {
						return fmt.Errorf("failed to open %s from DataNode: %w", path, err)
					}
					defer file.Close()

					// Copy content to temp file
					if _, err := io.Copy(tempFile, file); err != nil {
						return fmt.Errorf("failed to copy content to temp file: %w", err)
					}
					tempFile.Close() // Close the file to allow reading by the extractor

					// Extract metadata
					metadata, err := pdf.ExtractMetadata(tempFile.Name())
					if err != nil {
						// Log error but continue processing other files
						fmt.Fprintf(cmd.ErrOrStderr(), "could not extract metadata from %s: %v\n", path, err)
						return nil
					}
					metadata.File = filepath.Base(path) // Use the original filename
					allMetadata = append(allMetadata, metadata)
				}
				return nil
			})

			if err != nil {
				return fmt.Errorf("error walking DataNode: %w", err)
			}

			// Write the aggregated metadata to INDEX.json
			jsonOutput, err := json.MarshalIndent(allMetadata, "", "  ")
			if err != nil {
				return fmt.Errorf("failed to marshal metadata to JSON: %w", err)
			}

			err = os.WriteFile("INDEX.json", jsonOutput, 0644)
			if err != nil {
				return fmt.Errorf("failed to write INDEX.json: %w", err)
			}

			fmt.Fprintln(cmd.OutOrStdout(), "Metadata extracted and saved to INDEX.json")
			return nil
		},
	}
	cmd.Flags().String("type", "pdf", "The type of files to extract metadata from (currently only 'pdf' is supported)")
	return cmd
}

func GetExtractMetadataCmd() *cobra.Command {
	return extractMetadataCmd
}