2025-10-31 21:35:53 +00:00
|
|
|
package cmd
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"fmt"
|
|
|
|
|
"os"
|
|
|
|
|
|
2025-11-03 18:25:04 +00:00
|
|
|
"github.com/schollz/progressbar/v3"
|
2025-11-02 13:27:04 +00:00
|
|
|
"github.com/Snider/Borg/pkg/compress"
|
2025-11-14 13:47:27 +00:00
|
|
|
"github.com/Snider/Borg/pkg/tim"
|
|
|
|
|
"github.com/Snider/Borg/pkg/trix"
|
2025-11-02 01:26:52 +00:00
|
|
|
"github.com/Snider/Borg/pkg/ui"
|
2025-11-01 19:03:04 +00:00
|
|
|
"github.com/Snider/Borg/pkg/website"
|
2025-10-31 21:35:53 +00:00
|
|
|
|
|
|
|
|
"github.com/spf13/cobra"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// collectWebsiteCmd represents the collect website command
|
2025-11-14 10:36:35 +00:00
|
|
|
var collectWebsiteCmd = NewCollectWebsiteCmd()
|
2025-10-31 21:35:53 +00:00
|
|
|
|
2025-11-14 10:36:35 +00:00
|
|
|
func init() {
|
|
|
|
|
GetCollectCmd().AddCommand(GetCollectWebsiteCmd())
|
|
|
|
|
}
|
2025-11-02 01:26:52 +00:00
|
|
|
|
2025-11-14 10:36:35 +00:00
|
|
|
func GetCollectWebsiteCmd() *cobra.Command {
|
|
|
|
|
return collectWebsiteCmd
|
|
|
|
|
}
|
2025-10-31 21:35:53 +00:00
|
|
|
|
2025-11-14 10:36:35 +00:00
|
|
|
func NewCollectWebsiteCmd() *cobra.Command {
|
|
|
|
|
collectWebsiteCmd := &cobra.Command{
|
|
|
|
|
Use: "website [url]",
|
|
|
|
|
Short: "Collect a single website",
|
|
|
|
|
Long: `Collect a single website and store it in a DataNode.`,
|
|
|
|
|
Args: cobra.ExactArgs(1),
|
|
|
|
|
RunE: func(cmd *cobra.Command, args []string) error {
|
|
|
|
|
websiteURL := args[0]
|
|
|
|
|
outputFile, _ := cmd.Flags().GetString("output")
|
|
|
|
|
depth, _ := cmd.Flags().GetInt("depth")
|
|
|
|
|
format, _ := cmd.Flags().GetString("format")
|
|
|
|
|
compression, _ := cmd.Flags().GetString("compression")
|
2025-11-14 13:47:27 +00:00
|
|
|
password, _ := cmd.Flags().GetString("password")
|
|
|
|
|
|
|
|
|
|
if format != "datanode" && format != "tim" && format != "trix" {
|
|
|
|
|
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
|
|
|
|
|
}
|
2025-11-14 10:36:35 +00:00
|
|
|
|
|
|
|
|
prompter := ui.NewNonInteractivePrompter(ui.GetWebsiteQuote)
|
|
|
|
|
prompter.Start()
|
|
|
|
|
defer prompter.Stop()
|
|
|
|
|
var bar *progressbar.ProgressBar
|
|
|
|
|
if prompter.IsInteractive() {
|
|
|
|
|
bar = ui.NewProgressBar(-1, "Crawling website")
|
2025-11-02 12:39:46 +00:00
|
|
|
}
|
2025-11-14 10:36:35 +00:00
|
|
|
|
|
|
|
|
dn, err := website.DownloadAndPackageWebsite(websiteURL, depth, bar)
|
2025-11-02 12:39:46 +00:00
|
|
|
if err != nil {
|
2025-11-14 10:36:35 +00:00
|
|
|
return fmt.Errorf("error downloading and packaging website: %w", err)
|
2025-11-02 12:39:46 +00:00
|
|
|
}
|
2025-10-31 21:35:53 +00:00
|
|
|
|
2025-11-14 10:36:35 +00:00
|
|
|
var data []byte
|
2025-11-14 13:47:27 +00:00
|
|
|
if format == "tim" {
|
|
|
|
|
tim, err := tim.FromDataNode(dn)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("error creating tim: %w", err)
|
|
|
|
|
}
|
|
|
|
|
data, err = tim.ToTar()
|
2025-11-14 10:36:35 +00:00
|
|
|
if err != nil {
|
2025-11-14 13:47:27 +00:00
|
|
|
return fmt.Errorf("error serializing tim: %w", err)
|
2025-11-14 10:36:35 +00:00
|
|
|
}
|
2025-11-14 13:47:27 +00:00
|
|
|
} else if format == "trix" {
|
|
|
|
|
data, err = trix.ToTrix(dn, password)
|
2025-11-14 10:36:35 +00:00
|
|
|
if err != nil {
|
2025-11-14 13:47:27 +00:00
|
|
|
return fmt.Errorf("error serializing trix: %w", err)
|
2025-11-14 10:36:35 +00:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
data, err = dn.ToTar()
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("error serializing DataNode: %w", err)
|
|
|
|
|
}
|
|
|
|
|
}
|
2025-11-02 13:27:04 +00:00
|
|
|
|
2025-11-14 10:36:35 +00:00
|
|
|
compressedData, err := compress.Compress(data, compression)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("error compressing data: %w", err)
|
2025-11-02 13:27:04 +00:00
|
|
|
}
|
|
|
|
|
|
2025-11-14 10:36:35 +00:00
|
|
|
if outputFile == "" {
|
|
|
|
|
outputFile = "website." + format
|
|
|
|
|
if compression != "none" {
|
|
|
|
|
outputFile += "." + compression
|
|
|
|
|
}
|
|
|
|
|
}
|
2025-10-31 21:35:53 +00:00
|
|
|
|
2025-11-14 10:36:35 +00:00
|
|
|
err = os.WriteFile(outputFile, compressedData, 0644)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("error writing website to file: %w", err)
|
|
|
|
|
}
|
2025-10-31 21:35:53 +00:00
|
|
|
|
2025-11-14 10:36:35 +00:00
|
|
|
fmt.Fprintln(cmd.OutOrStdout(), "Website saved to", outputFile)
|
|
|
|
|
return nil
|
|
|
|
|
},
|
|
|
|
|
}
|
2025-11-02 13:27:04 +00:00
|
|
|
collectWebsiteCmd.PersistentFlags().String("output", "", "Output file for the DataNode")
|
2025-10-31 21:35:53 +00:00
|
|
|
collectWebsiteCmd.PersistentFlags().Int("depth", 2, "Recursion depth for downloading")
|
2025-11-14 13:47:27 +00:00
|
|
|
collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
|
2025-11-02 13:27:04 +00:00
|
|
|
collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
|
2025-11-14 13:47:27 +00:00
|
|
|
collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
|
2025-11-03 16:31:26 +00:00
|
|
|
return collectWebsiteCmd
|
|
|
|
|
}
|