This commit introduces a deduplication cache to avoid re-downloading files across multiple collection jobs. Key changes include: - A new `pkg/cache` package that provides content-addressable storage using SHA256 hashes of the file content. - Integration of the cache into the `collect website` command. Downloads are now skipped if the content already exists in the cache. - The addition of `--no-cache` and `--cache-dir` flags to give users control over the caching behavior. - New `borg cache stats` and `borg cache clear` commands to allow users to manage the cache. - A performance improvement to the cache implementation, which now only writes the URL-to-hash index file once at the end of the collection process, rather than on every file download. - Centralized logic for determining the default cache directory, removing code duplication. - Improved error handling and refactored duplicated cache-checking logic in the website collector. - Added comprehensive unit tests for the new cache package and an integration test to verify that the website collector correctly uses the cache. The implementation of cache size limiting and LRU eviction is still pending and will be addressed in a future commit. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
102 lines
2.2 KiB
Go
102 lines
2.2 KiB
Go
package cmd
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
|
|
"github.com/Snider/Borg/pkg/cache"
|
|
"github.com/spf13/cobra"
|
|
)
|
|
|
|
// cacheCmd represents the cache command
|
|
var cacheCmd = NewCacheCmd()
|
|
var cacheStatsCmd = NewCacheStatsCmd()
|
|
var cacheClearCmd = NewCacheClearCmd()
|
|
|
|
func init() {
|
|
RootCmd.AddCommand(GetCacheCmd())
|
|
GetCacheCmd().AddCommand(GetCacheStatsCmd())
|
|
GetCacheCmd().AddCommand(GetCacheClearCmd())
|
|
}
|
|
|
|
func GetCacheCmd() *cobra.Command {
|
|
return cacheCmd
|
|
}
|
|
|
|
func GetCacheStatsCmd() *cobra.Command {
|
|
return cacheStatsCmd
|
|
}
|
|
|
|
func GetCacheClearCmd() *cobra.Command {
|
|
return cacheClearCmd
|
|
}
|
|
|
|
func NewCacheCmd() *cobra.Command {
|
|
cacheCmd := &cobra.Command{
|
|
Use: "cache",
|
|
Short: "Manage the cache",
|
|
Long: `Manage the cache.`,
|
|
}
|
|
return cacheCmd
|
|
}
|
|
|
|
func NewCacheStatsCmd() *cobra.Command {
|
|
cacheStatsCmd := &cobra.Command{
|
|
Use: "stats",
|
|
Short: "Show cache stats",
|
|
Long: `Show cache stats.`,
|
|
RunE: func(cmd *cobra.Command, args []string) error {
|
|
cacheDir, err := GetCacheDir(cmd)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
cacheInstance, err := cache.New(cacheDir)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create cache: %w", err)
|
|
}
|
|
|
|
size, err := cacheInstance.Size()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get cache size: %w", err)
|
|
}
|
|
|
|
fmt.Printf("Cache directory: %s\n", cacheInstance.Dir())
|
|
fmt.Printf("Number of entries: %d\n", cacheInstance.NumEntries())
|
|
fmt.Printf("Total size: %d bytes\n", size)
|
|
|
|
return nil
|
|
},
|
|
}
|
|
cacheStatsCmd.Flags().String("cache-dir", "", "Custom cache location")
|
|
return cacheStatsCmd
|
|
}
|
|
|
|
func NewCacheClearCmd() *cobra.Command {
|
|
cacheClearCmd := &cobra.Command{
|
|
Use: "clear",
|
|
Short: "Clear the cache",
|
|
Long: `Clear the cache.`,
|
|
RunE: func(cmd *cobra.Command, args []string) error {
|
|
cacheDir, err := GetCacheDir(cmd)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
cacheInstance, err := cache.New(cacheDir)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create cache: %w", err)
|
|
}
|
|
|
|
err = cacheInstance.Clear()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to clear cache: %w", err)
|
|
}
|
|
|
|
fmt.Println("Cache cleared.")
|
|
|
|
return nil
|
|
},
|
|
}
|
|
cacheClearCmd.Flags().String("cache-dir", "", "Custom cache location")
|
|
return cacheClearCmd
|
|
}
|