Borg/cmd/collect_website.go
google-labs-jules[bot] 2ff65938ca feat: Circuit breaker for failing domains
Implement a circuit breaker for the website collector to prevent hammering domains that are consistently failing.

The circuit breaker has three states: CLOSED, OPEN, and HALF-OPEN. It tracks failures per-domain and will open the circuit after a configurable number of consecutive failures. After a cooldown period, the circuit will transition to HALF-OPEN and allow a limited number of test requests to check for recovery.

The following command-line flags have been added to the `collect website` command:
- `--no-circuit-breaker`: Disable the circuit breaker
- `--circuit-failures`: Number of failures to trip the circuit breaker
- `--circuit-cooldown`: Cooldown time for the circuit breaker
- `--circuit-success-threshold`: Number of successes to close the circuit breaker
- `--circuit-half-open-requests`: Number of test requests in the half-open state

The implementation also includes:
- A new `circuitbreaker` package with the core logic
- Integration into the `website` package with per-domain tracking
- Improved logging to include the domain name and state changes
- Integration tests to verify the circuit breaker's behavior

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
2026-02-02 00:59:38 +00:00

139 lines
4.7 KiB
Go

package cmd
import (
"fmt"
"os"
"time"
"github.com/Snider/Borg/pkg/circuitbreaker"
"github.com/schollz/progressbar/v3"
"golang.org/x/exp/slog"
"github.com/Snider/Borg/pkg/compress"
"github.com/Snider/Borg/pkg/tim"
"github.com/Snider/Borg/pkg/trix"
"github.com/Snider/Borg/pkg/ui"
"github.com/Snider/Borg/pkg/website"
"github.com/spf13/cobra"
)
// collectWebsiteCmd represents the collect website command
var collectWebsiteCmd = NewCollectWebsiteCmd()
func init() {
GetCollectCmd().AddCommand(GetCollectWebsiteCmd())
}
func GetCollectWebsiteCmd() *cobra.Command {
return collectWebsiteCmd
}
func NewCollectWebsiteCmd() *cobra.Command {
collectWebsiteCmd := &cobra.Command{
Use: "website [url]",
Short: "Collect a single website",
Long: `Collect a single website and store it in a DataNode.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
websiteURL := args[0]
outputFile, _ := cmd.Flags().GetString("output")
depth, _ := cmd.Flags().GetInt("depth")
format, _ := cmd.Flags().GetString("format")
compression, _ := cmd.Flags().GetString("compression")
password, _ := cmd.Flags().GetString("password")
noCircuitBreaker, _ := cmd.Flags().GetBool("no-circuit-breaker")
circuitFailures, _ := cmd.Flags().GetInt("circuit-failures")
circuitCooldown, _ := cmd.Flags().GetDuration("circuit-cooldown")
circuitSuccessThreshold, _ := cmd.Flags().GetInt("circuit-success-threshold")
circuitHalfOpenRequests, _ := cmd.Flags().GetInt("circuit-half-open-requests")
if format != "datanode" && format != "tim" && format != "trix" {
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
}
prompter := ui.NewNonInteractivePrompter(ui.GetWebsiteQuote)
prompter.Start()
defer prompter.Stop()
var bar *progressbar.ProgressBar
if prompter.IsInteractive() {
bar = ui.NewProgressBar(-1, "Crawling website")
}
logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: slog.LevelInfo,
}))
opts := website.DownloadOptions{
URL: websiteURL,
MaxDepth: depth,
ProgressBar: bar,
EnableCircuitBreaker: !noCircuitBreaker,
CBSettings: circuitbreaker.Settings{
FailureThreshold: circuitFailures,
SuccessThreshold: circuitSuccessThreshold,
Cooldown: circuitCooldown,
HalfOpenRequests: circuitHalfOpenRequests,
Logger: logger,
},
}
dn, err := website.DownloadAndPackageWebsite(opts)
if err != nil {
return fmt.Errorf("error downloading and packaging website: %w", err)
}
var data []byte
if format == "tim" {
tim, err := tim.FromDataNode(dn)
if err != nil {
return fmt.Errorf("error creating tim: %w", err)
}
data, err = tim.ToTar()
if err != nil {
return fmt.Errorf("error serializing tim: %w", err)
}
} else if format == "trix" {
data, err = trix.ToTrix(dn, password)
if err != nil {
return fmt.Errorf("error serializing trix: %w", err)
}
} else {
data, err = dn.ToTar()
if err != nil {
return fmt.Errorf("error serializing DataNode: %w", err)
}
}
compressedData, err := compress.Compress(data, compression)
if err != nil {
return fmt.Errorf("error compressing data: %w", err)
}
if outputFile == "" {
outputFile = "website." + format
if compression != "none" {
outputFile += "." + compression
}
}
err = os.WriteFile(outputFile, compressedData, 0644)
if err != nil {
return fmt.Errorf("error writing website to file: %w", err)
}
fmt.Fprintln(cmd.OutOrStdout(), "Website saved to", outputFile)
return nil
},
}
collectWebsiteCmd.PersistentFlags().String("output", "", "Output file for the DataNode")
collectWebsiteCmd.PersistentFlags().Int("depth", 2, "Recursion depth for downloading")
collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
collectWebsiteCmd.PersistentFlags().String("password", "", "Password for encryption")
collectWebsiteCmd.Flags().Bool("no-circuit-breaker", false, "Disable the circuit breaker")
collectWebsiteCmd.Flags().Int("circuit-failures", 5, "Number of failures to trip the circuit breaker")
collectWebsiteCmd.Flags().Duration("circuit-cooldown", 30*time.Second, "Cooldown time for the circuit breaker")
collectWebsiteCmd.Flags().Int("circuit-success-threshold", 2, "Number of successes to close the circuit breaker")
collectWebsiteCmd.Flags().Int("circuit-half-open-requests", 1, "Number of test requests in half-open state")
return collectWebsiteCmd
}