This commit introduces a configurable rate-limiting system for all HTTP requests made by the application. Key features include: - A token bucket algorithm for rate limiting. - Per-domain configuration via a YAML file (`--rate-config`). - Wildcard domain matching (e.g., `*.archive.org`). - Dynamic adjustments based on `429` responses and `Retry-After` headers. - New CLI flags (`--rate-limit`, `--burst`) for on-the-fly configuration. I began by creating a new `http` package to centralize the rate-limiting logic. I then integrated this package into the `website` and `github` collectors, ensuring that all outgoing HTTP requests are subject to the new rate-limiting rules. Throughout the implementation, I added comprehensive unit and integration tests to validate the new functionality. This process also uncovered several pre-existing issues in the test suite, which I have now fixed. These fixes include: - Correcting mock implementations for `http.Client` and `vcs.GitCloner`. - Updating outdated function signatures in tests and examples. - Resolving missing dependencies and syntax errors in test files. - Stabilizing flaky tests. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
83 lines
2.1 KiB
Go
83 lines
2.1 KiB
Go
package http
|
|
|
|
import (
|
|
"net/http"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
|
|
"golang.org/x/time/rate"
|
|
)
|
|
|
|
// RateLimitingRoundTripper is an http.RoundTripper that rate limits requests based on domain.
|
|
type RateLimitingRoundTripper struct {
|
|
next http.RoundTripper
|
|
config *Config
|
|
limiters map[string]*rate.Limiter
|
|
mu sync.Mutex
|
|
}
|
|
|
|
// NewRateLimitingRoundTripper creates a new RateLimitingRoundTripper.
|
|
func NewRateLimitingRoundTripper(config *Config, next http.RoundTripper) *RateLimitingRoundTripper {
|
|
if next == nil {
|
|
next = http.DefaultTransport
|
|
}
|
|
return &RateLimitingRoundTripper{
|
|
config: config,
|
|
next: next,
|
|
limiters: make(map[string]*rate.Limiter),
|
|
}
|
|
}
|
|
|
|
func (r *RateLimitingRoundTripper) getLimiter(host string) *rate.Limiter {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
|
|
limiter, exists := r.limiters[host]
|
|
if !exists {
|
|
rateLimit := r.config.GetRate(host)
|
|
limiter = rate.NewLimiter(rate.Limit(rateLimit.RequestsPerSecond), rateLimit.Burst)
|
|
r.limiters[host] = limiter
|
|
}
|
|
return limiter
|
|
}
|
|
|
|
// RoundTrip executes a single HTTP transaction, waiting for a token from the bucket first.
|
|
func (r *RateLimitingRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
|
|
limiter := r.getLimiter(req.URL.Hostname())
|
|
err := limiter.Wait(req.Context())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
resp, err := r.next.RoundTrip(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if resp.StatusCode == http.StatusTooManyRequests {
|
|
retryAfter := resp.Header.Get("Retry-After")
|
|
var delay time.Duration
|
|
|
|
// Retry-After can be in seconds or an HTTP-date.
|
|
if seconds, err := strconv.Atoi(retryAfter); err == nil {
|
|
delay = time.Duration(seconds) * time.Second
|
|
} else if t, err := http.ParseTime(retryAfter); err == nil {
|
|
delay = time.Until(t)
|
|
} else {
|
|
// No valid Retry-After header, use a default backoff.
|
|
delay = time.Second * 5
|
|
}
|
|
|
|
// Close the response body of the 429 response to allow the transport to reuse the connection.
|
|
if resp.Body != nil {
|
|
resp.Body.Close()
|
|
}
|
|
|
|
// Wait and retry the request once.
|
|
time.Sleep(delay)
|
|
return r.next.RoundTrip(req)
|
|
}
|
|
|
|
return resp, nil
|
|
}
|