Borg/pkg/vcs/git.go
google-labs-jules[bot] 1b98ba1c3d feat: Configurable rate limiting per domain
This commit introduces a configurable rate-limiting system for all HTTP requests made by the application.

Key features include:
- A token bucket algorithm for rate limiting.
- Per-domain configuration via a YAML file (`--rate-config`).
- Wildcard domain matching (e.g., `*.archive.org`).
- Dynamic adjustments based on `429` responses and `Retry-After` headers.
- New CLI flags (`--rate-limit`, `--burst`) for on-the-fly configuration.

I began by creating a new `http` package to centralize the rate-limiting logic. I then integrated this package into the `website` and `github` collectors, ensuring that all outgoing HTTP requests are subject to the new rate-limiting rules.

Throughout the implementation, I added comprehensive unit and integration tests to validate the new functionality. This process also uncovered several pre-existing issues in the test suite, which I have now fixed. These fixes include:
- Correcting mock implementations for `http.Client` and `vcs.GitCloner`.
- Updating outdated function signatures in tests and examples.
- Resolving missing dependencies and syntax errors in test files.
- Stabilizing flaky tests.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
2026-02-02 00:53:44 +00:00

100 lines
2.2 KiB
Go

package vcs
import (
"io"
"net/http"
"os"
"path/filepath"
"sync"
"github.com/Snider/Borg/pkg/datanode"
"github.com/go-git/go-git/v5"
githttp "github.com/go-git/go-git/v5/plumbing/transport/http"
)
// GitCloner is an interface for cloning Git repositories.
type GitCloner interface {
CloneGitRepository(repoURL string, progress io.Writer) (*datanode.DataNode, error)
}
// NewGitCloner creates a new GitCloner with the default http client.
func NewGitCloner() GitCloner {
return NewGitClonerWithClient(http.DefaultClient)
}
// NewGitClonerWithClient creates a new GitCloner with a custom http.Client.
func NewGitClonerWithClient(client *http.Client) GitCloner {
if client == nil {
client = http.DefaultClient
}
return &gitCloner{
httpClient: client,
}
}
type gitCloner struct {
httpClient *http.Client
}
var cloneMutex = &sync.Mutex{}
// CloneGitRepository clones a Git repository from a URL and packages it into a DataNode.
func (g *gitCloner) CloneGitRepository(repoURL string, progress io.Writer) (*datanode.DataNode, error) {
tempPath, err := os.MkdirTemp("", "borg-clone-*")
if err != nil {
return nil, err
}
defer os.RemoveAll(tempPath)
cloneOptions := &git.CloneOptions{
URL: repoURL,
}
if progress != nil {
cloneOptions.Progress = progress
}
cloneMutex.Lock()
originalClient := githttp.DefaultClient
githttp.DefaultClient = githttp.NewClient(g.httpClient)
defer func() {
githttp.DefaultClient = originalClient
cloneMutex.Unlock()
}()
_, err = git.PlainClone(tempPath, false, cloneOptions)
if err != nil {
if err.Error() == "remote repository is empty" {
return datanode.New(), nil
}
return nil, err
}
dn := datanode.New()
err = filepath.Walk(tempPath, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
// Skip the .git directory
if info.IsDir() && info.Name() == ".git" {
return filepath.SkipDir
}
if !info.IsDir() {
content, err := os.ReadFile(path)
if err != nil {
return err
}
relPath, err := filepath.Rel(tempPath, path)
if err != nil {
return err
}
dn.AddData(relPath, content)
}
return nil
})
if err != nil {
return nil, err
}
return dn, nil
}