refactor: Use DataNode for repository collection

This commit refactors the repository collection functionality to use the new `DataNode` package instead of the old `trix` package.

The `collect` and `all` commands have been updated to use the new `vcs` package, which clones Git repositories and packages them into a `DataNode`. The `trix` package and its related commands (`cat`, `ingest`) have been removed.
This commit is contained in:
google-labs-jules[bot] 2025-10-31 21:03:26 +00:00
parent 5149b64403
commit bd65eefcd3
8 changed files with 160 additions and 246 deletions

View file

@ -3,10 +3,11 @@ package cmd
import (
"fmt"
"os"
"strings"
"borg-data-collector/pkg/borg"
"borg-data-collector/pkg/github"
"borg-data-collector/pkg/trix"
"borg-data-collector/pkg/vcs"
"github.com/spf13/cobra"
)
@ -15,7 +16,7 @@ import (
var allCmd = &cobra.Command{
Use: "all [user/org]",
Short: "Collect all public repositories from a user or organization",
Long: `Collect all public repositories from a user or organization and store them in a Trix cube.`,
Long: `Collect all public repositories from a user or organization and store them in a DataNode.`,
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
fmt.Println(borg.GetRandomAssimilationMessage())
@ -26,30 +27,30 @@ var allCmd = &cobra.Command{
return
}
outputFile, _ := cmd.Flags().GetString("output")
cube, err := trix.NewCube(outputFile)
if err != nil {
fmt.Println(err)
return
}
defer cube.Close()
outputDir, _ := cmd.Flags().GetString("output")
for _, repoURL := range repos {
fmt.Printf("Cloning %s...\n", repoURL)
tempPath, err := os.MkdirTemp("", "borg-clone-*")
if err != nil {
fmt.Println(err)
return
}
defer os.RemoveAll(tempPath)
err = addRepoToCube(repoURL, cube, tempPath)
dn, err := vcs.CloneGitRepository(repoURL)
if err != nil {
fmt.Printf("Error cloning %s: %s\n", repoURL, err)
continue
}
data, err := dn.ToTar()
if err != nil {
fmt.Printf("Error serializing DataNode for %s: %v\n", repoURL, err)
continue
}
repoName := strings.Split(repoURL, "/")[len(strings.Split(repoURL, "/"))-1]
outputFile := fmt.Sprintf("%s/%s.dat", outputDir, repoName)
err = os.WriteFile(outputFile, data, 0644)
if err != nil {
fmt.Printf("Error writing DataNode for %s to file: %v\n", repoURL, err)
continue
}
}
fmt.Println(borg.GetRandomCodeLongMessage())
@ -57,5 +58,6 @@ var allCmd = &cobra.Command{
}
func init() {
collectCmd.AddCommand(allCmd)
rootCmd.AddCommand(allCmd)
allCmd.PersistentFlags().String("output", ".", "Output directory for the DataNodes")
}

View file

@ -1,53 +0,0 @@
package cmd
import (
"fmt"
"io"
"os"
"borg-data-collector/pkg/trix"
"github.com/spf13/cobra"
)
// catCmd represents the cat command
var catCmd = &cobra.Command{
Use: "cat [cube-file] [file-to-extract]",
Short: "Extract a file from a Trix cube",
Long: `Extract a file from a Trix cube and print its content to standard output.`,
Args: cobra.ExactArgs(2),
Run: func(cmd *cobra.Command, args []string) {
cubeFile := args[0]
fileToExtract := args[1]
reader, file, err := trix.Extract(cubeFile)
if err != nil {
fmt.Println(err)
return
}
defer file.Close()
for {
hdr, err := reader.Next()
if err == io.EOF {
break
}
if err != nil {
fmt.Println(err)
return
}
if hdr.Name == fileToExtract {
if _, err := io.Copy(os.Stdout, reader); err != nil {
fmt.Println(err)
return
}
return
}
}
},
}
func init() {
rootCmd.AddCommand(catCmd)
}

View file

@ -2,8 +2,9 @@ package cmd
import (
"fmt"
"os"
"borg-data-collector/pkg/trix"
"borg-data-collector/pkg/vcs"
"github.com/spf13/cobra"
)
@ -12,34 +13,35 @@ import (
var collectCmd = &cobra.Command{
Use: "collect [repository-url]",
Short: "Collect a single repository",
Long: `Collect a single repository and store it in a Trix cube.`,
Long: `Collect a single repository and store it in a DataNode.`,
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
if len(args) < 1 {
fmt.Println("Please provide a repository URL")
return
}
repoURL := args[0]
clonePath, _ := cmd.Flags().GetString("path")
outputFile, _ := cmd.Flags().GetString("output")
cube, err := trix.NewCube(outputFile)
dn, err := vcs.CloneGitRepository(repoURL)
if err != nil {
fmt.Println(err)
fmt.Printf("Error cloning repository: %v\n", err)
return
}
defer cube.Close()
err = addRepoToCube(repoURL, cube, clonePath)
data, err := dn.ToTar()
if err != nil {
fmt.Println(err)
fmt.Printf("Error serializing DataNode: %v\n", err)
return
}
err = os.WriteFile(outputFile, data, 0644)
if err != nil {
fmt.Printf("Error writing DataNode to file: %v\n", err)
return
}
fmt.Printf("Repository saved to %s\n", outputFile)
},
}
func init() {
rootCmd.AddCommand(collectCmd)
collectCmd.PersistentFlags().String("path", "/tmp/borg-clone", "Path to clone the repository")
collectCmd.PersistentFlags().String("output", "borg.cube", "Output file for the Trix cube")
collectCmd.PersistentFlags().String("output", "repo.dat", "Output file for the DataNode")
}

View file

@ -1,41 +0,0 @@
package cmd
import (
"os"
"path/filepath"
"borg-data-collector/pkg/trix"
"github.com/go-git/go-git/v5"
)
func addRepoToCube(repoURL string, cube *trix.Cube, clonePath string) error {
_, err := git.PlainClone(clonePath, false, &git.CloneOptions{
URL: repoURL,
Progress: os.Stdout,
})
if err != nil {
return err
}
err = filepath.Walk(clonePath, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() {
content, err := os.ReadFile(path)
if err != nil {
return err
}
relPath, err := filepath.Rel(clonePath, path)
if err != nil {
return err
}
cube.AddFile(relPath, content)
}
return nil
})
return err
}

View file

@ -1,56 +0,0 @@
package cmd
import (
"fmt"
"os"
"borg-data-collector/pkg/borg"
"borg-data-collector/pkg/trix"
"github.com/spf13/cobra"
)
// ingestCmd represents the ingest command
var ingestCmd = &cobra.Command{
Use: "ingest [cube-file] [file-to-add]",
Short: "Add a file to a Trix cube",
Long: `Add a file to a Trix cube. If the cube file does not exist, it will be created.`,
Args: cobra.ExactArgs(2),
Run: func(cmd *cobra.Command, args []string) {
cubeFile := args[0]
fileToAdd := args[1]
var cube *trix.Cube
var err error
if _, err := os.Stat(cubeFile); os.IsNotExist(err) {
cube, err = trix.NewCube(cubeFile)
} else {
cube, err = trix.AppendToCube(cubeFile)
}
if err != nil {
fmt.Println(err)
return
}
defer cube.Close()
content, err := os.ReadFile(fileToAdd)
if err != nil {
fmt.Println(err)
return
}
err = cube.AddFile(fileToAdd, content)
if err != nil {
fmt.Println(err)
return
}
fmt.Println(borg.GetRandomCodeShortMessage())
},
}
func init() {
rootCmd.AddCommand(ingestCmd)
}

View file

@ -1,63 +0,0 @@
package trix
import (
"archive/tar"
"os"
)
type Cube struct {
writer *tar.Writer
file *os.File
}
func NewCube(path string) (*Cube, error) {
file, err := os.Create(path)
if err != nil {
return nil, err
}
return &Cube{
writer: tar.NewWriter(file),
file: file,
}, nil
}
func (c *Cube) AddFile(path string, content []byte) error {
hdr := &tar.Header{
Name: path,
Mode: 0600,
Size: int64(len(content)),
}
if err := c.writer.WriteHeader(hdr); err != nil {
return err
}
if _, err := c.writer.Write(content); err != nil {
return err
}
return nil
}
func (c *Cube) Close() error {
if err := c.writer.Close(); err != nil {
return err
}
return c.file.Close()
}
func Extract(path string) (*tar.Reader, *os.File, error) {
file, err := os.Open(path)
if err != nil {
return nil, nil, err
}
return tar.NewReader(file), file, nil
}
func AppendToCube(path string) (*Cube, error) {
file, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0644)
if err != nil {
return nil, err
}
return &Cube{
writer: tar.NewWriter(file),
file: file,
}, nil
}

51
pkg/vcs/git.go Normal file
View file

@ -0,0 +1,51 @@
package vcs
import (
"os"
"path/filepath"
"borg-data-collector/pkg/datanode"
"github.com/go-git/go-git/v5"
)
// CloneGitRepository clones a Git repository from a URL and packages it into a DataNode.
func CloneGitRepository(repoURL string) (*datanode.DataNode, error) {
tempPath, err := os.MkdirTemp("", "borg-clone-*")
if err != nil {
return nil, err
}
defer os.RemoveAll(tempPath)
_, err = git.PlainClone(tempPath, false, &git.CloneOptions{
URL: repoURL,
Progress: os.Stdout,
})
if err != nil {
return nil, err
}
dn := datanode.New()
err = filepath.Walk(tempPath, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() {
content, err := os.ReadFile(path)
if err != nil {
return err
}
relPath, err := filepath.Rel(tempPath, path)
if err != nil {
return err
}
dn.AddData(relPath, content)
}
return nil
})
if err != nil {
return nil, err
}
return dn, nil
}

72
pkg/vcs/git_test.go Normal file
View file

@ -0,0 +1,72 @@
package vcs
import (
"os"
"os/exec"
"path/filepath"
"testing"
)
func TestCloneGitRepository(t *testing.T) {
// Create a temporary directory for the bare repository
bareRepoPath, err := os.MkdirTemp("", "bare-repo-")
if err != nil {
t.Fatalf("Failed to create temp dir for bare repo: %v", err)
}
defer os.RemoveAll(bareRepoPath)
// Initialize a bare git repository
cmd := exec.Command("git", "init", "--bare")
cmd.Dir = bareRepoPath
if err := cmd.Run(); err != nil {
t.Fatalf("Failed to init bare repo: %v", err)
}
// Clone the bare repository to a temporary directory to add a commit
clonePath, err := os.MkdirTemp("", "clone-")
if err != nil {
t.Fatalf("Failed to create temp dir for clone: %v", err)
}
defer os.RemoveAll(clonePath)
cmd = exec.Command("git", "clone", bareRepoPath, clonePath)
if err := cmd.Run(); err != nil {
t.Fatalf("Failed to clone bare repo: %v", err)
}
// Create a file and commit it
filePath := filepath.Join(clonePath, "foo.txt")
if err := os.WriteFile(filePath, []byte("foo"), 0644); err != nil {
t.Fatalf("Failed to write file: %v", err)
}
cmd = exec.Command("git", "add", "foo.txt")
cmd.Dir = clonePath
if err := cmd.Run(); err != nil {
t.Fatalf("Failed to git add: %v", err)
}
cmd = exec.Command("git", "commit", "-m", "Initial commit")
cmd.Dir = clonePath
if err := cmd.Run(); err != nil {
t.Fatalf("Failed to git commit: %v", err)
}
cmd = exec.Command("git", "push", "origin", "master")
cmd.Dir = clonePath
if err := cmd.Run(); err != nil {
t.Fatalf("Failed to git push: %v", err)
}
// Clone the repository using the function we're testing
dn, err := CloneGitRepository("file://" + bareRepoPath)
if err != nil {
t.Fatalf("CloneGitRepository failed: %v", err)
}
// Verify the DataNode contains the correct file
exists, err := dn.Exists("foo.txt")
if err != nil {
t.Fatalf("Exists failed: %v", err)
}
if !exists {
t.Errorf("Expected to find file foo.txt in DataNode, but it was not found")
}
}