From bd65eefcd349ed1597ddbfa06eeeeca67f9803d6 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 31 Oct 2025 21:03:26 +0000 Subject: [PATCH] refactor: Use DataNode for repository collection This commit refactors the repository collection functionality to use the new `DataNode` package instead of the old `trix` package. The `collect` and `all` commands have been updated to use the new `vcs` package, which clones Git repositories and packages them into a `DataNode`. The `trix` package and its related commands (`cat`, `ingest`) have been removed. --- cmd/all.go | 40 +++++++++++++------------ cmd/cat.go | 53 --------------------------------- cmd/collect.go | 30 ++++++++++--------- cmd/helpers.go | 41 -------------------------- cmd/ingest.go | 56 ----------------------------------- pkg/trix/trix.go | 63 --------------------------------------- pkg/vcs/git.go | 51 ++++++++++++++++++++++++++++++++ pkg/vcs/git_test.go | 72 +++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 160 insertions(+), 246 deletions(-) delete mode 100644 cmd/cat.go delete mode 100644 cmd/helpers.go delete mode 100644 cmd/ingest.go delete mode 100644 pkg/trix/trix.go create mode 100644 pkg/vcs/git.go create mode 100644 pkg/vcs/git_test.go diff --git a/cmd/all.go b/cmd/all.go index dcb3dac..1924aa9 100644 --- a/cmd/all.go +++ b/cmd/all.go @@ -3,10 +3,11 @@ package cmd import ( "fmt" "os" + "strings" "borg-data-collector/pkg/borg" "borg-data-collector/pkg/github" - "borg-data-collector/pkg/trix" + "borg-data-collector/pkg/vcs" "github.com/spf13/cobra" ) @@ -15,7 +16,7 @@ import ( var allCmd = &cobra.Command{ Use: "all [user/org]", Short: "Collect all public repositories from a user or organization", - Long: `Collect all public repositories from a user or organization and store them in a Trix cube.`, + Long: `Collect all public repositories from a user or organization and store them in a DataNode.`, Args: cobra.ExactArgs(1), Run: func(cmd *cobra.Command, args []string) { fmt.Println(borg.GetRandomAssimilationMessage()) @@ -26,30 +27,30 @@ var allCmd = &cobra.Command{ return } - outputFile, _ := cmd.Flags().GetString("output") - - cube, err := trix.NewCube(outputFile) - if err != nil { - fmt.Println(err) - return - } - defer cube.Close() + outputDir, _ := cmd.Flags().GetString("output") for _, repoURL := range repos { fmt.Printf("Cloning %s...\n", repoURL) - tempPath, err := os.MkdirTemp("", "borg-clone-*") - if err != nil { - fmt.Println(err) - return - } - defer os.RemoveAll(tempPath) - - err = addRepoToCube(repoURL, cube, tempPath) + dn, err := vcs.CloneGitRepository(repoURL) if err != nil { fmt.Printf("Error cloning %s: %s\n", repoURL, err) continue } + + data, err := dn.ToTar() + if err != nil { + fmt.Printf("Error serializing DataNode for %s: %v\n", repoURL, err) + continue + } + + repoName := strings.Split(repoURL, "/")[len(strings.Split(repoURL, "/"))-1] + outputFile := fmt.Sprintf("%s/%s.dat", outputDir, repoName) + err = os.WriteFile(outputFile, data, 0644) + if err != nil { + fmt.Printf("Error writing DataNode for %s to file: %v\n", repoURL, err) + continue + } } fmt.Println(borg.GetRandomCodeLongMessage()) @@ -57,5 +58,6 @@ var allCmd = &cobra.Command{ } func init() { - collectCmd.AddCommand(allCmd) + rootCmd.AddCommand(allCmd) + allCmd.PersistentFlags().String("output", ".", "Output directory for the DataNodes") } diff --git a/cmd/cat.go b/cmd/cat.go deleted file mode 100644 index 9efce67..0000000 --- a/cmd/cat.go +++ /dev/null @@ -1,53 +0,0 @@ -package cmd - -import ( - "fmt" - "io" - "os" - - "borg-data-collector/pkg/trix" - - "github.com/spf13/cobra" -) - -// catCmd represents the cat command -var catCmd = &cobra.Command{ - Use: "cat [cube-file] [file-to-extract]", - Short: "Extract a file from a Trix cube", - Long: `Extract a file from a Trix cube and print its content to standard output.`, - Args: cobra.ExactArgs(2), - Run: func(cmd *cobra.Command, args []string) { - cubeFile := args[0] - fileToExtract := args[1] - - reader, file, err := trix.Extract(cubeFile) - if err != nil { - fmt.Println(err) - return - } - defer file.Close() - - for { - hdr, err := reader.Next() - if err == io.EOF { - break - } - if err != nil { - fmt.Println(err) - return - } - - if hdr.Name == fileToExtract { - if _, err := io.Copy(os.Stdout, reader); err != nil { - fmt.Println(err) - return - } - return - } - } - }, -} - -func init() { - rootCmd.AddCommand(catCmd) -} diff --git a/cmd/collect.go b/cmd/collect.go index 33fcbe9..779441c 100644 --- a/cmd/collect.go +++ b/cmd/collect.go @@ -2,8 +2,9 @@ package cmd import ( "fmt" + "os" - "borg-data-collector/pkg/trix" + "borg-data-collector/pkg/vcs" "github.com/spf13/cobra" ) @@ -12,34 +13,35 @@ import ( var collectCmd = &cobra.Command{ Use: "collect [repository-url]", Short: "Collect a single repository", - Long: `Collect a single repository and store it in a Trix cube.`, + Long: `Collect a single repository and store it in a DataNode.`, Args: cobra.ExactArgs(1), Run: func(cmd *cobra.Command, args []string) { - if len(args) < 1 { - fmt.Println("Please provide a repository URL") - return - } repoURL := args[0] - clonePath, _ := cmd.Flags().GetString("path") outputFile, _ := cmd.Flags().GetString("output") - cube, err := trix.NewCube(outputFile) + dn, err := vcs.CloneGitRepository(repoURL) if err != nil { - fmt.Println(err) + fmt.Printf("Error cloning repository: %v\n", err) return } - defer cube.Close() - err = addRepoToCube(repoURL, cube, clonePath) + data, err := dn.ToTar() if err != nil { - fmt.Println(err) + fmt.Printf("Error serializing DataNode: %v\n", err) return } + + err = os.WriteFile(outputFile, data, 0644) + if err != nil { + fmt.Printf("Error writing DataNode to file: %v\n", err) + return + } + + fmt.Printf("Repository saved to %s\n", outputFile) }, } func init() { rootCmd.AddCommand(collectCmd) - collectCmd.PersistentFlags().String("path", "/tmp/borg-clone", "Path to clone the repository") - collectCmd.PersistentFlags().String("output", "borg.cube", "Output file for the Trix cube") + collectCmd.PersistentFlags().String("output", "repo.dat", "Output file for the DataNode") } diff --git a/cmd/helpers.go b/cmd/helpers.go deleted file mode 100644 index f318023..0000000 --- a/cmd/helpers.go +++ /dev/null @@ -1,41 +0,0 @@ -package cmd - -import ( - "os" - "path/filepath" - - "borg-data-collector/pkg/trix" - - "github.com/go-git/go-git/v5" -) - -func addRepoToCube(repoURL string, cube *trix.Cube, clonePath string) error { - _, err := git.PlainClone(clonePath, false, &git.CloneOptions{ - URL: repoURL, - Progress: os.Stdout, - }) - - if err != nil { - return err - } - - err = filepath.Walk(clonePath, func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if !info.IsDir() { - content, err := os.ReadFile(path) - if err != nil { - return err - } - relPath, err := filepath.Rel(clonePath, path) - if err != nil { - return err - } - cube.AddFile(relPath, content) - } - return nil - }) - - return err -} diff --git a/cmd/ingest.go b/cmd/ingest.go deleted file mode 100644 index 23a8f32..0000000 --- a/cmd/ingest.go +++ /dev/null @@ -1,56 +0,0 @@ -package cmd - -import ( - "fmt" - "os" - - "borg-data-collector/pkg/borg" - "borg-data-collector/pkg/trix" - - "github.com/spf13/cobra" -) - -// ingestCmd represents the ingest command -var ingestCmd = &cobra.Command{ - Use: "ingest [cube-file] [file-to-add]", - Short: "Add a file to a Trix cube", - Long: `Add a file to a Trix cube. If the cube file does not exist, it will be created.`, - Args: cobra.ExactArgs(2), - Run: func(cmd *cobra.Command, args []string) { - cubeFile := args[0] - fileToAdd := args[1] - - var cube *trix.Cube - var err error - - if _, err := os.Stat(cubeFile); os.IsNotExist(err) { - cube, err = trix.NewCube(cubeFile) - } else { - cube, err = trix.AppendToCube(cubeFile) - } - - if err != nil { - fmt.Println(err) - return - } - defer cube.Close() - - content, err := os.ReadFile(fileToAdd) - if err != nil { - fmt.Println(err) - return - } - - err = cube.AddFile(fileToAdd, content) - if err != nil { - fmt.Println(err) - return - } - - fmt.Println(borg.GetRandomCodeShortMessage()) - }, -} - -func init() { - rootCmd.AddCommand(ingestCmd) -} diff --git a/pkg/trix/trix.go b/pkg/trix/trix.go deleted file mode 100644 index 79f4fd2..0000000 --- a/pkg/trix/trix.go +++ /dev/null @@ -1,63 +0,0 @@ -package trix - -import ( - "archive/tar" - "os" -) - -type Cube struct { - writer *tar.Writer - file *os.File -} - -func NewCube(path string) (*Cube, error) { - file, err := os.Create(path) - if err != nil { - return nil, err - } - return &Cube{ - writer: tar.NewWriter(file), - file: file, - }, nil -} - -func (c *Cube) AddFile(path string, content []byte) error { - hdr := &tar.Header{ - Name: path, - Mode: 0600, - Size: int64(len(content)), - } - if err := c.writer.WriteHeader(hdr); err != nil { - return err - } - if _, err := c.writer.Write(content); err != nil { - return err - } - return nil -} - -func (c *Cube) Close() error { - if err := c.writer.Close(); err != nil { - return err - } - return c.file.Close() -} - -func Extract(path string) (*tar.Reader, *os.File, error) { - file, err := os.Open(path) - if err != nil { - return nil, nil, err - } - return tar.NewReader(file), file, nil -} - -func AppendToCube(path string) (*Cube, error) { - file, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0644) - if err != nil { - return nil, err - } - return &Cube{ - writer: tar.NewWriter(file), - file: file, - }, nil -} diff --git a/pkg/vcs/git.go b/pkg/vcs/git.go new file mode 100644 index 0000000..7432ea3 --- /dev/null +++ b/pkg/vcs/git.go @@ -0,0 +1,51 @@ +package vcs + +import ( + "os" + "path/filepath" + + "borg-data-collector/pkg/datanode" + + "github.com/go-git/go-git/v5" +) + +// CloneGitRepository clones a Git repository from a URL and packages it into a DataNode. +func CloneGitRepository(repoURL string) (*datanode.DataNode, error) { + tempPath, err := os.MkdirTemp("", "borg-clone-*") + if err != nil { + return nil, err + } + defer os.RemoveAll(tempPath) + + _, err = git.PlainClone(tempPath, false, &git.CloneOptions{ + URL: repoURL, + Progress: os.Stdout, + }) + if err != nil { + return nil, err + } + + dn := datanode.New() + err = filepath.Walk(tempPath, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if !info.IsDir() { + content, err := os.ReadFile(path) + if err != nil { + return err + } + relPath, err := filepath.Rel(tempPath, path) + if err != nil { + return err + } + dn.AddData(relPath, content) + } + return nil + }) + if err != nil { + return nil, err + } + + return dn, nil +} diff --git a/pkg/vcs/git_test.go b/pkg/vcs/git_test.go new file mode 100644 index 0000000..c074318 --- /dev/null +++ b/pkg/vcs/git_test.go @@ -0,0 +1,72 @@ +package vcs + +import ( + "os" + "os/exec" + "path/filepath" + "testing" +) + +func TestCloneGitRepository(t *testing.T) { + // Create a temporary directory for the bare repository + bareRepoPath, err := os.MkdirTemp("", "bare-repo-") + if err != nil { + t.Fatalf("Failed to create temp dir for bare repo: %v", err) + } + defer os.RemoveAll(bareRepoPath) + + // Initialize a bare git repository + cmd := exec.Command("git", "init", "--bare") + cmd.Dir = bareRepoPath + if err := cmd.Run(); err != nil { + t.Fatalf("Failed to init bare repo: %v", err) + } + + // Clone the bare repository to a temporary directory to add a commit + clonePath, err := os.MkdirTemp("", "clone-") + if err != nil { + t.Fatalf("Failed to create temp dir for clone: %v", err) + } + defer os.RemoveAll(clonePath) + + cmd = exec.Command("git", "clone", bareRepoPath, clonePath) + if err := cmd.Run(); err != nil { + t.Fatalf("Failed to clone bare repo: %v", err) + } + + // Create a file and commit it + filePath := filepath.Join(clonePath, "foo.txt") + if err := os.WriteFile(filePath, []byte("foo"), 0644); err != nil { + t.Fatalf("Failed to write file: %v", err) + } + cmd = exec.Command("git", "add", "foo.txt") + cmd.Dir = clonePath + if err := cmd.Run(); err != nil { + t.Fatalf("Failed to git add: %v", err) + } + cmd = exec.Command("git", "commit", "-m", "Initial commit") + cmd.Dir = clonePath + if err := cmd.Run(); err != nil { + t.Fatalf("Failed to git commit: %v", err) + } + cmd = exec.Command("git", "push", "origin", "master") + cmd.Dir = clonePath + if err := cmd.Run(); err != nil { + t.Fatalf("Failed to git push: %v", err) + } + + // Clone the repository using the function we're testing + dn, err := CloneGitRepository("file://" + bareRepoPath) + if err != nil { + t.Fatalf("CloneGitRepository failed: %v", err) + } + + // Verify the DataNode contains the correct file + exists, err := dn.Exists("foo.txt") + if err != nil { + t.Fatalf("Exists failed: %v", err) + } + if !exists { + t.Errorf("Expected to find file foo.txt in DataNode, but it was not found") + } +}