From 5a864a905946b251db3323e7bc0a227a9a4d03fc Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sun, 2 Nov 2025 13:27:04 +0000 Subject: [PATCH] feat: Add optional compression to collect commands This change introduces optional compression to the `collect` commands. Users can now specify `--compression` with `gz` or `xz` to compress the output. The `serve` command has also been enhanced to transparently decompress and serve these files. --- cmd/collect_github_repo.go | 20 +++++++++-- cmd/collect_pwa.go | 20 +++++++++-- cmd/collect_website.go | 20 +++++++++-- cmd/serve.go | 9 ++++- docs/README.md | 20 +++++++++++ examples/compress_datanode.sh | 8 +++++ examples/inspect_datanode.go | 9 ++++- go.mod | 1 + go.sum | 2 ++ pkg/compress/compress.go | 64 +++++++++++++++++++++++++++++++++++ pkg/tarfs/tarfs.go | 44 ++++++++++-------------- 11 files changed, 182 insertions(+), 35 deletions(-) create mode 100755 examples/compress_datanode.sh create mode 100644 pkg/compress/compress.go diff --git a/cmd/collect_github_repo.go b/cmd/collect_github_repo.go index e48e505..237d9af 100644 --- a/cmd/collect_github_repo.go +++ b/cmd/collect_github_repo.go @@ -4,6 +4,7 @@ import ( "fmt" "os" + "github.com/Snider/Borg/pkg/compress" "github.com/Snider/Borg/pkg/matrix" "github.com/Snider/Borg/pkg/ui" "github.com/Snider/Borg/pkg/vcs" @@ -21,6 +22,7 @@ var collectGithubRepoCmd = &cobra.Command{ repoURL := args[0] outputFile, _ := cmd.Flags().GetString("output") format, _ := cmd.Flags().GetString("format") + compression, _ := cmd.Flags().GetString("compression") bar := ui.NewProgressBar(-1, "Cloning repository") defer bar.Finish() @@ -51,7 +53,20 @@ var collectGithubRepoCmd = &cobra.Command{ } } - err = os.WriteFile(outputFile, data, 0644) + compressedData, err := compress.Compress(data, compression) + if err != nil { + fmt.Printf("Error compressing data: %v\n", err) + return + } + + if outputFile == "" { + outputFile = "repo." + format + if compression != "none" { + outputFile += "." + compression + } + } + + err = os.WriteFile(outputFile, compressedData, 0644) if err != nil { fmt.Printf("Error writing DataNode to file: %v\n", err) return @@ -63,6 +78,7 @@ var collectGithubRepoCmd = &cobra.Command{ func init() { collectGithubCmd.AddCommand(collectGithubRepoCmd) - collectGithubRepoCmd.PersistentFlags().String("output", "repo.dat", "Output file for the DataNode") + collectGithubRepoCmd.PersistentFlags().String("output", "", "Output file for the DataNode") collectGithubRepoCmd.PersistentFlags().String("format", "datanode", "Output format (datanode or matrix)") + collectGithubRepoCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)") } diff --git a/cmd/collect_pwa.go b/cmd/collect_pwa.go index 7c371ff..6dd57af 100644 --- a/cmd/collect_pwa.go +++ b/cmd/collect_pwa.go @@ -4,6 +4,7 @@ import ( "fmt" "os" + "github.com/Snider/Borg/pkg/compress" "github.com/Snider/Borg/pkg/matrix" "github.com/Snider/Borg/pkg/pwa" "github.com/Snider/Borg/pkg/ui" @@ -23,6 +24,7 @@ Example: pwaURL, _ := cmd.Flags().GetString("uri") outputFile, _ := cmd.Flags().GetString("output") format, _ := cmd.Flags().GetString("format") + compression, _ := cmd.Flags().GetString("compression") if pwaURL == "" { fmt.Println("Error: uri is required") @@ -64,7 +66,20 @@ Example: } } - err = os.WriteFile(outputFile, data, 0644) + compressedData, err := compress.Compress(data, compression) + if err != nil { + fmt.Printf("Error compressing data: %v\n", err) + return + } + + if outputFile == "" { + outputFile = "pwa." + format + if compression != "none" { + outputFile += "." + compression + } + } + + err = os.WriteFile(outputFile, compressedData, 0644) if err != nil { fmt.Printf("Error writing PWA to file: %v\n", err) return @@ -77,6 +92,7 @@ Example: func init() { collectCmd.AddCommand(collectPWACmd) collectPWACmd.Flags().String("uri", "", "The URI of the PWA to collect") - collectPWACmd.Flags().String("output", "pwa.dat", "Output file for the DataNode") + collectPWACmd.Flags().String("output", "", "Output file for the DataNode") collectPWACmd.Flags().String("format", "datanode", "Output format (datanode or matrix)") + collectPWACmd.Flags().String("compression", "none", "Compression format (none, gz, or xz)") } diff --git a/cmd/collect_website.go b/cmd/collect_website.go index 1a964af..b3f7c37 100644 --- a/cmd/collect_website.go +++ b/cmd/collect_website.go @@ -4,6 +4,7 @@ import ( "fmt" "os" + "github.com/Snider/Borg/pkg/compress" "github.com/Snider/Borg/pkg/matrix" "github.com/Snider/Borg/pkg/ui" "github.com/Snider/Borg/pkg/website" @@ -22,6 +23,7 @@ var collectWebsiteCmd = &cobra.Command{ outputFile, _ := cmd.Flags().GetString("output") depth, _ := cmd.Flags().GetInt("depth") format, _ := cmd.Flags().GetString("format") + compression, _ := cmd.Flags().GetString("compression") bar := ui.NewProgressBar(-1, "Crawling website") defer bar.Finish() @@ -52,7 +54,20 @@ var collectWebsiteCmd = &cobra.Command{ } } - err = os.WriteFile(outputFile, data, 0644) + compressedData, err := compress.Compress(data, compression) + if err != nil { + fmt.Printf("Error compressing data: %v\n", err) + return + } + + if outputFile == "" { + outputFile = "website." + format + if compression != "none" { + outputFile += "." + compression + } + } + + err = os.WriteFile(outputFile, compressedData, 0644) if err != nil { fmt.Printf("Error writing website to file: %v\n", err) return @@ -64,7 +79,8 @@ var collectWebsiteCmd = &cobra.Command{ func init() { collectCmd.AddCommand(collectWebsiteCmd) - collectWebsiteCmd.PersistentFlags().String("output", "website.dat", "Output file for the DataNode") + collectWebsiteCmd.PersistentFlags().String("output", "", "Output file for the DataNode") collectWebsiteCmd.PersistentFlags().Int("depth", 2, "Recursion depth for downloading") collectWebsiteCmd.PersistentFlags().String("format", "datanode", "Output format (datanode or matrix)") + collectWebsiteCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)") } diff --git a/cmd/serve.go b/cmd/serve.go index 531e719..87e225f 100644 --- a/cmd/serve.go +++ b/cmd/serve.go @@ -6,6 +6,7 @@ import ( "os" "strings" + "github.com/Snider/Borg/pkg/compress" "github.com/Snider/Borg/pkg/datanode" "github.com/Snider/Borg/pkg/tarfs" @@ -22,12 +23,18 @@ var serveCmd = &cobra.Command{ dataFile := args[0] port, _ := cmd.Flags().GetString("port") - data, err := os.ReadFile(dataFile) + rawData, err := os.ReadFile(dataFile) if err != nil { fmt.Printf("Error reading data file: %v\n", err) return } + data, err := compress.Decompress(rawData) + if err != nil { + fmt.Printf("Error decompressing data: %v\n", err) + return + } + var fs http.FileSystem if strings.HasSuffix(dataFile, ".matrix") { fs, err = tarfs.New(data) diff --git a/docs/README.md b/docs/README.md index fe82825..4151a29 100644 --- a/docs/README.md +++ b/docs/README.md @@ -20,6 +20,7 @@ borg collect github repo [repository-url] [flags] **Flags:** - `--output string`: Output file for the DataNode (default "repo.dat") - `--format string`: Output format (datanode or matrix) (default "datanode") +- `--compression string`: Compression format (none, gz, or xz) (default "none") **Example:** ``` @@ -39,6 +40,7 @@ borg collect website [url] [flags] - `--output string`: Output file for the DataNode (default "website.dat") - `--depth int`: Recursion depth for downloading (default 2) - `--format string`: Output format (datanode or matrix) (default "datanode") +- `--compression string`: Compression format (none, gz, or xz) (default "none") **Example:** ``` @@ -58,6 +60,7 @@ borg collect pwa [flags] - `--uri string`: The URI of the PWA to collect - `--output string`: Output file for the DataNode (default "pwa.dat") - `--format string`: Output format (datanode or matrix) (default "datanode") +- `--compression string`: Compression format (none, gz, or xz) (default "none") **Example:** ``` @@ -85,6 +88,23 @@ borg serve [file] [flags] ./borg serve borg.matrix --port 9999 ``` +## Compression + +All `collect` commands support optional compression. The following compression formats are available: + +- `none`: No compression (default) +- `gz`: Gzip compression +- `xz`: XZ compression + +To use compression, specify the desired format with the `--compression` flag. The output filename will be automatically updated with the appropriate extension (e.g., `.gz`, `.xz`). + +**Example:** +``` +./borg collect github repo https://github.com/Snider/Borg --compression gz +``` + +The `serve` command can transparently serve compressed files. + ## Terminal Isolation Matrix The `matrix` format creates a `runc` compatible bundle. This bundle can be executed by `runc` to create a container with the collected files. This is useful for creating isolated environments for testing or analysis. diff --git a/examples/compress_datanode.sh b/examples/compress_datanode.sh new file mode 100755 index 0000000..a5b95df --- /dev/null +++ b/examples/compress_datanode.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# Example of using the 'borg collect' command with the '--compression' flag. + +# This script clones the specified Git repository and saves it as a compressed .dat file. +# The main executable 'borg' is built from the project's root. +# Make sure you have built the project by running 'go build -o borg main.go' in the root directory. + +./borg collect github repo https://github.com/Snider/Borg --compression gz diff --git a/examples/inspect_datanode.go b/examples/inspect_datanode.go index 9c97ede..a6c67bf 100644 --- a/examples/inspect_datanode.go +++ b/examples/inspect_datanode.go @@ -5,6 +5,7 @@ import ( "io/fs" "os" + "github.com/Snider/Borg/pkg/compress" "github.com/Snider/Borg/pkg/datanode" ) @@ -16,12 +17,18 @@ func main() { datFile := os.Args[1] - data, err := os.ReadFile(datFile) + rawData, err := os.ReadFile(datFile) if err != nil { fmt.Printf("Error reading .dat file: %v\n", err) os.Exit(1) } + data, err := compress.Decompress(rawData) + if err != nil { + fmt.Printf("Error decompressing data: %v\n", err) + os.Exit(1) + } + dn, err := datanode.FromTar(data) if err != nil { fmt.Printf("Error creating DataNode from tarball: %v\n", err) diff --git a/go.mod b/go.mod index 2b473d4..ca335d6 100644 --- a/go.mod +++ b/go.mod @@ -33,6 +33,7 @@ require ( github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect github.com/skeema/knownhosts v1.3.1 // indirect github.com/spf13/pflag v1.0.10 // indirect + github.com/ulikunitz/xz v0.5.15 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect golang.org/x/crypto v0.43.0 // indirect golang.org/x/sys v0.37.0 // indirect diff --git a/go.sum b/go.sum index 693b03f..12ad192 100644 --- a/go.sum +++ b/go.sum @@ -94,6 +94,8 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/ulikunitz/xz v0.5.15 h1:9DNdB5s+SgV3bQ2ApL10xRc35ck0DuIX/isZvIk+ubY= +github.com/ulikunitz/xz v0.5.15/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM= github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= diff --git a/pkg/compress/compress.go b/pkg/compress/compress.go new file mode 100644 index 0000000..07e4d28 --- /dev/null +++ b/pkg/compress/compress.go @@ -0,0 +1,64 @@ +package compress + +import ( + "bytes" + "compress/gzip" + "io" + + "github.com/ulikunitz/xz" +) + +// Compress compresses data using the specified format. +func Compress(data []byte, format string) ([]byte, error) { + var buf bytes.Buffer + var writer io.WriteCloser + var err error + + switch format { + case "gz": + writer = gzip.NewWriter(&buf) + case "xz": + writer, err = xz.NewWriter(&buf) + if err != nil { + return nil, err + } + default: + return data, nil + } + + _, err = writer.Write(data) + if err != nil { + return nil, err + } + + err = writer.Close() + if err != nil { + return nil, err + } + + return buf.Bytes(), nil +} + +// Decompress decompresses data, detecting the format automatically. +func Decompress(data []byte) ([]byte, error) { + // Check for gzip header + if len(data) > 2 && data[0] == 0x1f && data[1] == 0x8b { + reader, err := gzip.NewReader(bytes.NewReader(data)) + if err != nil { + return nil, err + } + defer reader.Close() + return io.ReadAll(reader) + } + + // Check for xz header + if len(data) > 6 && data[0] == 0xfd && data[1] == '7' && data[2] == 'z' && data[3] == 'X' && data[4] == 'Z' && data[5] == 0x00 { + reader, err := xz.NewReader(bytes.NewReader(data)) + if err != nil { + return nil, err + } + return io.ReadAll(reader) + } + + return data, nil +} diff --git a/pkg/tarfs/tarfs.go b/pkg/tarfs/tarfs.go index 9a4b440..6abbee4 100644 --- a/pkg/tarfs/tarfs.go +++ b/pkg/tarfs/tarfs.go @@ -13,15 +13,13 @@ import ( // TarFS is a http.FileSystem that serves files from a tar archive. type TarFS struct { - files map[string]*tar.Header - data []byte + files map[string]*tarFile } // New creates a new TarFS from a tar archive. func New(data []byte) (*TarFS, error) { fs := &TarFS{ - files: make(map[string]*tar.Header), - data: data, + files: make(map[string]*tarFile), } tr := tar.NewReader(bytes.NewReader(data)) @@ -35,7 +33,15 @@ func New(data []byte) (*TarFS, error) { } if strings.HasPrefix(hdr.Name, "rootfs/") { - fs.files[strings.TrimPrefix(hdr.Name, "rootfs/")] = hdr + content, err := io.ReadAll(tr) + if err != nil { + return nil, err + } + fs.files[strings.TrimPrefix(hdr.Name, "rootfs/")] = &tarFile{ + header: hdr, + content: bytes.NewReader(content), + modTime: hdr.ModTime, + } } } @@ -45,26 +51,10 @@ func New(data []byte) (*TarFS, error) { // Open opens a file from the tar archive. func (fs *TarFS) Open(name string) (http.File, error) { name = strings.TrimPrefix(name, "/") - if hdr, ok := fs.files[name]; ok { - // This is a bit inefficient, but it's the simplest way to - // get the file content without pre-indexing everything. - tr := tar.NewReader(bytes.NewReader(fs.data)) - for { - h, err := tr.Next() - if err == io.EOF { - break - } - if err != nil { - return nil, err - } - if h.Name == hdr.Name { - return &tarFile{ - header: hdr, - content: tr, - modTime: hdr.ModTime, - }, nil - } - } + if file, ok := fs.files[name]; ok { + // Reset the reader to the beginning of the file + file.content.Seek(0, 0) + return file, nil } return nil, os.ErrNotExist @@ -73,14 +63,14 @@ func (fs *TarFS) Open(name string) (http.File, error) { // tarFile is a http.File that represents a file in a tar archive. type tarFile struct { header *tar.Header - content io.Reader + content *bytes.Reader modTime time.Time } func (f *tarFile) Close() error { return nil } func (f *tarFile) Read(p []byte) (int, error) { return f.content.Read(p) } func (f *tarFile) Seek(offset int64, whence int) (int64, error) { - return 0, io.ErrUnexpectedEOF + return f.content.Seek(offset, whence) } func (f *tarFile) Readdir(count int) ([]os.FileInfo, error) {