From efee04bfdb6d9f2a4975483293088207e43fe1e7 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 31 Oct 2025 20:32:46 +0000 Subject: [PATCH 1/4] feat: Add PWA download and serve commands This commit introduces two new commands: `pwa` and `serve`. The `pwa` command downloads a Progressive Web Application (PWA) from a given URL. It discovers the PWA's manifest, downloads the assets referenced in the manifest (start URL and icons), and packages them into a single `.tar` file. The `serve` command takes a `.tar` file created by the `pwa` command and serves its contents using a standard Go HTTP file server. It unpacks the tarball into an in-memory filesystem, making it a self-contained and efficient way to host the downloaded PWA. --- cmd/pwa.go | 50 +++++++++++ cmd/serve.go | 169 ++++++++++++++++++++++++++++++++++++ go.mod | 7 +- go.sum | 10 +++ pkg/pwa/pwa.go | 202 ++++++++++++++++++++++++++++++++++++++++++++ pkg/pwa/pwa_test.go | 131 ++++++++++++++++++++++++++++ 6 files changed, 566 insertions(+), 3 deletions(-) create mode 100644 cmd/pwa.go create mode 100644 cmd/serve.go create mode 100644 pkg/pwa/pwa.go create mode 100644 pkg/pwa/pwa_test.go diff --git a/cmd/pwa.go b/cmd/pwa.go new file mode 100644 index 0000000..d3deb09 --- /dev/null +++ b/cmd/pwa.go @@ -0,0 +1,50 @@ +package cmd + +import ( + "fmt" + "os" + + "borg-data-collector/pkg/pwa" + + "github.com/spf13/cobra" +) + +// pwaCmd represents the pwa command +var pwaCmd = &cobra.Command{ + Use: "pwa [url]", + Short: "Download a PWA from a URL", + Long: `Downloads a Progressive Web Application (PWA) from a given URL by finding its manifest.`, + Args: cobra.ExactArgs(1), + Run: func(cmd *cobra.Command, args []string) { + pwaURL := args[0] + outputFile, _ := cmd.Flags().GetString("output") + + fmt.Println("Finding PWA manifest...") + manifestURL, err := pwa.FindManifestURL(pwaURL) + if err != nil { + fmt.Printf("Error finding manifest: %v\n", err) + return + } + fmt.Printf("Found manifest: %s\n", manifestURL) + + fmt.Println("Downloading and packaging PWA...") + pwaData, err := pwa.DownloadAndPackagePWA(pwaURL, manifestURL) + if err != nil { + fmt.Printf("Error downloading and packaging PWA: %v\n", err) + return + } + + err = os.WriteFile(outputFile, pwaData, 0644) + if err != nil { + fmt.Printf("Error writing PWA to file: %v\n", err) + return + } + + fmt.Printf("PWA saved to %s\n", outputFile) + }, +} + +func init() { + rootCmd.AddCommand(pwaCmd) + pwaCmd.PersistentFlags().String("output", "pwa.tar", "Output file for the PWA tarball") +} diff --git a/cmd/serve.go b/cmd/serve.go new file mode 100644 index 0000000..b780df7 --- /dev/null +++ b/cmd/serve.go @@ -0,0 +1,169 @@ +package cmd + +import ( + "archive/tar" + "bytes" + "fmt" + "io" + "io/fs" + "net/http" + "os" + "path" + "strings" + "time" + + "github.com/spf13/cobra" +) + +// serveCmd represents the serve command +var serveCmd = &cobra.Command{ + Use: "serve [file]", + Short: "Serve a packaged PWA file", + Long: `Serves the contents of a packaged PWA file using a static file server.`, + Args: cobra.ExactArgs(1), + Run: func(cmd *cobra.Command, args []string) { + pwaFile := args[0] + port, _ := cmd.Flags().GetString("port") + + pwaData, err := os.ReadFile(pwaFile) + if err != nil { + fmt.Printf("Error reading PWA file: %v\n", err) + return + } + + memFS, err := newMemoryFS(pwaData) + if err != nil { + fmt.Printf("Error creating in-memory filesystem: %v\n", err) + return + } + + http.Handle("/", http.FileServer(http.FS(memFS))) + + fmt.Printf("Serving PWA on http://localhost:%s\n", port) + err = http.ListenAndServe(":"+port, nil) + if err != nil { + fmt.Printf("Error starting server: %v\n", err) + return + } + }, +} + +// memoryFS is an in-memory filesystem that implements fs.FS +type memoryFS struct { + files map[string]*memoryFile +} + +func newMemoryFS(tarball []byte) (*memoryFS, error) { + memFS := &memoryFS{files: make(map[string]*memoryFile)} + tarReader := tar.NewReader(bytes.NewReader(tarball)) + + for { + header, err := tarReader.Next() + if err == io.EOF { + break + } + if err != nil { + return nil, err + } + + if header.Typeflag == tar.TypeReg { + data, err := io.ReadAll(tarReader) + if err != nil { + return nil, err + } + name := strings.TrimPrefix(header.Name, "/") + memFS.files[name] = &memoryFile{ + name: name, + content: data, + modTime: header.ModTime, + } + } + } + + return memFS, nil +} + +func (m *memoryFS) Open(name string) (fs.File, error) { + name = strings.TrimPrefix(name, "/") + if name == "" { + name = "index.html" + } + if file, ok := m.files[name]; ok { + return &memoryFileReader{file: file}, nil + } + return nil, fs.ErrNotExist +} + +// memoryFile represents a file in the in-memory filesystem +type memoryFile struct { + name string + content []byte + modTime time.Time +} + +func (m *memoryFile) Stat() (fs.FileInfo, error) { + return &memoryFileInfo{file: m}, nil +} + +func (m *memoryFile) Read(p []byte) (int, error) { + return 0, nil // This is implemented by memoryFileReader +} + +func (m *memoryFile) Close() error { + return nil +} + +// memoryFileInfo implements fs.FileInfo for a memoryFile +type memoryFileInfo struct { + file *memoryFile +} + +func (m *memoryFileInfo) Name() string { + return path.Base(m.file.name) +} + +func (m *memoryFileInfo) Size() int64 { + return int64(len(m.file.content)) +} + +func (m *memoryFileInfo) Mode() fs.FileMode { + return 0444 +} + +func (m *memoryFileInfo) ModTime() time.Time { + return m.file.modTime +} + +func (m *memoryFileInfo) IsDir() bool { + return false +} + +func (m *memoryFileInfo) Sys() interface{} { + return nil +} + +// memoryFileReader implements fs.File for a memoryFile +type memoryFileReader struct { + file *memoryFile + reader *bytes.Reader +} + +func (m *memoryFileReader) Stat() (fs.FileInfo, error) { + return m.file.Stat() +} + +func (m *memoryFileReader) Read(p []byte) (int, error) { + if m.reader == nil { + m.reader = bytes.NewReader(m.file.content) + } + return m.reader.Read(p) +} + +func (m *memoryFileReader) Close() error { + return nil +} + +func init() { + rootCmd.AddCommand(serveCmd) + serveCmd.PersistentFlags().String("port", "8080", "Port to serve the PWA on") +} diff --git a/go.mod b/go.mod index d98196b..78af759 100644 --- a/go.mod +++ b/go.mod @@ -18,13 +18,14 @@ require ( github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect github.com/kevinburke/ssh_config v1.2.0 // indirect + github.com/leaanthony/debme v1.2.1 // indirect github.com/pjbgf/sha1cd v0.3.2 // indirect github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect github.com/skeema/knownhosts v1.3.1 // indirect github.com/spf13/pflag v1.0.10 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect - golang.org/x/crypto v0.37.0 // indirect - golang.org/x/net v0.39.0 // indirect - golang.org/x/sys v0.32.0 // indirect + golang.org/x/crypto v0.43.0 // indirect + golang.org/x/net v0.46.0 // indirect + golang.org/x/sys v0.37.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index ae7851b..1005854 100644 --- a/go.sum +++ b/go.sum @@ -31,6 +31,10 @@ github.com/kevinburke/ssh_config v1.2.0/go.mod h1:CT57kijsi8u/K/BOFA39wgDQJ9CxiF github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/leaanthony/debme v1.2.1 h1:9Tgwf+kjcrbMQ4WnPcEIUcQuIZYqdWftzZkBr+i/oOc= +github.com/leaanthony/debme v1.2.1/go.mod h1:3V+sCm5tYAgQymvSOfYQ5Xx2JCr+OXiD9Jkw3otUjiA= +github.com/leaanthony/slicer v1.5.0/go.mod h1:FwrApmf8gOrpzEWM2J/9Lh79tyq8KTX5AzRtwV7m4AY= +github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU= github.com/pjbgf/sha1cd v0.3.2 h1:a9wb0bp1oC2TGwStyn0Umc/IGKQnEgF0vVaZ8QF8eo4= github.com/pjbgf/sha1cd v0.3.2/go.mod h1:zQWigSxVmsHEZow5qaLtPYxpcKMMQpa09ixqBxuCS6A= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= @@ -54,9 +58,13 @@ github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE= golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc= +golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04= +golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0= golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= +golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= +golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -65,6 +73,8 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= +golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/pkg/pwa/pwa.go b/pkg/pwa/pwa.go new file mode 100644 index 0000000..6ee3465 --- /dev/null +++ b/pkg/pwa/pwa.go @@ -0,0 +1,202 @@ +package pwa + +import ( + "archive/tar" + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "path" + + "golang.org/x/net/html" +) + +// Manifest represents a simple PWA manifest structure. +type Manifest struct { + Name string `json:"name"` + ShortName string `json:"short_name"` + StartURL string `json:"start_url"` + Icons []Icon `json:"icons"` +} + +// Icon represents an icon in the PWA manifest. +type Icon struct { + Src string `json:"src"` + Sizes string `json:"sizes"` + Type string `json:"type"` +} + +// FindManifestURL finds the manifest URL from a given HTML page. +func FindManifestURL(pageURL string) (string, error) { + resp, err := http.Get(pageURL) + if err != nil { + return "", err + } + defer resp.Body.Close() + + doc, err := html.Parse(resp.Body) + if err != nil { + return "", err + } + + var manifestPath string + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "link" { + isManifest := false + for _, a := range n.Attr { + if a.Key == "rel" && a.Val == "manifest" { + isManifest = true + break + } + } + if isManifest { + for _, a := range n.Attr { + if a.Key == "href" { + manifestPath = a.Val + return // exit once found + } + } + } + } + for c := n.FirstChild; c != nil && manifestPath == ""; c = c.NextSibling { + f(c) + } + } + f(doc) + + if manifestPath == "" { + return "", fmt.Errorf("manifest not found") + } + + resolvedURL, err := resolveURL(pageURL, manifestPath) + if err != nil { + return "", fmt.Errorf("could not resolve manifest URL: %w", err) + } + + return resolvedURL.String(), nil +} + +// DownloadAndPackagePWA downloads all assets of a PWA and packages them into a tarball. +func DownloadAndPackagePWA(baseURL string, manifestURL string) ([]byte, error) { + manifestAbsURL, err := resolveURL(baseURL, manifestURL) + if err != nil { + return nil, fmt.Errorf("could not resolve manifest URL: %w", err) + } + + resp, err := http.Get(manifestAbsURL.String()) + if err != nil { + return nil, fmt.Errorf("could not download manifest: %w", err) + } + defer resp.Body.Close() + + manifestBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("could not read manifest body: %w", err) + } + + var manifest Manifest + if err := json.Unmarshal(manifestBody, &manifest); err != nil { + return nil, fmt.Errorf("could not parse manifest JSON: %w", err) + } + + // Create a buffer to write our archive to. + buf := new(bytes.Buffer) + tw := tar.NewWriter(buf) + + // Add the manifest to the archive + hdr := &tar.Header{ + Name: "manifest.json", + Mode: 0600, + Size: int64(len(manifestBody)), + } + if err := tw.WriteHeader(hdr); err != nil { + return nil, err + } + if _, err := tw.Write(manifestBody); err != nil { + return nil, err + } + + // Add the start_url to the archive + if manifest.StartURL != "" { + startURLAbs, err := resolveURL(manifestAbsURL.String(), manifest.StartURL) + if err != nil { + return nil, fmt.Errorf("could not resolve start_url: %w", err) + } + err = downloadAndAddFileToTar(tw, startURLAbs, manifest.StartURL) + if err != nil { + return nil, fmt.Errorf("failed to download start_url asset: %w", err) + } + } + + // Add the icons to the archive + for _, icon := range manifest.Icons { + iconURLAbs, err := resolveURL(manifestAbsURL.String(), icon.Src) + if err != nil { + fmt.Printf("Warning: could not resolve icon URL %s: %v\n", icon.Src, err) + continue + } + err = downloadAndAddFileToTar(tw, iconURLAbs, icon.Src) + if err != nil { + fmt.Printf("Warning: failed to download icon %s: %v\n", icon.Src, err) + } + } + + // Add the base HTML to the archive + baseURLAbs, _ := url.Parse(baseURL) + err = downloadAndAddFileToTar(tw, baseURLAbs, "index.html") + if err != nil { + return nil, fmt.Errorf("failed to download base HTML: %w", err) + } + + if err := tw.Close(); err != nil { + return nil, err + } + + return buf.Bytes(), nil +} + +func resolveURL(base, ref string) (*url.URL, error) { + baseURL, err := url.Parse(base) + if err != nil { + return nil, err + } + refURL, err := url.Parse(ref) + if err != nil { + return nil, err + } + return baseURL.ResolveReference(refURL), nil +} + +func downloadAndAddFileToTar(tw *tar.Writer, fileURL *url.URL, internalPath string) error { + resp, err := http.Get(fileURL.String()) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("bad status: %s", resp.Status) + } + + data, err := io.ReadAll(resp.Body) + if err != nil { + return err + } + + hdr := &tar.Header{ + Name: path.Clean(internalPath), + Mode: 0600, + Size: int64(len(data)), + } + if err := tw.WriteHeader(hdr); err != nil { + return err + } + if _, err := tw.Write(data); err != nil { + return err + } + + return nil +} diff --git a/pkg/pwa/pwa_test.go b/pkg/pwa/pwa_test.go new file mode 100644 index 0000000..ec615a2 --- /dev/null +++ b/pkg/pwa/pwa_test.go @@ -0,0 +1,131 @@ +package pwa + +import ( + "archive/tar" + "bytes" + "net/http" + "net/http/httptest" + "testing" +) + +func TestFindManifestURL(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(` + + + + Test PWA + + + +

Hello, PWA!

+ + + `)) + })) + defer server.Close() + + expectedURL := server.URL + "/manifest.json" + actualURL, err := FindManifestURL(server.URL) + if err != nil { + t.Fatalf("FindManifestURL failed: %v", err) + } + + if actualURL != expectedURL { + t.Errorf("Expected manifest URL %s, but got %s", expectedURL, actualURL) + } +} + +func TestDownloadAndPackagePWA(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/": + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(` + + + + Test PWA + + + +

Hello, PWA!

+ + + `)) + case "/manifest.json": + w.Header().Set("Content-Type", "application/json") + w.Write([]byte(`{ + "name": "Test PWA", + "short_name": "TestPWA", + "start_url": "index.html", + "icons": [ + { + "src": "icon.png", + "sizes": "192x192", + "type": "image/png" + } + ] + }`)) + case "/index.html": + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(`

Hello, PWA!

`)) + case "/icon.png": + w.Header().Set("Content-Type", "image/png") + w.Write([]byte("fake image data")) + default: + http.NotFound(w, r) + } + })) + defer server.Close() + + tarball, err := DownloadAndPackagePWA(server.URL, server.URL+"/manifest.json") + if err != nil { + t.Fatalf("DownloadAndPackagePWA failed: %v", err) + } + + tarReader := tar.NewReader(bytes.NewReader(tarball)) + expectedFiles := []string{"manifest.json", "index.html", "icon.png"} + foundFiles := make(map[string]bool) + + for { + header, err := tarReader.Next() + if err != nil { + break + } + foundFiles[header.Name] = true + } + + for _, file := range expectedFiles { + if !foundFiles[file] { + t.Errorf("Expected to find file %s in tarball, but it was not found", file) + } + } +} + +func TestResolveURL(t *testing.T) { + tests := []struct { + base string + ref string + want string + }{ + {"http://example.com/", "foo.html", "http://example.com/foo.html"}, + {"http://example.com/foo/", "bar.html", "http://example.com/foo/bar.html"}, + {"http://example.com/foo", "bar.html", "http://example.com/bar.html"}, + {"http://example.com/foo/", "/bar.html", "http://example.com/bar.html"}, + {"http://example.com/foo", "/bar.html", "http://example.com/bar.html"}, + {"http://example.com/", "http://example.com/foo/bar.html", "http://example.com/foo/bar.html"}, + } + + for _, tt := range tests { + got, err := resolveURL(tt.base, tt.ref) + if err != nil { + t.Errorf("resolveURL(%q, %q) returned error: %v", tt.base, tt.ref, err) + continue + } + if got.String() != tt.want { + t.Errorf("resolveURL(%q, %q) = %q, want %q", tt.base, tt.ref, got.String(), tt.want) + } + } +} From 5149b6440334c4c30e8c98fbbaf7109df08db2f2 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 31 Oct 2025 20:47:11 +0000 Subject: [PATCH 2/4] feat: Implement DataNode and update PWA commands This commit introduces a new `DataNode` package, which provides an in-memory, `fs.FS`-compatible filesystem with a `debme`-like interface. The `DataNode` can be serialized to and from a TAR archive, making it suitable for storing downloaded assets. The `pwa` and `serve` commands have been refactored to use the `DataNode`. The `pwa` command now packages downloaded PWA assets into a `DataNode` and saves it as a `.dat` file. The `serve` command loads a `.dat` file into a `DataNode` and serves its contents. --- cmd/pwa.go | 10 +- cmd/serve.go | 130 +------------- pkg/datanode/datanode.go | 317 ++++++++++++++++++++++++++++++++++ pkg/datanode/datanode_test.go | 124 +++++++++++++ pkg/pwa/pwa.go | 57 ++---- pkg/pwa/pwa_test.go | 23 +-- 6 files changed, 473 insertions(+), 188 deletions(-) create mode 100644 pkg/datanode/datanode.go create mode 100644 pkg/datanode/datanode_test.go diff --git a/cmd/pwa.go b/cmd/pwa.go index d3deb09..e288fcb 100644 --- a/cmd/pwa.go +++ b/cmd/pwa.go @@ -28,12 +28,18 @@ var pwaCmd = &cobra.Command{ fmt.Printf("Found manifest: %s\n", manifestURL) fmt.Println("Downloading and packaging PWA...") - pwaData, err := pwa.DownloadAndPackagePWA(pwaURL, manifestURL) + dn, err := pwa.DownloadAndPackagePWA(pwaURL, manifestURL) if err != nil { fmt.Printf("Error downloading and packaging PWA: %v\n", err) return } + pwaData, err := dn.ToTar() + if err != nil { + fmt.Printf("Error serializing PWA data: %v\n", err) + return + } + err = os.WriteFile(outputFile, pwaData, 0644) if err != nil { fmt.Printf("Error writing PWA to file: %v\n", err) @@ -46,5 +52,5 @@ var pwaCmd = &cobra.Command{ func init() { rootCmd.AddCommand(pwaCmd) - pwaCmd.PersistentFlags().String("output", "pwa.tar", "Output file for the PWA tarball") + pwaCmd.PersistentFlags().String("output", "pwa.dat", "Output file for the PWA DataNode") } diff --git a/cmd/serve.go b/cmd/serve.go index b780df7..57beb87 100644 --- a/cmd/serve.go +++ b/cmd/serve.go @@ -1,16 +1,11 @@ package cmd import ( - "archive/tar" - "bytes" "fmt" - "io" - "io/fs" "net/http" "os" - "path" - "strings" - "time" + + "borg-data-collector/pkg/datanode" "github.com/spf13/cobra" ) @@ -31,13 +26,13 @@ var serveCmd = &cobra.Command{ return } - memFS, err := newMemoryFS(pwaData) + dn, err := datanode.FromTar(pwaData) if err != nil { - fmt.Printf("Error creating in-memory filesystem: %v\n", err) + fmt.Printf("Error creating DataNode from tarball: %v\n", err) return } - http.Handle("/", http.FileServer(http.FS(memFS))) + http.Handle("/", http.FileServer(http.FS(dn))) fmt.Printf("Serving PWA on http://localhost:%s\n", port) err = http.ListenAndServe(":"+port, nil) @@ -48,121 +43,6 @@ var serveCmd = &cobra.Command{ }, } -// memoryFS is an in-memory filesystem that implements fs.FS -type memoryFS struct { - files map[string]*memoryFile -} - -func newMemoryFS(tarball []byte) (*memoryFS, error) { - memFS := &memoryFS{files: make(map[string]*memoryFile)} - tarReader := tar.NewReader(bytes.NewReader(tarball)) - - for { - header, err := tarReader.Next() - if err == io.EOF { - break - } - if err != nil { - return nil, err - } - - if header.Typeflag == tar.TypeReg { - data, err := io.ReadAll(tarReader) - if err != nil { - return nil, err - } - name := strings.TrimPrefix(header.Name, "/") - memFS.files[name] = &memoryFile{ - name: name, - content: data, - modTime: header.ModTime, - } - } - } - - return memFS, nil -} - -func (m *memoryFS) Open(name string) (fs.File, error) { - name = strings.TrimPrefix(name, "/") - if name == "" { - name = "index.html" - } - if file, ok := m.files[name]; ok { - return &memoryFileReader{file: file}, nil - } - return nil, fs.ErrNotExist -} - -// memoryFile represents a file in the in-memory filesystem -type memoryFile struct { - name string - content []byte - modTime time.Time -} - -func (m *memoryFile) Stat() (fs.FileInfo, error) { - return &memoryFileInfo{file: m}, nil -} - -func (m *memoryFile) Read(p []byte) (int, error) { - return 0, nil // This is implemented by memoryFileReader -} - -func (m *memoryFile) Close() error { - return nil -} - -// memoryFileInfo implements fs.FileInfo for a memoryFile -type memoryFileInfo struct { - file *memoryFile -} - -func (m *memoryFileInfo) Name() string { - return path.Base(m.file.name) -} - -func (m *memoryFileInfo) Size() int64 { - return int64(len(m.file.content)) -} - -func (m *memoryFileInfo) Mode() fs.FileMode { - return 0444 -} - -func (m *memoryFileInfo) ModTime() time.Time { - return m.file.modTime -} - -func (m *memoryFileInfo) IsDir() bool { - return false -} - -func (m *memoryFileInfo) Sys() interface{} { - return nil -} - -// memoryFileReader implements fs.File for a memoryFile -type memoryFileReader struct { - file *memoryFile - reader *bytes.Reader -} - -func (m *memoryFileReader) Stat() (fs.FileInfo, error) { - return m.file.Stat() -} - -func (m *memoryFileReader) Read(p []byte) (int, error) { - if m.reader == nil { - m.reader = bytes.NewReader(m.file.content) - } - return m.reader.Read(p) -} - -func (m *memoryFileReader) Close() error { - return nil -} - func init() { rootCmd.AddCommand(serveCmd) serveCmd.PersistentFlags().String("port", "8080", "Port to serve the PWA on") diff --git a/pkg/datanode/datanode.go b/pkg/datanode/datanode.go new file mode 100644 index 0000000..fe2f43b --- /dev/null +++ b/pkg/datanode/datanode.go @@ -0,0 +1,317 @@ +package datanode + +import ( + "archive/tar" + "bytes" + "io" + "io/fs" + "os" + "path" + "sort" + "strings" + "time" +) + +// DataNode is an in-memory filesystem that is compatible with fs.FS. +type DataNode struct { + files map[string]*dataFile +} + +// New creates a new, empty DataNode. +func New() *DataNode { + return &DataNode{files: make(map[string]*dataFile)} +} + +// FromTar creates a new DataNode from a tarball. +func FromTar(tarball []byte) (*DataNode, error) { + dn := New() + tarReader := tar.NewReader(bytes.NewReader(tarball)) + + for { + header, err := tarReader.Next() + if err == io.EOF { + break + } + if err != nil { + return nil, err + } + + if header.Typeflag == tar.TypeReg { + data, err := io.ReadAll(tarReader) + if err != nil { + return nil, err + } + dn.AddData(header.Name, data) + } + } + + return dn, nil +} + +// ToTar serializes the DataNode to a tarball. +func (d *DataNode) ToTar() ([]byte, error) { + buf := new(bytes.Buffer) + tw := tar.NewWriter(buf) + + for _, file := range d.files { + hdr := &tar.Header{ + Name: file.name, + Mode: 0600, + Size: int64(len(file.content)), + ModTime: file.modTime, + } + if err := tw.WriteHeader(hdr); err != nil { + return nil, err + } + if _, err := tw.Write(file.content); err != nil { + return nil, err + } + } + + if err := tw.Close(); err != nil { + return nil, err + } + + return buf.Bytes(), nil +} + +// AddData adds a file to the DataNode. +func (d *DataNode) AddData(name string, content []byte) { + name = strings.TrimPrefix(name, "/") + d.files[name] = &dataFile{ + name: name, + content: content, + modTime: time.Now(), + } +} + +// Open opens a file from the DataNode. +func (d *DataNode) Open(name string) (fs.File, error) { + name = strings.TrimPrefix(name, "/") + if file, ok := d.files[name]; ok { + return &dataFileReader{file: file}, nil + } + // Check if it's a directory + prefix := name + "/" + if name == "." || name == "" { + prefix = "" + } + for p := range d.files { + if strings.HasPrefix(p, prefix) { + return &dirFile{path: name, modTime: time.Now()}, nil + } + } + return nil, fs.ErrNotExist +} + +// ReadDir reads and returns all directory entries for the named directory. +func (d *DataNode) ReadDir(name string) ([]fs.DirEntry, error) { + name = strings.TrimPrefix(name, "/") + if name == "." { + name = "" + } + + entries := []fs.DirEntry{} + seen := make(map[string]bool) + + prefix := "" + if name != "" { + prefix = name + "/" + } + + for p := range d.files { + if !strings.HasPrefix(p, prefix) { + continue + } + + relPath := strings.TrimPrefix(p, prefix) + firstComponent := strings.Split(relPath, "/")[0] + + if seen[firstComponent] { + continue + } + seen[firstComponent] = true + + if strings.Contains(relPath, "/") { + // It's a directory + dir := &dirInfo{name: firstComponent, modTime: time.Now()} + entries = append(entries, fs.FileInfoToDirEntry(dir)) + } else { + // It's a file + file := d.files[p] + info, _ := file.Stat() + entries = append(entries, fs.FileInfoToDirEntry(info)) + } + } + + // Sort for stable order in tests + sort.Slice(entries, func(i, j int) bool { + return entries[i].Name() < entries[j].Name() + }) + + return entries, nil +} + +// Stat returns the FileInfo structure describing file. +func (d *DataNode) Stat(name string) (fs.FileInfo, error) { + name = strings.TrimPrefix(name, "/") + if file, ok := d.files[name]; ok { + return file.Stat() + } + // Check if it's a directory + prefix := name + "/" + if name == "." || name == "" { + prefix = "" + } + for p := range d.files { + if strings.HasPrefix(p, prefix) { + return &dirInfo{name: path.Base(name), modTime: time.Now()}, nil + } + } + + return nil, fs.ErrNotExist +} + +// ExistsOptions allows customizing the Exists check. +type ExistsOptions struct { + WantType fs.FileMode +} + +// Exists returns true if the file or directory exists. +func (d *DataNode) Exists(name string, opts ...ExistsOptions) (bool, error) { + info, err := d.Stat(name) + if err != nil { + if err == fs.ErrNotExist || os.IsNotExist(err) { + return false, nil + } + return false, err + } + if len(opts) > 0 { + if opts[0].WantType == fs.ModeDir && !info.IsDir() { + return false, nil + } + if opts[0].WantType != fs.ModeDir && info.IsDir() { + return false, nil + } + } + return true, nil +} + +// WalkOptions allows customizing the Walk behavior. +type WalkOptions struct { + MaxDepth int + Filter func(path string, d fs.DirEntry) bool + SkipErrors bool +} + +// Walk recursively descends the file tree rooted at root, calling fn for each file or directory. +func (d *DataNode) Walk(root string, fn fs.WalkDirFunc, opts ...WalkOptions) error { + var maxDepth int + var filter func(string, fs.DirEntry) bool + var skipErrors bool + if len(opts) > 0 { + maxDepth = opts[0].MaxDepth + filter = opts[0].Filter + skipErrors = opts[0].SkipErrors + } + + return fs.WalkDir(d, root, func(path string, de fs.DirEntry, err error) error { + if err != nil { + if skipErrors { + return nil + } + return fn(path, de, err) + } + if filter != nil && !filter(path, de) { + return nil + } + if maxDepth > 0 { + currentDepth := strings.Count(strings.TrimPrefix(path, root), "/") + if de.IsDir() && currentDepth >= maxDepth { + return fs.SkipDir + } + } + return fn(path, de, nil) + }) +} + +// CopyFile copies a file from the DataNode to the local filesystem. +func (d *DataNode) CopyFile(sourcePath string, target string, perm os.FileMode) error { + sourceFile, err := d.Open(sourcePath) + if err != nil { + return err + } + defer sourceFile.Close() + + targetFile, err := os.OpenFile(target, os.O_CREATE|os.O_RDWR, perm) + if err != nil { + return err + } + defer targetFile.Close() + + _, err = io.Copy(targetFile, sourceFile) + return err +} + +// dataFile represents a file in the DataNode. +type dataFile struct { + name string + content []byte + modTime time.Time +} + +func (d *dataFile) Stat() (fs.FileInfo, error) { return &dataFileInfo{file: d}, nil } +func (d *dataFile) Read(p []byte) (int, error) { return 0, io.EOF } +func (d *dataFile) Close() error { return nil } + +// dataFileInfo implements fs.FileInfo for a dataFile. +type dataFileInfo struct{ file *dataFile } + +func (d *dataFileInfo) Name() string { return path.Base(d.file.name) } +func (d *dataFileInfo) Size() int64 { return int64(len(d.file.content)) } +func (d *dataFileInfo) Mode() fs.FileMode { return 0444 } +func (d *dataFileInfo) ModTime() time.Time { return d.file.modTime } +func (d *dataFileInfo) IsDir() bool { return false } +func (d *dataFileInfo) Sys() interface{} { return nil } + +// dataFileReader implements fs.File for a dataFile. +type dataFileReader struct { + file *dataFile + reader *bytes.Reader +} + +func (d *dataFileReader) Stat() (fs.FileInfo, error) { return d.file.Stat() } +func (d *dataFileReader) Read(p []byte) (int, error) { + if d.reader == nil { + d.reader = bytes.NewReader(d.file.content) + } + return d.reader.Read(p) +} +func (d *dataFileReader) Close() error { return nil } + +// dirInfo implements fs.FileInfo for an implicit directory. +type dirInfo struct { + name string + modTime time.Time +} + +func (d *dirInfo) Name() string { return d.name } +func (d *dirInfo) Size() int64 { return 0 } +func (d *dirInfo) Mode() fs.FileMode { return fs.ModeDir | 0555 } +func (d *dirInfo) ModTime() time.Time { return d.modTime } +func (d *dirInfo) IsDir() bool { return true } +func (d *dirInfo) Sys() interface{} { return nil } + +// dirFile implements fs.File for a directory. +type dirFile struct { + path string + modTime time.Time +} + +func (d *dirFile) Stat() (fs.FileInfo, error) { + return &dirInfo{name: path.Base(d.path), modTime: d.modTime}, nil +} +func (d *dirFile) Read([]byte) (int, error) { + return 0, &fs.PathError{Op: "read", Path: d.path, Err: fs.ErrInvalid} +} +func (d *dirFile) Close() error { return nil } diff --git a/pkg/datanode/datanode_test.go b/pkg/datanode/datanode_test.go new file mode 100644 index 0000000..847d20b --- /dev/null +++ b/pkg/datanode/datanode_test.go @@ -0,0 +1,124 @@ +package datanode + +import ( + "io/fs" + "os" + "reflect" + "sort" + "testing" +) + +func TestDataNode(t *testing.T) { + dn := New() + dn.AddData("foo.txt", []byte("foo")) + dn.AddData("bar/baz.txt", []byte("baz")) + dn.AddData("bar/qux.txt", []byte("qux")) + + // Test Open + file, err := dn.Open("foo.txt") + if err != nil { + t.Fatalf("Open failed: %v", err) + } + file.Close() + + _, err = dn.Open("nonexistent.txt") + if err == nil { + t.Fatalf("Expected error opening nonexistent file, got nil") + } + + // Test Stat + info, err := dn.Stat("bar/baz.txt") + if err != nil { + t.Fatalf("Stat failed: %v", err) + } + if info.Name() != "baz.txt" { + t.Errorf("Expected name baz.txt, got %s", info.Name()) + } + if info.Size() != 3 { + t.Errorf("Expected size 3, got %d", info.Size()) + } + if info.IsDir() { + t.Errorf("Expected baz.txt to not be a directory") + } + + dirInfo, err := dn.Stat("bar") + if err != nil { + t.Fatalf("Stat directory failed: %v", err) + } + if !dirInfo.IsDir() { + t.Errorf("Expected 'bar' to be a directory") + } + + // Test Exists + exists, err := dn.Exists("foo.txt") + if err != nil || !exists { + t.Errorf("Expected foo.txt to exist, err: %v", err) + } + exists, err = dn.Exists("bar") + if err != nil || !exists { + t.Errorf("Expected 'bar' directory to exist, err: %v", err) + } + exists, err = dn.Exists("nonexistent") + if err != nil || exists { + t.Errorf("Expected 'nonexistent' to not exist, err: %v", err) + } + + // Test ReadDir + entries, err := dn.ReadDir(".") + if err != nil { + t.Fatalf("ReadDir failed: %v", err) + } + expectedRootEntries := []string{"bar", "foo.txt"} + if len(entries) != len(expectedRootEntries) { + t.Errorf("Expected %d entries in root, got %d", len(expectedRootEntries), len(entries)) + } + var rootEntryNames []string + for _, e := range entries { + rootEntryNames = append(rootEntryNames, e.Name()) + } + sort.Strings(rootEntryNames) + if !reflect.DeepEqual(rootEntryNames, expectedRootEntries) { + t.Errorf("Expected entries %v, got %v", expectedRootEntries, rootEntryNames) + } + + barEntries, err := dn.ReadDir("bar") + if err != nil { + t.Fatalf("ReadDir('bar') failed: %v", err) + } + expectedBarEntries := []string{"baz.txt", "qux.txt"} + if len(barEntries) != len(expectedBarEntries) { + t.Errorf("Expected %d entries in 'bar', got %d", len(expectedBarEntries), len(barEntries)) + } + + // Test Walk + var paths []string + dn.Walk(".", func(path string, d fs.DirEntry, err error) error { + paths = append(paths, path) + return nil + }) + expectedPaths := []string{".", "bar", "bar/baz.txt", "bar/qux.txt", "foo.txt"} + sort.Strings(paths) + if !reflect.DeepEqual(paths, expectedPaths) { + t.Errorf("Walk expected paths %v, got %v", expectedPaths, paths) + } + + // Test CopyFile + tmpfile, err := os.CreateTemp("", "datanode-test-") + if err != nil { + t.Fatalf("CreateTemp failed: %v", err) + } + defer os.Remove(tmpfile.Name()) + + err = dn.CopyFile("foo.txt", tmpfile.Name(), 0644) + if err != nil { + t.Fatalf("CopyFile failed: %v", err) + } + + content, err := os.ReadFile(tmpfile.Name()) + if err != nil { + t.Fatalf("ReadFile failed: %v", err) + } + if string(content) != "foo" { + t.Errorf("Expected foo, got %s", string(content)) + } +} diff --git a/pkg/pwa/pwa.go b/pkg/pwa/pwa.go index 6ee3465..6cef4c5 100644 --- a/pkg/pwa/pwa.go +++ b/pkg/pwa/pwa.go @@ -1,8 +1,6 @@ package pwa import ( - "archive/tar" - "bytes" "encoding/json" "fmt" "io" @@ -10,6 +8,8 @@ import ( "net/url" "path" + "borg-data-collector/pkg/datanode" + "golang.org/x/net/html" ) @@ -79,8 +79,8 @@ func FindManifestURL(pageURL string) (string, error) { return resolvedURL.String(), nil } -// DownloadAndPackagePWA downloads all assets of a PWA and packages them into a tarball. -func DownloadAndPackagePWA(baseURL string, manifestURL string) ([]byte, error) { +// DownloadAndPackagePWA downloads all assets of a PWA and packages them into a DataNode. +func DownloadAndPackagePWA(baseURL string, manifestURL string) (*datanode.DataNode, error) { manifestAbsURL, err := resolveURL(baseURL, manifestURL) if err != nil { return nil, fmt.Errorf("could not resolve manifest URL: %w", err) @@ -102,60 +102,39 @@ func DownloadAndPackagePWA(baseURL string, manifestURL string) ([]byte, error) { return nil, fmt.Errorf("could not parse manifest JSON: %w", err) } - // Create a buffer to write our archive to. - buf := new(bytes.Buffer) - tw := tar.NewWriter(buf) + dn := datanode.New() + dn.AddData("manifest.json", manifestBody) - // Add the manifest to the archive - hdr := &tar.Header{ - Name: "manifest.json", - Mode: 0600, - Size: int64(len(manifestBody)), - } - if err := tw.WriteHeader(hdr); err != nil { - return nil, err - } - if _, err := tw.Write(manifestBody); err != nil { - return nil, err - } - - // Add the start_url to the archive if manifest.StartURL != "" { startURLAbs, err := resolveURL(manifestAbsURL.String(), manifest.StartURL) if err != nil { return nil, fmt.Errorf("could not resolve start_url: %w", err) } - err = downloadAndAddFileToTar(tw, startURLAbs, manifest.StartURL) + err = downloadAndAddFile(dn, startURLAbs, manifest.StartURL) if err != nil { return nil, fmt.Errorf("failed to download start_url asset: %w", err) } } - // Add the icons to the archive for _, icon := range manifest.Icons { iconURLAbs, err := resolveURL(manifestAbsURL.String(), icon.Src) if err != nil { fmt.Printf("Warning: could not resolve icon URL %s: %v\n", icon.Src, err) continue } - err = downloadAndAddFileToTar(tw, iconURLAbs, icon.Src) + err = downloadAndAddFile(dn, iconURLAbs, icon.Src) if err != nil { fmt.Printf("Warning: failed to download icon %s: %v\n", icon.Src, err) } } - // Add the base HTML to the archive baseURLAbs, _ := url.Parse(baseURL) - err = downloadAndAddFileToTar(tw, baseURLAbs, "index.html") + err = downloadAndAddFile(dn, baseURLAbs, "index.html") if err != nil { return nil, fmt.Errorf("failed to download base HTML: %w", err) } - if err := tw.Close(); err != nil { - return nil, err - } - - return buf.Bytes(), nil + return dn, nil } func resolveURL(base, ref string) (*url.URL, error) { @@ -170,7 +149,7 @@ func resolveURL(base, ref string) (*url.URL, error) { return baseURL.ResolveReference(refURL), nil } -func downloadAndAddFileToTar(tw *tar.Writer, fileURL *url.URL, internalPath string) error { +func downloadAndAddFile(dn *datanode.DataNode, fileURL *url.URL, internalPath string) error { resp, err := http.Get(fileURL.String()) if err != nil { return err @@ -185,18 +164,6 @@ func downloadAndAddFileToTar(tw *tar.Writer, fileURL *url.URL, internalPath stri if err != nil { return err } - - hdr := &tar.Header{ - Name: path.Clean(internalPath), - Mode: 0600, - Size: int64(len(data)), - } - if err := tw.WriteHeader(hdr); err != nil { - return err - } - if _, err := tw.Write(data); err != nil { - return err - } - + dn.AddData(path.Clean(internalPath), data) return nil } diff --git a/pkg/pwa/pwa_test.go b/pkg/pwa/pwa_test.go index ec615a2..186412d 100644 --- a/pkg/pwa/pwa_test.go +++ b/pkg/pwa/pwa_test.go @@ -1,8 +1,6 @@ package pwa import ( - "archive/tar" - "bytes" "net/http" "net/http/httptest" "testing" @@ -80,26 +78,19 @@ func TestDownloadAndPackagePWA(t *testing.T) { })) defer server.Close() - tarball, err := DownloadAndPackagePWA(server.URL, server.URL+"/manifest.json") + dn, err := DownloadAndPackagePWA(server.URL, server.URL+"/manifest.json") if err != nil { t.Fatalf("DownloadAndPackagePWA failed: %v", err) } - tarReader := tar.NewReader(bytes.NewReader(tarball)) expectedFiles := []string{"manifest.json", "index.html", "icon.png"} - foundFiles := make(map[string]bool) - - for { - header, err := tarReader.Next() - if err != nil { - break - } - foundFiles[header.Name] = true - } - for _, file := range expectedFiles { - if !foundFiles[file] { - t.Errorf("Expected to find file %s in tarball, but it was not found", file) + exists, err := dn.Exists(file) + if err != nil { + t.Fatalf("Exists failed for %s: %v", file, err) + } + if !exists { + t.Errorf("Expected to find file %s in DataNode, but it was not found", file) } } } From bd65eefcd349ed1597ddbfa06eeeeca67f9803d6 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 31 Oct 2025 21:03:26 +0000 Subject: [PATCH 3/4] refactor: Use DataNode for repository collection This commit refactors the repository collection functionality to use the new `DataNode` package instead of the old `trix` package. The `collect` and `all` commands have been updated to use the new `vcs` package, which clones Git repositories and packages them into a `DataNode`. The `trix` package and its related commands (`cat`, `ingest`) have been removed. --- cmd/all.go | 40 +++++++++++++------------ cmd/cat.go | 53 --------------------------------- cmd/collect.go | 30 ++++++++++--------- cmd/helpers.go | 41 -------------------------- cmd/ingest.go | 56 ----------------------------------- pkg/trix/trix.go | 63 --------------------------------------- pkg/vcs/git.go | 51 ++++++++++++++++++++++++++++++++ pkg/vcs/git_test.go | 72 +++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 160 insertions(+), 246 deletions(-) delete mode 100644 cmd/cat.go delete mode 100644 cmd/helpers.go delete mode 100644 cmd/ingest.go delete mode 100644 pkg/trix/trix.go create mode 100644 pkg/vcs/git.go create mode 100644 pkg/vcs/git_test.go diff --git a/cmd/all.go b/cmd/all.go index dcb3dac..1924aa9 100644 --- a/cmd/all.go +++ b/cmd/all.go @@ -3,10 +3,11 @@ package cmd import ( "fmt" "os" + "strings" "borg-data-collector/pkg/borg" "borg-data-collector/pkg/github" - "borg-data-collector/pkg/trix" + "borg-data-collector/pkg/vcs" "github.com/spf13/cobra" ) @@ -15,7 +16,7 @@ import ( var allCmd = &cobra.Command{ Use: "all [user/org]", Short: "Collect all public repositories from a user or organization", - Long: `Collect all public repositories from a user or organization and store them in a Trix cube.`, + Long: `Collect all public repositories from a user or organization and store them in a DataNode.`, Args: cobra.ExactArgs(1), Run: func(cmd *cobra.Command, args []string) { fmt.Println(borg.GetRandomAssimilationMessage()) @@ -26,30 +27,30 @@ var allCmd = &cobra.Command{ return } - outputFile, _ := cmd.Flags().GetString("output") - - cube, err := trix.NewCube(outputFile) - if err != nil { - fmt.Println(err) - return - } - defer cube.Close() + outputDir, _ := cmd.Flags().GetString("output") for _, repoURL := range repos { fmt.Printf("Cloning %s...\n", repoURL) - tempPath, err := os.MkdirTemp("", "borg-clone-*") - if err != nil { - fmt.Println(err) - return - } - defer os.RemoveAll(tempPath) - - err = addRepoToCube(repoURL, cube, tempPath) + dn, err := vcs.CloneGitRepository(repoURL) if err != nil { fmt.Printf("Error cloning %s: %s\n", repoURL, err) continue } + + data, err := dn.ToTar() + if err != nil { + fmt.Printf("Error serializing DataNode for %s: %v\n", repoURL, err) + continue + } + + repoName := strings.Split(repoURL, "/")[len(strings.Split(repoURL, "/"))-1] + outputFile := fmt.Sprintf("%s/%s.dat", outputDir, repoName) + err = os.WriteFile(outputFile, data, 0644) + if err != nil { + fmt.Printf("Error writing DataNode for %s to file: %v\n", repoURL, err) + continue + } } fmt.Println(borg.GetRandomCodeLongMessage()) @@ -57,5 +58,6 @@ var allCmd = &cobra.Command{ } func init() { - collectCmd.AddCommand(allCmd) + rootCmd.AddCommand(allCmd) + allCmd.PersistentFlags().String("output", ".", "Output directory for the DataNodes") } diff --git a/cmd/cat.go b/cmd/cat.go deleted file mode 100644 index 9efce67..0000000 --- a/cmd/cat.go +++ /dev/null @@ -1,53 +0,0 @@ -package cmd - -import ( - "fmt" - "io" - "os" - - "borg-data-collector/pkg/trix" - - "github.com/spf13/cobra" -) - -// catCmd represents the cat command -var catCmd = &cobra.Command{ - Use: "cat [cube-file] [file-to-extract]", - Short: "Extract a file from a Trix cube", - Long: `Extract a file from a Trix cube and print its content to standard output.`, - Args: cobra.ExactArgs(2), - Run: func(cmd *cobra.Command, args []string) { - cubeFile := args[0] - fileToExtract := args[1] - - reader, file, err := trix.Extract(cubeFile) - if err != nil { - fmt.Println(err) - return - } - defer file.Close() - - for { - hdr, err := reader.Next() - if err == io.EOF { - break - } - if err != nil { - fmt.Println(err) - return - } - - if hdr.Name == fileToExtract { - if _, err := io.Copy(os.Stdout, reader); err != nil { - fmt.Println(err) - return - } - return - } - } - }, -} - -func init() { - rootCmd.AddCommand(catCmd) -} diff --git a/cmd/collect.go b/cmd/collect.go index 33fcbe9..779441c 100644 --- a/cmd/collect.go +++ b/cmd/collect.go @@ -2,8 +2,9 @@ package cmd import ( "fmt" + "os" - "borg-data-collector/pkg/trix" + "borg-data-collector/pkg/vcs" "github.com/spf13/cobra" ) @@ -12,34 +13,35 @@ import ( var collectCmd = &cobra.Command{ Use: "collect [repository-url]", Short: "Collect a single repository", - Long: `Collect a single repository and store it in a Trix cube.`, + Long: `Collect a single repository and store it in a DataNode.`, Args: cobra.ExactArgs(1), Run: func(cmd *cobra.Command, args []string) { - if len(args) < 1 { - fmt.Println("Please provide a repository URL") - return - } repoURL := args[0] - clonePath, _ := cmd.Flags().GetString("path") outputFile, _ := cmd.Flags().GetString("output") - cube, err := trix.NewCube(outputFile) + dn, err := vcs.CloneGitRepository(repoURL) if err != nil { - fmt.Println(err) + fmt.Printf("Error cloning repository: %v\n", err) return } - defer cube.Close() - err = addRepoToCube(repoURL, cube, clonePath) + data, err := dn.ToTar() if err != nil { - fmt.Println(err) + fmt.Printf("Error serializing DataNode: %v\n", err) return } + + err = os.WriteFile(outputFile, data, 0644) + if err != nil { + fmt.Printf("Error writing DataNode to file: %v\n", err) + return + } + + fmt.Printf("Repository saved to %s\n", outputFile) }, } func init() { rootCmd.AddCommand(collectCmd) - collectCmd.PersistentFlags().String("path", "/tmp/borg-clone", "Path to clone the repository") - collectCmd.PersistentFlags().String("output", "borg.cube", "Output file for the Trix cube") + collectCmd.PersistentFlags().String("output", "repo.dat", "Output file for the DataNode") } diff --git a/cmd/helpers.go b/cmd/helpers.go deleted file mode 100644 index f318023..0000000 --- a/cmd/helpers.go +++ /dev/null @@ -1,41 +0,0 @@ -package cmd - -import ( - "os" - "path/filepath" - - "borg-data-collector/pkg/trix" - - "github.com/go-git/go-git/v5" -) - -func addRepoToCube(repoURL string, cube *trix.Cube, clonePath string) error { - _, err := git.PlainClone(clonePath, false, &git.CloneOptions{ - URL: repoURL, - Progress: os.Stdout, - }) - - if err != nil { - return err - } - - err = filepath.Walk(clonePath, func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if !info.IsDir() { - content, err := os.ReadFile(path) - if err != nil { - return err - } - relPath, err := filepath.Rel(clonePath, path) - if err != nil { - return err - } - cube.AddFile(relPath, content) - } - return nil - }) - - return err -} diff --git a/cmd/ingest.go b/cmd/ingest.go deleted file mode 100644 index 23a8f32..0000000 --- a/cmd/ingest.go +++ /dev/null @@ -1,56 +0,0 @@ -package cmd - -import ( - "fmt" - "os" - - "borg-data-collector/pkg/borg" - "borg-data-collector/pkg/trix" - - "github.com/spf13/cobra" -) - -// ingestCmd represents the ingest command -var ingestCmd = &cobra.Command{ - Use: "ingest [cube-file] [file-to-add]", - Short: "Add a file to a Trix cube", - Long: `Add a file to a Trix cube. If the cube file does not exist, it will be created.`, - Args: cobra.ExactArgs(2), - Run: func(cmd *cobra.Command, args []string) { - cubeFile := args[0] - fileToAdd := args[1] - - var cube *trix.Cube - var err error - - if _, err := os.Stat(cubeFile); os.IsNotExist(err) { - cube, err = trix.NewCube(cubeFile) - } else { - cube, err = trix.AppendToCube(cubeFile) - } - - if err != nil { - fmt.Println(err) - return - } - defer cube.Close() - - content, err := os.ReadFile(fileToAdd) - if err != nil { - fmt.Println(err) - return - } - - err = cube.AddFile(fileToAdd, content) - if err != nil { - fmt.Println(err) - return - } - - fmt.Println(borg.GetRandomCodeShortMessage()) - }, -} - -func init() { - rootCmd.AddCommand(ingestCmd) -} diff --git a/pkg/trix/trix.go b/pkg/trix/trix.go deleted file mode 100644 index 79f4fd2..0000000 --- a/pkg/trix/trix.go +++ /dev/null @@ -1,63 +0,0 @@ -package trix - -import ( - "archive/tar" - "os" -) - -type Cube struct { - writer *tar.Writer - file *os.File -} - -func NewCube(path string) (*Cube, error) { - file, err := os.Create(path) - if err != nil { - return nil, err - } - return &Cube{ - writer: tar.NewWriter(file), - file: file, - }, nil -} - -func (c *Cube) AddFile(path string, content []byte) error { - hdr := &tar.Header{ - Name: path, - Mode: 0600, - Size: int64(len(content)), - } - if err := c.writer.WriteHeader(hdr); err != nil { - return err - } - if _, err := c.writer.Write(content); err != nil { - return err - } - return nil -} - -func (c *Cube) Close() error { - if err := c.writer.Close(); err != nil { - return err - } - return c.file.Close() -} - -func Extract(path string) (*tar.Reader, *os.File, error) { - file, err := os.Open(path) - if err != nil { - return nil, nil, err - } - return tar.NewReader(file), file, nil -} - -func AppendToCube(path string) (*Cube, error) { - file, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0644) - if err != nil { - return nil, err - } - return &Cube{ - writer: tar.NewWriter(file), - file: file, - }, nil -} diff --git a/pkg/vcs/git.go b/pkg/vcs/git.go new file mode 100644 index 0000000..7432ea3 --- /dev/null +++ b/pkg/vcs/git.go @@ -0,0 +1,51 @@ +package vcs + +import ( + "os" + "path/filepath" + + "borg-data-collector/pkg/datanode" + + "github.com/go-git/go-git/v5" +) + +// CloneGitRepository clones a Git repository from a URL and packages it into a DataNode. +func CloneGitRepository(repoURL string) (*datanode.DataNode, error) { + tempPath, err := os.MkdirTemp("", "borg-clone-*") + if err != nil { + return nil, err + } + defer os.RemoveAll(tempPath) + + _, err = git.PlainClone(tempPath, false, &git.CloneOptions{ + URL: repoURL, + Progress: os.Stdout, + }) + if err != nil { + return nil, err + } + + dn := datanode.New() + err = filepath.Walk(tempPath, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if !info.IsDir() { + content, err := os.ReadFile(path) + if err != nil { + return err + } + relPath, err := filepath.Rel(tempPath, path) + if err != nil { + return err + } + dn.AddData(relPath, content) + } + return nil + }) + if err != nil { + return nil, err + } + + return dn, nil +} diff --git a/pkg/vcs/git_test.go b/pkg/vcs/git_test.go new file mode 100644 index 0000000..c074318 --- /dev/null +++ b/pkg/vcs/git_test.go @@ -0,0 +1,72 @@ +package vcs + +import ( + "os" + "os/exec" + "path/filepath" + "testing" +) + +func TestCloneGitRepository(t *testing.T) { + // Create a temporary directory for the bare repository + bareRepoPath, err := os.MkdirTemp("", "bare-repo-") + if err != nil { + t.Fatalf("Failed to create temp dir for bare repo: %v", err) + } + defer os.RemoveAll(bareRepoPath) + + // Initialize a bare git repository + cmd := exec.Command("git", "init", "--bare") + cmd.Dir = bareRepoPath + if err := cmd.Run(); err != nil { + t.Fatalf("Failed to init bare repo: %v", err) + } + + // Clone the bare repository to a temporary directory to add a commit + clonePath, err := os.MkdirTemp("", "clone-") + if err != nil { + t.Fatalf("Failed to create temp dir for clone: %v", err) + } + defer os.RemoveAll(clonePath) + + cmd = exec.Command("git", "clone", bareRepoPath, clonePath) + if err := cmd.Run(); err != nil { + t.Fatalf("Failed to clone bare repo: %v", err) + } + + // Create a file and commit it + filePath := filepath.Join(clonePath, "foo.txt") + if err := os.WriteFile(filePath, []byte("foo"), 0644); err != nil { + t.Fatalf("Failed to write file: %v", err) + } + cmd = exec.Command("git", "add", "foo.txt") + cmd.Dir = clonePath + if err := cmd.Run(); err != nil { + t.Fatalf("Failed to git add: %v", err) + } + cmd = exec.Command("git", "commit", "-m", "Initial commit") + cmd.Dir = clonePath + if err := cmd.Run(); err != nil { + t.Fatalf("Failed to git commit: %v", err) + } + cmd = exec.Command("git", "push", "origin", "master") + cmd.Dir = clonePath + if err := cmd.Run(); err != nil { + t.Fatalf("Failed to git push: %v", err) + } + + // Clone the repository using the function we're testing + dn, err := CloneGitRepository("file://" + bareRepoPath) + if err != nil { + t.Fatalf("CloneGitRepository failed: %v", err) + } + + // Verify the DataNode contains the correct file + exists, err := dn.Exists("foo.txt") + if err != nil { + t.Fatalf("Exists failed: %v", err) + } + if !exists { + t.Errorf("Expected to find file foo.txt in DataNode, but it was not found") + } +} From 8e82bada062ea12211e880afca457299dd9af9a9 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 31 Oct 2025 21:35:53 +0000 Subject: [PATCH 4/4] feat: Add recursive website downloader and progress bar This commit introduces a new `collect website` command that recursively downloads a website to a specified depth. - A new `pkg/website` package contains the logic for the recursive download. - A new `pkg/ui` package provides a progress bar for long-running operations, which is used by the website downloader. - The `collect pwa` subcommand has been restored to be PWA-specific. --- cmd/collect.go | 37 +------- cmd/collect_git.go | 47 ++++++++++ cmd/{pwa.go => collect_pwa.go} | 16 ++-- cmd/collect_website.go | 49 ++++++++++ go.mod | 4 + go.sum | 8 ++ pkg/pwa/pwa.go | 4 +- pkg/pwa/pwa_test.go | 6 +- pkg/ui/progressbar.go | 15 +++ pkg/website/website.go | 166 +++++++++++++++++++++++++++++++++ pkg/website/website_test.go | 82 ++++++++++++++++ 11 files changed, 387 insertions(+), 47 deletions(-) create mode 100644 cmd/collect_git.go rename cmd/{pwa.go => collect_pwa.go} (68%) create mode 100644 cmd/collect_website.go create mode 100644 pkg/ui/progressbar.go create mode 100644 pkg/website/website.go create mode 100644 pkg/website/website_test.go diff --git a/cmd/collect.go b/cmd/collect.go index 779441c..57960b2 100644 --- a/cmd/collect.go +++ b/cmd/collect.go @@ -1,47 +1,16 @@ package cmd import ( - "fmt" - "os" - - "borg-data-collector/pkg/vcs" - "github.com/spf13/cobra" ) // collectCmd represents the collect command var collectCmd = &cobra.Command{ - Use: "collect [repository-url]", - Short: "Collect a single repository", - Long: `Collect a single repository and store it in a DataNode.`, - Args: cobra.ExactArgs(1), - Run: func(cmd *cobra.Command, args []string) { - repoURL := args[0] - outputFile, _ := cmd.Flags().GetString("output") - - dn, err := vcs.CloneGitRepository(repoURL) - if err != nil { - fmt.Printf("Error cloning repository: %v\n", err) - return - } - - data, err := dn.ToTar() - if err != nil { - fmt.Printf("Error serializing DataNode: %v\n", err) - return - } - - err = os.WriteFile(outputFile, data, 0644) - if err != nil { - fmt.Printf("Error writing DataNode to file: %v\n", err) - return - } - - fmt.Printf("Repository saved to %s\n", outputFile) - }, + Use: "collect", + Short: "Collect a resource and store it in a DataNode.", + Long: `Collect a resource from a git repository, a website, or other URI and store it in a DataNode.`, } func init() { rootCmd.AddCommand(collectCmd) - collectCmd.PersistentFlags().String("output", "repo.dat", "Output file for the DataNode") } diff --git a/cmd/collect_git.go b/cmd/collect_git.go new file mode 100644 index 0000000..2fcd450 --- /dev/null +++ b/cmd/collect_git.go @@ -0,0 +1,47 @@ +package cmd + +import ( + "fmt" + "os" + + "borg-data-collector/pkg/vcs" + + "github.com/spf13/cobra" +) + +// collectGitCmd represents the collect git command +var collectGitCmd = &cobra.Command{ + Use: "git [repository-url]", + Short: "Collect a single Git repository", + Long: `Collect a single Git repository and store it in a DataNode.`, + Args: cobra.ExactArgs(1), + Run: func(cmd *cobra.Command, args []string) { + repoURL := args[0] + outputFile, _ := cmd.Flags().GetString("output") + + dn, err := vcs.CloneGitRepository(repoURL) + if err != nil { + fmt.Printf("Error cloning repository: %v\n", err) + return + } + + data, err := dn.ToTar() + if err != nil { + fmt.Printf("Error serializing DataNode: %v\n", err) + return + } + + err = os.WriteFile(outputFile, data, 0644) + if err != nil { + fmt.Printf("Error writing DataNode to file: %v\n", err) + return + } + + fmt.Printf("Repository saved to %s\n", outputFile) + }, +} + +func init() { + collectCmd.AddCommand(collectGitCmd) + collectGitCmd.PersistentFlags().String("output", "repo.dat", "Output file for the DataNode") +} diff --git a/cmd/pwa.go b/cmd/collect_pwa.go similarity index 68% rename from cmd/pwa.go rename to cmd/collect_pwa.go index e288fcb..d7b5402 100644 --- a/cmd/pwa.go +++ b/cmd/collect_pwa.go @@ -9,18 +9,18 @@ import ( "github.com/spf13/cobra" ) -// pwaCmd represents the pwa command -var pwaCmd = &cobra.Command{ +// collectPWACmd represents the collect pwa command +var collectPWACmd = &cobra.Command{ Use: "pwa [url]", - Short: "Download a PWA from a URL", - Long: `Downloads a Progressive Web Application (PWA) from a given URL by finding its manifest.`, + Short: "Collect a single PWA", + Long: `Collect a single PWA and store it in a DataNode.`, Args: cobra.ExactArgs(1), Run: func(cmd *cobra.Command, args []string) { pwaURL := args[0] outputFile, _ := cmd.Flags().GetString("output") fmt.Println("Finding PWA manifest...") - manifestURL, err := pwa.FindManifestURL(pwaURL) + manifestURL, err := pwa.FindManifest(pwaURL) if err != nil { fmt.Printf("Error finding manifest: %v\n", err) return @@ -36,7 +36,7 @@ var pwaCmd = &cobra.Command{ pwaData, err := dn.ToTar() if err != nil { - fmt.Printf("Error serializing PWA data: %v\n", err) + fmt.Printf("Error converting PWA to bytes: %v\n", err) return } @@ -51,6 +51,6 @@ var pwaCmd = &cobra.Command{ } func init() { - rootCmd.AddCommand(pwaCmd) - pwaCmd.PersistentFlags().String("output", "pwa.dat", "Output file for the PWA DataNode") + collectCmd.AddCommand(collectPWACmd) + collectPWACmd.PersistentFlags().String("output", "pwa.dat", "Output file for the DataNode") } diff --git a/cmd/collect_website.go b/cmd/collect_website.go new file mode 100644 index 0000000..90911a2 --- /dev/null +++ b/cmd/collect_website.go @@ -0,0 +1,49 @@ +package cmd + +import ( + "fmt" + "os" + + "borg-data-collector/pkg/website" + + "github.com/spf13/cobra" +) + +// collectWebsiteCmd represents the collect website command +var collectWebsiteCmd = &cobra.Command{ + Use: "website [url]", + Short: "Collect a single website", + Long: `Collect a single website and store it in a DataNode.`, + Args: cobra.ExactArgs(1), + Run: func(cmd *cobra.Command, args []string) { + websiteURL := args[0] + outputFile, _ := cmd.Flags().GetString("output") + depth, _ := cmd.Flags().GetInt("depth") + + dn, err := website.DownloadAndPackageWebsite(websiteURL, depth) + if err != nil { + fmt.Printf("Error downloading and packaging website: %v\n", err) + return + } + + websiteData, err := dn.ToTar() + if err != nil { + fmt.Printf("Error converting website to bytes: %v\n", err) + return + } + + err = os.WriteFile(outputFile, websiteData, 0644) + if err != nil { + fmt.Printf("Error writing website to file: %v\n", err) + return + } + + fmt.Printf("Website saved to %s\n", outputFile) + }, +} + +func init() { + collectCmd.AddCommand(collectWebsiteCmd) + collectWebsiteCmd.PersistentFlags().String("output", "website.dat", "Output file for the DataNode") + collectWebsiteCmd.PersistentFlags().Int("depth", 2, "Recursion depth for downloading") +} diff --git a/go.mod b/go.mod index 78af759..5b4dcca 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,10 @@ require ( github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect github.com/kevinburke/ssh_config v1.2.0 // indirect github.com/leaanthony/debme v1.2.1 // indirect + github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect github.com/pjbgf/sha1cd v0.3.2 // indirect + github.com/rivo/uniseg v0.4.7 // indirect + github.com/schollz/progressbar/v3 v3.18.0 // indirect github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect github.com/skeema/knownhosts v1.3.1 // indirect github.com/spf13/pflag v1.0.10 // indirect @@ -27,5 +30,6 @@ require ( golang.org/x/crypto v0.43.0 // indirect golang.org/x/net v0.46.0 // indirect golang.org/x/sys v0.37.0 // indirect + golang.org/x/term v0.36.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index 1005854..8957994 100644 --- a/go.sum +++ b/go.sum @@ -35,11 +35,17 @@ github.com/leaanthony/debme v1.2.1 h1:9Tgwf+kjcrbMQ4WnPcEIUcQuIZYqdWftzZkBr+i/oO github.com/leaanthony/debme v1.2.1/go.mod h1:3V+sCm5tYAgQymvSOfYQ5Xx2JCr+OXiD9Jkw3otUjiA= github.com/leaanthony/slicer v1.5.0/go.mod h1:FwrApmf8gOrpzEWM2J/9Lh79tyq8KTX5AzRtwV7m4AY= github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU= +github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= +github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= github.com/pjbgf/sha1cd v0.3.2 h1:a9wb0bp1oC2TGwStyn0Umc/IGKQnEgF0vVaZ8QF8eo4= github.com/pjbgf/sha1cd v0.3.2/go.mod h1:zQWigSxVmsHEZow5qaLtPYxpcKMMQpa09ixqBxuCS6A= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= +github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/schollz/progressbar/v3 v3.18.0 h1:uXdoHABRFmNIjUfte/Ex7WtuyVslrw2wVPQmCN62HpA= +github.com/schollz/progressbar/v3 v3.18.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= @@ -76,6 +82,8 @@ golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q= +golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/pkg/pwa/pwa.go b/pkg/pwa/pwa.go index 6cef4c5..1679fa9 100644 --- a/pkg/pwa/pwa.go +++ b/pkg/pwa/pwa.go @@ -28,8 +28,8 @@ type Icon struct { Type string `json:"type"` } -// FindManifestURL finds the manifest URL from a given HTML page. -func FindManifestURL(pageURL string) (string, error) { +// FindManifest finds the manifest URL from a given HTML page. +func FindManifest(pageURL string) (string, error) { resp, err := http.Get(pageURL) if err != nil { return "", err diff --git a/pkg/pwa/pwa_test.go b/pkg/pwa/pwa_test.go index 186412d..f90fdb3 100644 --- a/pkg/pwa/pwa_test.go +++ b/pkg/pwa/pwa_test.go @@ -6,7 +6,7 @@ import ( "testing" ) -func TestFindManifestURL(t *testing.T) { +func TestFindManifest(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") w.Write([]byte(` @@ -25,9 +25,9 @@ func TestFindManifestURL(t *testing.T) { defer server.Close() expectedURL := server.URL + "/manifest.json" - actualURL, err := FindManifestURL(server.URL) + actualURL, err := FindManifest(server.URL) if err != nil { - t.Fatalf("FindManifestURL failed: %v", err) + t.Fatalf("FindManifest failed: %v", err) } if actualURL != expectedURL { diff --git a/pkg/ui/progressbar.go b/pkg/ui/progressbar.go new file mode 100644 index 0000000..8f143e1 --- /dev/null +++ b/pkg/ui/progressbar.go @@ -0,0 +1,15 @@ +package ui + +import ( + "github.com/schollz/progressbar/v3" +) + +// NewProgressBar creates a new progress bar with the specified total and description. +func NewProgressBar(total int, description string) *progressbar.ProgressBar { + return progressbar.NewOptions(total, + progressbar.OptionSetDescription(description), + progressbar.OptionSetWidth(15), + progressbar.OptionShowCount(), + progressbar.OptionClearOnFinish(), + ) +} diff --git a/pkg/website/website.go b/pkg/website/website.go new file mode 100644 index 0000000..2096a30 --- /dev/null +++ b/pkg/website/website.go @@ -0,0 +1,166 @@ +package website + +import ( + "fmt" + "io" + "net/http" + "net/url" + "strings" + + "borg-data-collector/pkg/datanode" + "github.com/schollz/progressbar/v3" + + "golang.org/x/net/html" +) + +// Downloader is a recursive website downloader. +type Downloader struct { + baseURL *url.URL + dn *datanode.DataNode + visited map[string]bool + maxDepth int + progressBar *progressbar.ProgressBar +} + +// NewDownloader creates a new Downloader. +func NewDownloader(maxDepth int) *Downloader { + return &Downloader{ + dn: datanode.New(), + visited: make(map[string]bool), + maxDepth: maxDepth, + } +} + +// DownloadAndPackageWebsite downloads a website and packages it into a DataNode. +func DownloadAndPackageWebsite(startURL string, maxDepth int) (*datanode.DataNode, error) { + baseURL, err := url.Parse(startURL) + if err != nil { + return nil, err + } + + d := NewDownloader(maxDepth) + d.baseURL = baseURL + + fmt.Println("Downloading website...") + d.progressBar = progressbar.NewOptions(1, progressbar.OptionSetDescription("Downloading")) + d.crawl(startURL, 0) + + return d.dn, nil +} + +func (d *Downloader) crawl(pageURL string, depth int) { + if depth > d.maxDepth || d.visited[pageURL] { + return + } + d.visited[pageURL] = true + d.progressBar.Add(1) + + resp, err := http.Get(pageURL) + if err != nil { + fmt.Printf("Error getting %s: %v\n", pageURL, err) + return + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + fmt.Printf("Error reading body of %s: %v\n", pageURL, err) + return + } + + relPath := d.getRelativePath(pageURL) + d.dn.AddData(relPath, body) + + doc, err := html.Parse(strings.NewReader(string(body))) + if err != nil { + fmt.Printf("Error parsing HTML of %s: %v\n", pageURL, err) + return + } + + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.ElementNode { + for _, a := range n.Attr { + if a.Key == "href" || a.Key == "src" { + link, err := d.resolveURL(pageURL, a.Val) + if err != nil { + continue + } + if d.isLocal(link) { + if isAsset(link) { + d.downloadAsset(link) + } else { + d.crawl(link, depth+1) + } + } + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + f(doc) +} + +func (d *Downloader) downloadAsset(assetURL string) { + if d.visited[assetURL] { + return + } + d.visited[assetURL] = true + d.progressBar.Add(1) + + resp, err := http.Get(assetURL) + if err != nil { + fmt.Printf("Error getting asset %s: %v\n", assetURL, err) + return + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + fmt.Printf("Error reading body of asset %s: %v\n", assetURL, err) + return + } + + relPath := d.getRelativePath(assetURL) + d.dn.AddData(relPath, body) +} + +func (d *Downloader) getRelativePath(pageURL string) string { + u, err := url.Parse(pageURL) + if err != nil { + return "" + } + return strings.TrimPrefix(u.Path, "/") +} + +func (d *Downloader) resolveURL(base, ref string) (string, error) { + baseURL, err := url.Parse(base) + if err != nil { + return "", err + } + refURL, err := url.Parse(ref) + if err != nil { + return "", err + } + return baseURL.ResolveReference(refURL).String(), nil +} + +func (d *Downloader) isLocal(pageURL string) bool { + u, err := url.Parse(pageURL) + if err != nil { + return false + } + return u.Hostname() == d.baseURL.Hostname() +} + +func isAsset(pageURL string) bool { + ext := []string{".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico"} + for _, e := range ext { + if strings.HasSuffix(pageURL, e) { + return true + } + } + return false +} diff --git a/pkg/website/website_test.go b/pkg/website/website_test.go new file mode 100644 index 0000000..9a8ec85 --- /dev/null +++ b/pkg/website/website_test.go @@ -0,0 +1,82 @@ +package website + +import ( + "net/http" + "net/http/httptest" + "testing" +) + +func TestDownloadAndPackageWebsite(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/": + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(` + + + + Test Website + + + +

Hello, Website!

+ Page 2 + + + + `)) + case "/style.css": + w.Header().Set("Content-Type", "text/css") + w.Write([]byte(`body { color: red; }`)) + case "/image.png": + w.Header().Set("Content-Type", "image/png") + w.Write([]byte("fake image data")) + case "/page2.html": + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(` + + + + Page 2 + + +

Page 2

+ Page 3 + + + `)) + case "/page3.html": + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(` + + + + Page 3 + + +

Page 3

+ + + `)) + default: + http.NotFound(w, r) + } + })) + defer server.Close() + + dn, err := DownloadAndPackageWebsite(server.URL, 2) + if err != nil { + t.Fatalf("DownloadAndPackageWebsite failed: %v", err) + } + + expectedFiles := []string{"", "style.css", "image.png", "page2.html", "page3.html"} + for _, file := range expectedFiles { + exists, err := dn.Exists(file) + if err != nil { + t.Fatalf("Exists failed for %s: %v", file, err) + } + if !exists { + t.Errorf("Expected to find file %s in DataNode, but it was not found", file) + } + } +}