From c3865faf56ffc85f60caf580af8427fdcaaa391f Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Mon, 2 Feb 2026 00:52:46 +0000 Subject: [PATCH] feat(search): Add full-text search and indexing for archives This commit introduces two new commands: `borg search` and `borg index`. The `borg index` command builds a trigram index for an archive, which can be used to significantly speed up searches. The `borg search` command allows users to search for patterns within archives. It supports regular expressions, context control, file type filtering, and result limits. The command will automatically use a pre-built index if one is available, falling back to a full scan if not. This commit also includes: - Unit tests for the new commands. - Documentation for the new commands in `docs/cli.md`. - Updates to `.gitignore` to exclude index files. - Improvements to the test infrastructure to prevent state pollution. Co-authored-by: Snider <631881+Snider@users.noreply.github.com> --- .gitignore | 1 + cmd/index.go | 134 ++++++++++++++++++ cmd/index_test.go | 54 +++++++ cmd/root_test.go | 38 +++-- cmd/search.go | 342 +++++++++++++++++++++++++++++++++++++++++++++ cmd/search_test.go | 78 +++++++++++ docs/cli.md | 28 ++++ 7 files changed, 654 insertions(+), 21 deletions(-) create mode 100644 cmd/index.go create mode 100644 cmd/index_test.go create mode 100644 cmd/search.go create mode 100644 cmd/search_test.go diff --git a/.gitignore b/.gitignore index d3a3066..06f85bd 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ demo-track.smsg # Dev artifacts .playwright-mcp/ +.borg-index/ diff --git a/cmd/index.go b/cmd/index.go new file mode 100644 index 0000000..2f0245b --- /dev/null +++ b/cmd/index.go @@ -0,0 +1,134 @@ +package cmd + +import ( + "bytes" + "encoding/gob" + "encoding/json" + "fmt" + "io" + "io/fs" + "os" + "path/filepath" + + "github.com/Snider/Borg/pkg/compress" + "github.com/Snider/Borg/pkg/datanode" + "github.com/spf13/cobra" +) + +// indexCmd represents the index command +var indexCmd = NewIndexCmd() + +func init() { + RootCmd.AddCommand(GetIndexCmd()) +} + +func NewIndexCmd() *cobra.Command { + return &cobra.Command{ + Use: "index ", + Short: "Build search index for an archive.", + Long: `Build a search index for a .dat, .tim, or .trix archive.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + archivePath, err := filepath.Abs(args[0]) + if err != nil { + return fmt.Errorf("failed to get absolute path for archive: %w", err) + } + + // Read and decompress the archive + compressedData, err := os.ReadFile(archivePath) + if err != nil { + return fmt.Errorf("failed to read archive: %w", err) + } + tarData, err := compress.Decompress(compressedData) + if err != nil { + return fmt.Errorf("failed to decompress archive: %w", err) + } + + // Load the DataNode + dn, err := datanode.FromTar(tarData) + if err != nil { + return fmt.Errorf("failed to load datanode: %w", err) + } + + // Build the index + trigramIndex := make(map[[3]byte][]uint32) + var fileList []string + + err = dn.Walk(".", func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() { + return nil + } + + // Add file to list and map + fileID := uint32(len(fileList)) + fileList = append(fileList, path) + + // Read file content + file, err := dn.Open(path) + if err != nil { + return err + } + defer file.Close() + content, err := io.ReadAll(file) + if err != nil { + return err + } + + // Generate and add trigrams + if len(content) < 3 { + return nil + } + for i := 0; i <= len(content)-3; i++ { + var trigram [3]byte + copy(trigram[:], content[i:i+3]) + + postings := trigramIndex[trigram] + if len(postings) == 0 || postings[len(postings)-1] != fileID { + trigramIndex[trigram] = append(postings, fileID) + } + } + return nil + }) + if err != nil { + return fmt.Errorf("failed to walk datanode: %w", err) + } + + // Save the index + indexDir := filepath.Join(filepath.Dir(archivePath), ".borg-index") + if err := os.MkdirAll(indexDir, 0755); err != nil { + return fmt.Errorf("failed to create index directory: %w", err) + } + + // Save file list + fileListPath := filepath.Join(indexDir, "files.json") + fileListData, err := json.MarshalIndent(fileList, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal file list: %w", err) + } + if err := os.WriteFile(fileListPath, fileListData, 0644); err != nil { + return fmt.Errorf("failed to write file list: %w", err) + } + + // Save trigram index + trigramIndexPath := filepath.Join(indexDir, "trigram.idx") + var buf bytes.Buffer + encoder := gob.NewEncoder(&buf) + if err := encoder.Encode(trigramIndex); err != nil { + return fmt.Errorf("failed to encode trigram index: %w", err) + } + if err := os.WriteFile(trigramIndexPath, buf.Bytes(), 0644); err != nil { + return fmt.Errorf("failed to write trigram index: %w", err) + } + + fmt.Fprintf(cmd.OutOrStdout(), "Successfully built index for %s\n", args[0]) + return nil + }, + } +} + +func GetIndexCmd() *cobra.Command { + return indexCmd +} diff --git a/cmd/index_test.go b/cmd/index_test.go new file mode 100644 index 0000000..b95f3f0 --- /dev/null +++ b/cmd/index_test.go @@ -0,0 +1,54 @@ +package cmd + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/Snider/Borg/pkg/datanode" +) + +func TestIndexCommand_Good(t *testing.T) { + // Create a temporary directory + tmpDir := t.TempDir() + archivePath := filepath.Join(tmpDir, "test.dat") + + // Create a sample DataNode + dn := datanode.New() + dn.AddData("file1.txt", []byte("hello world")) + dn.AddData("file2.go", []byte("package main")) + tarData, err := dn.ToTar() + if err != nil { + t.Fatalf("failed to create tar: %v", err) + } + if err := os.WriteFile(archivePath, tarData, 0644); err != nil { + t.Fatalf("failed to write archive: %v", err) + } + + // Run the index command + output, err := executeCommand(RootCmd, "index", archivePath) + if err != nil { + t.Fatalf("index command failed: %v", err) + } + + if !strings.Contains(output, "Successfully built index") { + t.Errorf("expected success message, got: %s", output) + } + + // Verify that the index directory and files were created + indexDir := filepath.Join(tmpDir, ".borg-index") + if _, err := os.Stat(indexDir); os.IsNotExist(err) { + t.Fatalf(".borg-index directory was not created") + } + + filesJSONPath := filepath.Join(indexDir, "files.json") + if _, err := os.Stat(filesJSONPath); os.IsNotExist(err) { + t.Fatalf("files.json was not created") + } + + trigramIdxPath := filepath.Join(indexDir, "trigram.idx") + if _, err := os.Stat(trigramIdxPath); os.IsNotExist(err) { + t.Fatalf("trigram.idx was not created") + } +} diff --git a/cmd/root_test.go b/cmd/root_test.go index 5b257aa..af1b3cb 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -18,12 +18,16 @@ func executeCommand(root *cobra.Command, args ...string) (string, error) { // executeCommandC is a helper function to execute a cobra command and return the output. func executeCommandC(root *cobra.Command, args ...string) (*cobra.Command, string, error) { - buf := new(bytes.Buffer) - root.SetOut(buf) - root.SetErr(buf) - root.SetArgs(args) + // We need to create a new instance of the root command for each test to avoid state pollution. + testRootCmd := NewRootCmd() + initAllCommands(testRootCmd) // Pass the new instance to the init function. - c, err := root.ExecuteC() + buf := new(bytes.Buffer) + testRootCmd.SetOut(buf) + testRootCmd.SetErr(buf) + testRootCmd.SetArgs(args) + + c, err := testRootCmd.ExecuteC() return c, buf.String(), err } @@ -45,11 +49,6 @@ func TestRootCmd_Good(t *testing.T) { }) t.Run("Help flag", func(t *testing.T) { - // We need to reset the command's state before each run. - RootCmd.ResetFlags() - RootCmd.ResetCommands() - initAllCommands() - output, err := executeCommand(RootCmd, "--help") if err != nil { t.Fatalf("unexpected error: %v", err) @@ -62,11 +61,6 @@ func TestRootCmd_Good(t *testing.T) { func TestRootCmd_Bad(t *testing.T) { t.Run("Unknown command", func(t *testing.T) { - // We need to reset the command's state before each run. - RootCmd.ResetFlags() - RootCmd.ResetCommands() - initAllCommands() - _, err := executeCommand(RootCmd, "unknown-command") if err == nil { t.Fatal("expected an error for an unknown command, but got none") @@ -75,10 +69,12 @@ func TestRootCmd_Bad(t *testing.T) { } // initAllCommands re-initializes all commands for testing. -func initAllCommands() { - RootCmd.AddCommand(GetAllCmd()) - RootCmd.AddCommand(GetCollectCmd()) - RootCmd.AddCommand(GetCompileCmd()) - RootCmd.AddCommand(GetRunCmd()) - RootCmd.AddCommand(GetServeCmd()) +func initAllCommands(cmd *cobra.Command) { + cmd.AddCommand(GetAllCmd()) + cmd.AddCommand(GetCollectCmd()) + cmd.AddCommand(GetCompileCmd()) + cmd.AddCommand(GetRunCmd()) + cmd.AddCommand(GetServeCmd()) + cmd.AddCommand(GetIndexCmd()) + cmd.AddCommand(GetSearchCmd()) } diff --git a/cmd/search.go b/cmd/search.go new file mode 100644 index 0000000..1c7f17f --- /dev/null +++ b/cmd/search.go @@ -0,0 +1,342 @@ +package cmd + +import ( + "bufio" + "bytes" + "encoding/gob" + "encoding/json" + "fmt" + "io/fs" + "os" + "path/filepath" + "regexp" + "strings" + + "github.com/Snider/Borg/pkg/compress" + "github.com/Snider/Borg/pkg/datanode" + "github.com/spf13/cobra" +) + +// searchCmd represents the search command +var searchCmd = NewSearchCmd() + +func init() { + RootCmd.AddCommand(GetSearchCmd()) +} + +type searchResult struct { + FilePath string + LineNum int + Line string +} + +func NewSearchCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "search ", + Short: "Search for a pattern in an archive.", + Long: `Search for a pattern in a .dat, .tim, or .trix archive.`, + Args: cobra.ExactArgs(2), + RunE: func(cmd *cobra.Command, args []string) error { + archivePath, err := filepath.Abs(args[0]) + if err != nil { + return fmt.Errorf("could not get absolute path for archive: %w", err) + } + pattern := args[1] + + // Read and decompress the archive + compressedData, err := os.ReadFile(archivePath) + if err != nil { + return fmt.Errorf("failed to read archive: %w", err) + } + tarData, err := compress.Decompress(compressedData) + if err != nil { + return fmt.Errorf("failed to decompress archive: %w", err) + } + dn, err := datanode.FromTar(tarData) + if err != nil { + return fmt.Errorf("failed to load datanode: %w", err) + } + + indexDir := filepath.Join(filepath.Dir(archivePath), ".borg-index") + indexPath := filepath.Join(indexDir, "trigram.idx") + + var results []searchResult + + if _, err := os.Stat(indexPath); err == nil { + results, err = searchWithIndex(dn, archivePath, pattern, cmd) + if err != nil { + return fmt.Errorf("error searching with index: %w", err) + } + } else { + results, err = searchWithoutIndex(dn, pattern, cmd) + if err != nil { + return fmt.Errorf("error searching without index: %w", err) + } + } + + return printResults(cmd, dn, results) + }, + } + + cmd.Flags().Bool("regex", false, "Use regex pattern") + cmd.Flags().IntP("context", "C", 0, "Show N lines around match") + cmd.Flags().String("type", "", "Filter by file extension") + cmd.Flags().Int("max-results", 0, "Limit output to N results") + + return cmd +} + +func printResults(cmd *cobra.Command, dn *datanode.DataNode, results []searchResult) error { + contextLines, _ := cmd.Flags().GetInt("context") + maxResults, _ := cmd.Flags().GetInt("max-results") + + if maxResults > 0 && len(results) > maxResults { + results = results[:maxResults] + } + + // Group results by file + resultsByFile := make(map[string][]searchResult) + for _, res := range results { + resultsByFile[res.FilePath] = append(resultsByFile[res.FilePath], res) + } + + // Process each file + for filePath, fileResults := range resultsByFile { + if contextLines > 0 { + file, err := dn.Open(filePath) + if err != nil { + return fmt.Errorf("could not open file %s from archive: %w", filePath, err) + } + + var lines []string + scanner := bufio.NewScanner(file) + for scanner.Scan() { + lines = append(lines, scanner.Text()) + } + file.Close() + + for _, res := range fileResults { + start := res.LineNum - 1 - contextLines + if start < 0 { + start = 0 + } + + end := res.LineNum + contextLines + if end > len(lines) { + end = len(lines) + } + + for j := start; j < end; j++ { + lineNum := j + 1 + line := lines[j] + prefix := " " + if lineNum == res.LineNum { + prefix = ">" + } + fmt.Fprintf(cmd.OutOrStdout(), "%s %s:%d: %s\n", prefix, filePath, lineNum, line) + } + fmt.Fprintln(cmd.OutOrStdout(), "--") + } + } else { + for _, res := range fileResults { + fmt.Fprintf(cmd.OutOrStdout(), "%s:%d: %s\n", res.FilePath, res.LineNum, res.Line) + } + } + } + + return nil +} + +func searchWithoutIndex(dn *datanode.DataNode, pattern string, cmd *cobra.Command) ([]searchResult, error) { + var results []searchResult + + useRegex, _ := cmd.Flags().GetBool("regex") + fileType, _ := cmd.Flags().GetString("type") + + var re *regexp.Regexp + var err error + if useRegex { + re, err = regexp.Compile(pattern) + if err != nil { + return nil, fmt.Errorf("invalid regex pattern: %w", err) + } + } + + err = dn.Walk(".", func(path string, d fs.DirEntry, err error) error { + if err != nil || d.IsDir() { + return err + } + if fileType != "" && !strings.HasSuffix(path, "."+fileType) { + return nil + } + + file, err := dn.Open(path) + if err != nil { + return err + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for lineNum := 1; scanner.Scan(); lineNum++ { + line := scanner.Text() + match := false + if useRegex { + if re.MatchString(line) { + match = true + } + } else { + if strings.Contains(line, pattern) { + match = true + } + } + + if match { + results = append(results, searchResult{ + FilePath: path, + LineNum: lineNum, + Line: strings.TrimSpace(line), + }) + } + } + return scanner.Err() + }) + + if err != nil { + return nil, fmt.Errorf("error walking datanode: %w", err) + } + + return results, nil +} + +func searchWithIndex(dn *datanode.DataNode, archivePath, pattern string, cmd *cobra.Command) ([]searchResult, error) { + indexDir := filepath.Join(filepath.Dir(archivePath), ".borg-index") + + // Load file list + fileListPath := filepath.Join(indexDir, "files.json") + fileListData, err := os.ReadFile(fileListPath) + if err != nil { + return nil, fmt.Errorf("could not read file list: %w", err) + } + var fileList []string + if err := json.Unmarshal(fileListData, &fileList); err != nil { + return nil, fmt.Errorf("could not unmarshal file list: %w", err) + } + + // Load trigram index + trigramIndexPath := filepath.Join(indexDir, "trigram.idx") + trigramIndexData, err := os.ReadFile(trigramIndexPath) + if err != nil { + return nil, fmt.Errorf("could not read trigram index: %w", err) + } + var trigramIndex map[[3]byte][]uint32 + decoder := gob.NewDecoder(bytes.NewReader(trigramIndexData)) + if err := decoder.Decode(&trigramIndex); err != nil { + return nil, fmt.Errorf("could not decode trigram index: %w", err) + } + + // Find candidate files + candidateFiles := findCandidateFiles(pattern, trigramIndex, fileList) + + // Search within candidate files + var results []searchResult + useRegex, _ := cmd.Flags().GetBool("regex") + fileType, _ := cmd.Flags().GetString("type") + + var re *regexp.Regexp + if useRegex { + re, err = regexp.Compile(pattern) + if err != nil { + return nil, fmt.Errorf("invalid regex pattern: %w", err) + } + } + + for path := range candidateFiles { + if fileType != "" && !strings.HasSuffix(path, "."+fileType) { + continue + } + + file, err := dn.Open(path) + if err != nil { + return nil, fmt.Errorf("could not open file from archive: %w", err) + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for lineNum := 1; scanner.Scan(); lineNum++ { + line := scanner.Text() + match := false + if useRegex { + if re.MatchString(line) { + match = true + } + } else { + if strings.Contains(line, pattern) { + match = true + } + } + + if match { + results = append(results, searchResult{ + FilePath: path, + LineNum: lineNum, + Line: strings.TrimSpace(line), + }) + } + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("error scanning file %s: %w", path, err) + } + } + + return results, nil +} + +func findCandidateFiles(pattern string, trigramIndex map[[3]byte][]uint32, fileList []string) map[string]struct{} { + if len(pattern) < 3 { + // Fallback for short patterns + candidateFiles := make(map[string]struct{}) + for _, file := range fileList { + candidateFiles[file] = struct{}{} + } + return candidateFiles + } + + // Generate trigrams from pattern + var trigrams [][3]byte + for i := 0; i <= len(pattern)-3; i++ { + var trigram [3]byte + copy(trigram[:], pattern[i:i+3]) + trigrams = append(trigrams, trigram) + } + + // Find intersection of file IDs + var intersection map[uint32]struct{} + for i, trigram := range trigrams { + postings := trigramIndex[trigram] + if i == 0 { + intersection = make(map[uint32]struct{}) + for _, fileID := range postings { + intersection[fileID] = struct{}{} + } + } else { + newIntersection := make(map[uint32]struct{}) + for _, fileID := range postings { + if _, ok := intersection[fileID]; ok { + newIntersection[fileID] = struct{}{} + } + } + intersection = newIntersection + } + } + + candidateFiles := make(map[string]struct{}) + for fileID := range intersection { + candidateFiles[fileList[fileID]] = struct{}{} + } + + return candidateFiles +} + +func GetSearchCmd() *cobra.Command { + return searchCmd +} diff --git a/cmd/search_test.go b/cmd/search_test.go new file mode 100644 index 0000000..04f38c4 --- /dev/null +++ b/cmd/search_test.go @@ -0,0 +1,78 @@ +package cmd + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/Snider/Borg/pkg/datanode" +) + +func TestSearchCommand_WithoutIndex(t *testing.T) { + // Create a temporary directory + tmpDir := t.TempDir() + archivePath := filepath.Join(tmpDir, "test.dat") + + // Create a sample DataNode + dn := datanode.New() + dn.AddData("file1.txt", []byte("hello world")) + dn.AddData("file2.go", []byte("package main\n\nfunc main() {\n\tprintln(\"hello\")\n}")) + tarData, err := dn.ToTar() + if err != nil { + t.Fatalf("failed to create tar: %v", err) + } + if err := os.WriteFile(archivePath, tarData, 0644); err != nil { + t.Fatalf("failed to write archive: %v", err) + } + + // Run the search command + output, err := executeCommand(RootCmd, "search", archivePath, "hello") + if err != nil { + t.Fatalf("search command failed: %v", err) + } + + if !strings.Contains(output, "file1.txt:1: hello world") { + t.Errorf("expected to find 'hello' in file1.txt, got: %s", output) + } + if !strings.Contains(output, "file2.go:4: println(\"hello\")") { + t.Errorf("expected to find 'hello' in file2.go, got: %s", output) + } +} + +func TestSearchCommand_WithIndex(t *testing.T) { + // Create a temporary directory + tmpDir := t.TempDir() + archivePath := filepath.Join(tmpDir, "test.dat") + + // Create a sample DataNode + dn := datanode.New() + dn.AddData("file1.txt", []byte("hello world")) + dn.AddData("file2.go", []byte("package main\n\nfunc main() {\n\tprintln(\"hello\")\n}")) + tarData, err := dn.ToTar() + if err != nil { + t.Fatalf("failed to create tar: %v", err) + } + if err := os.WriteFile(archivePath, tarData, 0644); err != nil { + t.Fatalf("failed to write archive: %v", err) + } + + // Run the index command + _, err = executeCommand(RootCmd, "index", archivePath) + if err != nil { + t.Fatalf("index command failed: %v", err) + } + + // Run the search command + output, err := executeCommand(RootCmd, "search", archivePath, "hello") + if err != nil { + t.Fatalf("search command failed: %v", err) + } + + if !strings.Contains(output, "file1.txt:1: hello world") { + t.Errorf("expected to find 'hello' in file1.txt, got: %s", output) + } + if !strings.Contains(output, "file2.go:4: println(\"hello\")") { + t.Errorf("expected to find 'hello' in file2.go, got: %s", output) + } +} diff --git a/docs/cli.md b/docs/cli.md index 55c0185..ef0e226 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -74,6 +74,34 @@ Examples: - `borg decode borg.trix --output borg.dat --password "secret"` - `borg decode borg.tim --output borg.dat --i-am-in-isolation` +### index + +Build a search index for an archive to speed up searches. + +- `borg index ` + +Example: +- `borg index my-project.dat` + +This will create a `.borg-index` directory next to the archive. + +### search + +Search for a pattern within an archive. Uses a pre-built index if available. + +- `borg search ` + +Flags: +- `--regex`: Treat the pattern as a regular expression. +- `-C, --context N`: Show N lines of context before and after each match. +- `--type `: Filter search by file extension (e.g., `go`, `md`). +- `--max-results N`: Limit the number of results returned. + +Examples: +- `borg search my-project.dat "TODO:"` +- `borg search my-project.dat "func.*main" --regex --type go` +- `borg search my-project.dat "important" -C 3` + ## Compression All collect commands accept `--compression` with values: