feat(search): Add full-text search and indexing for archives

This commit introduces two new commands: `borg search` and `borg index`.

The `borg index` command builds a trigram index for an archive, which can be used to significantly speed up searches.

The `borg search` command allows users to search for patterns within archives. It supports regular expressions, context control, file type filtering, and result limits. The command will automatically use a pre-built index if one is available, falling back to a full scan if not.

This commit also includes:
- Unit tests for the new commands.
- Documentation for the new commands in `docs/cli.md`.
- Updates to `.gitignore` to exclude index files.
- Improvements to the test infrastructure to prevent state pollution.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
This commit is contained in:
google-labs-jules[bot] 2026-02-02 00:52:46 +00:00
parent cf2af53ed3
commit c3865faf56
7 changed files with 654 additions and 21 deletions

1
.gitignore vendored
View file

@ -10,3 +10,4 @@ demo-track.smsg
# Dev artifacts
.playwright-mcp/
.borg-index/

134
cmd/index.go Normal file
View file

@ -0,0 +1,134 @@
package cmd
import (
"bytes"
"encoding/gob"
"encoding/json"
"fmt"
"io"
"io/fs"
"os"
"path/filepath"
"github.com/Snider/Borg/pkg/compress"
"github.com/Snider/Borg/pkg/datanode"
"github.com/spf13/cobra"
)
// indexCmd represents the index command
var indexCmd = NewIndexCmd()
func init() {
RootCmd.AddCommand(GetIndexCmd())
}
func NewIndexCmd() *cobra.Command {
return &cobra.Command{
Use: "index <archive>",
Short: "Build search index for an archive.",
Long: `Build a search index for a .dat, .tim, or .trix archive.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
archivePath, err := filepath.Abs(args[0])
if err != nil {
return fmt.Errorf("failed to get absolute path for archive: %w", err)
}
// Read and decompress the archive
compressedData, err := os.ReadFile(archivePath)
if err != nil {
return fmt.Errorf("failed to read archive: %w", err)
}
tarData, err := compress.Decompress(compressedData)
if err != nil {
return fmt.Errorf("failed to decompress archive: %w", err)
}
// Load the DataNode
dn, err := datanode.FromTar(tarData)
if err != nil {
return fmt.Errorf("failed to load datanode: %w", err)
}
// Build the index
trigramIndex := make(map[[3]byte][]uint32)
var fileList []string
err = dn.Walk(".", func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}
// Add file to list and map
fileID := uint32(len(fileList))
fileList = append(fileList, path)
// Read file content
file, err := dn.Open(path)
if err != nil {
return err
}
defer file.Close()
content, err := io.ReadAll(file)
if err != nil {
return err
}
// Generate and add trigrams
if len(content) < 3 {
return nil
}
for i := 0; i <= len(content)-3; i++ {
var trigram [3]byte
copy(trigram[:], content[i:i+3])
postings := trigramIndex[trigram]
if len(postings) == 0 || postings[len(postings)-1] != fileID {
trigramIndex[trigram] = append(postings, fileID)
}
}
return nil
})
if err != nil {
return fmt.Errorf("failed to walk datanode: %w", err)
}
// Save the index
indexDir := filepath.Join(filepath.Dir(archivePath), ".borg-index")
if err := os.MkdirAll(indexDir, 0755); err != nil {
return fmt.Errorf("failed to create index directory: %w", err)
}
// Save file list
fileListPath := filepath.Join(indexDir, "files.json")
fileListData, err := json.MarshalIndent(fileList, "", " ")
if err != nil {
return fmt.Errorf("failed to marshal file list: %w", err)
}
if err := os.WriteFile(fileListPath, fileListData, 0644); err != nil {
return fmt.Errorf("failed to write file list: %w", err)
}
// Save trigram index
trigramIndexPath := filepath.Join(indexDir, "trigram.idx")
var buf bytes.Buffer
encoder := gob.NewEncoder(&buf)
if err := encoder.Encode(trigramIndex); err != nil {
return fmt.Errorf("failed to encode trigram index: %w", err)
}
if err := os.WriteFile(trigramIndexPath, buf.Bytes(), 0644); err != nil {
return fmt.Errorf("failed to write trigram index: %w", err)
}
fmt.Fprintf(cmd.OutOrStdout(), "Successfully built index for %s\n", args[0])
return nil
},
}
}
func GetIndexCmd() *cobra.Command {
return indexCmd
}

54
cmd/index_test.go Normal file
View file

@ -0,0 +1,54 @@
package cmd
import (
"os"
"path/filepath"
"strings"
"testing"
"github.com/Snider/Borg/pkg/datanode"
)
func TestIndexCommand_Good(t *testing.T) {
// Create a temporary directory
tmpDir := t.TempDir()
archivePath := filepath.Join(tmpDir, "test.dat")
// Create a sample DataNode
dn := datanode.New()
dn.AddData("file1.txt", []byte("hello world"))
dn.AddData("file2.go", []byte("package main"))
tarData, err := dn.ToTar()
if err != nil {
t.Fatalf("failed to create tar: %v", err)
}
if err := os.WriteFile(archivePath, tarData, 0644); err != nil {
t.Fatalf("failed to write archive: %v", err)
}
// Run the index command
output, err := executeCommand(RootCmd, "index", archivePath)
if err != nil {
t.Fatalf("index command failed: %v", err)
}
if !strings.Contains(output, "Successfully built index") {
t.Errorf("expected success message, got: %s", output)
}
// Verify that the index directory and files were created
indexDir := filepath.Join(tmpDir, ".borg-index")
if _, err := os.Stat(indexDir); os.IsNotExist(err) {
t.Fatalf(".borg-index directory was not created")
}
filesJSONPath := filepath.Join(indexDir, "files.json")
if _, err := os.Stat(filesJSONPath); os.IsNotExist(err) {
t.Fatalf("files.json was not created")
}
trigramIdxPath := filepath.Join(indexDir, "trigram.idx")
if _, err := os.Stat(trigramIdxPath); os.IsNotExist(err) {
t.Fatalf("trigram.idx was not created")
}
}

View file

@ -18,12 +18,16 @@ func executeCommand(root *cobra.Command, args ...string) (string, error) {
// executeCommandC is a helper function to execute a cobra command and return the output.
func executeCommandC(root *cobra.Command, args ...string) (*cobra.Command, string, error) {
buf := new(bytes.Buffer)
root.SetOut(buf)
root.SetErr(buf)
root.SetArgs(args)
// We need to create a new instance of the root command for each test to avoid state pollution.
testRootCmd := NewRootCmd()
initAllCommands(testRootCmd) // Pass the new instance to the init function.
c, err := root.ExecuteC()
buf := new(bytes.Buffer)
testRootCmd.SetOut(buf)
testRootCmd.SetErr(buf)
testRootCmd.SetArgs(args)
c, err := testRootCmd.ExecuteC()
return c, buf.String(), err
}
@ -45,11 +49,6 @@ func TestRootCmd_Good(t *testing.T) {
})
t.Run("Help flag", func(t *testing.T) {
// We need to reset the command's state before each run.
RootCmd.ResetFlags()
RootCmd.ResetCommands()
initAllCommands()
output, err := executeCommand(RootCmd, "--help")
if err != nil {
t.Fatalf("unexpected error: %v", err)
@ -62,11 +61,6 @@ func TestRootCmd_Good(t *testing.T) {
func TestRootCmd_Bad(t *testing.T) {
t.Run("Unknown command", func(t *testing.T) {
// We need to reset the command's state before each run.
RootCmd.ResetFlags()
RootCmd.ResetCommands()
initAllCommands()
_, err := executeCommand(RootCmd, "unknown-command")
if err == nil {
t.Fatal("expected an error for an unknown command, but got none")
@ -75,10 +69,12 @@ func TestRootCmd_Bad(t *testing.T) {
}
// initAllCommands re-initializes all commands for testing.
func initAllCommands() {
RootCmd.AddCommand(GetAllCmd())
RootCmd.AddCommand(GetCollectCmd())
RootCmd.AddCommand(GetCompileCmd())
RootCmd.AddCommand(GetRunCmd())
RootCmd.AddCommand(GetServeCmd())
func initAllCommands(cmd *cobra.Command) {
cmd.AddCommand(GetAllCmd())
cmd.AddCommand(GetCollectCmd())
cmd.AddCommand(GetCompileCmd())
cmd.AddCommand(GetRunCmd())
cmd.AddCommand(GetServeCmd())
cmd.AddCommand(GetIndexCmd())
cmd.AddCommand(GetSearchCmd())
}

342
cmd/search.go Normal file
View file

@ -0,0 +1,342 @@
package cmd
import (
"bufio"
"bytes"
"encoding/gob"
"encoding/json"
"fmt"
"io/fs"
"os"
"path/filepath"
"regexp"
"strings"
"github.com/Snider/Borg/pkg/compress"
"github.com/Snider/Borg/pkg/datanode"
"github.com/spf13/cobra"
)
// searchCmd represents the search command
var searchCmd = NewSearchCmd()
func init() {
RootCmd.AddCommand(GetSearchCmd())
}
type searchResult struct {
FilePath string
LineNum int
Line string
}
func NewSearchCmd() *cobra.Command {
cmd := &cobra.Command{
Use: "search <archive> <pattern>",
Short: "Search for a pattern in an archive.",
Long: `Search for a pattern in a .dat, .tim, or .trix archive.`,
Args: cobra.ExactArgs(2),
RunE: func(cmd *cobra.Command, args []string) error {
archivePath, err := filepath.Abs(args[0])
if err != nil {
return fmt.Errorf("could not get absolute path for archive: %w", err)
}
pattern := args[1]
// Read and decompress the archive
compressedData, err := os.ReadFile(archivePath)
if err != nil {
return fmt.Errorf("failed to read archive: %w", err)
}
tarData, err := compress.Decompress(compressedData)
if err != nil {
return fmt.Errorf("failed to decompress archive: %w", err)
}
dn, err := datanode.FromTar(tarData)
if err != nil {
return fmt.Errorf("failed to load datanode: %w", err)
}
indexDir := filepath.Join(filepath.Dir(archivePath), ".borg-index")
indexPath := filepath.Join(indexDir, "trigram.idx")
var results []searchResult
if _, err := os.Stat(indexPath); err == nil {
results, err = searchWithIndex(dn, archivePath, pattern, cmd)
if err != nil {
return fmt.Errorf("error searching with index: %w", err)
}
} else {
results, err = searchWithoutIndex(dn, pattern, cmd)
if err != nil {
return fmt.Errorf("error searching without index: %w", err)
}
}
return printResults(cmd, dn, results)
},
}
cmd.Flags().Bool("regex", false, "Use regex pattern")
cmd.Flags().IntP("context", "C", 0, "Show N lines around match")
cmd.Flags().String("type", "", "Filter by file extension")
cmd.Flags().Int("max-results", 0, "Limit output to N results")
return cmd
}
func printResults(cmd *cobra.Command, dn *datanode.DataNode, results []searchResult) error {
contextLines, _ := cmd.Flags().GetInt("context")
maxResults, _ := cmd.Flags().GetInt("max-results")
if maxResults > 0 && len(results) > maxResults {
results = results[:maxResults]
}
// Group results by file
resultsByFile := make(map[string][]searchResult)
for _, res := range results {
resultsByFile[res.FilePath] = append(resultsByFile[res.FilePath], res)
}
// Process each file
for filePath, fileResults := range resultsByFile {
if contextLines > 0 {
file, err := dn.Open(filePath)
if err != nil {
return fmt.Errorf("could not open file %s from archive: %w", filePath, err)
}
var lines []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
file.Close()
for _, res := range fileResults {
start := res.LineNum - 1 - contextLines
if start < 0 {
start = 0
}
end := res.LineNum + contextLines
if end > len(lines) {
end = len(lines)
}
for j := start; j < end; j++ {
lineNum := j + 1
line := lines[j]
prefix := " "
if lineNum == res.LineNum {
prefix = ">"
}
fmt.Fprintf(cmd.OutOrStdout(), "%s %s:%d: %s\n", prefix, filePath, lineNum, line)
}
fmt.Fprintln(cmd.OutOrStdout(), "--")
}
} else {
for _, res := range fileResults {
fmt.Fprintf(cmd.OutOrStdout(), "%s:%d: %s\n", res.FilePath, res.LineNum, res.Line)
}
}
}
return nil
}
func searchWithoutIndex(dn *datanode.DataNode, pattern string, cmd *cobra.Command) ([]searchResult, error) {
var results []searchResult
useRegex, _ := cmd.Flags().GetBool("regex")
fileType, _ := cmd.Flags().GetString("type")
var re *regexp.Regexp
var err error
if useRegex {
re, err = regexp.Compile(pattern)
if err != nil {
return nil, fmt.Errorf("invalid regex pattern: %w", err)
}
}
err = dn.Walk(".", func(path string, d fs.DirEntry, err error) error {
if err != nil || d.IsDir() {
return err
}
if fileType != "" && !strings.HasSuffix(path, "."+fileType) {
return nil
}
file, err := dn.Open(path)
if err != nil {
return err
}
defer file.Close()
scanner := bufio.NewScanner(file)
for lineNum := 1; scanner.Scan(); lineNum++ {
line := scanner.Text()
match := false
if useRegex {
if re.MatchString(line) {
match = true
}
} else {
if strings.Contains(line, pattern) {
match = true
}
}
if match {
results = append(results, searchResult{
FilePath: path,
LineNum: lineNum,
Line: strings.TrimSpace(line),
})
}
}
return scanner.Err()
})
if err != nil {
return nil, fmt.Errorf("error walking datanode: %w", err)
}
return results, nil
}
func searchWithIndex(dn *datanode.DataNode, archivePath, pattern string, cmd *cobra.Command) ([]searchResult, error) {
indexDir := filepath.Join(filepath.Dir(archivePath), ".borg-index")
// Load file list
fileListPath := filepath.Join(indexDir, "files.json")
fileListData, err := os.ReadFile(fileListPath)
if err != nil {
return nil, fmt.Errorf("could not read file list: %w", err)
}
var fileList []string
if err := json.Unmarshal(fileListData, &fileList); err != nil {
return nil, fmt.Errorf("could not unmarshal file list: %w", err)
}
// Load trigram index
trigramIndexPath := filepath.Join(indexDir, "trigram.idx")
trigramIndexData, err := os.ReadFile(trigramIndexPath)
if err != nil {
return nil, fmt.Errorf("could not read trigram index: %w", err)
}
var trigramIndex map[[3]byte][]uint32
decoder := gob.NewDecoder(bytes.NewReader(trigramIndexData))
if err := decoder.Decode(&trigramIndex); err != nil {
return nil, fmt.Errorf("could not decode trigram index: %w", err)
}
// Find candidate files
candidateFiles := findCandidateFiles(pattern, trigramIndex, fileList)
// Search within candidate files
var results []searchResult
useRegex, _ := cmd.Flags().GetBool("regex")
fileType, _ := cmd.Flags().GetString("type")
var re *regexp.Regexp
if useRegex {
re, err = regexp.Compile(pattern)
if err != nil {
return nil, fmt.Errorf("invalid regex pattern: %w", err)
}
}
for path := range candidateFiles {
if fileType != "" && !strings.HasSuffix(path, "."+fileType) {
continue
}
file, err := dn.Open(path)
if err != nil {
return nil, fmt.Errorf("could not open file from archive: %w", err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for lineNum := 1; scanner.Scan(); lineNum++ {
line := scanner.Text()
match := false
if useRegex {
if re.MatchString(line) {
match = true
}
} else {
if strings.Contains(line, pattern) {
match = true
}
}
if match {
results = append(results, searchResult{
FilePath: path,
LineNum: lineNum,
Line: strings.TrimSpace(line),
})
}
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error scanning file %s: %w", path, err)
}
}
return results, nil
}
func findCandidateFiles(pattern string, trigramIndex map[[3]byte][]uint32, fileList []string) map[string]struct{} {
if len(pattern) < 3 {
// Fallback for short patterns
candidateFiles := make(map[string]struct{})
for _, file := range fileList {
candidateFiles[file] = struct{}{}
}
return candidateFiles
}
// Generate trigrams from pattern
var trigrams [][3]byte
for i := 0; i <= len(pattern)-3; i++ {
var trigram [3]byte
copy(trigram[:], pattern[i:i+3])
trigrams = append(trigrams, trigram)
}
// Find intersection of file IDs
var intersection map[uint32]struct{}
for i, trigram := range trigrams {
postings := trigramIndex[trigram]
if i == 0 {
intersection = make(map[uint32]struct{})
for _, fileID := range postings {
intersection[fileID] = struct{}{}
}
} else {
newIntersection := make(map[uint32]struct{})
for _, fileID := range postings {
if _, ok := intersection[fileID]; ok {
newIntersection[fileID] = struct{}{}
}
}
intersection = newIntersection
}
}
candidateFiles := make(map[string]struct{})
for fileID := range intersection {
candidateFiles[fileList[fileID]] = struct{}{}
}
return candidateFiles
}
func GetSearchCmd() *cobra.Command {
return searchCmd
}

78
cmd/search_test.go Normal file
View file

@ -0,0 +1,78 @@
package cmd
import (
"os"
"path/filepath"
"strings"
"testing"
"github.com/Snider/Borg/pkg/datanode"
)
func TestSearchCommand_WithoutIndex(t *testing.T) {
// Create a temporary directory
tmpDir := t.TempDir()
archivePath := filepath.Join(tmpDir, "test.dat")
// Create a sample DataNode
dn := datanode.New()
dn.AddData("file1.txt", []byte("hello world"))
dn.AddData("file2.go", []byte("package main\n\nfunc main() {\n\tprintln(\"hello\")\n}"))
tarData, err := dn.ToTar()
if err != nil {
t.Fatalf("failed to create tar: %v", err)
}
if err := os.WriteFile(archivePath, tarData, 0644); err != nil {
t.Fatalf("failed to write archive: %v", err)
}
// Run the search command
output, err := executeCommand(RootCmd, "search", archivePath, "hello")
if err != nil {
t.Fatalf("search command failed: %v", err)
}
if !strings.Contains(output, "file1.txt:1: hello world") {
t.Errorf("expected to find 'hello' in file1.txt, got: %s", output)
}
if !strings.Contains(output, "file2.go:4: println(\"hello\")") {
t.Errorf("expected to find 'hello' in file2.go, got: %s", output)
}
}
func TestSearchCommand_WithIndex(t *testing.T) {
// Create a temporary directory
tmpDir := t.TempDir()
archivePath := filepath.Join(tmpDir, "test.dat")
// Create a sample DataNode
dn := datanode.New()
dn.AddData("file1.txt", []byte("hello world"))
dn.AddData("file2.go", []byte("package main\n\nfunc main() {\n\tprintln(\"hello\")\n}"))
tarData, err := dn.ToTar()
if err != nil {
t.Fatalf("failed to create tar: %v", err)
}
if err := os.WriteFile(archivePath, tarData, 0644); err != nil {
t.Fatalf("failed to write archive: %v", err)
}
// Run the index command
_, err = executeCommand(RootCmd, "index", archivePath)
if err != nil {
t.Fatalf("index command failed: %v", err)
}
// Run the search command
output, err := executeCommand(RootCmd, "search", archivePath, "hello")
if err != nil {
t.Fatalf("search command failed: %v", err)
}
if !strings.Contains(output, "file1.txt:1: hello world") {
t.Errorf("expected to find 'hello' in file1.txt, got: %s", output)
}
if !strings.Contains(output, "file2.go:4: println(\"hello\")") {
t.Errorf("expected to find 'hello' in file2.go, got: %s", output)
}
}

View file

@ -74,6 +74,34 @@ Examples:
- `borg decode borg.trix --output borg.dat --password "secret"`
- `borg decode borg.tim --output borg.dat --i-am-in-isolation`
### index
Build a search index for an archive to speed up searches.
- `borg index <archive-file>`
Example:
- `borg index my-project.dat`
This will create a `.borg-index` directory next to the archive.
### search
Search for a pattern within an archive. Uses a pre-built index if available.
- `borg search <archive-file> <pattern>`
Flags:
- `--regex`: Treat the pattern as a regular expression.
- `-C, --context N`: Show N lines of context before and after each match.
- `--type <ext>`: Filter search by file extension (e.g., `go`, `md`).
- `--max-results N`: Limit the number of results returned.
Examples:
- `borg search my-project.dat "TODO:"`
- `borg search my-project.dat "func.*main" --regex --type go`
- `borg search my-project.dat "important" -C 3`
## Compression
All collect commands accept `--compression` with values: