diff --git a/.gitignore b/.gitignore index d3a3066..3ea16c8 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ demo-track.smsg # Dev artifacts .playwright-mcp/ +discord/ diff --git a/cmd/collect_discord.go b/cmd/collect_discord.go new file mode 100644 index 0000000..1f049f2 --- /dev/null +++ b/cmd/collect_discord.go @@ -0,0 +1,236 @@ +package cmd + +import ( + "encoding/json" + "fmt" + "github.com/spf13/cobra" + "io" + "net/http" + "os" + "path/filepath" + "sort" + "strings" + "time" +) + +// collectDiscordCmd represents the collect discord command +var collectDiscordCmd = &cobra.Command{ + Use: "discord", + Short: "Collect a Discord server export.", + Long: `Collect a Discord server export from DiscordChatExporter and store it in a searchable archive.`, +} + +// DiscordExport represents the top-level structure of a DiscordChatExporter JSON export. +// This struct is based on a common format, but may need adjustments for different export versions. +type DiscordExport struct { + Guild Guild `json:"guild"` + Channels []Channel `json:"channels"` + Messages []Message `json:"messages"` +} + +// Guild represents the server information. +type Guild struct { + ID string `json:"id"` + Name string `json:"name"` +} + +// Channel represents a channel in the server. +type Channel struct { + ID string `json:"id"` + Name string `json:"name"` +} + +// Message represents a single message in a channel. +type Message struct { + ID string `json:"id"` + ChannelID string `json:"channelId"` + Author Author `json:"author"` + Timestamp time.Time `json:"timestamp"` + Content string `json:"content"` + Attachments []Attachment `json:"attachments"` +} + +// Author represents the message author. +type Author struct { + ID string `json:"id"` + Name string `json:"name"` + AvatarURL string `json:"avatarUrl"` +} + +// Attachment represents a file attached to a message. +type Attachment struct { + URL string `json:"url"` + FileName string `json:"fileName"` +} + +// sanitizeFilename removes characters that are invalid in file paths. +func sanitizeFilename(name string) string { + // Replace path separators and other problematic characters with a dash. + return strings.Map(func(r rune) rune { + switch r { + case '/', '\\', ':', '*', '?', '"', '<', '>', '|': + return '-' + } + return r + }, name) +} + +var collectDiscordImportCmd = &cobra.Command{ + Use: "import [path]", + Short: "Import a DiscordChatExporter JSON export.", + Long: `Import a DiscordChatExporter JSON export and convert it to a searchable archive.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + filePath := args[0] + fmt.Println("Importing Discord export from:", filePath) + + // Read the JSON file + jsonFile, err := os.Open(filePath) + if err != nil { + return fmt.Errorf("could not open file: %w", err) + } + defer jsonFile.Close() + + byteValue, err := io.ReadAll(jsonFile) + if err != nil { + return fmt.Errorf("could not read file: %w", err) + } + + // Unmarshal the JSON data + var export DiscordExport + if err := json.Unmarshal(byteValue, &export); err != nil { + return fmt.Errorf("could not unmarshal json: %w", err) + } + + // Group messages by channel + messagesByChannel := make(map[string][]Message) + for _, msg := range export.Messages { + messagesByChannel[msg.ChannelID] = append(messagesByChannel[msg.ChannelID], msg) + } + + // Sanitize server name for the directory path + sanitizedServerName := sanitizeFilename(export.Guild.Name) + + // Create a searchable index + type SearchEntry struct { + ID string `json:"id"` + Channel string `json:"channel"` + Author string `json:"author"` + Timestamp time.Time `json:"timestamp"` + Content string `json:"content"` + } + + channelNames := make(map[string]string) + for _, ch := range export.Channels { + channelNames[ch.ID] = ch.Name + } + + var searchIndex []SearchEntry + for _, msg := range export.Messages { + searchIndex = append(searchIndex, SearchEntry{ + ID: msg.ID, + Channel: channelNames[msg.ChannelID], + Author: msg.Author.Name, + Timestamp: msg.Timestamp, + Content: msg.Content, + }) + } + + // Create the main output directory + outputDir := filepath.Join("discord", sanitizedServerName) + if err := os.MkdirAll(outputDir, 0755); err != nil { + return fmt.Errorf("could not create output directory: %w", err) + } + + // Save the index to a file + indexData, err := json.MarshalIndent(searchIndex, "", " ") + if err != nil { + return fmt.Errorf("could not marshal search index: %w", err) + } + + indexPath := filepath.Join(outputDir, "INDEX.json") + if err := os.WriteFile(indexPath, indexData, 0644); err != nil { + return fmt.Errorf("could not write search index: %w", err) + } + + // Process each channel and convert messages to Markdown + for _, channel := range export.Channels { + // Sort messages by timestamp + sort.Slice(messagesByChannel[channel.ID], func(i, j int) bool { + return messagesByChannel[channel.ID][i].Timestamp.Before(messagesByChannel[channel.ID][j].Timestamp) + }) + + var markdownContent strings.Builder + markdownContent.WriteString(fmt.Sprintf("# %s\n\n", channel.Name)) + + for _, msg := range messagesByChannel[channel.ID] { + markdownContent.WriteString("---\n") + markdownContent.WriteString(fmt.Sprintf("**%s** `%s`\n\n", msg.Author.Name, msg.Timestamp.Format("2006-01-02 15:04:05"))) + markdownContent.WriteString(msg.Content) + markdownContent.WriteString("\n") + + for _, att := range msg.Attachments { + // Download attachment + resp, err := http.Get(att.URL) + if err != nil { + // Log the error but don't block the entire process + fmt.Printf("Warning: could not download attachment %s: %v\n", att.URL, err) + markdownContent.WriteString(fmt.Sprintf("\n[Failed to download %s](%s)", att.FileName, att.URL)) + continue + } + + // Create attachments directory + attachmentsDir := filepath.Join(outputDir, "attachments") + if err := os.MkdirAll(attachmentsDir, 0755); err != nil { + return fmt.Errorf("could not create attachments directory: %w", err) + } + + // Save attachment + sanitizedAttachmentName := sanitizeFilename(att.FileName) + attachmentPath := filepath.Join(attachmentsDir, sanitizedAttachmentName) + outFile, err := os.Create(attachmentPath) + if err != nil { + resp.Body.Close() + return fmt.Errorf("could not create attachment file: %w", err) + } + + if _, err := io.Copy(outFile, resp.Body); err != nil { + outFile.Close() + resp.Body.Close() + return fmt.Errorf("could not save attachment: %w", err) + } + outFile.Close() + resp.Body.Close() + + // Update markdown to link to local file + localPath := filepath.Join("..", "attachments", sanitizedAttachmentName) + markdownContent.WriteString(fmt.Sprintf("\n[%s](%s)", att.FileName, localPath)) + } + markdownContent.WriteString("\n\n") + } + + // Create the output directory for markdown files + channelsDir := filepath.Join(outputDir, "channels") + if err := os.MkdirAll(channelsDir, 0755); err != nil { + return fmt.Errorf("could not create output directory: %w", err) + } + + // Sanitize channel name for the filename + sanitizedChannelName := sanitizeFilename(channel.Name) + + // Write the markdown to a file + filePath := filepath.Join(channelsDir, fmt.Sprintf("%s.md", sanitizedChannelName)) + if err := os.WriteFile(filePath, []byte(markdownContent.String()), 0644); err != nil { + return fmt.Errorf("could not write markdown file for channel %s: %w", channel.Name, err) + } + } + + fmt.Printf("Successfully created archive in discord/%s\n", sanitizedServerName) + return nil + }, +} + +func init() { + collectCmd.AddCommand(collectDiscordCmd) + collectDiscordCmd.AddCommand(collectDiscordImportCmd) +} diff --git a/cmd/collect_discord_test.go b/cmd/collect_discord_test.go new file mode 100644 index 0000000..3f8ca6c --- /dev/null +++ b/cmd/collect_discord_test.go @@ -0,0 +1,132 @@ +package cmd + +import ( + "encoding/json" + "io" + "net/http" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/Snider/Borg/pkg/mocks" +) + +func TestCollectDiscordImportCmd_Good(t *testing.T) { + // Mock HTTP client + mockClient := mocks.NewMockClient(map[string]*http.Response{ + "https://example.com/file.txt": { + StatusCode: http.StatusOK, + Body: io.NopCloser(strings.NewReader("attachment content")), + }, + }) + http.DefaultClient = mockClient + + // Create a temporary directory + tempDir := t.TempDir() + + // Read the sample export from testdata + sampleData, err := os.ReadFile("testdata/sample_export.json") + if err != nil { + t.Fatalf("failed to read sample export file: %v", err) + } + jsonPath := filepath.Join(tempDir, "export.json") + if err := os.WriteFile(jsonPath, sampleData, 0644); err != nil { + t.Fatalf("failed to write sample json: %v", err) + } + + // Change working directory to tempDir to check relative output path + oldWd, err := os.Getwd() + if err != nil { + t.Fatalf("failed to get current working directory: %v", err) + } + if err := os.Chdir(tempDir); err != nil { + t.Fatalf("failed to change working directory: %v", err) + } + defer os.Chdir(oldWd) + + rootCmd := NewRootCmd() + rootCmd.AddCommand(GetCollectCmd()) + + // Execute command + _, err = executeCommand(rootCmd, "collect", "discord", "import", "export.json") + if err != nil { + t.Fatalf("collect discord import command failed: %v", err) + } + + // Verify output + sanitizedServerName := "Test-Server" + expectedBaseDir := filepath.Join("discord", sanitizedServerName) + + // Verify INDEX.json + indexPath := filepath.Join(expectedBaseDir, "INDEX.json") + indexContent, err := os.ReadFile(indexPath) + if err != nil { + t.Fatalf("failed to read INDEX.json: %v", err) + } + type SearchEntry struct { + ID string `json:"id"` + Channel string `json:"channel"` + Author string `json:"author"` + Timestamp time.Time `json:"timestamp"` + Content string `json:"content"` + } + var index []SearchEntry + if err := json.Unmarshal(indexContent, &index); err != nil { + t.Fatalf("failed to unmarshal INDEX.json: %v", err) + } + if len(index) != 3 { + t.Fatalf("expected 3 messages in index, got %d", len(index)) + } + if index[1].Content != "This is a test message." { + t.Errorf("unexpected content in index entry: %s", index[1].Content) + } + + // Verify attachment + attachmentPath := filepath.Join(expectedBaseDir, "attachments", "file with spaces.txt") + attachmentContent, err := os.ReadFile(attachmentPath) + if err != nil { + t.Fatalf("failed to read attachment: %v", err) + } + if string(attachmentContent) != "attachment content" { + t.Errorf("unexpected content in attachment. Got: %s", string(attachmentContent)) + } + + // Verify random.md + randomMdPath := filepath.Join(expectedBaseDir, "channels", "random.md") + randomMdContent, err := os.ReadFile(randomMdPath) + if err != nil { + t.Fatalf("failed to read random.md: %v", err) + } + expectedRandomContent := "# random\n\n---\n**User2** `2024-01-01 12:01:00`\n\nThis is a test message.\n\n[file with spaces.txt](../attachments/file with spaces.txt)\n\n" + if string(randomMdContent) != expectedRandomContent { + t.Errorf("unexpected content in random.md.\nGot:\n%s\nExpected:\n%s", string(randomMdContent), expectedRandomContent) + } +} + +func TestCollectDiscordImportCmd_Bad(t *testing.T) { + rootCmd := NewRootCmd() + rootCmd.AddCommand(GetCollectCmd()) + + // Execute command with non-existent file + _, err := executeCommand(rootCmd, "collect", "discord", "import", "non-existent.json") + if err == nil { + t.Fatal("expected an error, but got none") + } + if !strings.Contains(err.Error(), "could not open file") { + t.Errorf("unexpected error message: %v", err) + } +} + +func TestCollectDiscordImportCmd_Ugly(t *testing.T) { + rootCmd := NewRootCmd() + rootCmd.AddCommand(GetCollectCmd()) + _, err := executeCommand(rootCmd, "collect", "discord", "import") + if err == nil { + t.Fatal("expected an error for no arguments, but got none") + } + if !strings.Contains(err.Error(), "accepts 1 arg(s), received 0") { + t.Errorf("unexpected error message: %v", err) + } +} diff --git a/cmd/testdata/sample_export.json b/cmd/testdata/sample_export.json new file mode 100644 index 0000000..29c6d14 --- /dev/null +++ b/cmd/testdata/sample_export.json @@ -0,0 +1,59 @@ +{ + "guild": { + "id": "12345", + "name": "Test/Server" + }, + "channels": [ + { + "id": "channel1", + "name": "general" + }, + { + "id": "channel2", + "name": "random" + } + ], + "messages": [ + { + "id": "msg1", + "channelId": "channel1", + "author": { + "id": "user1", + "name": "Jules", + "avatarUrl": "" + }, + "timestamp": "2024-01-01T12:00:00Z", + "content": "Hello, world!", + "attachments": [] + }, + { + "id": "msg2", + "channelId": "channel2", + "author": { + "id": "user2", + "name": "User2", + "avatarUrl": "" + }, + "timestamp": "2024-01-01T12:01:00Z", + "content": "This is a test message.", + "attachments": [ + { + "url": "https://example.com/file.txt", + "fileName": "file with spaces.txt" + } + ] + }, + { + "id": "msg3", + "channelId": "channel1", + "author": { + "id": "user1", + "name": "Jules", + "avatarUrl": "" + }, + "timestamp": "2024-01-01T12:02:00Z", + "content": "Another message in general.", + "attachments": [] + } + ] +} diff --git a/docs/cli.md b/docs/cli.md index 55c0185..d1210c0 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -21,8 +21,10 @@ Subcommands: - `borg collect github repos [--output ] [--format ...] [--compression ...]` - `borg collect website [--depth N] [--output ] [--format ...] [--compression ...]` - `borg collect pwa --uri [--output ] [--format ...] [--compression ...]` +- `borg collect discord import ` Examples: +- `borg collect discord import ./discord-export/export.json` - `borg collect github repo https://github.com/Snider/Borg --output borg.dat` - `borg collect website https://example.com --depth 1 --output site.dat` - `borg collect pwa --uri https://squoosh.app --output squoosh.dat`