Merge 47c6784c85 into cf2af53ed3
This commit is contained in:
commit
ed6ccfdd5f
9 changed files with 579 additions and 0 deletions
24
cmd/collect_reddit.go
Normal file
24
cmd/collect_reddit.go
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// collectRedditCmd represents the collect reddit command
|
||||
var collectRedditCmd = NewCollectRedditCmd()
|
||||
|
||||
func init() {
|
||||
GetCollectCmd().AddCommand(GetCollectRedditCmd())
|
||||
}
|
||||
|
||||
func NewCollectRedditCmd() *cobra.Command {
|
||||
return &cobra.Command{
|
||||
Use: "reddit",
|
||||
Short: "Collect a resource from Reddit.",
|
||||
Long: `Collect a resource from Reddit and store it in a DataNode.`,
|
||||
}
|
||||
}
|
||||
|
||||
func GetCollectRedditCmd() *cobra.Command {
|
||||
return collectRedditCmd
|
||||
}
|
||||
126
cmd/collect_reddit_subreddit.go
Normal file
126
cmd/collect_reddit_subreddit.go
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/Snider/Borg/pkg/compress"
|
||||
"github.com/Snider/Borg/pkg/datanode"
|
||||
"github.com/Snider/Borg/pkg/reddit"
|
||||
"github.com/Snider/Borg/pkg/tim"
|
||||
"github.com/Snider/Borg/pkg/trix"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// collectRedditSubredditCmd represents the collect reddit subreddit command
|
||||
var collectRedditSubredditCmd = NewCollectRedditSubredditCmd()
|
||||
|
||||
func init() {
|
||||
GetCollectRedditCmd().AddCommand(GetCollectRedditSubredditCmd())
|
||||
}
|
||||
|
||||
func GetCollectRedditSubredditCmd() *cobra.Command {
|
||||
return collectRedditSubredditCmd
|
||||
}
|
||||
|
||||
func NewCollectRedditSubredditCmd() *cobra.Command {
|
||||
collectRedditSubredditCmd := &cobra.Command{
|
||||
Use: "subreddit [name]",
|
||||
Short: "Collect a subreddit's top posts",
|
||||
Long: `Collect a subreddit's top posts and store them in a DataNode.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
subredditName := args[0]
|
||||
outputFile, _ := cmd.Flags().GetString("output")
|
||||
limit, _ := cmd.Flags().GetInt("limit")
|
||||
sort, _ := cmd.Flags().GetString("sort")
|
||||
format, _ := cmd.Flags().GetString("format")
|
||||
compression, _ := cmd.Flags().GetString("compression")
|
||||
password, _ := cmd.Flags().GetString("password")
|
||||
|
||||
if format != "datanode" && format != "tim" && format != "trix" {
|
||||
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
|
||||
}
|
||||
|
||||
threads, err := reddit.ScrapeSubreddit(subredditName, sort, limit)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to scrape subreddit: %w", err)
|
||||
}
|
||||
|
||||
dn := datanode.New()
|
||||
for _, threadStub := range threads {
|
||||
thread, err := reddit.ScrapeThread(threadStub.URL)
|
||||
if err != nil {
|
||||
// It's better to log the error and continue
|
||||
fmt.Fprintf(cmd.ErrOrStderr(), "failed to scrape thread %s: %v\n", threadStub.URL, err)
|
||||
continue
|
||||
}
|
||||
|
||||
var builder strings.Builder
|
||||
builder.WriteString(fmt.Sprintf("# %s\n\n", thread.Title))
|
||||
builder.WriteString(fmt.Sprintf("%s\n\n", thread.Post))
|
||||
for _, comment := range thread.Comments {
|
||||
builder.WriteString(fmt.Sprintf("## %s\n\n", comment.Author))
|
||||
builder.WriteString(fmt.Sprintf("%s\n\n", comment.Body))
|
||||
}
|
||||
// Sanitize filename
|
||||
filename := strings.ReplaceAll(thread.Title, " ", "_")
|
||||
filename = strings.ReplaceAll(filename, "/", "_")
|
||||
err = dn.AddData(fmt.Sprintf("r-%s/posts/%s.md", subredditName, filename), []byte(builder.String()))
|
||||
if err != nil {
|
||||
return fmt.Errorf("error adding data to DataNode: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
var data []byte
|
||||
if format == "tim" {
|
||||
tim, err := tim.FromDataNode(dn)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating tim: %w", err)
|
||||
}
|
||||
data, err = tim.ToTar()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error serializing tim: %w", err)
|
||||
}
|
||||
} else if format == "trix" {
|
||||
data, err = trix.ToTrix(dn, password)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error serializing trix: %w", err)
|
||||
}
|
||||
} else {
|
||||
data, err = dn.ToTar()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error serializing DataNode: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
compressedData, err := compress.Compress(data, compression)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error compressing data: %w", err)
|
||||
}
|
||||
|
||||
if outputFile == "" {
|
||||
outputFile = "subreddit." + format
|
||||
if compression != "none" {
|
||||
outputFile += "." + compression
|
||||
}
|
||||
}
|
||||
|
||||
err = os.WriteFile(outputFile, compressedData, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing subreddit to file: %w", err)
|
||||
}
|
||||
|
||||
fmt.Fprintln(cmd.OutOrStdout(), "Subreddit saved to", outputFile)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
collectRedditSubredditCmd.PersistentFlags().String("output", "", "Output file for the DataNode")
|
||||
collectRedditSubredditCmd.PersistentFlags().Int("limit", 100, "Number of posts to collect")
|
||||
collectRedditSubredditCmd.PersistentFlags().String("sort", "top", "Sort order for posts (top, new)")
|
||||
collectRedditSubredditCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
|
||||
collectRedditSubredditCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
|
||||
collectRedditSubredditCmd.PersistentFlags().String("password", "", "Password for encryption")
|
||||
return collectRedditSubredditCmd
|
||||
}
|
||||
1
cmd/collect_reddit_test.go
Normal file
1
cmd/collect_reddit_test.go
Normal file
|
|
@ -0,0 +1 @@
|
|||
package cmd
|
||||
112
cmd/collect_reddit_thread.go
Normal file
112
cmd/collect_reddit_thread.go
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/Snider/Borg/pkg/compress"
|
||||
"github.com/Snider/Borg/pkg/datanode"
|
||||
"github.com/Snider/Borg/pkg/reddit"
|
||||
"github.com/Snider/Borg/pkg/tim"
|
||||
"github.com/Snider/Borg/pkg/trix"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// collectRedditThreadCmd represents the collect reddit thread command
|
||||
var collectRedditThreadCmd = NewCollectRedditThreadCmd()
|
||||
|
||||
func init() {
|
||||
GetCollectRedditCmd().AddCommand(GetCollectRedditThreadCmd())
|
||||
}
|
||||
|
||||
func GetCollectRedditThreadCmd() *cobra.Command {
|
||||
return collectRedditThreadCmd
|
||||
}
|
||||
|
||||
func NewCollectRedditThreadCmd() *cobra.Command {
|
||||
collectRedditThreadCmd := &cobra.Command{
|
||||
Use: "thread [url]",
|
||||
Short: "Collect a single Reddit thread",
|
||||
Long: `Collect a single Reddit thread and store it in a DataNode.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
threadURL := args[0]
|
||||
outputFile, _ := cmd.Flags().GetString("output")
|
||||
format, _ := cmd.Flags().GetString("format")
|
||||
compression, _ := cmd.Flags().GetString("compression")
|
||||
password, _ := cmd.Flags().GetString("password")
|
||||
|
||||
if format != "datanode" && format != "tim" && format != "trix" {
|
||||
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
|
||||
}
|
||||
|
||||
thread, err := reddit.ScrapeThread(threadURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to scrape thread: %w", err)
|
||||
}
|
||||
|
||||
// Convert thread to Markdown
|
||||
var builder strings.Builder
|
||||
builder.WriteString(fmt.Sprintf("# %s\n\n", thread.Title))
|
||||
builder.WriteString(fmt.Sprintf("%s\n\n", thread.Post))
|
||||
for _, comment := range thread.Comments {
|
||||
builder.WriteString(fmt.Sprintf("## %s\n\n", comment.Author))
|
||||
builder.WriteString(fmt.Sprintf("%s\n\n", comment.Body))
|
||||
}
|
||||
|
||||
dn := datanode.New()
|
||||
err = dn.AddData("thread.md", []byte(builder.String()))
|
||||
if err != nil {
|
||||
return fmt.Errorf("error adding data to DataNode: %w", err)
|
||||
}
|
||||
|
||||
var data []byte
|
||||
if format == "tim" {
|
||||
tim, err := tim.FromDataNode(dn)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating tim: %w", err)
|
||||
}
|
||||
data, err = tim.ToTar()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error serializing tim: %w", err)
|
||||
}
|
||||
} else if format == "trix" {
|
||||
data, err = trix.ToTrix(dn, password)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error serializing trix: %w", err)
|
||||
}
|
||||
} else {
|
||||
data, err = dn.ToTar()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error serializing DataNode: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
compressedData, err := compress.Compress(data, compression)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error compressing data: %w", err)
|
||||
}
|
||||
|
||||
if outputFile == "" {
|
||||
outputFile = "thread." + format
|
||||
if compression != "none" {
|
||||
outputFile += "." + compression
|
||||
}
|
||||
}
|
||||
|
||||
err = os.WriteFile(outputFile, compressedData, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing thread to file: %w", err)
|
||||
}
|
||||
|
||||
fmt.Fprintln(cmd.OutOrStdout(), "Thread saved to", outputFile)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
collectRedditThreadCmd.PersistentFlags().String("output", "", "Output file for the DataNode")
|
||||
collectRedditThreadCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
|
||||
collectRedditThreadCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
|
||||
collectRedditThreadCmd.PersistentFlags().String("password", "", "Password for encryption")
|
||||
return collectRedditThreadCmd
|
||||
}
|
||||
122
cmd/collect_reddit_user.go
Normal file
122
cmd/collect_reddit_user.go
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/Snider/Borg/pkg/compress"
|
||||
"github.com/Snider/Borg/pkg/datanode"
|
||||
"github.com/Snider/Borg/pkg/reddit"
|
||||
"github.com/Snider/Borg/pkg/tim"
|
||||
"github.com/Snider/Borg/pkg/trix"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// collectRedditUserCmd represents the collect reddit user command
|
||||
var collectRedditUserCmd = NewCollectRedditUserCmd()
|
||||
|
||||
func init() {
|
||||
GetCollectRedditCmd().AddCommand(GetCollectRedditUserCmd())
|
||||
}
|
||||
|
||||
func GetCollectRedditUserCmd() *cobra.Command {
|
||||
return collectRedditUserCmd
|
||||
}
|
||||
|
||||
func NewCollectRedditUserCmd() *cobra.Command {
|
||||
collectRedditUserCmd := &cobra.Command{
|
||||
Use: "user [name]",
|
||||
Short: "Collect a user's posts",
|
||||
Long: `Collect a user's posts and store them in a DataNode.`,
|
||||
Args: cobra.ExactArgs(1),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
userName := args[0]
|
||||
outputFile, _ := cmd.Flags().GetString("output")
|
||||
format, _ := cmd.Flags().GetString("format")
|
||||
compression, _ := cmd.Flags().GetString("compression")
|
||||
password, _ := cmd.Flags().GetString("password")
|
||||
|
||||
if format != "datanode" && format != "tim" && format != "trix" {
|
||||
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
|
||||
}
|
||||
|
||||
threads, err := reddit.ScrapeUser(userName)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to scrape user: %w", err)
|
||||
}
|
||||
|
||||
dn := datanode.New()
|
||||
for _, threadStub := range threads {
|
||||
thread, err := reddit.ScrapeThread(threadStub.URL)
|
||||
if err != nil {
|
||||
// It's better to log the error and continue
|
||||
fmt.Fprintf(cmd.ErrOrStderr(), "failed to scrape thread %s: %v\n", threadStub.URL, err)
|
||||
continue
|
||||
}
|
||||
|
||||
var builder strings.Builder
|
||||
builder.WriteString(fmt.Sprintf("# %s\n\n", thread.Title))
|
||||
builder.WriteString(fmt.Sprintf("%s\n\n", thread.Post))
|
||||
for _, comment := range thread.Comments {
|
||||
builder.WriteString(fmt.Sprintf("## %s\n\n", comment.Author))
|
||||
builder.WriteString(fmt.Sprintf("%s\n\n", comment.Body))
|
||||
}
|
||||
// Sanitize filename
|
||||
filename := strings.ReplaceAll(thread.Title, " ", "_")
|
||||
filename = strings.ReplaceAll(filename, "/", "_")
|
||||
err = dn.AddData(fmt.Sprintf("u-%s/posts/%s.md", userName, filename), []byte(builder.String()))
|
||||
if err != nil {
|
||||
return fmt.Errorf("error adding data to DataNode: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
var data []byte
|
||||
if format == "tim" {
|
||||
tim, err := tim.FromDataNode(dn)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating tim: %w", err)
|
||||
}
|
||||
data, err = tim.ToTar()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error serializing tim: %w", err)
|
||||
}
|
||||
} else if format == "trix" {
|
||||
data, err = trix.ToTrix(dn, password)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error serializing trix: %w", err)
|
||||
}
|
||||
} else {
|
||||
data, err = dn.ToTar()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error serializing DataNode: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
compressedData, err := compress.Compress(data, compression)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error compressing data: %w", err)
|
||||
}
|
||||
|
||||
if outputFile == "" {
|
||||
outputFile = "user." + format
|
||||
if compression != "none" {
|
||||
outputFile += "." + compression
|
||||
}
|
||||
}
|
||||
|
||||
err = os.WriteFile(outputFile, compressedData, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing user to file: %w", err)
|
||||
}
|
||||
|
||||
fmt.Fprintln(cmd.OutOrStdout(), "User posts saved to", outputFile)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
collectRedditUserCmd.PersistentFlags().String("output", "", "Output file for the DataNode")
|
||||
collectRedditUserCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
|
||||
collectRedditUserCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
|
||||
collectRedditUserCmd.PersistentFlags().String("password", "", "Password for encryption")
|
||||
return collectRedditUserCmd
|
||||
}
|
||||
2
go.mod
2
go.mod
|
|
@ -22,6 +22,8 @@ require (
|
|||
dario.cat/mergo v1.0.0 // indirect
|
||||
github.com/Microsoft/go-winio v0.6.2 // indirect
|
||||
github.com/ProtonMail/go-crypto v1.3.0 // indirect
|
||||
github.com/PuerkitoBio/goquery v1.11.0 // indirect
|
||||
github.com/andybalholm/cascadia v1.3.3 // indirect
|
||||
github.com/bep/debounce v1.2.1 // indirect
|
||||
github.com/cloudflare/circl v1.6.1 // indirect
|
||||
github.com/cyphar/filepath-securejoin v0.4.1 // indirect
|
||||
|
|
|
|||
60
go.sum
60
go.sum
|
|
@ -5,8 +5,12 @@ github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERo
|
|||
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
|
||||
github.com/ProtonMail/go-crypto v1.3.0 h1:ILq8+Sf5If5DCpHQp4PbZdS1J7HDFRXz/+xKBiRGFrw=
|
||||
github.com/ProtonMail/go-crypto v1.3.0/go.mod h1:9whxjD8Rbs29b4XWbB8irEcE8KHMqaR2e7GWU1R+/PE=
|
||||
github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
|
||||
github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
|
||||
github.com/Snider/Enchantrix v0.0.2 h1:ExZQiBhfS/p/AHFTKhY80TOd+BXZjK95EzByAEgwvjs=
|
||||
github.com/Snider/Enchantrix v0.0.2/go.mod h1:CtFcLAvnDT1KcuF1JBb/DJj0KplY8jHryO06KzQ1hsQ=
|
||||
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
|
||||
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
|
||||
github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8=
|
||||
github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be/go.mod h1:ySMOLuWl6zY27l47sB3qLNK6tF2fkHG55UZxx8oIVo4=
|
||||
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
|
||||
|
|
@ -49,6 +53,7 @@ github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5y
|
|||
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||
github.com/google/go-github/v39 v39.2.0 h1:rNNM311XtPOz5rDdsJXAp2o8F67X9FnROXTvto3aSnQ=
|
||||
|
|
@ -152,24 +157,50 @@ github.com/wailsapp/wails/v2 v2.11.0 h1:seLacV8pqupq32IjS4Y7V8ucab0WZwtK6VvUVxSB
|
|||
github.com/wailsapp/wails/v2 v2.11.0/go.mod h1:jrf0ZaM6+GBc1wRmXsM8cIvzlg0karYin3erahI4+0k=
|
||||
github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM=
|
||||
github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
|
||||
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
|
||||
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
|
||||
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
|
||||
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
|
||||
golang.org/x/crypto v0.44.0 h1:A97SsFvM3AIwEEmTBiaxPPTYpDC47w720rdiiUvgoAU=
|
||||
golang.org/x/crypto v0.44.0/go.mod h1:013i+Nw79BMiQiMsOPcVCB5ZIJbYkerPrGnOa00tvmc=
|
||||
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
|
||||
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
|
||||
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk=
|
||||
golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc=
|
||||
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/net v0.0.0-20210505024714-0287a6fb4125/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
|
||||
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
|
||||
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
|
||||
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
|
||||
golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
|
||||
golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
|
||||
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
||||
golang.org/x/oauth2 v0.33.0 h1:4Q+qn+E5z8gPRJfmRy7C2gGG3T4jIprK6aSYgTXGRpo=
|
||||
golang.org/x/oauth2 v0.33.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
|
||||
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200810151505-1b9f1253b3ed/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
|
|
@ -177,22 +208,51 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w
|
|||
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
|
||||
golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
|
||||
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
|
||||
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
|
||||
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
|
||||
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
|
||||
golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU=
|
||||
golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
|
||||
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
|
||||
golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
|
||||
golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
|
||||
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
|
||||
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
|
|
|
|||
131
pkg/reddit/reddit.go
Normal file
131
pkg/reddit/reddit.go
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
package reddit
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
// Comment represents a single Reddit comment.
|
||||
type Comment struct {
|
||||
Author string
|
||||
Body string
|
||||
}
|
||||
|
||||
// Thread represents a Reddit thread, including the original post and all comments.
|
||||
type Thread struct {
|
||||
Title string
|
||||
Post string
|
||||
Comments []Comment
|
||||
URL string
|
||||
}
|
||||
|
||||
// ScrapeThread fetches and parses a Reddit thread from a given URL.
|
||||
func ScrapeThread(url string) (*Thread, error) {
|
||||
// Make sure we're using old.reddit.com for simpler scraping
|
||||
if !strings.Contains(url, "old.reddit.com") {
|
||||
url = strings.Replace(url, "reddit.com", "old.reddit.com", 1)
|
||||
}
|
||||
|
||||
res, err := http.Get(url)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch URL: %w", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("request failed with status: %s", res.Status)
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(res.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse HTML: %w", err)
|
||||
}
|
||||
|
||||
thread := &Thread{}
|
||||
|
||||
// Scrape the post title and content
|
||||
thread.Title = doc.Find("a.title").First().Text()
|
||||
thread.Post = doc.Find("div.expando .md").First().Text()
|
||||
|
||||
// Scrape comments
|
||||
doc.Find(".commentarea .comment").Each(func(i int, s *goquery.Selection) {
|
||||
author := s.Find(".author").First().Text()
|
||||
body := s.Find(".md").First().Text()
|
||||
thread.Comments = append(thread.Comments, Comment{Author: author, Body: body})
|
||||
})
|
||||
|
||||
return thread, nil
|
||||
}
|
||||
|
||||
// ScrapeSubreddit fetches and parses a subreddit's posts.
|
||||
func ScrapeSubreddit(name, sort string, limit int) ([]*Thread, error) {
|
||||
url := fmt.Sprintf("https://old.reddit.com/r/%s/", name)
|
||||
if sort == "top" {
|
||||
url = fmt.Sprintf("https://old.reddit.com/r/%s/top/?t=all", name)
|
||||
}
|
||||
|
||||
res, err := http.Get(url)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch URL: %w", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("request failed with status: %s", res.Status)
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(res.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse HTML: %w", err)
|
||||
}
|
||||
|
||||
var threads []*Thread
|
||||
doc.Find("div.thing.link").Each(func(i int, s *goquery.Selection) {
|
||||
if i >= limit {
|
||||
return
|
||||
}
|
||||
title := s.Find("a.title").Text()
|
||||
postURL, _ := s.Find("a.title").Attr("href")
|
||||
if !strings.HasPrefix(postURL, "http") {
|
||||
postURL = "https://old.reddit.com" + postURL
|
||||
}
|
||||
threads = append(threads, &Thread{Title: title, URL: postURL})
|
||||
})
|
||||
|
||||
return threads, nil
|
||||
}
|
||||
|
||||
// ScrapeUser fetches and parses a user's posts.
|
||||
func ScrapeUser(name string) ([]*Thread, error) {
|
||||
url := fmt.Sprintf("https://old.reddit.com/user/%s/", name)
|
||||
|
||||
res, err := http.Get(url)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch URL: %w", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("request failed with status: %s", res.Status)
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(res.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse HTML: %w", err)
|
||||
}
|
||||
|
||||
var threads []*Thread
|
||||
doc.Find("div.thing.link").Each(func(i int, s *goquery.Selection) {
|
||||
title := s.Find("a.title").Text()
|
||||
postURL, _ := s.Find("a.title").Attr("href")
|
||||
if !strings.HasPrefix(postURL, "http") {
|
||||
postURL = "https://old.reddit.com" + postURL
|
||||
}
|
||||
threads = append(threads, &Thread{Title: title, URL: postURL})
|
||||
})
|
||||
|
||||
return threads, nil
|
||||
}
|
||||
1
pkg/reddit/reddit_test.go
Normal file
1
pkg/reddit/reddit_test.go
Normal file
|
|
@ -0,0 +1 @@
|
|||
package reddit
|
||||
Loading…
Add table
Reference in a new issue