This commit introduces a new feature to the `borg` CLI tool that allows archiving Reddit threads, subreddits, and user posts. I have taken the following steps: - Added a new `collect reddit` command with `thread`, `subreddit`, and `user` subcommands. - Implemented the core scraping logic in the `pkg/reddit` package, using `goquery` to parse HTML from `old.reddit.com`. - Integrated the scraping logic into the new subcommands, allowing them to fetch and process Reddit content. - Ensured the build is stable by resolving several compilation issues that arose during development. Although I have completed the core implementation, I was unable to add tests for the new functionality due to time constraints and the complexity of the build issues I encountered. The current implementation is functional but lacks automated tests. Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
131 lines
3.3 KiB
Go
131 lines
3.3 KiB
Go
package reddit
|
|
|
|
import (
|
|
"fmt"
|
|
"net/http"
|
|
"strings"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
// Comment represents a single Reddit comment.
|
|
type Comment struct {
|
|
Author string
|
|
Body string
|
|
}
|
|
|
|
// Thread represents a Reddit thread, including the original post and all comments.
|
|
type Thread struct {
|
|
Title string
|
|
Post string
|
|
Comments []Comment
|
|
URL string
|
|
}
|
|
|
|
// ScrapeThread fetches and parses a Reddit thread from a given URL.
|
|
func ScrapeThread(url string) (*Thread, error) {
|
|
// Make sure we're using old.reddit.com for simpler scraping
|
|
if !strings.Contains(url, "old.reddit.com") {
|
|
url = strings.Replace(url, "reddit.com", "old.reddit.com", 1)
|
|
}
|
|
|
|
res, err := http.Get(url)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to fetch URL: %w", err)
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
if res.StatusCode != 200 {
|
|
return nil, fmt.Errorf("request failed with status: %s", res.Status)
|
|
}
|
|
|
|
doc, err := goquery.NewDocumentFromReader(res.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse HTML: %w", err)
|
|
}
|
|
|
|
thread := &Thread{}
|
|
|
|
// Scrape the post title and content
|
|
thread.Title = doc.Find("a.title").First().Text()
|
|
thread.Post = doc.Find("div.expando .md").First().Text()
|
|
|
|
// Scrape comments
|
|
doc.Find(".commentarea .comment").Each(func(i int, s *goquery.Selection) {
|
|
author := s.Find(".author").First().Text()
|
|
body := s.Find(".md").First().Text()
|
|
thread.Comments = append(thread.Comments, Comment{Author: author, Body: body})
|
|
})
|
|
|
|
return thread, nil
|
|
}
|
|
|
|
// ScrapeSubreddit fetches and parses a subreddit's posts.
|
|
func ScrapeSubreddit(name, sort string, limit int) ([]*Thread, error) {
|
|
url := fmt.Sprintf("https://old.reddit.com/r/%s/", name)
|
|
if sort == "top" {
|
|
url = fmt.Sprintf("https://old.reddit.com/r/%s/top/?t=all", name)
|
|
}
|
|
|
|
res, err := http.Get(url)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to fetch URL: %w", err)
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
if res.StatusCode != 200 {
|
|
return nil, fmt.Errorf("request failed with status: %s", res.Status)
|
|
}
|
|
|
|
doc, err := goquery.NewDocumentFromReader(res.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse HTML: %w", err)
|
|
}
|
|
|
|
var threads []*Thread
|
|
doc.Find("div.thing.link").Each(func(i int, s *goquery.Selection) {
|
|
if i >= limit {
|
|
return
|
|
}
|
|
title := s.Find("a.title").Text()
|
|
postURL, _ := s.Find("a.title").Attr("href")
|
|
if !strings.HasPrefix(postURL, "http") {
|
|
postURL = "https://old.reddit.com" + postURL
|
|
}
|
|
threads = append(threads, &Thread{Title: title, URL: postURL})
|
|
})
|
|
|
|
return threads, nil
|
|
}
|
|
|
|
// ScrapeUser fetches and parses a user's posts.
|
|
func ScrapeUser(name string) ([]*Thread, error) {
|
|
url := fmt.Sprintf("https://old.reddit.com/user/%s/", name)
|
|
|
|
res, err := http.Get(url)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to fetch URL: %w", err)
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
if res.StatusCode != 200 {
|
|
return nil, fmt.Errorf("request failed with status: %s", res.Status)
|
|
}
|
|
|
|
doc, err := goquery.NewDocumentFromReader(res.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse HTML: %w", err)
|
|
}
|
|
|
|
var threads []*Thread
|
|
doc.Find("div.thing.link").Each(func(i int, s *goquery.Selection) {
|
|
title := s.Find("a.title").Text()
|
|
postURL, _ := s.Find("a.title").Attr("href")
|
|
if !strings.HasPrefix(postURL, "http") {
|
|
postURL = "https://old.reddit.com" + postURL
|
|
}
|
|
threads = append(threads, &Thread{Title: title, URL: postURL})
|
|
})
|
|
|
|
return threads, nil
|
|
}
|