Borg/pkg/reddit/reddit.go
google-labs-jules[bot] 47c6784c85 feat: Reddit thread/subreddit archival
This commit introduces a new feature to the `borg` CLI tool that allows archiving Reddit threads, subreddits, and user posts.

I have taken the following steps:
- Added a new `collect reddit` command with `thread`, `subreddit`, and `user` subcommands.
- Implemented the core scraping logic in the `pkg/reddit` package, using `goquery` to parse HTML from `old.reddit.com`.
- Integrated the scraping logic into the new subcommands, allowing them to fetch and process Reddit content.
- Ensured the build is stable by resolving several compilation issues that arose during development.

Although I have completed the core implementation, I was unable to add tests for the new functionality due to time constraints and the complexity of the build issues I encountered. The current implementation is functional but lacks automated tests.

Co-authored-by: Snider <631881+Snider@users.noreply.github.com>
2026-02-02 00:53:02 +00:00

131 lines
3.3 KiB
Go

package reddit
import (
"fmt"
"net/http"
"strings"
"github.com/PuerkitoBio/goquery"
)
// Comment represents a single Reddit comment.
type Comment struct {
Author string
Body string
}
// Thread represents a Reddit thread, including the original post and all comments.
type Thread struct {
Title string
Post string
Comments []Comment
URL string
}
// ScrapeThread fetches and parses a Reddit thread from a given URL.
func ScrapeThread(url string) (*Thread, error) {
// Make sure we're using old.reddit.com for simpler scraping
if !strings.Contains(url, "old.reddit.com") {
url = strings.Replace(url, "reddit.com", "old.reddit.com", 1)
}
res, err := http.Get(url)
if err != nil {
return nil, fmt.Errorf("failed to fetch URL: %w", err)
}
defer res.Body.Close()
if res.StatusCode != 200 {
return nil, fmt.Errorf("request failed with status: %s", res.Status)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return nil, fmt.Errorf("failed to parse HTML: %w", err)
}
thread := &Thread{}
// Scrape the post title and content
thread.Title = doc.Find("a.title").First().Text()
thread.Post = doc.Find("div.expando .md").First().Text()
// Scrape comments
doc.Find(".commentarea .comment").Each(func(i int, s *goquery.Selection) {
author := s.Find(".author").First().Text()
body := s.Find(".md").First().Text()
thread.Comments = append(thread.Comments, Comment{Author: author, Body: body})
})
return thread, nil
}
// ScrapeSubreddit fetches and parses a subreddit's posts.
func ScrapeSubreddit(name, sort string, limit int) ([]*Thread, error) {
url := fmt.Sprintf("https://old.reddit.com/r/%s/", name)
if sort == "top" {
url = fmt.Sprintf("https://old.reddit.com/r/%s/top/?t=all", name)
}
res, err := http.Get(url)
if err != nil {
return nil, fmt.Errorf("failed to fetch URL: %w", err)
}
defer res.Body.Close()
if res.StatusCode != 200 {
return nil, fmt.Errorf("request failed with status: %s", res.Status)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return nil, fmt.Errorf("failed to parse HTML: %w", err)
}
var threads []*Thread
doc.Find("div.thing.link").Each(func(i int, s *goquery.Selection) {
if i >= limit {
return
}
title := s.Find("a.title").Text()
postURL, _ := s.Find("a.title").Attr("href")
if !strings.HasPrefix(postURL, "http") {
postURL = "https://old.reddit.com" + postURL
}
threads = append(threads, &Thread{Title: title, URL: postURL})
})
return threads, nil
}
// ScrapeUser fetches and parses a user's posts.
func ScrapeUser(name string) ([]*Thread, error) {
url := fmt.Sprintf("https://old.reddit.com/user/%s/", name)
res, err := http.Get(url)
if err != nil {
return nil, fmt.Errorf("failed to fetch URL: %w", err)
}
defer res.Body.Close()
if res.StatusCode != 200 {
return nil, fmt.Errorf("request failed with status: %s", res.Status)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return nil, fmt.Errorf("failed to parse HTML: %w", err)
}
var threads []*Thread
doc.Find("div.thing.link").Each(func(i int, s *goquery.Selection) {
title := s.Find("a.title").Text()
postURL, _ := s.Find("a.title").Attr("href")
if !strings.HasPrefix(postURL, "http") {
postURL = "https://old.reddit.com" + postURL
}
threads = append(threads, &Thread{Title: title, URL: postURL})
})
return threads, nil
}