298 lines
7.7 KiB
Go
298 lines
7.7 KiB
Go
|
|
package collect
|
||
|
|
|
||
|
|
import (
|
||
|
|
"context"
|
||
|
|
"fmt"
|
||
|
|
"net/http"
|
||
|
|
"path/filepath"
|
||
|
|
"strings"
|
||
|
|
"time"
|
||
|
|
|
||
|
|
core "github.com/host-uk/core/pkg/framework/core"
|
||
|
|
"golang.org/x/net/html"
|
||
|
|
)
|
||
|
|
|
||
|
|
// httpClient is the HTTP client used for all collection requests.
|
||
|
|
// Use SetHTTPClient to override for testing.
|
||
|
|
var httpClient = &http.Client{
|
||
|
|
Timeout: 30 * time.Second,
|
||
|
|
}
|
||
|
|
|
||
|
|
// BitcoinTalkCollector collects forum posts from BitcoinTalk.
|
||
|
|
type BitcoinTalkCollector struct {
|
||
|
|
// TopicID is the numeric topic identifier.
|
||
|
|
TopicID string
|
||
|
|
|
||
|
|
// URL is a full URL to a BitcoinTalk topic page. If set, TopicID is
|
||
|
|
// extracted from it.
|
||
|
|
URL string
|
||
|
|
|
||
|
|
// Pages limits collection to this many pages. 0 means all pages.
|
||
|
|
Pages int
|
||
|
|
}
|
||
|
|
|
||
|
|
// Name returns the collector name.
|
||
|
|
func (b *BitcoinTalkCollector) Name() string {
|
||
|
|
id := b.TopicID
|
||
|
|
if id == "" && b.URL != "" {
|
||
|
|
id = "url"
|
||
|
|
}
|
||
|
|
return fmt.Sprintf("bitcointalk:%s", id)
|
||
|
|
}
|
||
|
|
|
||
|
|
// Collect gathers posts from a BitcoinTalk topic.
|
||
|
|
func (b *BitcoinTalkCollector) Collect(ctx context.Context, cfg *Config) (*Result, error) {
|
||
|
|
result := &Result{Source: b.Name()}
|
||
|
|
|
||
|
|
if cfg.Dispatcher != nil {
|
||
|
|
cfg.Dispatcher.EmitStart(b.Name(), "Starting BitcoinTalk collection")
|
||
|
|
}
|
||
|
|
|
||
|
|
topicID := b.TopicID
|
||
|
|
if topicID == "" {
|
||
|
|
return result, core.E("collect.BitcoinTalk.Collect", "topic ID is required", nil)
|
||
|
|
}
|
||
|
|
|
||
|
|
if cfg.DryRun {
|
||
|
|
if cfg.Dispatcher != nil {
|
||
|
|
cfg.Dispatcher.EmitProgress(b.Name(), fmt.Sprintf("[dry-run] Would collect topic %s", topicID), nil)
|
||
|
|
}
|
||
|
|
return result, nil
|
||
|
|
}
|
||
|
|
|
||
|
|
baseDir := filepath.Join(cfg.OutputDir, "bitcointalk", topicID, "posts")
|
||
|
|
if err := cfg.Output.EnsureDir(baseDir); err != nil {
|
||
|
|
return result, core.E("collect.BitcoinTalk.Collect", "failed to create output directory", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
postNum := 0
|
||
|
|
offset := 0
|
||
|
|
pageCount := 0
|
||
|
|
postsPerPage := 20
|
||
|
|
|
||
|
|
for {
|
||
|
|
if ctx.Err() != nil {
|
||
|
|
return result, core.E("collect.BitcoinTalk.Collect", "context cancelled", ctx.Err())
|
||
|
|
}
|
||
|
|
|
||
|
|
if b.Pages > 0 && pageCount >= b.Pages {
|
||
|
|
break
|
||
|
|
}
|
||
|
|
|
||
|
|
if cfg.Limiter != nil {
|
||
|
|
if err := cfg.Limiter.Wait(ctx, "bitcointalk"); err != nil {
|
||
|
|
return result, err
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
pageURL := fmt.Sprintf("https://bitcointalk.org/index.php?topic=%s.%d", topicID, offset)
|
||
|
|
|
||
|
|
posts, err := b.fetchPage(ctx, pageURL)
|
||
|
|
if err != nil {
|
||
|
|
result.Errors++
|
||
|
|
if cfg.Dispatcher != nil {
|
||
|
|
cfg.Dispatcher.EmitError(b.Name(), fmt.Sprintf("Failed to fetch page at offset %d: %v", offset, err), nil)
|
||
|
|
}
|
||
|
|
break
|
||
|
|
}
|
||
|
|
|
||
|
|
if len(posts) == 0 {
|
||
|
|
break
|
||
|
|
}
|
||
|
|
|
||
|
|
for _, post := range posts {
|
||
|
|
postNum++
|
||
|
|
filePath := filepath.Join(baseDir, fmt.Sprintf("%d.md", postNum))
|
||
|
|
content := formatPostMarkdown(postNum, post)
|
||
|
|
|
||
|
|
if err := cfg.Output.Write(filePath, content); err != nil {
|
||
|
|
result.Errors++
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
|
||
|
|
result.Items++
|
||
|
|
result.Files = append(result.Files, filePath)
|
||
|
|
|
||
|
|
if cfg.Dispatcher != nil {
|
||
|
|
cfg.Dispatcher.EmitItem(b.Name(), fmt.Sprintf("Post %d by %s", postNum, post.Author), nil)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
pageCount++
|
||
|
|
offset += postsPerPage
|
||
|
|
|
||
|
|
// If we got fewer posts than expected, we've reached the end
|
||
|
|
if len(posts) < postsPerPage {
|
||
|
|
break
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if cfg.Dispatcher != nil {
|
||
|
|
cfg.Dispatcher.EmitComplete(b.Name(), fmt.Sprintf("Collected %d posts", result.Items), result)
|
||
|
|
}
|
||
|
|
|
||
|
|
return result, nil
|
||
|
|
}
|
||
|
|
|
||
|
|
// btPost represents a parsed BitcoinTalk forum post.
|
||
|
|
type btPost struct {
|
||
|
|
Author string
|
||
|
|
Date string
|
||
|
|
Content string
|
||
|
|
}
|
||
|
|
|
||
|
|
// fetchPage fetches and parses a single BitcoinTalk topic page.
|
||
|
|
func (b *BitcoinTalkCollector) fetchPage(ctx context.Context, pageURL string) ([]btPost, error) {
|
||
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, pageURL, nil)
|
||
|
|
if err != nil {
|
||
|
|
return nil, core.E("collect.BitcoinTalk.fetchPage", "failed to create request", err)
|
||
|
|
}
|
||
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; CoreCollector/1.0)")
|
||
|
|
|
||
|
|
resp, err := httpClient.Do(req)
|
||
|
|
if err != nil {
|
||
|
|
return nil, core.E("collect.BitcoinTalk.fetchPage", "request failed", err)
|
||
|
|
}
|
||
|
|
defer func() { _ = resp.Body.Close() }()
|
||
|
|
|
||
|
|
if resp.StatusCode != http.StatusOK {
|
||
|
|
return nil, core.E("collect.BitcoinTalk.fetchPage",
|
||
|
|
fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)
|
||
|
|
}
|
||
|
|
|
||
|
|
doc, err := html.Parse(resp.Body)
|
||
|
|
if err != nil {
|
||
|
|
return nil, core.E("collect.BitcoinTalk.fetchPage", "failed to parse HTML", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
return extractPosts(doc), nil
|
||
|
|
}
|
||
|
|
|
||
|
|
// extractPosts extracts post data from a parsed HTML document.
|
||
|
|
// It looks for the common BitcoinTalk post structure using div.post elements.
|
||
|
|
func extractPosts(doc *html.Node) []btPost {
|
||
|
|
var posts []btPost
|
||
|
|
var walk func(*html.Node)
|
||
|
|
|
||
|
|
walk = func(n *html.Node) {
|
||
|
|
if n.Type == html.ElementNode && n.Data == "div" {
|
||
|
|
for _, attr := range n.Attr {
|
||
|
|
if attr.Key == "class" && strings.Contains(attr.Val, "post") {
|
||
|
|
post := parsePost(n)
|
||
|
|
if post.Content != "" {
|
||
|
|
posts = append(posts, post)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
|
|
walk(c)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
walk(doc)
|
||
|
|
return posts
|
||
|
|
}
|
||
|
|
|
||
|
|
// parsePost extracts author, date, and content from a post div.
|
||
|
|
func parsePost(node *html.Node) btPost {
|
||
|
|
post := btPost{}
|
||
|
|
var walk func(*html.Node)
|
||
|
|
|
||
|
|
walk = func(n *html.Node) {
|
||
|
|
if n.Type == html.ElementNode {
|
||
|
|
for _, attr := range n.Attr {
|
||
|
|
if attr.Key == "class" {
|
||
|
|
switch {
|
||
|
|
case strings.Contains(attr.Val, "poster_info"):
|
||
|
|
post.Author = extractText(n)
|
||
|
|
case strings.Contains(attr.Val, "headerandpost"):
|
||
|
|
// Look for date in smalltext
|
||
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
|
|
if c.Type == html.ElementNode && c.Data == "div" {
|
||
|
|
for _, a := range c.Attr {
|
||
|
|
if a.Key == "class" && strings.Contains(a.Val, "smalltext") {
|
||
|
|
post.Date = strings.TrimSpace(extractText(c))
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
case strings.Contains(attr.Val, "inner"):
|
||
|
|
post.Content = strings.TrimSpace(extractText(n))
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
|
|
walk(c)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
walk(node)
|
||
|
|
return post
|
||
|
|
}
|
||
|
|
|
||
|
|
// extractText recursively extracts text content from an HTML node.
|
||
|
|
func extractText(n *html.Node) string {
|
||
|
|
if n.Type == html.TextNode {
|
||
|
|
return n.Data
|
||
|
|
}
|
||
|
|
|
||
|
|
var b strings.Builder
|
||
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
|
|
text := extractText(c)
|
||
|
|
if text != "" {
|
||
|
|
if b.Len() > 0 && c.Type == html.ElementNode && (c.Data == "br" || c.Data == "p" || c.Data == "div") {
|
||
|
|
b.WriteString("\n")
|
||
|
|
}
|
||
|
|
b.WriteString(text)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return b.String()
|
||
|
|
}
|
||
|
|
|
||
|
|
// formatPostMarkdown formats a BitcoinTalk post as markdown.
|
||
|
|
func formatPostMarkdown(num int, post btPost) string {
|
||
|
|
var b strings.Builder
|
||
|
|
fmt.Fprintf(&b, "# Post %d by %s\n\n", num, post.Author)
|
||
|
|
|
||
|
|
if post.Date != "" {
|
||
|
|
fmt.Fprintf(&b, "**Date:** %s\n\n", post.Date)
|
||
|
|
}
|
||
|
|
|
||
|
|
b.WriteString(post.Content)
|
||
|
|
b.WriteString("\n")
|
||
|
|
|
||
|
|
return b.String()
|
||
|
|
}
|
||
|
|
|
||
|
|
// ParsePostsFromHTML parses BitcoinTalk posts from raw HTML content.
|
||
|
|
// This is exported for testing purposes.
|
||
|
|
func ParsePostsFromHTML(htmlContent string) ([]btPost, error) {
|
||
|
|
doc, err := html.Parse(strings.NewReader(htmlContent))
|
||
|
|
if err != nil {
|
||
|
|
return nil, core.E("collect.ParsePostsFromHTML", "failed to parse HTML", err)
|
||
|
|
}
|
||
|
|
return extractPosts(doc), nil
|
||
|
|
}
|
||
|
|
|
||
|
|
// FormatPostMarkdown is exported for testing purposes.
|
||
|
|
func FormatPostMarkdown(num int, author, date, content string) string {
|
||
|
|
return formatPostMarkdown(num, btPost{Author: author, Date: date, Content: content})
|
||
|
|
}
|
||
|
|
|
||
|
|
// FetchPageFunc is an injectable function type for fetching pages, used in testing.
|
||
|
|
type FetchPageFunc func(ctx context.Context, url string) ([]btPost, error)
|
||
|
|
|
||
|
|
// BitcoinTalkCollectorWithFetcher wraps BitcoinTalkCollector with a custom fetcher for testing.
|
||
|
|
type BitcoinTalkCollectorWithFetcher struct {
|
||
|
|
BitcoinTalkCollector
|
||
|
|
Fetcher FetchPageFunc
|
||
|
|
}
|
||
|
|
|
||
|
|
// SetHTTPClient replaces the package-level HTTP client.
|
||
|
|
// Use this in tests to inject a custom transport or timeout.
|
||
|
|
func SetHTTPClient(c *http.Client) {
|
||
|
|
httpClient = c
|
||
|
|
}
|