cli/pkg/collect/bitcointalk.go

298 lines
7.7 KiB
Go
Raw Normal View History

feat: infrastructure packages and lint cleanup (#281) * ci: consolidate duplicate workflows and merge CodeQL configs Remove 17 duplicate workflow files that were split copies of the combined originals. Each family (CI, CodeQL, Coverage, PR Build, Alpha Release) had the same job duplicated across separate push/pull_request/schedule/manual trigger files. Merge codeql.yml and codescan.yml into a single codeql.yml with a language matrix covering go, javascript-typescript, python, and actions — matching the previous default setup coverage. Remaining workflows (one per family): - ci.yml (push + PR + manual) - codeql.yml (push + PR + schedule, all languages) - coverage.yml (push + PR + manual) - alpha-release.yml (push + manual) - pr-build.yml (PR + manual) - release.yml (tag push) - agent-verify.yml, auto-label.yml, auto-project.yml Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat: add collect, config, crypt, plugin packages and fix all lint issues Add four new infrastructure packages with CLI commands: - pkg/config: layered configuration (defaults → file → env → flags) - pkg/crypt: crypto primitives (Argon2id, AES-GCM, ChaCha20, HMAC, checksums) - pkg/plugin: plugin system with GitHub-based install/update/remove - pkg/collect: collection subsystem (GitHub, BitcoinTalk, market, papers, excavate) Fix all golangci-lint issues across the entire codebase (~100 errcheck, staticcheck SA1012/SA1019/ST1005, unused, ineffassign fixes) so that `core go qa` passes with 0 issues. Closes #167, #168, #170, #250, #251, #252, #253, #254, #255, #256 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 11:34:43 +00:00
package collect
import (
"context"
"fmt"
"net/http"
"path/filepath"
"strings"
"time"
core "github.com/host-uk/core/pkg/framework/core"
"golang.org/x/net/html"
)
// httpClient is the HTTP client used for all collection requests.
// Use SetHTTPClient to override for testing.
var httpClient = &http.Client{
Timeout: 30 * time.Second,
}
// BitcoinTalkCollector collects forum posts from BitcoinTalk.
type BitcoinTalkCollector struct {
// TopicID is the numeric topic identifier.
TopicID string
// URL is a full URL to a BitcoinTalk topic page. If set, TopicID is
// extracted from it.
URL string
// Pages limits collection to this many pages. 0 means all pages.
Pages int
}
// Name returns the collector name.
func (b *BitcoinTalkCollector) Name() string {
id := b.TopicID
if id == "" && b.URL != "" {
id = "url"
}
return fmt.Sprintf("bitcointalk:%s", id)
}
// Collect gathers posts from a BitcoinTalk topic.
func (b *BitcoinTalkCollector) Collect(ctx context.Context, cfg *Config) (*Result, error) {
result := &Result{Source: b.Name()}
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitStart(b.Name(), "Starting BitcoinTalk collection")
}
topicID := b.TopicID
if topicID == "" {
return result, core.E("collect.BitcoinTalk.Collect", "topic ID is required", nil)
}
if cfg.DryRun {
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitProgress(b.Name(), fmt.Sprintf("[dry-run] Would collect topic %s", topicID), nil)
}
return result, nil
}
baseDir := filepath.Join(cfg.OutputDir, "bitcointalk", topicID, "posts")
if err := cfg.Output.EnsureDir(baseDir); err != nil {
return result, core.E("collect.BitcoinTalk.Collect", "failed to create output directory", err)
}
postNum := 0
offset := 0
pageCount := 0
postsPerPage := 20
for {
if ctx.Err() != nil {
return result, core.E("collect.BitcoinTalk.Collect", "context cancelled", ctx.Err())
}
if b.Pages > 0 && pageCount >= b.Pages {
break
}
if cfg.Limiter != nil {
if err := cfg.Limiter.Wait(ctx, "bitcointalk"); err != nil {
return result, err
}
}
pageURL := fmt.Sprintf("https://bitcointalk.org/index.php?topic=%s.%d", topicID, offset)
posts, err := b.fetchPage(ctx, pageURL)
if err != nil {
result.Errors++
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitError(b.Name(), fmt.Sprintf("Failed to fetch page at offset %d: %v", offset, err), nil)
}
break
}
if len(posts) == 0 {
break
}
for _, post := range posts {
postNum++
filePath := filepath.Join(baseDir, fmt.Sprintf("%d.md", postNum))
content := formatPostMarkdown(postNum, post)
if err := cfg.Output.Write(filePath, content); err != nil {
result.Errors++
continue
}
result.Items++
result.Files = append(result.Files, filePath)
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitItem(b.Name(), fmt.Sprintf("Post %d by %s", postNum, post.Author), nil)
}
}
pageCount++
offset += postsPerPage
// If we got fewer posts than expected, we've reached the end
if len(posts) < postsPerPage {
break
}
}
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitComplete(b.Name(), fmt.Sprintf("Collected %d posts", result.Items), result)
}
return result, nil
}
// btPost represents a parsed BitcoinTalk forum post.
type btPost struct {
Author string
Date string
Content string
}
// fetchPage fetches and parses a single BitcoinTalk topic page.
func (b *BitcoinTalkCollector) fetchPage(ctx context.Context, pageURL string) ([]btPost, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, pageURL, nil)
if err != nil {
return nil, core.E("collect.BitcoinTalk.fetchPage", "failed to create request", err)
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; CoreCollector/1.0)")
resp, err := httpClient.Do(req)
if err != nil {
return nil, core.E("collect.BitcoinTalk.fetchPage", "request failed", err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
return nil, core.E("collect.BitcoinTalk.fetchPage",
fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)
}
doc, err := html.Parse(resp.Body)
if err != nil {
return nil, core.E("collect.BitcoinTalk.fetchPage", "failed to parse HTML", err)
}
return extractPosts(doc), nil
}
// extractPosts extracts post data from a parsed HTML document.
// It looks for the common BitcoinTalk post structure using div.post elements.
func extractPosts(doc *html.Node) []btPost {
var posts []btPost
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "div" {
for _, attr := range n.Attr {
if attr.Key == "class" && strings.Contains(attr.Val, "post") {
post := parsePost(n)
if post.Content != "" {
posts = append(posts, post)
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
return posts
}
// parsePost extracts author, date, and content from a post div.
func parsePost(node *html.Node) btPost {
post := btPost{}
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode {
for _, attr := range n.Attr {
if attr.Key == "class" {
switch {
case strings.Contains(attr.Val, "poster_info"):
post.Author = extractText(n)
case strings.Contains(attr.Val, "headerandpost"):
// Look for date in smalltext
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.Data == "div" {
for _, a := range c.Attr {
if a.Key == "class" && strings.Contains(a.Val, "smalltext") {
post.Date = strings.TrimSpace(extractText(c))
}
}
}
}
case strings.Contains(attr.Val, "inner"):
post.Content = strings.TrimSpace(extractText(n))
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(node)
return post
}
// extractText recursively extracts text content from an HTML node.
func extractText(n *html.Node) string {
if n.Type == html.TextNode {
return n.Data
}
var b strings.Builder
for c := n.FirstChild; c != nil; c = c.NextSibling {
text := extractText(c)
if text != "" {
if b.Len() > 0 && c.Type == html.ElementNode && (c.Data == "br" || c.Data == "p" || c.Data == "div") {
b.WriteString("\n")
}
b.WriteString(text)
}
}
return b.String()
}
// formatPostMarkdown formats a BitcoinTalk post as markdown.
func formatPostMarkdown(num int, post btPost) string {
var b strings.Builder
fmt.Fprintf(&b, "# Post %d by %s\n\n", num, post.Author)
if post.Date != "" {
fmt.Fprintf(&b, "**Date:** %s\n\n", post.Date)
}
b.WriteString(post.Content)
b.WriteString("\n")
return b.String()
}
// ParsePostsFromHTML parses BitcoinTalk posts from raw HTML content.
// This is exported for testing purposes.
func ParsePostsFromHTML(htmlContent string) ([]btPost, error) {
doc, err := html.Parse(strings.NewReader(htmlContent))
if err != nil {
return nil, core.E("collect.ParsePostsFromHTML", "failed to parse HTML", err)
}
return extractPosts(doc), nil
}
// FormatPostMarkdown is exported for testing purposes.
func FormatPostMarkdown(num int, author, date, content string) string {
return formatPostMarkdown(num, btPost{Author: author, Date: date, Content: content})
}
// FetchPageFunc is an injectable function type for fetching pages, used in testing.
type FetchPageFunc func(ctx context.Context, url string) ([]btPost, error)
// BitcoinTalkCollectorWithFetcher wraps BitcoinTalkCollector with a custom fetcher for testing.
type BitcoinTalkCollectorWithFetcher struct {
BitcoinTalkCollector
Fetcher FetchPageFunc
}
// SetHTTPClient replaces the package-level HTTP client.
// Use this in tests to inject a custom transport or timeout.
func SetHTTPClient(c *http.Client) {
httpClient = c
}