cli/pkg/collect/papers.go

403 lines
10 KiB
Go
Raw Normal View History

feat: infrastructure packages and lint cleanup (#281) * ci: consolidate duplicate workflows and merge CodeQL configs Remove 17 duplicate workflow files that were split copies of the combined originals. Each family (CI, CodeQL, Coverage, PR Build, Alpha Release) had the same job duplicated across separate push/pull_request/schedule/manual trigger files. Merge codeql.yml and codescan.yml into a single codeql.yml with a language matrix covering go, javascript-typescript, python, and actions — matching the previous default setup coverage. Remaining workflows (one per family): - ci.yml (push + PR + manual) - codeql.yml (push + PR + schedule, all languages) - coverage.yml (push + PR + manual) - alpha-release.yml (push + manual) - pr-build.yml (PR + manual) - release.yml (tag push) - agent-verify.yml, auto-label.yml, auto-project.yml Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat: add collect, config, crypt, plugin packages and fix all lint issues Add four new infrastructure packages with CLI commands: - pkg/config: layered configuration (defaults → file → env → flags) - pkg/crypt: crypto primitives (Argon2id, AES-GCM, ChaCha20, HMAC, checksums) - pkg/plugin: plugin system with GitHub-based install/update/remove - pkg/collect: collection subsystem (GitHub, BitcoinTalk, market, papers, excavate) Fix all golangci-lint issues across the entire codebase (~100 errcheck, staticcheck SA1012/SA1019/ST1005, unused, ineffassign fixes) so that `core go qa` passes with 0 issues. Closes #167, #168, #170, #250, #251, #252, #253, #254, #255, #256 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 11:34:43 +00:00
package collect
import (
"context"
"encoding/xml"
"fmt"
"net/http"
"net/url"
"path/filepath"
"strings"
core "github.com/host-uk/core/pkg/framework/core"
"golang.org/x/net/html"
)
// Paper source identifiers.
const (
PaperSourceIACR = "iacr"
PaperSourceArXiv = "arxiv"
PaperSourceAll = "all"
)
// PapersCollector collects papers from IACR and arXiv.
type PapersCollector struct {
// Source is one of PaperSourceIACR, PaperSourceArXiv, or PaperSourceAll.
Source string
// Category is the arXiv category (e.g. "cs.CR" for cryptography).
Category string
// Query is the search query string.
Query string
}
// Name returns the collector name.
func (p *PapersCollector) Name() string {
return fmt.Sprintf("papers:%s", p.Source)
}
// paper represents a parsed academic paper.
type paper struct {
ID string
Title string
Authors []string
Abstract string
Date string
URL string
Source string
}
// Collect gathers papers from the configured sources.
func (p *PapersCollector) Collect(ctx context.Context, cfg *Config) (*Result, error) {
result := &Result{Source: p.Name()}
if p.Query == "" {
return result, core.E("collect.Papers.Collect", "query is required", nil)
}
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitStart(p.Name(), fmt.Sprintf("Starting paper collection for %q", p.Query))
}
if cfg.DryRun {
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitProgress(p.Name(), fmt.Sprintf("[dry-run] Would search papers for %q", p.Query), nil)
}
return result, nil
}
switch p.Source {
case PaperSourceIACR:
return p.collectIACR(ctx, cfg)
case PaperSourceArXiv:
return p.collectArXiv(ctx, cfg)
case PaperSourceAll:
iacrResult, iacrErr := p.collectIACR(ctx, cfg)
arxivResult, arxivErr := p.collectArXiv(ctx, cfg)
if iacrErr != nil && arxivErr != nil {
return result, core.E("collect.Papers.Collect", "all sources failed", iacrErr)
}
merged := MergeResults(p.Name(), iacrResult, arxivResult)
if iacrErr != nil {
merged.Errors++
}
if arxivErr != nil {
merged.Errors++
}
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitComplete(p.Name(), fmt.Sprintf("Collected %d papers", merged.Items), merged)
}
return merged, nil
default:
return result, core.E("collect.Papers.Collect",
fmt.Sprintf("unknown source: %s (use iacr, arxiv, or all)", p.Source), nil)
}
}
// collectIACR fetches papers from the IACR ePrint archive.
func (p *PapersCollector) collectIACR(ctx context.Context, cfg *Config) (*Result, error) {
result := &Result{Source: "papers:iacr"}
if cfg.Limiter != nil {
if err := cfg.Limiter.Wait(ctx, "iacr"); err != nil {
return result, err
}
}
searchURL := fmt.Sprintf("https://eprint.iacr.org/search?q=%s", url.QueryEscape(p.Query))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
if err != nil {
return result, core.E("collect.Papers.collectIACR", "failed to create request", err)
}
req.Header.Set("User-Agent", "CoreCollector/1.0")
resp, err := httpClient.Do(req)
if err != nil {
return result, core.E("collect.Papers.collectIACR", "request failed", err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
return result, core.E("collect.Papers.collectIACR",
fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)
}
doc, err := html.Parse(resp.Body)
if err != nil {
return result, core.E("collect.Papers.collectIACR", "failed to parse HTML", err)
}
papers := extractIACRPapers(doc)
baseDir := filepath.Join(cfg.OutputDir, "papers", "iacr")
if err := cfg.Output.EnsureDir(baseDir); err != nil {
return result, core.E("collect.Papers.collectIACR", "failed to create output directory", err)
}
for _, ppr := range papers {
filePath := filepath.Join(baseDir, ppr.ID+".md")
content := formatPaperMarkdown(ppr)
if err := cfg.Output.Write(filePath, content); err != nil {
result.Errors++
continue
}
result.Items++
result.Files = append(result.Files, filePath)
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Paper: %s", ppr.Title), nil)
}
}
return result, nil
}
// arxivFeed represents the Atom feed returned by the arXiv API.
type arxivFeed struct {
XMLName xml.Name `xml:"feed"`
Entries []arxivEntry `xml:"entry"`
}
type arxivEntry struct {
ID string `xml:"id"`
Title string `xml:"title"`
Summary string `xml:"summary"`
Published string `xml:"published"`
Authors []arxivAuthor `xml:"author"`
Links []arxivLink `xml:"link"`
}
type arxivAuthor struct {
Name string `xml:"name"`
}
type arxivLink struct {
Href string `xml:"href,attr"`
Rel string `xml:"rel,attr"`
Type string `xml:"type,attr"`
}
// collectArXiv fetches papers from the arXiv API.
func (p *PapersCollector) collectArXiv(ctx context.Context, cfg *Config) (*Result, error) {
result := &Result{Source: "papers:arxiv"}
if cfg.Limiter != nil {
if err := cfg.Limiter.Wait(ctx, "arxiv"); err != nil {
return result, err
}
}
query := url.QueryEscape(p.Query)
if p.Category != "" {
query = fmt.Sprintf("cat:%s+AND+%s", url.QueryEscape(p.Category), query)
}
searchURL := fmt.Sprintf("https://export.arxiv.org/api/query?search_query=%s&max_results=50", query)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
if err != nil {
return result, core.E("collect.Papers.collectArXiv", "failed to create request", err)
}
req.Header.Set("User-Agent", "CoreCollector/1.0")
resp, err := httpClient.Do(req)
if err != nil {
return result, core.E("collect.Papers.collectArXiv", "request failed", err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
return result, core.E("collect.Papers.collectArXiv",
fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)
}
var feed arxivFeed
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
return result, core.E("collect.Papers.collectArXiv", "failed to parse XML", err)
}
baseDir := filepath.Join(cfg.OutputDir, "papers", "arxiv")
if err := cfg.Output.EnsureDir(baseDir); err != nil {
return result, core.E("collect.Papers.collectArXiv", "failed to create output directory", err)
}
for _, entry := range feed.Entries {
ppr := arxivEntryToPaper(entry)
filePath := filepath.Join(baseDir, ppr.ID+".md")
content := formatPaperMarkdown(ppr)
if err := cfg.Output.Write(filePath, content); err != nil {
result.Errors++
continue
}
result.Items++
result.Files = append(result.Files, filePath)
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Paper: %s", ppr.Title), nil)
}
}
return result, nil
}
// arxivEntryToPaper converts an arXiv Atom entry to a paper.
func arxivEntryToPaper(entry arxivEntry) paper {
authors := make([]string, len(entry.Authors))
for i, a := range entry.Authors {
authors[i] = a.Name
}
// Extract the arXiv ID from the URL
id := entry.ID
if idx := strings.LastIndex(id, "/abs/"); idx != -1 {
id = id[idx+5:]
}
// Replace characters that are not valid in file names
id = strings.ReplaceAll(id, "/", "-")
id = strings.ReplaceAll(id, ":", "-")
paperURL := entry.ID
for _, link := range entry.Links {
if link.Rel == "alternate" {
paperURL = link.Href
break
}
}
return paper{
ID: id,
Title: strings.TrimSpace(entry.Title),
Authors: authors,
Abstract: strings.TrimSpace(entry.Summary),
Date: entry.Published,
URL: paperURL,
Source: "arxiv",
}
}
// extractIACRPapers extracts paper metadata from an IACR search results page.
func extractIACRPapers(doc *html.Node) []paper {
var papers []paper
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "div" {
for _, attr := range n.Attr {
if attr.Key == "class" && strings.Contains(attr.Val, "paperentry") {
ppr := parseIACREntry(n)
if ppr.Title != "" {
papers = append(papers, ppr)
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
return papers
}
// parseIACREntry extracts paper data from an IACR paper entry div.
func parseIACREntry(node *html.Node) paper {
ppr := paper{Source: "iacr"}
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode {
switch n.Data {
case "a":
for _, attr := range n.Attr {
if attr.Key == "href" && strings.Contains(attr.Val, "/eprint/") {
ppr.URL = "https://eprint.iacr.org" + attr.Val
// Extract ID from URL
parts := strings.Split(attr.Val, "/")
if len(parts) >= 2 {
ppr.ID = parts[len(parts)-2] + "-" + parts[len(parts)-1]
}
}
}
if ppr.Title == "" {
ppr.Title = strings.TrimSpace(extractText(n))
}
case "span":
for _, attr := range n.Attr {
if attr.Key == "class" {
switch {
case strings.Contains(attr.Val, "author"):
author := strings.TrimSpace(extractText(n))
if author != "" {
ppr.Authors = append(ppr.Authors, author)
}
case strings.Contains(attr.Val, "date"):
ppr.Date = strings.TrimSpace(extractText(n))
}
}
}
case "p":
for _, attr := range n.Attr {
if attr.Key == "class" && strings.Contains(attr.Val, "abstract") {
ppr.Abstract = strings.TrimSpace(extractText(n))
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(node)
return ppr
}
// formatPaperMarkdown formats a paper as markdown.
func formatPaperMarkdown(ppr paper) string {
var b strings.Builder
fmt.Fprintf(&b, "# %s\n\n", ppr.Title)
if len(ppr.Authors) > 0 {
fmt.Fprintf(&b, "- **Authors:** %s\n", strings.Join(ppr.Authors, ", "))
}
if ppr.Date != "" {
fmt.Fprintf(&b, "- **Published:** %s\n", ppr.Date)
}
if ppr.URL != "" {
fmt.Fprintf(&b, "- **URL:** %s\n", ppr.URL)
}
if ppr.Source != "" {
fmt.Fprintf(&b, "- **Source:** %s\n", ppr.Source)
}
if ppr.Abstract != "" {
fmt.Fprintf(&b, "\n## Abstract\n\n%s\n", ppr.Abstract)
}
return b.String()
}
// FormatPaperMarkdown is exported for testing.
func FormatPaperMarkdown(title string, authors []string, date, paperURL, source, abstract string) string {
return formatPaperMarkdown(paper{
Title: title,
Authors: authors,
Date: date,
URL: paperURL,
Source: source,
Abstract: abstract,
})
}