cli/pkg/collect/papers.go

403 lines
10 KiB
Go
Raw Normal View History

package collect
import (
"context"
"encoding/xml"
"fmt"
"net/http"
"net/url"
"path/filepath"
"strings"
core "github.com/host-uk/core/pkg/framework/core"
"golang.org/x/net/html"
)
// Paper source identifiers.
const (
PaperSourceIACR = "iacr"
PaperSourceArXiv = "arxiv"
PaperSourceAll = "all"
)
// PapersCollector collects papers from IACR and arXiv.
type PapersCollector struct {
// Source is one of PaperSourceIACR, PaperSourceArXiv, or PaperSourceAll.
Source string
// Category is the arXiv category (e.g. "cs.CR" for cryptography).
Category string
// Query is the search query string.
Query string
}
// Name returns the collector name.
func (p *PapersCollector) Name() string {
return fmt.Sprintf("papers:%s", p.Source)
}
// paper represents a parsed academic paper.
type paper struct {
ID string
Title string
Authors []string
Abstract string
Date string
URL string
Source string
}
// Collect gathers papers from the configured sources.
func (p *PapersCollector) Collect(ctx context.Context, cfg *Config) (*Result, error) {
result := &Result{Source: p.Name()}
if p.Query == "" {
return result, core.E("collect.Papers.Collect", "query is required", nil)
}
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitStart(p.Name(), fmt.Sprintf("Starting paper collection for %q", p.Query))
}
if cfg.DryRun {
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitProgress(p.Name(), fmt.Sprintf("[dry-run] Would search papers for %q", p.Query), nil)
}
return result, nil
}
switch p.Source {
case PaperSourceIACR:
return p.collectIACR(ctx, cfg)
case PaperSourceArXiv:
return p.collectArXiv(ctx, cfg)
case PaperSourceAll:
iacrResult, iacrErr := p.collectIACR(ctx, cfg)
arxivResult, arxivErr := p.collectArXiv(ctx, cfg)
if iacrErr != nil && arxivErr != nil {
return result, core.E("collect.Papers.Collect", "all sources failed", iacrErr)
}
merged := MergeResults(p.Name(), iacrResult, arxivResult)
if iacrErr != nil {
merged.Errors++
}
if arxivErr != nil {
merged.Errors++
}
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitComplete(p.Name(), fmt.Sprintf("Collected %d papers", merged.Items), merged)
}
return merged, nil
default:
return result, core.E("collect.Papers.Collect",
fmt.Sprintf("unknown source: %s (use iacr, arxiv, or all)", p.Source), nil)
}
}
// collectIACR fetches papers from the IACR ePrint archive.
func (p *PapersCollector) collectIACR(ctx context.Context, cfg *Config) (*Result, error) {
result := &Result{Source: "papers:iacr"}
if cfg.Limiter != nil {
if err := cfg.Limiter.Wait(ctx, "iacr"); err != nil {
return result, err
}
}
searchURL := fmt.Sprintf("https://eprint.iacr.org/search?q=%s", url.QueryEscape(p.Query))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
if err != nil {
return result, core.E("collect.Papers.collectIACR", "failed to create request", err)
}
req.Header.Set("User-Agent", "CoreCollector/1.0")
resp, err := httpClient.Do(req)
if err != nil {
return result, core.E("collect.Papers.collectIACR", "request failed", err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
return result, core.E("collect.Papers.collectIACR",
fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)
}
doc, err := html.Parse(resp.Body)
if err != nil {
return result, core.E("collect.Papers.collectIACR", "failed to parse HTML", err)
}
papers := extractIACRPapers(doc)
baseDir := filepath.Join(cfg.OutputDir, "papers", "iacr")
if err := cfg.Output.EnsureDir(baseDir); err != nil {
return result, core.E("collect.Papers.collectIACR", "failed to create output directory", err)
}
for _, ppr := range papers {
filePath := filepath.Join(baseDir, ppr.ID+".md")
content := formatPaperMarkdown(ppr)
if err := cfg.Output.Write(filePath, content); err != nil {
result.Errors++
continue
}
result.Items++
result.Files = append(result.Files, filePath)
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Paper: %s", ppr.Title), nil)
}
}
return result, nil
}
// arxivFeed represents the Atom feed returned by the arXiv API.
type arxivFeed struct {
XMLName xml.Name `xml:"feed"`
Entries []arxivEntry `xml:"entry"`
}
type arxivEntry struct {
ID string `xml:"id"`
Title string `xml:"title"`
Summary string `xml:"summary"`
Published string `xml:"published"`
Authors []arxivAuthor `xml:"author"`
Links []arxivLink `xml:"link"`
}
type arxivAuthor struct {
Name string `xml:"name"`
}
type arxivLink struct {
Href string `xml:"href,attr"`
Rel string `xml:"rel,attr"`
Type string `xml:"type,attr"`
}
// collectArXiv fetches papers from the arXiv API.
func (p *PapersCollector) collectArXiv(ctx context.Context, cfg *Config) (*Result, error) {
result := &Result{Source: "papers:arxiv"}
if cfg.Limiter != nil {
if err := cfg.Limiter.Wait(ctx, "arxiv"); err != nil {
return result, err
}
}
query := url.QueryEscape(p.Query)
if p.Category != "" {
query = fmt.Sprintf("cat:%s+AND+%s", url.QueryEscape(p.Category), query)
}
searchURL := fmt.Sprintf("https://export.arxiv.org/api/query?search_query=%s&max_results=50", query)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
if err != nil {
return result, core.E("collect.Papers.collectArXiv", "failed to create request", err)
}
req.Header.Set("User-Agent", "CoreCollector/1.0")
resp, err := httpClient.Do(req)
if err != nil {
return result, core.E("collect.Papers.collectArXiv", "request failed", err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
return result, core.E("collect.Papers.collectArXiv",
fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)
}
var feed arxivFeed
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
return result, core.E("collect.Papers.collectArXiv", "failed to parse XML", err)
}
baseDir := filepath.Join(cfg.OutputDir, "papers", "arxiv")
if err := cfg.Output.EnsureDir(baseDir); err != nil {
return result, core.E("collect.Papers.collectArXiv", "failed to create output directory", err)
}
for _, entry := range feed.Entries {
ppr := arxivEntryToPaper(entry)
filePath := filepath.Join(baseDir, ppr.ID+".md")
content := formatPaperMarkdown(ppr)
if err := cfg.Output.Write(filePath, content); err != nil {
result.Errors++
continue
}
result.Items++
result.Files = append(result.Files, filePath)
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Paper: %s", ppr.Title), nil)
}
}
return result, nil
}
// arxivEntryToPaper converts an arXiv Atom entry to a paper.
func arxivEntryToPaper(entry arxivEntry) paper {
authors := make([]string, len(entry.Authors))
for i, a := range entry.Authors {
authors[i] = a.Name
}
// Extract the arXiv ID from the URL
id := entry.ID
if idx := strings.LastIndex(id, "/abs/"); idx != -1 {
id = id[idx+5:]
}
// Replace characters that are not valid in file names
id = strings.ReplaceAll(id, "/", "-")
id = strings.ReplaceAll(id, ":", "-")
paperURL := entry.ID
for _, link := range entry.Links {
if link.Rel == "alternate" {
paperURL = link.Href
break
}
}
return paper{
ID: id,
Title: strings.TrimSpace(entry.Title),
Authors: authors,
Abstract: strings.TrimSpace(entry.Summary),
Date: entry.Published,
URL: paperURL,
Source: "arxiv",
}
}
// extractIACRPapers extracts paper metadata from an IACR search results page.
func extractIACRPapers(doc *html.Node) []paper {
var papers []paper
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "div" {
for _, attr := range n.Attr {
if attr.Key == "class" && strings.Contains(attr.Val, "paperentry") {
ppr := parseIACREntry(n)
if ppr.Title != "" {
papers = append(papers, ppr)
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
return papers
}
// parseIACREntry extracts paper data from an IACR paper entry div.
func parseIACREntry(node *html.Node) paper {
ppr := paper{Source: "iacr"}
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode {
switch n.Data {
case "a":
for _, attr := range n.Attr {
if attr.Key == "href" && strings.Contains(attr.Val, "/eprint/") {
ppr.URL = "https://eprint.iacr.org" + attr.Val
// Extract ID from URL
parts := strings.Split(attr.Val, "/")
if len(parts) >= 2 {
ppr.ID = parts[len(parts)-2] + "-" + parts[len(parts)-1]
}
}
}
if ppr.Title == "" {
ppr.Title = strings.TrimSpace(extractText(n))
}
case "span":
for _, attr := range n.Attr {
if attr.Key == "class" {
switch {
case strings.Contains(attr.Val, "author"):
author := strings.TrimSpace(extractText(n))
if author != "" {
ppr.Authors = append(ppr.Authors, author)
}
case strings.Contains(attr.Val, "date"):
ppr.Date = strings.TrimSpace(extractText(n))
}
}
}
case "p":
for _, attr := range n.Attr {
if attr.Key == "class" && strings.Contains(attr.Val, "abstract") {
ppr.Abstract = strings.TrimSpace(extractText(n))
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(node)
return ppr
}
// formatPaperMarkdown formats a paper as markdown.
func formatPaperMarkdown(ppr paper) string {
var b strings.Builder
fmt.Fprintf(&b, "# %s\n\n", ppr.Title)
if len(ppr.Authors) > 0 {
fmt.Fprintf(&b, "- **Authors:** %s\n", strings.Join(ppr.Authors, ", "))
}
if ppr.Date != "" {
fmt.Fprintf(&b, "- **Published:** %s\n", ppr.Date)
}
if ppr.URL != "" {
fmt.Fprintf(&b, "- **URL:** %s\n", ppr.URL)
}
if ppr.Source != "" {
fmt.Fprintf(&b, "- **Source:** %s\n", ppr.Source)
}
if ppr.Abstract != "" {
fmt.Fprintf(&b, "\n## Abstract\n\n%s\n", ppr.Abstract)
}
return b.String()
}
// FormatPaperMarkdown is exported for testing.
func FormatPaperMarkdown(title string, authors []string, date, paperURL, source, abstract string) string {
return formatPaperMarkdown(paper{
Title: title,
Authors: authors,
Date: date,
URL: paperURL,
Source: source,
Abstract: abstract,
})
}