go-scm/collect/papers.go

package collect

import (
	"context"
	"encoding/xml"
	"fmt"
	"net/http"
	"net/url"
	"path/filepath"
	"strings"

	core "forge.lthn.ai/core/go/pkg/framework/core"
	"golang.org/x/net/html"
)

// Paper source identifiers.
const (
	PaperSourceIACR  = "iacr"
	PaperSourceArXiv = "arxiv"
	PaperSourceAll   = "all"
)

// PapersCollector collects papers from IACR and arXiv.
type PapersCollector struct {
	// Source is one of PaperSourceIACR, PaperSourceArXiv, or PaperSourceAll.
	Source string

	// Category is the arXiv category (e.g. "cs.CR" for cryptography).
	Category string

	// Query is the search query string.
	Query string
}

// Name returns the collector name.
func (p *PapersCollector) Name() string {
	return fmt.Sprintf("papers:%s", p.Source)
}

// paper represents a parsed academic paper.
type paper struct {
	ID       string
	Title    string
	Authors  []string
	Abstract string
	Date     string
	URL      string
	Source   string
}

// Collect gathers papers from the configured sources.
func (p *PapersCollector) Collect(ctx context.Context, cfg *Config) (*Result, error) {
	result := &Result{Source: p.Name()}

	if p.Query == "" {
		return result, core.E("collect.Papers.Collect", "query is required", nil)
	}

	if cfg.Dispatcher != nil {
		cfg.Dispatcher.EmitStart(p.Name(), fmt.Sprintf("Starting paper collection for %q", p.Query))
	}

	if cfg.DryRun {
		if cfg.Dispatcher != nil {
			cfg.Dispatcher.EmitProgress(p.Name(), fmt.Sprintf("[dry-run] Would search papers for %q", p.Query), nil)
		}
		return result, nil
	}

	switch p.Source {
	case PaperSourceIACR:
		return p.collectIACR(ctx, cfg)
	case PaperSourceArXiv:
		return p.collectArXiv(ctx, cfg)
	case PaperSourceAll:
		iacrResult, iacrErr := p.collectIACR(ctx, cfg)
		arxivResult, arxivErr := p.collectArXiv(ctx, cfg)

		if iacrErr != nil && arxivErr != nil {
			return result, core.E("collect.Papers.Collect", "all sources failed", iacrErr)
		}

		merged := MergeResults(p.Name(), iacrResult, arxivResult)
		if iacrErr != nil {
			merged.Errors++
		}
		if arxivErr != nil {
			merged.Errors++
		}

		if cfg.Dispatcher != nil {
			cfg.Dispatcher.EmitComplete(p.Name(), fmt.Sprintf("Collected %d papers", merged.Items), merged)
		}

		return merged, nil
	default:
		return result, core.E("collect.Papers.Collect",
			fmt.Sprintf("unknown source: %s (use iacr, arxiv, or all)", p.Source), nil)
	}
}

// collectIACR fetches papers from the IACR ePrint archive.
func (p *PapersCollector) collectIACR(ctx context.Context, cfg *Config) (*Result, error) {
	result := &Result{Source: "papers:iacr"}

	if cfg.Limiter != nil {
		if err := cfg.Limiter.Wait(ctx, "iacr"); err != nil {
			return result, err
		}
	}

	searchURL := fmt.Sprintf("https://eprint.iacr.org/search?q=%s", url.QueryEscape(p.Query))

	req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
	if err != nil {
		return result, core.E("collect.Papers.collectIACR", "failed to create request", err)
	}
	req.Header.Set("User-Agent", "CoreCollector/1.0")

	resp, err := httpClient.Do(req)
	if err != nil {
		return result, core.E("collect.Papers.collectIACR", "request failed", err)
	}
	defer func() { _ = resp.Body.Close() }()

	if resp.StatusCode != http.StatusOK {
		return result, core.E("collect.Papers.collectIACR",
			fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)
	}

	doc, err := html.Parse(resp.Body)
	if err != nil {
		return result, core.E("collect.Papers.collectIACR", "failed to parse HTML", err)
	}

	papers := extractIACRPapers(doc)

	baseDir := filepath.Join(cfg.OutputDir, "papers", "iacr")
	if err := cfg.Output.EnsureDir(baseDir); err != nil {
		return result, core.E("collect.Papers.collectIACR", "failed to create output directory", err)
	}

	for _, ppr := range papers {
		filePath := filepath.Join(baseDir, ppr.ID+".md")
		content := formatPaperMarkdown(ppr)

		if err := cfg.Output.Write(filePath, content); err != nil {
			result.Errors++
			continue
		}

		result.Items++
		result.Files = append(result.Files, filePath)

		if cfg.Dispatcher != nil {
			cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Paper: %s", ppr.Title), nil)
		}
	}

	return result, nil
}

// arxivFeed represents the Atom feed returned by the arXiv API.
type arxivFeed struct {
	XMLName xml.Name     `xml:"feed"`
	Entries []arxivEntry `xml:"entry"`
}

type arxivEntry struct {
	ID        string        `xml:"id"`
	Title     string        `xml:"title"`
	Summary   string        `xml:"summary"`
	Published string        `xml:"published"`
	Authors   []arxivAuthor `xml:"author"`
	Links     []arxivLink   `xml:"link"`
}

type arxivAuthor struct {
	Name string `xml:"name"`
}

type arxivLink struct {
	Href string `xml:"href,attr"`
	Rel  string `xml:"rel,attr"`
	Type string `xml:"type,attr"`
}

// collectArXiv fetches papers from the arXiv API.
func (p *PapersCollector) collectArXiv(ctx context.Context, cfg *Config) (*Result, error) {
	result := &Result{Source: "papers:arxiv"}

	if cfg.Limiter != nil {
		if err := cfg.Limiter.Wait(ctx, "arxiv"); err != nil {
			return result, err
		}
	}

	query := url.QueryEscape(p.Query)
	if p.Category != "" {
		query = fmt.Sprintf("cat:%s+AND+%s", url.QueryEscape(p.Category), query)
	}

	searchURL := fmt.Sprintf("https://export.arxiv.org/api/query?search_query=%s&max_results=50", query)

	req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
	if err != nil {
		return result, core.E("collect.Papers.collectArXiv", "failed to create request", err)
	}
	req.Header.Set("User-Agent", "CoreCollector/1.0")

	resp, err := httpClient.Do(req)
	if err != nil {
		return result, core.E("collect.Papers.collectArXiv", "request failed", err)
	}
	defer func() { _ = resp.Body.Close() }()

	if resp.StatusCode != http.StatusOK {
		return result, core.E("collect.Papers.collectArXiv",
			fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)
	}

	var feed arxivFeed
	if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
		return result, core.E("collect.Papers.collectArXiv", "failed to parse XML", err)
	}

	baseDir := filepath.Join(cfg.OutputDir, "papers", "arxiv")
	if err := cfg.Output.EnsureDir(baseDir); err != nil {
		return result, core.E("collect.Papers.collectArXiv", "failed to create output directory", err)
	}

	for _, entry := range feed.Entries {
		ppr := arxivEntryToPaper(entry)

		filePath := filepath.Join(baseDir, ppr.ID+".md")
		content := formatPaperMarkdown(ppr)

		if err := cfg.Output.Write(filePath, content); err != nil {
			result.Errors++
			continue
		}

		result.Items++
		result.Files = append(result.Files, filePath)

		if cfg.Dispatcher != nil {
			cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Paper: %s", ppr.Title), nil)
		}
	}

	return result, nil
}

// arxivEntryToPaper converts an arXiv Atom entry to a paper.
func arxivEntryToPaper(entry arxivEntry) paper {
	authors := make([]string, len(entry.Authors))
	for i, a := range entry.Authors {
		authors[i] = a.Name
	}

	// Extract the arXiv ID from the URL
	id := entry.ID
	if idx := strings.LastIndex(id, "/abs/"); idx != -1 {
		id = id[idx+5:]
	}
	// Replace characters that are not valid in file names
	id = strings.ReplaceAll(id, "/", "-")
	id = strings.ReplaceAll(id, ":", "-")

	paperURL := entry.ID
	for _, link := range entry.Links {
		if link.Rel == "alternate" {
			paperURL = link.Href
			break
		}
	}

	return paper{
		ID:       id,
		Title:    strings.TrimSpace(entry.Title),
		Authors:  authors,
		Abstract: strings.TrimSpace(entry.Summary),
		Date:     entry.Published,
		URL:      paperURL,
		Source:   "arxiv",
	}
}

// extractIACRPapers extracts paper metadata from an IACR search results page.
func extractIACRPapers(doc *html.Node) []paper {
	var papers []paper
	var walk func(*html.Node)

	walk = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "div" {
			for _, attr := range n.Attr {
				if attr.Key == "class" && strings.Contains(attr.Val, "paperentry") {
					ppr := parseIACREntry(n)
					if ppr.Title != "" {
						papers = append(papers, ppr)
					}
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c)
		}
	}

	walk(doc)
	return papers
}

// parseIACREntry extracts paper data from an IACR paper entry div.
func parseIACREntry(node *html.Node) paper {
	ppr := paper{Source: "iacr"}
	var walk func(*html.Node)

	walk = func(n *html.Node) {
		if n.Type == html.ElementNode {
			switch n.Data {
			case "a":
				for _, attr := range n.Attr {
					if attr.Key == "href" && strings.Contains(attr.Val, "/eprint/") {
						ppr.URL = "https://eprint.iacr.org" + attr.Val
						// Extract ID from URL
						parts := strings.Split(attr.Val, "/")
						if len(parts) >= 2 {
							ppr.ID = parts[len(parts)-2] + "-" + parts[len(parts)-1]
						}
					}
				}
				if ppr.Title == "" {
					ppr.Title = strings.TrimSpace(extractText(n))
				}
			case "span":
				for _, attr := range n.Attr {
					if attr.Key == "class" {
						switch {
						case strings.Contains(attr.Val, "author"):
							author := strings.TrimSpace(extractText(n))
							if author != "" {
								ppr.Authors = append(ppr.Authors, author)
							}
						case strings.Contains(attr.Val, "date"):
							ppr.Date = strings.TrimSpace(extractText(n))
						}
					}
				}
			case "p":
				for _, attr := range n.Attr {
					if attr.Key == "class" && strings.Contains(attr.Val, "abstract") {
						ppr.Abstract = strings.TrimSpace(extractText(n))
					}
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c)
		}
	}

	walk(node)
	return ppr
}

// formatPaperMarkdown formats a paper as markdown.
func formatPaperMarkdown(ppr paper) string {
	var b strings.Builder
	fmt.Fprintf(&b, "# %s\n\n", ppr.Title)

	if len(ppr.Authors) > 0 {
		fmt.Fprintf(&b, "- **Authors:** %s\n", strings.Join(ppr.Authors, ", "))
	}
	if ppr.Date != "" {
		fmt.Fprintf(&b, "- **Published:** %s\n", ppr.Date)
	}
	if ppr.URL != "" {
		fmt.Fprintf(&b, "- **URL:** %s\n", ppr.URL)
	}
	if ppr.Source != "" {
		fmt.Fprintf(&b, "- **Source:** %s\n", ppr.Source)
	}

	if ppr.Abstract != "" {
		fmt.Fprintf(&b, "\n## Abstract\n\n%s\n", ppr.Abstract)
	}

	return b.String()
}

// FormatPaperMarkdown is exported for testing.
func FormatPaperMarkdown(title string, authors []string, date, paperURL, source, abstract string) string {
	return formatPaperMarkdown(paper{
		Title:    title,
		Authors:  authors,
		Date:     date,
		URL:      paperURL,
		Source:   source,
		Abstract: abstract,
	})
}