cli/pkg/collect/papers.go

package collect

import (
	"context"
	"encoding/xml"
	"fmt"
	"net/http"
	"net/url"
	"path/filepath"
	"strings"

	core "github.com/host-uk/core/pkg/framework/core"
	"golang.org/x/net/html"
)

// Paper source identifiers.
const (
	PaperSourceIACR  = "iacr"
	PaperSourceArXiv = "arxiv"
	PaperSourceAll   = "all"
)

// PapersCollector collects papers from IACR and arXiv.
type PapersCollector struct {
	// Source is one of PaperSourceIACR, PaperSourceArXiv, or PaperSourceAll.
	Source string

	// Category is the arXiv category (e.g. "cs.CR" for cryptography).
	Category string

	// Query is the search query string.
	Query string
}

// Name returns the collector name.
func (p *PapersCollector) Name() string {
	return fmt.Sprintf("papers:%s", p.Source)
}

// paper represents a parsed academic paper.
type paper struct {
	ID       string
	Title    string
	Authors  []string
	Abstract string
	Date     string
	URL      string
	Source   string
}

// Collect gathers papers from the configured sources.
func (p *PapersCollector) Collect(ctx context.Context, cfg *Config) (*Result, error) {
	result := &Result{Source: p.Name()}

	if p.Query == "" {
		return result, core.E("collect.Papers.Collect", "query is required", nil)
	}

	if cfg.Dispatcher != nil {
		cfg.Dispatcher.EmitStart(p.Name(), fmt.Sprintf("Starting paper collection for %q", p.Query))
	}

	if cfg.DryRun {
		if cfg.Dispatcher != nil {
			cfg.Dispatcher.EmitProgress(p.Name(), fmt.Sprintf("[dry-run] Would search papers for %q", p.Query), nil)
		}
		return result, nil
	}

	switch p.Source {
	case PaperSourceIACR:
		return p.collectIACR(ctx, cfg)
	case PaperSourceArXiv:
		return p.collectArXiv(ctx, cfg)
	case PaperSourceAll:
		iacrResult, iacrErr := p.collectIACR(ctx, cfg)
		arxivResult, arxivErr := p.collectArXiv(ctx, cfg)

		if iacrErr != nil && arxivErr != nil {
			return result, core.E("collect.Papers.Collect", "all sources failed", iacrErr)
		}

		merged := MergeResults(p.Name(), iacrResult, arxivResult)
		if iacrErr != nil {
			merged.Errors++
		}
		if arxivErr != nil {
			merged.Errors++
		}

		if cfg.Dispatcher != nil {
			cfg.Dispatcher.EmitComplete(p.Name(), fmt.Sprintf("Collected %d papers", merged.Items), merged)
		}

		return merged, nil
	default:
		return result, core.E("collect.Papers.Collect",
			fmt.Sprintf("unknown source: %s (use iacr, arxiv, or all)", p.Source), nil)
	}
}

// collectIACR fetches papers from the IACR ePrint archive.
func (p *PapersCollector) collectIACR(ctx context.Context, cfg *Config) (*Result, error) {
	result := &Result{Source: "papers:iacr"}

	if cfg.Limiter != nil {
		if err := cfg.Limiter.Wait(ctx, "iacr"); err != nil {
			return result, err
		}
	}

	searchURL := fmt.Sprintf("https://eprint.iacr.org/search?q=%s", url.QueryEscape(p.Query))

	req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
	if err != nil {
		return result, core.E("collect.Papers.collectIACR", "failed to create request", err)
	}
	req.Header.Set("User-Agent", "CoreCollector/1.0")

	resp, err := httpClient.Do(req)
	if err != nil {
		return result, core.E("collect.Papers.collectIACR", "request failed", err)
	}
	defer func() { _ = resp.Body.Close() }()

	if resp.StatusCode != http.StatusOK {
		return result, core.E("collect.Papers.collectIACR",
			fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)
	}

	doc, err := html.Parse(resp.Body)
	if err != nil {
		return result, core.E("collect.Papers.collectIACR", "failed to parse HTML", err)
	}

	papers := extractIACRPapers(doc)

	baseDir := filepath.Join(cfg.OutputDir, "papers", "iacr")
	if err := cfg.Output.EnsureDir(baseDir); err != nil {
		return result, core.E("collect.Papers.collectIACR", "failed to create output directory", err)
	}

	for _, ppr := range papers {
		filePath := filepath.Join(baseDir, ppr.ID+".md")
		content := formatPaperMarkdown(ppr)

		if err := cfg.Output.Write(filePath, content); err != nil {
			result.Errors++
			continue
		}

		result.Items++
		result.Files = append(result.Files, filePath)

		if cfg.Dispatcher != nil {
			cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Paper: %s", ppr.Title), nil)
		}
	}

	return result, nil
}

// arxivFeed represents the Atom feed returned by the arXiv API.
type arxivFeed struct {
	XMLName xml.Name     `xml:"feed"`
	Entries []arxivEntry `xml:"entry"`
}

type arxivEntry struct {
	ID        string        `xml:"id"`
	Title     string        `xml:"title"`
	Summary   string        `xml:"summary"`
	Published string        `xml:"published"`
	Authors   []arxivAuthor `xml:"author"`
	Links     []arxivLink   `xml:"link"`
}

type arxivAuthor struct {
	Name string `xml:"name"`
}

type arxivLink struct {
	Href string `xml:"href,attr"`
	Rel  string `xml:"rel,attr"`
	Type string `xml:"type,attr"`
}

// collectArXiv fetches papers from the arXiv API.
func (p *PapersCollector) collectArXiv(ctx context.Context, cfg *Config) (*Result, error) {
	result := &Result{Source: "papers:arxiv"}

	if cfg.Limiter != nil {
		if err := cfg.Limiter.Wait(ctx, "arxiv"); err != nil {
			return result, err
		}
	}

	query := url.QueryEscape(p.Query)
	if p.Category != "" {
		query = fmt.Sprintf("cat:%s+AND+%s", url.QueryEscape(p.Category), query)
	}

	searchURL := fmt.Sprintf("https://export.arxiv.org/api/query?search_query=%s&max_results=50", query)

	req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
	if err != nil {
		return result, core.E("collect.Papers.collectArXiv", "failed to create request", err)
	}
	req.Header.Set("User-Agent", "CoreCollector/1.0")

	resp, err := httpClient.Do(req)
	if err != nil {
		return result, core.E("collect.Papers.collectArXiv", "request failed", err)
	}
	defer func() { _ = resp.Body.Close() }()

	if resp.StatusCode != http.StatusOK {
		return result, core.E("collect.Papers.collectArXiv",
			fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)
	}

	var feed arxivFeed
	if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
		return result, core.E("collect.Papers.collectArXiv", "failed to parse XML", err)
	}

	baseDir := filepath.Join(cfg.OutputDir, "papers", "arxiv")
	if err := cfg.Output.EnsureDir(baseDir); err != nil {
		return result, core.E("collect.Papers.collectArXiv", "failed to create output directory", err)
	}

	for _, entry := range feed.Entries {
		ppr := arxivEntryToPaper(entry)

		filePath := filepath.Join(baseDir, ppr.ID+".md")
		content := formatPaperMarkdown(ppr)

		if err := cfg.Output.Write(filePath, content); err != nil {
			result.Errors++
			continue
		}

		result.Items++
		result.Files = append(result.Files, filePath)

		if cfg.Dispatcher != nil {
			cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Paper: %s", ppr.Title), nil)
		}
	}

	return result, nil
}

// arxivEntryToPaper converts an arXiv Atom entry to a paper.
func arxivEntryToPaper(entry arxivEntry) paper {
	authors := make([]string, len(entry.Authors))
	for i, a := range entry.Authors {
		authors[i] = a.Name
	}

	// Extract the arXiv ID from the URL
	id := entry.ID
	if idx := strings.LastIndex(id, "/abs/"); idx != -1 {
		id = id[idx+5:]
	}
	// Replace characters that are not valid in file names
	id = strings.ReplaceAll(id, "/", "-")
	id = strings.ReplaceAll(id, ":", "-")

	paperURL := entry.ID
	for _, link := range entry.Links {
		if link.Rel == "alternate" {
			paperURL = link.Href
			break
		}
	}

	return paper{
		ID:       id,
		Title:    strings.TrimSpace(entry.Title),
		Authors:  authors,
		Abstract: strings.TrimSpace(entry.Summary),
		Date:     entry.Published,
		URL:      paperURL,
		Source:   "arxiv",
	}
}

// extractIACRPapers extracts paper metadata from an IACR search results page.
func extractIACRPapers(doc *html.Node) []paper {
	var papers []paper
	var walk func(*html.Node)

	walk = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "div" {
			for _, attr := range n.Attr {
				if attr.Key == "class" && strings.Contains(attr.Val, "paperentry") {
					ppr := parseIACREntry(n)
					if ppr.Title != "" {
						papers = append(papers, ppr)
					}
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c)
		}
	}

	walk(doc)
	return papers
}

// parseIACREntry extracts paper data from an IACR paper entry div.
func parseIACREntry(node *html.Node) paper {
	ppr := paper{Source: "iacr"}
	var walk func(*html.Node)

	walk = func(n *html.Node) {
		if n.Type == html.ElementNode {
			switch n.Data {
			case "a":
				for _, attr := range n.Attr {
					if attr.Key == "href" && strings.Contains(attr.Val, "/eprint/") {
						ppr.URL = "https://eprint.iacr.org" + attr.Val
						// Extract ID from URL
						parts := strings.Split(attr.Val, "/")
						if len(parts) >= 2 {
							ppr.ID = parts[len(parts)-2] + "-" + parts[len(parts)-1]
						}
					}
				}
				if ppr.Title == "" {
					ppr.Title = strings.TrimSpace(extractText(n))
				}
			case "span":
				for _, attr := range n.Attr {
					if attr.Key == "class" {
						switch {
						case strings.Contains(attr.Val, "author"):
							author := strings.TrimSpace(extractText(n))
							if author != "" {
								ppr.Authors = append(ppr.Authors, author)
							}
						case strings.Contains(attr.Val, "date"):
							ppr.Date = strings.TrimSpace(extractText(n))
						}
					}
				}
			case "p":
				for _, attr := range n.Attr {
					if attr.Key == "class" && strings.Contains(attr.Val, "abstract") {
						ppr.Abstract = strings.TrimSpace(extractText(n))
					}
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c)
		}
	}

	walk(node)
	return ppr
}

// formatPaperMarkdown formats a paper as markdown.
func formatPaperMarkdown(ppr paper) string {
	var b strings.Builder
	fmt.Fprintf(&b, "# %s\n\n", ppr.Title)

	if len(ppr.Authors) > 0 {
		fmt.Fprintf(&b, "- **Authors:** %s\n", strings.Join(ppr.Authors, ", "))
	}
	if ppr.Date != "" {
		fmt.Fprintf(&b, "- **Published:** %s\n", ppr.Date)
	}
	if ppr.URL != "" {
		fmt.Fprintf(&b, "- **URL:** %s\n", ppr.URL)
	}
	if ppr.Source != "" {
		fmt.Fprintf(&b, "- **Source:** %s\n", ppr.Source)
	}

	if ppr.Abstract != "" {
		fmt.Fprintf(&b, "\n## Abstract\n\n%s\n", ppr.Abstract)
	}

	return b.String()
}

// FormatPaperMarkdown is exported for testing.
func FormatPaperMarkdown(title string, authors []string, date, paperURL, source, abstract string) string {
	return formatPaperMarkdown(paper{
		Title:    title,
		Authors:  authors,
		Date:     date,
		URL:      paperURL,
		Source:   source,
		Abstract: abstract,
	})
}
feat: add collect, config, crypt, plugin packages and fix all lint issues Add four new infrastructure packages with CLI commands: - pkg/config: layered configuration (defaults → file → env → flags) - pkg/crypt: crypto primitives (Argon2id, AES-GCM, ChaCha20, HMAC, checksums) - pkg/plugin: plugin system with GitHub-based install/update/remove - pkg/collect: collection subsystem (GitHub, BitcoinTalk, market, papers, excavate) Fix all golangci-lint issues across the entire codebase (~100 errcheck, staticcheck SA1012/SA1019/ST1005, unused, ineffassign fixes) so that `core go qa` passes with 0 issues. Closes #167, #168, #170, #250, #251, #252, #253, #254, #255, #256 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> 2026-02-04 11:32:41 +00:00			`package collect`

			`import (`
			`"context"`
			`"encoding/xml"`
			`"fmt"`
			`"net/http"`
			`"net/url"`
			`"path/filepath"`
			`"strings"`

			`core "github.com/host-uk/core/pkg/framework/core"`
			`"golang.org/x/net/html"`
			`)`

			`// Paper source identifiers.`
			`const (`
			`PaperSourceIACR = "iacr"`
			`PaperSourceArXiv = "arxiv"`
			`PaperSourceAll = "all"`
			`)`

			`// PapersCollector collects papers from IACR and arXiv.`
			`type PapersCollector struct {`
			`// Source is one of PaperSourceIACR, PaperSourceArXiv, or PaperSourceAll.`
			`Source string`

			`// Category is the arXiv category (e.g. "cs.CR" for cryptography).`
			`Category string`

			`// Query is the search query string.`
			`Query string`
			`}`

			`// Name returns the collector name.`
			`func (p *PapersCollector) Name() string {`
			`return fmt.Sprintf("papers:%s", p.Source)`
			`}`

			`// paper represents a parsed academic paper.`
			`type paper struct {`
			`ID string`
			`Title string`
			`Authors []string`
			`Abstract string`
			`Date string`
			`URL string`
			`Source string`
			`}`

			`// Collect gathers papers from the configured sources.`
			`func (p PapersCollector) Collect(ctx context.Context, cfg Config) (*Result, error) {`
			`result := &Result{Source: p.Name()}`

			`if p.Query == "" {`
			`return result, core.E("collect.Papers.Collect", "query is required", nil)`
			`}`

			`if cfg.Dispatcher != nil {`
			`cfg.Dispatcher.EmitStart(p.Name(), fmt.Sprintf("Starting paper collection for %q", p.Query))`
			`}`

			`if cfg.DryRun {`
			`if cfg.Dispatcher != nil {`
			`cfg.Dispatcher.EmitProgress(p.Name(), fmt.Sprintf("[dry-run] Would search papers for %q", p.Query), nil)`
			`}`
			`return result, nil`
			`}`

			`switch p.Source {`
			`case PaperSourceIACR:`
			`return p.collectIACR(ctx, cfg)`
			`case PaperSourceArXiv:`
			`return p.collectArXiv(ctx, cfg)`
			`case PaperSourceAll:`
			`iacrResult, iacrErr := p.collectIACR(ctx, cfg)`
			`arxivResult, arxivErr := p.collectArXiv(ctx, cfg)`

			`if iacrErr != nil && arxivErr != nil {`
			`return result, core.E("collect.Papers.Collect", "all sources failed", iacrErr)`
			`}`

			`merged := MergeResults(p.Name(), iacrResult, arxivResult)`
			`if iacrErr != nil {`
			`merged.Errors++`
			`}`
			`if arxivErr != nil {`
			`merged.Errors++`
			`}`

			`if cfg.Dispatcher != nil {`
			`cfg.Dispatcher.EmitComplete(p.Name(), fmt.Sprintf("Collected %d papers", merged.Items), merged)`
			`}`

			`return merged, nil`
			`default:`
			`return result, core.E("collect.Papers.Collect",`
			`fmt.Sprintf("unknown source: %s (use iacr, arxiv, or all)", p.Source), nil)`
			`}`
			`}`

			`// collectIACR fetches papers from the IACR ePrint archive.`
			`func (p PapersCollector) collectIACR(ctx context.Context, cfg Config) (*Result, error) {`
			`result := &Result{Source: "papers:iacr"}`

			`if cfg.Limiter != nil {`
			`if err := cfg.Limiter.Wait(ctx, "iacr"); err != nil {`
			`return result, err`
			`}`
			`}`

			`searchURL := fmt.Sprintf("https://eprint.iacr.org/search?q=%s", url.QueryEscape(p.Query))`

			`req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)`
			`if err != nil {`
			`return result, core.E("collect.Papers.collectIACR", "failed to create request", err)`
			`}`
			`req.Header.Set("User-Agent", "CoreCollector/1.0")`

			`resp, err := httpClient.Do(req)`
			`if err != nil {`
			`return result, core.E("collect.Papers.collectIACR", "request failed", err)`
			`}`
			`defer func() { _ = resp.Body.Close() }()`

			`if resp.StatusCode != http.StatusOK {`
			`return result, core.E("collect.Papers.collectIACR",`
			`fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)`
			`}`

			`doc, err := html.Parse(resp.Body)`
			`if err != nil {`
			`return result, core.E("collect.Papers.collectIACR", "failed to parse HTML", err)`
			`}`

			`papers := extractIACRPapers(doc)`

			`baseDir := filepath.Join(cfg.OutputDir, "papers", "iacr")`
			`if err := cfg.Output.EnsureDir(baseDir); err != nil {`
			`return result, core.E("collect.Papers.collectIACR", "failed to create output directory", err)`
			`}`

			`for _, ppr := range papers {`
			`filePath := filepath.Join(baseDir, ppr.ID+".md")`
			`content := formatPaperMarkdown(ppr)`

			`if err := cfg.Output.Write(filePath, content); err != nil {`
			`result.Errors++`
			`continue`
			`}`

			`result.Items++`
			`result.Files = append(result.Files, filePath)`

			`if cfg.Dispatcher != nil {`
			`cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Paper: %s", ppr.Title), nil)`
			`}`
			`}`

			`return result, nil`
			`}`

			`// arxivFeed represents the Atom feed returned by the arXiv API.`
			`type arxivFeed struct {`
			XMLName xml.Name `xml:"feed"`
			Entries []arxivEntry `xml:"entry"`
			`}`

			`type arxivEntry struct {`
			ID string `xml:"id"`
			Title string `xml:"title"`
			Summary string `xml:"summary"`
			Published string `xml:"published"`
			Authors []arxivAuthor `xml:"author"`
			Links []arxivLink `xml:"link"`
			`}`

			`type arxivAuthor struct {`
			Name string `xml:"name"`
			`}`

			`type arxivLink struct {`
			Href string `xml:"href,attr"`
			Rel string `xml:"rel,attr"`
			Type string `xml:"type,attr"`
			`}`

			`// collectArXiv fetches papers from the arXiv API.`
			`func (p PapersCollector) collectArXiv(ctx context.Context, cfg Config) (*Result, error) {`
			`result := &Result{Source: "papers:arxiv"}`

			`if cfg.Limiter != nil {`
			`if err := cfg.Limiter.Wait(ctx, "arxiv"); err != nil {`
			`return result, err`
			`}`
			`}`

			`query := url.QueryEscape(p.Query)`
			`if p.Category != "" {`
			`query = fmt.Sprintf("cat:%s+AND+%s", url.QueryEscape(p.Category), query)`
			`}`

			`searchURL := fmt.Sprintf("https://export.arxiv.org/api/query?search_query=%s&max_results=50", query)`

			`req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)`
			`if err != nil {`
			`return result, core.E("collect.Papers.collectArXiv", "failed to create request", err)`
			`}`
			`req.Header.Set("User-Agent", "CoreCollector/1.0")`

			`resp, err := httpClient.Do(req)`
			`if err != nil {`
			`return result, core.E("collect.Papers.collectArXiv", "request failed", err)`
			`}`
			`defer func() { _ = resp.Body.Close() }()`

			`if resp.StatusCode != http.StatusOK {`
			`return result, core.E("collect.Papers.collectArXiv",`
			`fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)`
			`}`

			`var feed arxivFeed`
			`if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {`
			`return result, core.E("collect.Papers.collectArXiv", "failed to parse XML", err)`
			`}`

			`baseDir := filepath.Join(cfg.OutputDir, "papers", "arxiv")`
			`if err := cfg.Output.EnsureDir(baseDir); err != nil {`
			`return result, core.E("collect.Papers.collectArXiv", "failed to create output directory", err)`
			`}`

			`for _, entry := range feed.Entries {`
			`ppr := arxivEntryToPaper(entry)`

			`filePath := filepath.Join(baseDir, ppr.ID+".md")`
			`content := formatPaperMarkdown(ppr)`

			`if err := cfg.Output.Write(filePath, content); err != nil {`
			`result.Errors++`
			`continue`
			`}`

			`result.Items++`
			`result.Files = append(result.Files, filePath)`

			`if cfg.Dispatcher != nil {`
			`cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Paper: %s", ppr.Title), nil)`
			`}`
			`}`

			`return result, nil`
			`}`

			`// arxivEntryToPaper converts an arXiv Atom entry to a paper.`
			`func arxivEntryToPaper(entry arxivEntry) paper {`
			`authors := make([]string, len(entry.Authors))`
			`for i, a := range entry.Authors {`
			`authors[i] = a.Name`
			`}`

			`// Extract the arXiv ID from the URL`
			`id := entry.ID`
			`if idx := strings.LastIndex(id, "/abs/"); idx != -1 {`
			`id = id[idx+5:]`
			`}`
			`// Replace characters that are not valid in file names`
			`id = strings.ReplaceAll(id, "/", "-")`
			`id = strings.ReplaceAll(id, ":", "-")`

			`paperURL := entry.ID`
			`for _, link := range entry.Links {`
			`if link.Rel == "alternate" {`
			`paperURL = link.Href`
			`break`
			`}`
			`}`

			`return paper{`
			`ID: id,`
			`Title: strings.TrimSpace(entry.Title),`
			`Authors: authors,`
			`Abstract: strings.TrimSpace(entry.Summary),`
			`Date: entry.Published,`
			`URL: paperURL,`
			`Source: "arxiv",`
			`}`
			`}`

			`// extractIACRPapers extracts paper metadata from an IACR search results page.`
			`func extractIACRPapers(doc *html.Node) []paper {`
			`var papers []paper`
			`var walk func(*html.Node)`

			`walk = func(n *html.Node) {`
			`if n.Type == html.ElementNode && n.Data == "div" {`
			`for _, attr := range n.Attr {`
			`if attr.Key == "class" && strings.Contains(attr.Val, "paperentry") {`
			`ppr := parseIACREntry(n)`
			`if ppr.Title != "" {`
			`papers = append(papers, ppr)`
			`}`
			`}`
			`}`
			`}`
			`for c := n.FirstChild; c != nil; c = c.NextSibling {`
			`walk(c)`
			`}`
			`}`

			`walk(doc)`
			`return papers`
			`}`

			`// parseIACREntry extracts paper data from an IACR paper entry div.`
			`func parseIACREntry(node *html.Node) paper {`
			`ppr := paper{Source: "iacr"}`
			`var walk func(*html.Node)`

			`walk = func(n *html.Node) {`
			`if n.Type == html.ElementNode {`
			`switch n.Data {`
			`case "a":`
			`for _, attr := range n.Attr {`
			`if attr.Key == "href" && strings.Contains(attr.Val, "/eprint/") {`
			`ppr.URL = "https://eprint.iacr.org" + attr.Val`
			`// Extract ID from URL`
			`parts := strings.Split(attr.Val, "/")`
			`if len(parts) >= 2 {`
			`ppr.ID = parts[len(parts)-2] + "-" + parts[len(parts)-1]`
			`}`
			`}`
			`}`
			`if ppr.Title == "" {`
			`ppr.Title = strings.TrimSpace(extractText(n))`
			`}`
			`case "span":`
			`for _, attr := range n.Attr {`
			`if attr.Key == "class" {`
			`switch {`
			`case strings.Contains(attr.Val, "author"):`
			`author := strings.TrimSpace(extractText(n))`
			`if author != "" {`
			`ppr.Authors = append(ppr.Authors, author)`
			`}`
			`case strings.Contains(attr.Val, "date"):`
			`ppr.Date = strings.TrimSpace(extractText(n))`
			`}`
			`}`
			`}`
			`case "p":`
			`for _, attr := range n.Attr {`
			`if attr.Key == "class" && strings.Contains(attr.Val, "abstract") {`
			`ppr.Abstract = strings.TrimSpace(extractText(n))`
			`}`
			`}`
			`}`
			`}`
			`for c := n.FirstChild; c != nil; c = c.NextSibling {`
			`walk(c)`
			`}`
			`}`

			`walk(node)`
			`return ppr`
			`}`

			`// formatPaperMarkdown formats a paper as markdown.`
			`func formatPaperMarkdown(ppr paper) string {`
			`var b strings.Builder`
			`fmt.Fprintf(&b, "# %s\n\n", ppr.Title)`

			`if len(ppr.Authors) > 0 {`
			`fmt.Fprintf(&b, "- Authors: %s\n", strings.Join(ppr.Authors, ", "))`
			`}`
			`if ppr.Date != "" {`
			`fmt.Fprintf(&b, "- Published: %s\n", ppr.Date)`
			`}`
			`if ppr.URL != "" {`
			`fmt.Fprintf(&b, "- URL: %s\n", ppr.URL)`
			`}`
			`if ppr.Source != "" {`
			`fmt.Fprintf(&b, "- Source: %s\n", ppr.Source)`
			`}`

			`if ppr.Abstract != "" {`
			`fmt.Fprintf(&b, "\n## Abstract\n\n%s\n", ppr.Abstract)`
			`}`

			`return b.String()`
			`}`

			`// FormatPaperMarkdown is exported for testing.`
			`func FormatPaperMarkdown(title string, authors []string, date, paperURL, source, abstract string) string {`
			`return formatPaperMarkdown(paper{`
			`Title: title,`
			`Authors: authors,`
			`Date: date,`
			`URL: paperURL,`
			`Source: source,`
			`Abstract: abstract,`
			`})`
			`}`