* ci: consolidate duplicate workflows and merge CodeQL configs Remove 17 duplicate workflow files that were split copies of the combined originals. Each family (CI, CodeQL, Coverage, PR Build, Alpha Release) had the same job duplicated across separate push/pull_request/schedule/manual trigger files. Merge codeql.yml and codescan.yml into a single codeql.yml with a language matrix covering go, javascript-typescript, python, and actions — matching the previous default setup coverage. Remaining workflows (one per family): - ci.yml (push + PR + manual) - codeql.yml (push + PR + schedule, all languages) - coverage.yml (push + PR + manual) - alpha-release.yml (push + manual) - pr-build.yml (PR + manual) - release.yml (tag push) - agent-verify.yml, auto-label.yml, auto-project.yml Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat: add collect, config, crypt, plugin packages and fix all lint issues Add four new infrastructure packages with CLI commands: - pkg/config: layered configuration (defaults → file → env → flags) - pkg/crypt: crypto primitives (Argon2id, AES-GCM, ChaCha20, HMAC, checksums) - pkg/plugin: plugin system with GitHub-based install/update/remove - pkg/collect: collection subsystem (GitHub, BitcoinTalk, market, papers, excavate) Fix all golangci-lint issues across the entire codebase (~100 errcheck, staticcheck SA1012/SA1019/ST1005, unused, ineffassign fixes) so that `core go qa` passes with 0 issues. Closes #167, #168, #170, #250, #251, #252, #253, #254, #255, #256 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
402 lines
10 KiB
Go
402 lines
10 KiB
Go
package collect
|
|
|
|
import (
|
|
"context"
|
|
"encoding/xml"
|
|
"fmt"
|
|
"net/http"
|
|
"net/url"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
core "github.com/host-uk/core/pkg/framework/core"
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
// Paper source identifiers.
|
|
const (
|
|
PaperSourceIACR = "iacr"
|
|
PaperSourceArXiv = "arxiv"
|
|
PaperSourceAll = "all"
|
|
)
|
|
|
|
// PapersCollector collects papers from IACR and arXiv.
|
|
type PapersCollector struct {
|
|
// Source is one of PaperSourceIACR, PaperSourceArXiv, or PaperSourceAll.
|
|
Source string
|
|
|
|
// Category is the arXiv category (e.g. "cs.CR" for cryptography).
|
|
Category string
|
|
|
|
// Query is the search query string.
|
|
Query string
|
|
}
|
|
|
|
// Name returns the collector name.
|
|
func (p *PapersCollector) Name() string {
|
|
return fmt.Sprintf("papers:%s", p.Source)
|
|
}
|
|
|
|
// paper represents a parsed academic paper.
|
|
type paper struct {
|
|
ID string
|
|
Title string
|
|
Authors []string
|
|
Abstract string
|
|
Date string
|
|
URL string
|
|
Source string
|
|
}
|
|
|
|
// Collect gathers papers from the configured sources.
|
|
func (p *PapersCollector) Collect(ctx context.Context, cfg *Config) (*Result, error) {
|
|
result := &Result{Source: p.Name()}
|
|
|
|
if p.Query == "" {
|
|
return result, core.E("collect.Papers.Collect", "query is required", nil)
|
|
}
|
|
|
|
if cfg.Dispatcher != nil {
|
|
cfg.Dispatcher.EmitStart(p.Name(), fmt.Sprintf("Starting paper collection for %q", p.Query))
|
|
}
|
|
|
|
if cfg.DryRun {
|
|
if cfg.Dispatcher != nil {
|
|
cfg.Dispatcher.EmitProgress(p.Name(), fmt.Sprintf("[dry-run] Would search papers for %q", p.Query), nil)
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
switch p.Source {
|
|
case PaperSourceIACR:
|
|
return p.collectIACR(ctx, cfg)
|
|
case PaperSourceArXiv:
|
|
return p.collectArXiv(ctx, cfg)
|
|
case PaperSourceAll:
|
|
iacrResult, iacrErr := p.collectIACR(ctx, cfg)
|
|
arxivResult, arxivErr := p.collectArXiv(ctx, cfg)
|
|
|
|
if iacrErr != nil && arxivErr != nil {
|
|
return result, core.E("collect.Papers.Collect", "all sources failed", iacrErr)
|
|
}
|
|
|
|
merged := MergeResults(p.Name(), iacrResult, arxivResult)
|
|
if iacrErr != nil {
|
|
merged.Errors++
|
|
}
|
|
if arxivErr != nil {
|
|
merged.Errors++
|
|
}
|
|
|
|
if cfg.Dispatcher != nil {
|
|
cfg.Dispatcher.EmitComplete(p.Name(), fmt.Sprintf("Collected %d papers", merged.Items), merged)
|
|
}
|
|
|
|
return merged, nil
|
|
default:
|
|
return result, core.E("collect.Papers.Collect",
|
|
fmt.Sprintf("unknown source: %s (use iacr, arxiv, or all)", p.Source), nil)
|
|
}
|
|
}
|
|
|
|
// collectIACR fetches papers from the IACR ePrint archive.
|
|
func (p *PapersCollector) collectIACR(ctx context.Context, cfg *Config) (*Result, error) {
|
|
result := &Result{Source: "papers:iacr"}
|
|
|
|
if cfg.Limiter != nil {
|
|
if err := cfg.Limiter.Wait(ctx, "iacr"); err != nil {
|
|
return result, err
|
|
}
|
|
}
|
|
|
|
searchURL := fmt.Sprintf("https://eprint.iacr.org/search?q=%s", url.QueryEscape(p.Query))
|
|
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
|
|
if err != nil {
|
|
return result, core.E("collect.Papers.collectIACR", "failed to create request", err)
|
|
}
|
|
req.Header.Set("User-Agent", "CoreCollector/1.0")
|
|
|
|
resp, err := httpClient.Do(req)
|
|
if err != nil {
|
|
return result, core.E("collect.Papers.collectIACR", "request failed", err)
|
|
}
|
|
defer func() { _ = resp.Body.Close() }()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return result, core.E("collect.Papers.collectIACR",
|
|
fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)
|
|
}
|
|
|
|
doc, err := html.Parse(resp.Body)
|
|
if err != nil {
|
|
return result, core.E("collect.Papers.collectIACR", "failed to parse HTML", err)
|
|
}
|
|
|
|
papers := extractIACRPapers(doc)
|
|
|
|
baseDir := filepath.Join(cfg.OutputDir, "papers", "iacr")
|
|
if err := cfg.Output.EnsureDir(baseDir); err != nil {
|
|
return result, core.E("collect.Papers.collectIACR", "failed to create output directory", err)
|
|
}
|
|
|
|
for _, ppr := range papers {
|
|
filePath := filepath.Join(baseDir, ppr.ID+".md")
|
|
content := formatPaperMarkdown(ppr)
|
|
|
|
if err := cfg.Output.Write(filePath, content); err != nil {
|
|
result.Errors++
|
|
continue
|
|
}
|
|
|
|
result.Items++
|
|
result.Files = append(result.Files, filePath)
|
|
|
|
if cfg.Dispatcher != nil {
|
|
cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Paper: %s", ppr.Title), nil)
|
|
}
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// arxivFeed represents the Atom feed returned by the arXiv API.
|
|
type arxivFeed struct {
|
|
XMLName xml.Name `xml:"feed"`
|
|
Entries []arxivEntry `xml:"entry"`
|
|
}
|
|
|
|
type arxivEntry struct {
|
|
ID string `xml:"id"`
|
|
Title string `xml:"title"`
|
|
Summary string `xml:"summary"`
|
|
Published string `xml:"published"`
|
|
Authors []arxivAuthor `xml:"author"`
|
|
Links []arxivLink `xml:"link"`
|
|
}
|
|
|
|
type arxivAuthor struct {
|
|
Name string `xml:"name"`
|
|
}
|
|
|
|
type arxivLink struct {
|
|
Href string `xml:"href,attr"`
|
|
Rel string `xml:"rel,attr"`
|
|
Type string `xml:"type,attr"`
|
|
}
|
|
|
|
// collectArXiv fetches papers from the arXiv API.
|
|
func (p *PapersCollector) collectArXiv(ctx context.Context, cfg *Config) (*Result, error) {
|
|
result := &Result{Source: "papers:arxiv"}
|
|
|
|
if cfg.Limiter != nil {
|
|
if err := cfg.Limiter.Wait(ctx, "arxiv"); err != nil {
|
|
return result, err
|
|
}
|
|
}
|
|
|
|
query := url.QueryEscape(p.Query)
|
|
if p.Category != "" {
|
|
query = fmt.Sprintf("cat:%s+AND+%s", url.QueryEscape(p.Category), query)
|
|
}
|
|
|
|
searchURL := fmt.Sprintf("https://export.arxiv.org/api/query?search_query=%s&max_results=50", query)
|
|
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
|
|
if err != nil {
|
|
return result, core.E("collect.Papers.collectArXiv", "failed to create request", err)
|
|
}
|
|
req.Header.Set("User-Agent", "CoreCollector/1.0")
|
|
|
|
resp, err := httpClient.Do(req)
|
|
if err != nil {
|
|
return result, core.E("collect.Papers.collectArXiv", "request failed", err)
|
|
}
|
|
defer func() { _ = resp.Body.Close() }()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return result, core.E("collect.Papers.collectArXiv",
|
|
fmt.Sprintf("unexpected status code: %d", resp.StatusCode), nil)
|
|
}
|
|
|
|
var feed arxivFeed
|
|
if err := xml.NewDecoder(resp.Body).Decode(&feed); err != nil {
|
|
return result, core.E("collect.Papers.collectArXiv", "failed to parse XML", err)
|
|
}
|
|
|
|
baseDir := filepath.Join(cfg.OutputDir, "papers", "arxiv")
|
|
if err := cfg.Output.EnsureDir(baseDir); err != nil {
|
|
return result, core.E("collect.Papers.collectArXiv", "failed to create output directory", err)
|
|
}
|
|
|
|
for _, entry := range feed.Entries {
|
|
ppr := arxivEntryToPaper(entry)
|
|
|
|
filePath := filepath.Join(baseDir, ppr.ID+".md")
|
|
content := formatPaperMarkdown(ppr)
|
|
|
|
if err := cfg.Output.Write(filePath, content); err != nil {
|
|
result.Errors++
|
|
continue
|
|
}
|
|
|
|
result.Items++
|
|
result.Files = append(result.Files, filePath)
|
|
|
|
if cfg.Dispatcher != nil {
|
|
cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Paper: %s", ppr.Title), nil)
|
|
}
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// arxivEntryToPaper converts an arXiv Atom entry to a paper.
|
|
func arxivEntryToPaper(entry arxivEntry) paper {
|
|
authors := make([]string, len(entry.Authors))
|
|
for i, a := range entry.Authors {
|
|
authors[i] = a.Name
|
|
}
|
|
|
|
// Extract the arXiv ID from the URL
|
|
id := entry.ID
|
|
if idx := strings.LastIndex(id, "/abs/"); idx != -1 {
|
|
id = id[idx+5:]
|
|
}
|
|
// Replace characters that are not valid in file names
|
|
id = strings.ReplaceAll(id, "/", "-")
|
|
id = strings.ReplaceAll(id, ":", "-")
|
|
|
|
paperURL := entry.ID
|
|
for _, link := range entry.Links {
|
|
if link.Rel == "alternate" {
|
|
paperURL = link.Href
|
|
break
|
|
}
|
|
}
|
|
|
|
return paper{
|
|
ID: id,
|
|
Title: strings.TrimSpace(entry.Title),
|
|
Authors: authors,
|
|
Abstract: strings.TrimSpace(entry.Summary),
|
|
Date: entry.Published,
|
|
URL: paperURL,
|
|
Source: "arxiv",
|
|
}
|
|
}
|
|
|
|
// extractIACRPapers extracts paper metadata from an IACR search results page.
|
|
func extractIACRPapers(doc *html.Node) []paper {
|
|
var papers []paper
|
|
var walk func(*html.Node)
|
|
|
|
walk = func(n *html.Node) {
|
|
if n.Type == html.ElementNode && n.Data == "div" {
|
|
for _, attr := range n.Attr {
|
|
if attr.Key == "class" && strings.Contains(attr.Val, "paperentry") {
|
|
ppr := parseIACREntry(n)
|
|
if ppr.Title != "" {
|
|
papers = append(papers, ppr)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
walk(c)
|
|
}
|
|
}
|
|
|
|
walk(doc)
|
|
return papers
|
|
}
|
|
|
|
// parseIACREntry extracts paper data from an IACR paper entry div.
|
|
func parseIACREntry(node *html.Node) paper {
|
|
ppr := paper{Source: "iacr"}
|
|
var walk func(*html.Node)
|
|
|
|
walk = func(n *html.Node) {
|
|
if n.Type == html.ElementNode {
|
|
switch n.Data {
|
|
case "a":
|
|
for _, attr := range n.Attr {
|
|
if attr.Key == "href" && strings.Contains(attr.Val, "/eprint/") {
|
|
ppr.URL = "https://eprint.iacr.org" + attr.Val
|
|
// Extract ID from URL
|
|
parts := strings.Split(attr.Val, "/")
|
|
if len(parts) >= 2 {
|
|
ppr.ID = parts[len(parts)-2] + "-" + parts[len(parts)-1]
|
|
}
|
|
}
|
|
}
|
|
if ppr.Title == "" {
|
|
ppr.Title = strings.TrimSpace(extractText(n))
|
|
}
|
|
case "span":
|
|
for _, attr := range n.Attr {
|
|
if attr.Key == "class" {
|
|
switch {
|
|
case strings.Contains(attr.Val, "author"):
|
|
author := strings.TrimSpace(extractText(n))
|
|
if author != "" {
|
|
ppr.Authors = append(ppr.Authors, author)
|
|
}
|
|
case strings.Contains(attr.Val, "date"):
|
|
ppr.Date = strings.TrimSpace(extractText(n))
|
|
}
|
|
}
|
|
}
|
|
case "p":
|
|
for _, attr := range n.Attr {
|
|
if attr.Key == "class" && strings.Contains(attr.Val, "abstract") {
|
|
ppr.Abstract = strings.TrimSpace(extractText(n))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
walk(c)
|
|
}
|
|
}
|
|
|
|
walk(node)
|
|
return ppr
|
|
}
|
|
|
|
// formatPaperMarkdown formats a paper as markdown.
|
|
func formatPaperMarkdown(ppr paper) string {
|
|
var b strings.Builder
|
|
fmt.Fprintf(&b, "# %s\n\n", ppr.Title)
|
|
|
|
if len(ppr.Authors) > 0 {
|
|
fmt.Fprintf(&b, "- **Authors:** %s\n", strings.Join(ppr.Authors, ", "))
|
|
}
|
|
if ppr.Date != "" {
|
|
fmt.Fprintf(&b, "- **Published:** %s\n", ppr.Date)
|
|
}
|
|
if ppr.URL != "" {
|
|
fmt.Fprintf(&b, "- **URL:** %s\n", ppr.URL)
|
|
}
|
|
if ppr.Source != "" {
|
|
fmt.Fprintf(&b, "- **Source:** %s\n", ppr.Source)
|
|
}
|
|
|
|
if ppr.Abstract != "" {
|
|
fmt.Fprintf(&b, "\n## Abstract\n\n%s\n", ppr.Abstract)
|
|
}
|
|
|
|
return b.String()
|
|
}
|
|
|
|
// FormatPaperMarkdown is exported for testing.
|
|
func FormatPaperMarkdown(title string, authors []string, date, paperURL, source, abstract string) string {
|
|
return formatPaperMarkdown(paper{
|
|
Title: title,
|
|
Authors: authors,
|
|
Date: date,
|
|
URL: paperURL,
|
|
Source: source,
|
|
Abstract: abstract,
|
|
})
|
|
}
|