go-scm/collect/papers_http_test.go
Claude b4e3d0555a
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).

Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00

313 lines
9.7 KiB
Go

package collect
import (
"context"
"net/http"
"net/http/httptest"
"strings"
"testing"
"forge.lthn.ai/core/go/pkg/io"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/net/html"
)
const sampleIACRHTML = `<html><body>
<div class="paperentry">
<a href="/eprint/2025/001">Zero-Knowledge Proofs</a>
<span class="author">Alice</span>
<span class="author">Bob</span>
<span class="date">2025-01-15</span>
<p class="abstract">We present a novel construction for zero-knowledge proofs.</p>
</div>
<div class="paperentry">
<a href="/eprint/2025/002">Lattice Cryptography</a>
<span class="author">Charlie</span>
<span class="date">2025-01-20</span>
<p class="abstract">A survey of lattice-based cryptography.</p>
</div>
</body></html>`
const sampleArXivXML = `<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<id>http://arxiv.org/abs/2501.12345v1</id>
<title>Ring Signatures Revisited</title>
<summary>We propose an efficient ring signature scheme.</summary>
<published>2025-01-10T00:00:00Z</published>
<author><name>Alice</name></author>
<author><name>David</name></author>
<link href="http://arxiv.org/abs/2501.12345v1" rel="alternate"/>
</entry>
<entry>
<id>http://arxiv.org/abs/2501.67890v1</id>
<title>Post-Quantum Signatures</title>
<summary>A new approach to post-quantum digital signatures.</summary>
<published>2025-01-12T00:00:00Z</published>
<author><name>Eve</name></author>
<link href="http://arxiv.org/abs/2501.67890v1" rel="alternate"/>
</entry>
</feed>`
func TestPapersCollector_CollectIACR_Good(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(sampleIACRHTML))
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceIACR, Query: "zero knowledge"}
result, err := p.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 2, result.Items)
assert.Len(t, result.Files, 2)
// Verify content was written.
content, err := m.Read("/output/papers/iacr/2025-001.md")
require.NoError(t, err)
assert.Contains(t, content, "Zero-Knowledge Proofs")
}
func TestPapersCollector_CollectArXiv_Good(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(sampleArXivXML))
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceArXiv, Query: "ring signatures"}
result, err := p.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 2, result.Items)
assert.Len(t, result.Files, 2)
// Verify one of the papers.
content, err := m.Read("/output/papers/arxiv/2501.12345v1.md")
require.NoError(t, err)
assert.Contains(t, content, "Ring Signatures Revisited")
assert.Contains(t, content, "Alice")
}
func TestPapersCollector_CollectArXiv_Good_WithCategory(t *testing.T) {
var capturedQuery string
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
capturedQuery = r.URL.RawQuery
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(sampleArXivXML))
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceArXiv, Query: "crypto", Category: "cs.CR"}
_, err := p.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Contains(t, capturedQuery, "cat")
}
func TestPapersCollector_CollectAll_Good(t *testing.T) {
callCount := 0
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
callCount++
if callCount == 1 {
// First call is IACR
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(sampleIACRHTML))
} else {
// Second call is arXiv
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(sampleArXivXML))
}
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceAll, Query: "cryptography"}
result, err := p.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 4, result.Items) // 2 IACR + 2 arXiv
}
func TestPapersCollector_CollectIACR_Bad_ServerError(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceIACR, Query: "test"}
_, err := p.Collect(context.Background(), cfg)
assert.Error(t, err)
}
func TestPapersCollector_CollectArXiv_Bad_ServerError(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusServiceUnavailable)
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceArXiv, Query: "test"}
_, err := p.Collect(context.Background(), cfg)
assert.Error(t, err)
}
func TestPapersCollector_CollectArXiv_Bad_InvalidXML(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(`not xml at all`))
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceArXiv, Query: "test"}
_, err := p.Collect(context.Background(), cfg)
assert.Error(t, err)
}
func TestPapersCollector_CollectAll_Bad_BothFail(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceAll, Query: "test"}
_, err := p.Collect(context.Background(), cfg)
assert.Error(t, err)
}
func TestPapersCollector_CollectAll_Good_OneFails(t *testing.T) {
callCount := 0
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
callCount++
if callCount == 1 {
// IACR fails
w.WriteHeader(http.StatusInternalServerError)
} else {
// ArXiv succeeds
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(sampleArXivXML))
}
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceAll, Query: "test"}
result, err := p.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 2, result.Items)
assert.Equal(t, 1, result.Errors) // IACR failure counted
}
func TestExtractIACRPapers_Good(t *testing.T) {
doc, err := html.Parse(strings.NewReader(sampleIACRHTML))
require.NoError(t, err)
papers := extractIACRPapers(doc)
assert.Len(t, papers, 2)
assert.Equal(t, "Zero-Knowledge Proofs", papers[0].Title)
assert.Contains(t, papers[0].Authors, "Alice")
assert.Contains(t, papers[0].Authors, "Bob")
assert.Equal(t, "2025-01-15", papers[0].Date)
assert.Contains(t, papers[0].Abstract, "zero-knowledge proofs")
assert.Equal(t, "iacr", papers[0].Source)
assert.Equal(t, "Lattice Cryptography", papers[1].Title)
}
func TestExtractIACRPapers_Good_Empty(t *testing.T) {
doc, err := html.Parse(strings.NewReader(`<html><body></body></html>`))
require.NoError(t, err)
papers := extractIACRPapers(doc)
assert.Empty(t, papers)
}
func TestExtractIACRPapers_Good_NoTitle(t *testing.T) {
doc, err := html.Parse(strings.NewReader(`<html><body><div class="paperentry"></div></body></html>`))
require.NoError(t, err)
papers := extractIACRPapers(doc)
// Entry with no title should be excluded by the Title check.
assert.Empty(t, papers)
}