test(collect): push coverage from 57.3% to 83.0%

Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).

Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Claude 2026-02-20 01:55:18 +00:00
parent 4de0356880
commit b4e3d0555a
No known key found for this signature in database
GPG key ID: AF404715446AEB41
6 changed files with 1207 additions and 0 deletions

View file

@ -0,0 +1,256 @@
package collect
import (
"context"
"fmt"
"net/http"
"net/http/httptest"
"testing"
"forge.lthn.ai/core/go/pkg/io"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// sampleBTCTalkPage returns HTML resembling a BitcoinTalk topic page with the
// given number of posts. If fewer than postsPerPage the caller can infer that
// it is the last page.
func sampleBTCTalkPage(count int) string {
page := `<html><body>`
for i := 0; i < count; i++ {
page += fmt.Sprintf(`
<div class="post">
<div class="poster_info">user%d</div>
<div class="headerandpost">
<div class="smalltext">January %02d, 2009</div>
</div>
<div class="inner">Post content number %d.</div>
</div>`, i, i+1, i)
}
page += `</body></html>`
return page
}
func TestBitcoinTalkCollector_Collect_Good_OnePage(t *testing.T) {
// Serve a single page with 5 posts (< 20, so collection stops after one page).
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(sampleBTCTalkPage(5)))
}))
defer srv.Close()
// Override the package-level HTTP client so requests go to our test server.
oldClient := httpClient
httpClient = srv.Client()
defer func() { httpClient = oldClient }()
// We also need to redirect the URL that fetchPage constructs.
// The easiest approach: use SetHTTPClient with a custom transport.
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
httpClient = &http.Client{Transport: transport}
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil // disable rate limiting for tests
b := &BitcoinTalkCollector{TopicID: "12345"}
result, err := b.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 5, result.Items)
assert.Equal(t, 0, result.Errors)
assert.Len(t, result.Files, 5)
assert.Equal(t, "bitcointalk:12345", result.Source)
// Verify files were written.
for i := 1; i <= 5; i++ {
path := fmt.Sprintf("/output/bitcointalk/12345/posts/%d.md", i)
content, err := m.Read(path)
require.NoError(t, err, "file %s should exist", path)
assert.Contains(t, content, fmt.Sprintf("Post %d by", i))
}
}
func TestBitcoinTalkCollector_Collect_Good_PageLimit(t *testing.T) {
pageCount := 0
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
pageCount++
w.Header().Set("Content-Type", "text/html")
// Return a full page (20 posts) each time so collection would continue
// indefinitely without a Pages limit.
_, _ = w.Write([]byte(sampleBTCTalkPage(20)))
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
b := &BitcoinTalkCollector{TopicID: "99999", Pages: 2}
result, err := b.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 40, result.Items) // 2 pages * 20 posts
assert.Equal(t, 2, pageCount)
}
func TestBitcoinTalkCollector_Collect_Good_CancelledContext(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(sampleBTCTalkPage(5)))
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
ctx, cancel := context.WithCancel(context.Background())
cancel() // Cancel immediately.
b := &BitcoinTalkCollector{TopicID: "12345"}
_, err := b.Collect(ctx, cfg)
assert.Error(t, err)
}
func TestBitcoinTalkCollector_Collect_Bad_ServerError(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
b := &BitcoinTalkCollector{TopicID: "12345"}
result, err := b.Collect(context.Background(), cfg)
// fetchPage error causes break with Errors incremented.
require.NoError(t, err)
assert.Equal(t, 0, result.Items)
assert.Equal(t, 1, result.Errors)
}
func TestBitcoinTalkCollector_Collect_Good_EmitsEvents(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(sampleBTCTalkPage(2)))
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
var starts, items, completes int
cfg.Dispatcher.On(EventStart, func(e Event) { starts++ })
cfg.Dispatcher.On(EventItem, func(e Event) { items++ })
cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ })
b := &BitcoinTalkCollector{TopicID: "12345"}
result, err := b.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 2, result.Items)
assert.Equal(t, 1, starts)
assert.Equal(t, 2, items)
assert.Equal(t, 1, completes)
}
func TestSetHTTPClient_Good(t *testing.T) {
old := httpClient
defer func() { httpClient = old }()
custom := &http.Client{}
SetHTTPClient(custom)
assert.Equal(t, custom, httpClient)
}
func TestFetchPage_Good(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(sampleBTCTalkPage(3)))
}))
defer srv.Close()
old := httpClient
httpClient = srv.Client()
defer func() { httpClient = old }()
b := &BitcoinTalkCollector{TopicID: "12345"}
posts, err := b.fetchPage(context.Background(), srv.URL)
require.NoError(t, err)
assert.Len(t, posts, 3)
}
func TestFetchPage_Bad_StatusCode(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusForbidden)
}))
defer srv.Close()
old := httpClient
httpClient = srv.Client()
defer func() { httpClient = old }()
b := &BitcoinTalkCollector{TopicID: "12345"}
_, err := b.fetchPage(context.Background(), srv.URL)
assert.Error(t, err)
}
func TestFetchPage_Bad_InvalidHTML(t *testing.T) {
// html.Parse is very forgiving, so serve an empty page.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><body></body></html>`))
}))
defer srv.Close()
old := httpClient
httpClient = srv.Client()
defer func() { httpClient = old }()
b := &BitcoinTalkCollector{TopicID: "12345"}
posts, err := b.fetchPage(context.Background(), srv.URL)
require.NoError(t, err)
assert.Empty(t, posts)
}
// rewriteTransport rewrites all request URLs to point at the test server.
type rewriteTransport struct {
base http.RoundTripper
target string
}
func (t *rewriteTransport) RoundTrip(req *http.Request) (*http.Response, error) {
req = req.Clone(req.Context())
req.URL.Scheme = "http"
req.URL.Host = t.target[len("http://"):]
base := t.base
if base == nil {
base = http.DefaultTransport
}
return base.RoundTrip(req)
}

View file

@ -0,0 +1,127 @@
package collect
import (
"context"
"testing"
"time"
"forge.lthn.ai/core/go/pkg/io"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestExcavator_Run_Good_ResumeSkipsCompleted(t *testing.T) {
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
// Pre-populate state so source-a looks completed.
cfg.State.Set("source-a", &StateEntry{
Source: "source-a",
LastRun: time.Now().Add(-1 * time.Hour),
Items: 10,
})
c1 := &mockCollector{name: "source-a", items: 10}
c2 := &mockCollector{name: "source-b", items: 5}
e := &Excavator{
Collectors: []Collector{c1, c2},
Resume: true,
}
result, err := e.Run(context.Background(), cfg)
require.NoError(t, err)
assert.False(t, c1.called, "source-a should be skipped (already completed)")
assert.True(t, c2.called, "source-b should run")
assert.Equal(t, 5, result.Items)
assert.Equal(t, 1, result.Skipped)
}
func TestExcavator_Run_Good_ResumeRunsIncomplete(t *testing.T) {
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
// Pre-populate state with 0 items (incomplete).
cfg.State.Set("source-a", &StateEntry{
Source: "source-a",
LastRun: time.Now(),
Items: 0,
})
c1 := &mockCollector{name: "source-a", items: 5}
e := &Excavator{
Collectors: []Collector{c1},
Resume: true,
}
result, err := e.Run(context.Background(), cfg)
require.NoError(t, err)
assert.True(t, c1.called, "source-a should run (0 items in previous run)")
assert.Equal(t, 5, result.Items)
}
func TestExcavator_Run_Good_NilState(t *testing.T) {
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.State = nil
cfg.Limiter = nil
c1 := &mockCollector{name: "source-a", items: 3}
e := &Excavator{
Collectors: []Collector{c1},
}
result, err := e.Run(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 3, result.Items)
}
func TestExcavator_Run_Good_NilDispatcher(t *testing.T) {
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Dispatcher = nil
cfg.Limiter = nil
c1 := &mockCollector{name: "source-a", items: 2}
e := &Excavator{
Collectors: []Collector{c1},
}
result, err := e.Run(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 2, result.Items)
}
func TestExcavator_Run_Good_ProgressEvents(t *testing.T) {
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
var progressMsgs []string
cfg.Dispatcher.On(EventProgress, func(e Event) {
progressMsgs = append(progressMsgs, e.Message)
})
c1 := &mockCollector{name: "source-a", items: 1}
c2 := &mockCollector{name: "source-b", items: 1}
e := &Excavator{
Collectors: []Collector{c1, c2},
}
_, err := e.Run(context.Background(), cfg)
require.NoError(t, err)
assert.Len(t, progressMsgs, 2)
assert.Contains(t, progressMsgs[0], "1/2")
assert.Contains(t, progressMsgs[1], "2/2")
}

View file

@ -0,0 +1,242 @@
package collect
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"forge.lthn.ai/core/go/pkg/io"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestMarketCollector_Collect_Good_HistoricalWithFromDate(t *testing.T) {
callCount := 0
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
callCount++
w.Header().Set("Content-Type", "application/json")
if callCount == 1 {
data := coinData{
ID: "lethean",
Symbol: "lthn",
Name: "Lethean",
MarketData: marketData{
CurrentPrice: map[string]float64{"usd": 0.001},
},
}
_ = json.NewEncoder(w).Encode(data)
} else {
// Historical data with FromDate param.
assert.Contains(t, r.URL.RawQuery, "days=")
data := historicalData{
Prices: [][]float64{{1705305600000, 0.001}},
MarketCaps: [][]float64{{1705305600000, 10000}},
TotalVolumes: [][]float64{{1705305600000, 500}},
}
_ = json.NewEncoder(w).Encode(data)
}
}))
defer srv.Close()
oldURL := coinGeckoBaseURL
coinGeckoBaseURL = srv.URL
defer func() { coinGeckoBaseURL = oldURL }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
mc := &MarketCollector{CoinID: "lethean", Historical: true, FromDate: "2025-01-01"}
result, err := mc.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 3, result.Items)
}
func TestMarketCollector_Collect_Good_HistoricalInvalidDate(t *testing.T) {
callCount := 0
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
callCount++
w.Header().Set("Content-Type", "application/json")
if callCount == 1 {
data := coinData{
ID: "test",
Symbol: "tst",
Name: "Test",
MarketData: marketData{
CurrentPrice: map[string]float64{"usd": 1.0},
},
}
_ = json.NewEncoder(w).Encode(data)
} else {
// Should fall back to 365 days with invalid date.
assert.Contains(t, r.URL.RawQuery, "days=365")
data := historicalData{
Prices: [][]float64{{1705305600000, 1.0}},
}
_ = json.NewEncoder(w).Encode(data)
}
}))
defer srv.Close()
oldURL := coinGeckoBaseURL
coinGeckoBaseURL = srv.URL
defer func() { coinGeckoBaseURL = oldURL }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
mc := &MarketCollector{CoinID: "test", Historical: true, FromDate: "not-a-date"}
result, err := mc.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 3, result.Items)
}
func TestMarketCollector_Collect_Bad_HistoricalServerError(t *testing.T) {
callCount := 0
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
callCount++
w.Header().Set("Content-Type", "application/json")
if callCount == 1 {
data := coinData{
ID: "test",
Symbol: "tst",
Name: "Test",
MarketData: marketData{
CurrentPrice: map[string]float64{"usd": 1.0},
},
}
_ = json.NewEncoder(w).Encode(data)
} else {
// Historical endpoint fails.
w.WriteHeader(http.StatusTooManyRequests)
}
}))
defer srv.Close()
oldURL := coinGeckoBaseURL
coinGeckoBaseURL = srv.URL
defer func() { coinGeckoBaseURL = oldURL }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
mc := &MarketCollector{CoinID: "test", Historical: true}
result, err := mc.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 2, result.Items) // current.json + summary.md
assert.Equal(t, 1, result.Errors) // historical failed
}
func TestMarketCollector_Collect_Good_EmitsEvents(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
data := coinData{
ID: "bitcoin",
Symbol: "btc",
Name: "Bitcoin",
MarketData: marketData{
CurrentPrice: map[string]float64{"usd": 50000},
},
}
_ = json.NewEncoder(w).Encode(data)
}))
defer srv.Close()
oldURL := coinGeckoBaseURL
coinGeckoBaseURL = srv.URL
defer func() { coinGeckoBaseURL = oldURL }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
var starts, completes int
cfg.Dispatcher.On(EventStart, func(e Event) { starts++ })
cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ })
mc := &MarketCollector{CoinID: "bitcoin"}
_, err := mc.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 1, starts)
assert.Equal(t, 1, completes)
}
func TestMarketCollector_Collect_Good_CancelledContext(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer srv.Close()
oldURL := coinGeckoBaseURL
coinGeckoBaseURL = srv.URL
defer func() { coinGeckoBaseURL = oldURL }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
ctx, cancel := context.WithCancel(context.Background())
cancel()
mc := &MarketCollector{CoinID: "bitcoin"}
result, err := mc.Collect(ctx, cfg)
// Context cancellation causes error in fetchJSON.
require.NoError(t, err) // outer Collect doesn't return errors from currentData fetch
assert.Equal(t, 1, result.Errors)
}
func TestFormatMarketSummary_Good_AllFields(t *testing.T) {
data := &coinData{
Name: "Lethean",
Symbol: "lthn",
MarketData: marketData{
CurrentPrice: map[string]float64{"usd": 0.001},
MarketCap: map[string]float64{"usd": 100000},
TotalVolume: map[string]float64{"usd": 5000},
High24h: map[string]float64{"usd": 0.0015},
Low24h: map[string]float64{"usd": 0.0005},
PriceChange24h: 0.0002,
PriceChangePct24h: 5.5,
MarketCapRank: 500,
CirculatingSupply: 1000000000,
TotalSupply: 2000000000,
LastUpdated: "2025-01-15T12:00:00Z",
},
}
summary := FormatMarketSummary(data)
assert.Contains(t, summary, "# Lethean (LTHN)")
assert.Contains(t, summary, "24h Volume")
assert.Contains(t, summary, "24h High")
assert.Contains(t, summary, "24h Low")
assert.Contains(t, summary, "24h Price Change")
assert.Contains(t, summary, "#500")
assert.Contains(t, summary, "Circulating Supply")
assert.Contains(t, summary, "Total Supply")
assert.Contains(t, summary, "Last updated")
}
func TestFormatMarketSummary_Good_Minimal(t *testing.T) {
data := &coinData{
Name: "Unknown",
Symbol: "ukn",
}
summary := FormatMarketSummary(data)
assert.Contains(t, summary, "# Unknown (UKN)")
// No price data, so these should be absent.
assert.NotContains(t, summary, "Market Cap Rank")
}

313
collect/papers_http_test.go Normal file
View file

@ -0,0 +1,313 @@
package collect
import (
"context"
"net/http"
"net/http/httptest"
"strings"
"testing"
"forge.lthn.ai/core/go/pkg/io"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/net/html"
)
const sampleIACRHTML = `<html><body>
<div class="paperentry">
<a href="/eprint/2025/001">Zero-Knowledge Proofs</a>
<span class="author">Alice</span>
<span class="author">Bob</span>
<span class="date">2025-01-15</span>
<p class="abstract">We present a novel construction for zero-knowledge proofs.</p>
</div>
<div class="paperentry">
<a href="/eprint/2025/002">Lattice Cryptography</a>
<span class="author">Charlie</span>
<span class="date">2025-01-20</span>
<p class="abstract">A survey of lattice-based cryptography.</p>
</div>
</body></html>`
const sampleArXivXML = `<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<id>http://arxiv.org/abs/2501.12345v1</id>
<title>Ring Signatures Revisited</title>
<summary>We propose an efficient ring signature scheme.</summary>
<published>2025-01-10T00:00:00Z</published>
<author><name>Alice</name></author>
<author><name>David</name></author>
<link href="http://arxiv.org/abs/2501.12345v1" rel="alternate"/>
</entry>
<entry>
<id>http://arxiv.org/abs/2501.67890v1</id>
<title>Post-Quantum Signatures</title>
<summary>A new approach to post-quantum digital signatures.</summary>
<published>2025-01-12T00:00:00Z</published>
<author><name>Eve</name></author>
<link href="http://arxiv.org/abs/2501.67890v1" rel="alternate"/>
</entry>
</feed>`
func TestPapersCollector_CollectIACR_Good(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(sampleIACRHTML))
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceIACR, Query: "zero knowledge"}
result, err := p.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 2, result.Items)
assert.Len(t, result.Files, 2)
// Verify content was written.
content, err := m.Read("/output/papers/iacr/2025-001.md")
require.NoError(t, err)
assert.Contains(t, content, "Zero-Knowledge Proofs")
}
func TestPapersCollector_CollectArXiv_Good(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(sampleArXivXML))
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceArXiv, Query: "ring signatures"}
result, err := p.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 2, result.Items)
assert.Len(t, result.Files, 2)
// Verify one of the papers.
content, err := m.Read("/output/papers/arxiv/2501.12345v1.md")
require.NoError(t, err)
assert.Contains(t, content, "Ring Signatures Revisited")
assert.Contains(t, content, "Alice")
}
func TestPapersCollector_CollectArXiv_Good_WithCategory(t *testing.T) {
var capturedQuery string
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
capturedQuery = r.URL.RawQuery
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(sampleArXivXML))
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceArXiv, Query: "crypto", Category: "cs.CR"}
_, err := p.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Contains(t, capturedQuery, "cat")
}
func TestPapersCollector_CollectAll_Good(t *testing.T) {
callCount := 0
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
callCount++
if callCount == 1 {
// First call is IACR
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(sampleIACRHTML))
} else {
// Second call is arXiv
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(sampleArXivXML))
}
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceAll, Query: "cryptography"}
result, err := p.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 4, result.Items) // 2 IACR + 2 arXiv
}
func TestPapersCollector_CollectIACR_Bad_ServerError(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceIACR, Query: "test"}
_, err := p.Collect(context.Background(), cfg)
assert.Error(t, err)
}
func TestPapersCollector_CollectArXiv_Bad_ServerError(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusServiceUnavailable)
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceArXiv, Query: "test"}
_, err := p.Collect(context.Background(), cfg)
assert.Error(t, err)
}
func TestPapersCollector_CollectArXiv_Bad_InvalidXML(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(`not xml at all`))
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceArXiv, Query: "test"}
_, err := p.Collect(context.Background(), cfg)
assert.Error(t, err)
}
func TestPapersCollector_CollectAll_Bad_BothFail(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceAll, Query: "test"}
_, err := p.Collect(context.Background(), cfg)
assert.Error(t, err)
}
func TestPapersCollector_CollectAll_Good_OneFails(t *testing.T) {
callCount := 0
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
callCount++
if callCount == 1 {
// IACR fails
w.WriteHeader(http.StatusInternalServerError)
} else {
// ArXiv succeeds
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(sampleArXivXML))
}
}))
defer srv.Close()
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
old := httpClient
httpClient = &http.Client{Transport: transport}
defer func() { httpClient = old }()
m := io.NewMockMedium()
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &PapersCollector{Source: PaperSourceAll, Query: "test"}
result, err := p.Collect(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 2, result.Items)
assert.Equal(t, 1, result.Errors) // IACR failure counted
}
func TestExtractIACRPapers_Good(t *testing.T) {
doc, err := html.Parse(strings.NewReader(sampleIACRHTML))
require.NoError(t, err)
papers := extractIACRPapers(doc)
assert.Len(t, papers, 2)
assert.Equal(t, "Zero-Knowledge Proofs", papers[0].Title)
assert.Contains(t, papers[0].Authors, "Alice")
assert.Contains(t, papers[0].Authors, "Bob")
assert.Equal(t, "2025-01-15", papers[0].Date)
assert.Contains(t, papers[0].Abstract, "zero-knowledge proofs")
assert.Equal(t, "iacr", papers[0].Source)
assert.Equal(t, "Lattice Cryptography", papers[1].Title)
}
func TestExtractIACRPapers_Good_Empty(t *testing.T) {
doc, err := html.Parse(strings.NewReader(`<html><body></body></html>`))
require.NoError(t, err)
papers := extractIACRPapers(doc)
assert.Empty(t, papers)
}
func TestExtractIACRPapers_Good_NoTitle(t *testing.T) {
doc, err := html.Parse(strings.NewReader(`<html><body><div class="paperentry"></div></body></html>`))
require.NoError(t, err)
papers := extractIACRPapers(doc)
// Entry with no title should be excluded by the Title check.
assert.Empty(t, papers)
}

View file

@ -0,0 +1,193 @@
package collect
import (
"context"
"testing"
"forge.lthn.ai/core/go/pkg/io"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestHTMLToMarkdown_Good_OrderedList(t *testing.T) {
input := `<ol><li>First</li><li>Second</li><li>Third</li></ol>`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "1. First")
assert.Contains(t, result, "2. Second")
assert.Contains(t, result, "3. Third")
}
func TestHTMLToMarkdown_Good_UnorderedList(t *testing.T) {
input := `<ul><li>Alpha</li><li>Beta</li></ul>`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "- Alpha")
assert.Contains(t, result, "- Beta")
}
func TestHTMLToMarkdown_Good_Blockquote(t *testing.T) {
input := `<blockquote>A wise quote</blockquote>`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "> A wise quote")
}
func TestHTMLToMarkdown_Good_HorizontalRule(t *testing.T) {
input := `<p>Before</p><hr/><p>After</p>`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "---")
}
func TestHTMLToMarkdown_Good_LinkWithoutHref(t *testing.T) {
input := `<a>bare link text</a>`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "bare link text")
assert.NotContains(t, result, "[")
}
func TestHTMLToMarkdown_Good_H4H5H6(t *testing.T) {
input := `<h4>H4</h4><h5>H5</h5><h6>H6</h6>`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "#### H4")
assert.Contains(t, result, "##### H5")
assert.Contains(t, result, "###### H6")
}
func TestHTMLToMarkdown_Good_StripsStyle(t *testing.T) {
input := `<html><head><style>.foo{color:red}</style></head><body><p>Clean</p></body></html>`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "Clean")
assert.NotContains(t, result, "color")
}
func TestHTMLToMarkdown_Good_LineBreak(t *testing.T) {
input := `<p>Line one<br/>Line two</p>`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "Line one")
assert.Contains(t, result, "Line two")
}
func TestHTMLToMarkdown_Good_NestedBoldItalic(t *testing.T) {
input := `<b>bold text</b> and <i>italic text</i>`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "**bold text**")
assert.Contains(t, result, "*italic text*")
}
func TestJSONToMarkdown_Good_NestedObject(t *testing.T) {
input := `{"outer": {"inner_key": "inner_value"}}`
result, err := JSONToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "**outer:**")
assert.Contains(t, result, "**inner_key:** inner_value")
}
func TestJSONToMarkdown_Good_NestedArray(t *testing.T) {
input := `[["a", "b"], ["c"]]`
result, err := JSONToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "# Data")
assert.Contains(t, result, "a")
assert.Contains(t, result, "b")
}
func TestJSONToMarkdown_Good_ScalarValue(t *testing.T) {
input := `42`
result, err := JSONToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "42")
}
func TestJSONToMarkdown_Good_ArrayOfObjects(t *testing.T) {
input := `[{"name": "Alice"}, {"name": "Bob"}]`
result, err := JSONToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "Item 1")
assert.Contains(t, result, "Alice")
assert.Contains(t, result, "Item 2")
assert.Contains(t, result, "Bob")
}
func TestProcessor_Process_Good_CancelledContext(t *testing.T) {
m := io.NewMockMedium()
m.Dirs["/input"] = true
m.Files["/input/file.html"] = `<h1>Test</h1>`
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
ctx, cancel := context.WithCancel(context.Background())
cancel()
p := &Processor{Source: "test", Dir: "/input"}
_, err := p.Process(ctx, cfg)
assert.Error(t, err)
}
func TestProcessor_Process_Good_EmitsEvents(t *testing.T) {
m := io.NewMockMedium()
m.Dirs["/input"] = true
m.Files["/input/a.html"] = `<h1>Title</h1>`
m.Files["/input/b.json"] = `{"key": "value"}`
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
var starts, items, completes int
cfg.Dispatcher.On(EventStart, func(e Event) { starts++ })
cfg.Dispatcher.On(EventItem, func(e Event) { items++ })
cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ })
p := &Processor{Source: "test", Dir: "/input"}
result, err := p.Process(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 2, result.Items)
assert.Equal(t, 1, starts)
assert.Equal(t, 2, items)
assert.Equal(t, 1, completes)
}
func TestProcessor_Process_Good_BadHTML(t *testing.T) {
m := io.NewMockMedium()
m.Dirs["/input"] = true
// html.Parse is very tolerant, so even bad HTML will parse. But we test
// that the pipeline handles it gracefully.
m.Files["/input/bad.html"] = `<html><body><p>Still valid enough</p>`
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &Processor{Source: "test", Dir: "/input"}
result, err := p.Process(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 1, result.Items)
}
func TestProcessor_Process_Good_BadJSON(t *testing.T) {
m := io.NewMockMedium()
m.Dirs["/input"] = true
m.Files["/input/bad.json"] = `not valid json`
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
var errors int
cfg.Dispatcher.On(EventError, func(e Event) { errors++ })
p := &Processor{Source: "test", Dir: "/input"}
result, err := p.Process(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 0, result.Items)
assert.Equal(t, 1, result.Errors)
assert.Equal(t, 1, errors)
}

View file

@ -0,0 +1,76 @@
package collect
import (
"testing"
"forge.lthn.ai/core/go/pkg/io"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestState_Get_Good_ReturnsCopy(t *testing.T) {
m := io.NewMockMedium()
s := NewState(m, "/state.json")
s.Set("test", &StateEntry{Source: "test", Items: 5})
// Get returns a copy, so mutating it shouldn't affect internal state.
got, ok := s.Get("test")
require.True(t, ok)
got.Items = 999
again, ok := s.Get("test")
require.True(t, ok)
assert.Equal(t, 5, again.Items, "internal state should not be mutated")
}
func TestState_Save_Good_WritesJSON(t *testing.T) {
m := io.NewMockMedium()
s := NewState(m, "/data/state.json")
s.Set("src-a", &StateEntry{Source: "src-a", Items: 10, LastID: "abc"})
err := s.Save()
require.NoError(t, err)
// Verify the raw JSON was written.
content, err := m.Read("/data/state.json")
require.NoError(t, err)
assert.Contains(t, content, `"src-a"`)
assert.Contains(t, content, `"abc"`)
}
func TestState_Load_Good_NullJSON(t *testing.T) {
m := io.NewMockMedium()
m.Files["/state.json"] = "null"
s := NewState(m, "/state.json")
err := s.Load()
require.NoError(t, err)
// Null JSON should result in empty entries.
_, ok := s.Get("anything")
assert.False(t, ok)
}
func TestState_SaveLoad_Good_WithCursor(t *testing.T) {
m := io.NewMockMedium()
s := NewState(m, "/state.json")
s.Set("paginated", &StateEntry{
Source: "paginated",
Items: 50,
Cursor: "page_token_abc123",
})
err := s.Save()
require.NoError(t, err)
s2 := NewState(m, "/state.json")
err = s2.Load()
require.NoError(t, err)
entry, ok := s2.Get("paginated")
require.True(t, ok)
assert.Equal(t, "page_token_abc123", entry.Cursor)
}