test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server), papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market (historical with FromDate, invalid date, server errors), process (ordered lists, blockquotes, h4-h6, nested objects, cancelled context), excavate (resume skips completed, progress events), and state (copy safety, cursor round-trip, null JSON). Uses httptest.Server with rewriteTransport to intercept external HTTP calls without touching the production code. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
4de0356880
commit
b4e3d0555a
6 changed files with 1207 additions and 0 deletions
256
collect/bitcointalk_http_test.go
Normal file
256
collect/bitcointalk_http_test.go
Normal file
|
|
@ -0,0 +1,256 @@
|
|||
package collect
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
|
||||
"forge.lthn.ai/core/go/pkg/io"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// sampleBTCTalkPage returns HTML resembling a BitcoinTalk topic page with the
|
||||
// given number of posts. If fewer than postsPerPage the caller can infer that
|
||||
// it is the last page.
|
||||
func sampleBTCTalkPage(count int) string {
|
||||
page := `<html><body>`
|
||||
for i := 0; i < count; i++ {
|
||||
page += fmt.Sprintf(`
|
||||
<div class="post">
|
||||
<div class="poster_info">user%d</div>
|
||||
<div class="headerandpost">
|
||||
<div class="smalltext">January %02d, 2009</div>
|
||||
</div>
|
||||
<div class="inner">Post content number %d.</div>
|
||||
</div>`, i, i+1, i)
|
||||
}
|
||||
page += `</body></html>`
|
||||
return page
|
||||
}
|
||||
|
||||
func TestBitcoinTalkCollector_Collect_Good_OnePage(t *testing.T) {
|
||||
// Serve a single page with 5 posts (< 20, so collection stops after one page).
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(sampleBTCTalkPage(5)))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
// Override the package-level HTTP client so requests go to our test server.
|
||||
oldClient := httpClient
|
||||
httpClient = srv.Client()
|
||||
defer func() { httpClient = oldClient }()
|
||||
|
||||
// We also need to redirect the URL that fetchPage constructs.
|
||||
// The easiest approach: use SetHTTPClient with a custom transport.
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil // disable rate limiting for tests
|
||||
|
||||
b := &BitcoinTalkCollector{TopicID: "12345"}
|
||||
result, err := b.Collect(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 5, result.Items)
|
||||
assert.Equal(t, 0, result.Errors)
|
||||
assert.Len(t, result.Files, 5)
|
||||
assert.Equal(t, "bitcointalk:12345", result.Source)
|
||||
|
||||
// Verify files were written.
|
||||
for i := 1; i <= 5; i++ {
|
||||
path := fmt.Sprintf("/output/bitcointalk/12345/posts/%d.md", i)
|
||||
content, err := m.Read(path)
|
||||
require.NoError(t, err, "file %s should exist", path)
|
||||
assert.Contains(t, content, fmt.Sprintf("Post %d by", i))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBitcoinTalkCollector_Collect_Good_PageLimit(t *testing.T) {
|
||||
pageCount := 0
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
pageCount++
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
// Return a full page (20 posts) each time so collection would continue
|
||||
// indefinitely without a Pages limit.
|
||||
_, _ = w.Write([]byte(sampleBTCTalkPage(20)))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
old := httpClient
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
b := &BitcoinTalkCollector{TopicID: "99999", Pages: 2}
|
||||
result, err := b.Collect(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 40, result.Items) // 2 pages * 20 posts
|
||||
assert.Equal(t, 2, pageCount)
|
||||
}
|
||||
|
||||
func TestBitcoinTalkCollector_Collect_Good_CancelledContext(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(sampleBTCTalkPage(5)))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
old := httpClient
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel() // Cancel immediately.
|
||||
|
||||
b := &BitcoinTalkCollector{TopicID: "12345"}
|
||||
_, err := b.Collect(ctx, cfg)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
func TestBitcoinTalkCollector_Collect_Bad_ServerError(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
old := httpClient
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
b := &BitcoinTalkCollector{TopicID: "12345"}
|
||||
result, err := b.Collect(context.Background(), cfg)
|
||||
|
||||
// fetchPage error causes break with Errors incremented.
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 0, result.Items)
|
||||
assert.Equal(t, 1, result.Errors)
|
||||
}
|
||||
|
||||
func TestBitcoinTalkCollector_Collect_Good_EmitsEvents(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(sampleBTCTalkPage(2)))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
old := httpClient
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
var starts, items, completes int
|
||||
cfg.Dispatcher.On(EventStart, func(e Event) { starts++ })
|
||||
cfg.Dispatcher.On(EventItem, func(e Event) { items++ })
|
||||
cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ })
|
||||
|
||||
b := &BitcoinTalkCollector{TopicID: "12345"}
|
||||
result, err := b.Collect(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 2, result.Items)
|
||||
assert.Equal(t, 1, starts)
|
||||
assert.Equal(t, 2, items)
|
||||
assert.Equal(t, 1, completes)
|
||||
}
|
||||
|
||||
func TestSetHTTPClient_Good(t *testing.T) {
|
||||
old := httpClient
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
custom := &http.Client{}
|
||||
SetHTTPClient(custom)
|
||||
assert.Equal(t, custom, httpClient)
|
||||
}
|
||||
|
||||
func TestFetchPage_Good(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(sampleBTCTalkPage(3)))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
old := httpClient
|
||||
httpClient = srv.Client()
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
b := &BitcoinTalkCollector{TopicID: "12345"}
|
||||
posts, err := b.fetchPage(context.Background(), srv.URL)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, posts, 3)
|
||||
}
|
||||
|
||||
func TestFetchPage_Bad_StatusCode(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusForbidden)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
old := httpClient
|
||||
httpClient = srv.Client()
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
b := &BitcoinTalkCollector{TopicID: "12345"}
|
||||
_, err := b.fetchPage(context.Background(), srv.URL)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
func TestFetchPage_Bad_InvalidHTML(t *testing.T) {
|
||||
// html.Parse is very forgiving, so serve an empty page.
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><body></body></html>`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
old := httpClient
|
||||
httpClient = srv.Client()
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
b := &BitcoinTalkCollector{TopicID: "12345"}
|
||||
posts, err := b.fetchPage(context.Background(), srv.URL)
|
||||
require.NoError(t, err)
|
||||
assert.Empty(t, posts)
|
||||
}
|
||||
|
||||
// rewriteTransport rewrites all request URLs to point at the test server.
|
||||
type rewriteTransport struct {
|
||||
base http.RoundTripper
|
||||
target string
|
||||
}
|
||||
|
||||
func (t *rewriteTransport) RoundTrip(req *http.Request) (*http.Response, error) {
|
||||
req = req.Clone(req.Context())
|
||||
req.URL.Scheme = "http"
|
||||
req.URL.Host = t.target[len("http://"):]
|
||||
base := t.base
|
||||
if base == nil {
|
||||
base = http.DefaultTransport
|
||||
}
|
||||
return base.RoundTrip(req)
|
||||
}
|
||||
127
collect/excavate_extra_test.go
Normal file
127
collect/excavate_extra_test.go
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
package collect
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"forge.lthn.ai/core/go/pkg/io"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestExcavator_Run_Good_ResumeSkipsCompleted(t *testing.T) {
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
// Pre-populate state so source-a looks completed.
|
||||
cfg.State.Set("source-a", &StateEntry{
|
||||
Source: "source-a",
|
||||
LastRun: time.Now().Add(-1 * time.Hour),
|
||||
Items: 10,
|
||||
})
|
||||
|
||||
c1 := &mockCollector{name: "source-a", items: 10}
|
||||
c2 := &mockCollector{name: "source-b", items: 5}
|
||||
|
||||
e := &Excavator{
|
||||
Collectors: []Collector{c1, c2},
|
||||
Resume: true,
|
||||
}
|
||||
|
||||
result, err := e.Run(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.False(t, c1.called, "source-a should be skipped (already completed)")
|
||||
assert.True(t, c2.called, "source-b should run")
|
||||
assert.Equal(t, 5, result.Items)
|
||||
assert.Equal(t, 1, result.Skipped)
|
||||
}
|
||||
|
||||
func TestExcavator_Run_Good_ResumeRunsIncomplete(t *testing.T) {
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
// Pre-populate state with 0 items (incomplete).
|
||||
cfg.State.Set("source-a", &StateEntry{
|
||||
Source: "source-a",
|
||||
LastRun: time.Now(),
|
||||
Items: 0,
|
||||
})
|
||||
|
||||
c1 := &mockCollector{name: "source-a", items: 5}
|
||||
|
||||
e := &Excavator{
|
||||
Collectors: []Collector{c1},
|
||||
Resume: true,
|
||||
}
|
||||
|
||||
result, err := e.Run(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.True(t, c1.called, "source-a should run (0 items in previous run)")
|
||||
assert.Equal(t, 5, result.Items)
|
||||
}
|
||||
|
||||
func TestExcavator_Run_Good_NilState(t *testing.T) {
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.State = nil
|
||||
cfg.Limiter = nil
|
||||
|
||||
c1 := &mockCollector{name: "source-a", items: 3}
|
||||
|
||||
e := &Excavator{
|
||||
Collectors: []Collector{c1},
|
||||
}
|
||||
|
||||
result, err := e.Run(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 3, result.Items)
|
||||
}
|
||||
|
||||
func TestExcavator_Run_Good_NilDispatcher(t *testing.T) {
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Dispatcher = nil
|
||||
cfg.Limiter = nil
|
||||
|
||||
c1 := &mockCollector{name: "source-a", items: 2}
|
||||
|
||||
e := &Excavator{
|
||||
Collectors: []Collector{c1},
|
||||
}
|
||||
|
||||
result, err := e.Run(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 2, result.Items)
|
||||
}
|
||||
|
||||
func TestExcavator_Run_Good_ProgressEvents(t *testing.T) {
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
var progressMsgs []string
|
||||
cfg.Dispatcher.On(EventProgress, func(e Event) {
|
||||
progressMsgs = append(progressMsgs, e.Message)
|
||||
})
|
||||
|
||||
c1 := &mockCollector{name: "source-a", items: 1}
|
||||
c2 := &mockCollector{name: "source-b", items: 1}
|
||||
|
||||
e := &Excavator{
|
||||
Collectors: []Collector{c1, c2},
|
||||
}
|
||||
|
||||
_, err := e.Run(context.Background(), cfg)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Len(t, progressMsgs, 2)
|
||||
assert.Contains(t, progressMsgs[0], "1/2")
|
||||
assert.Contains(t, progressMsgs[1], "2/2")
|
||||
}
|
||||
242
collect/market_extra_test.go
Normal file
242
collect/market_extra_test.go
Normal file
|
|
@ -0,0 +1,242 @@
|
|||
package collect
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
|
||||
"forge.lthn.ai/core/go/pkg/io"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestMarketCollector_Collect_Good_HistoricalWithFromDate(t *testing.T) {
|
||||
callCount := 0
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
callCount++
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
||||
if callCount == 1 {
|
||||
data := coinData{
|
||||
ID: "lethean",
|
||||
Symbol: "lthn",
|
||||
Name: "Lethean",
|
||||
MarketData: marketData{
|
||||
CurrentPrice: map[string]float64{"usd": 0.001},
|
||||
},
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(data)
|
||||
} else {
|
||||
// Historical data with FromDate param.
|
||||
assert.Contains(t, r.URL.RawQuery, "days=")
|
||||
data := historicalData{
|
||||
Prices: [][]float64{{1705305600000, 0.001}},
|
||||
MarketCaps: [][]float64{{1705305600000, 10000}},
|
||||
TotalVolumes: [][]float64{{1705305600000, 500}},
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(data)
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
oldURL := coinGeckoBaseURL
|
||||
coinGeckoBaseURL = srv.URL
|
||||
defer func() { coinGeckoBaseURL = oldURL }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
mc := &MarketCollector{CoinID: "lethean", Historical: true, FromDate: "2025-01-01"}
|
||||
result, err := mc.Collect(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 3, result.Items)
|
||||
}
|
||||
|
||||
func TestMarketCollector_Collect_Good_HistoricalInvalidDate(t *testing.T) {
|
||||
callCount := 0
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
callCount++
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
||||
if callCount == 1 {
|
||||
data := coinData{
|
||||
ID: "test",
|
||||
Symbol: "tst",
|
||||
Name: "Test",
|
||||
MarketData: marketData{
|
||||
CurrentPrice: map[string]float64{"usd": 1.0},
|
||||
},
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(data)
|
||||
} else {
|
||||
// Should fall back to 365 days with invalid date.
|
||||
assert.Contains(t, r.URL.RawQuery, "days=365")
|
||||
data := historicalData{
|
||||
Prices: [][]float64{{1705305600000, 1.0}},
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(data)
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
oldURL := coinGeckoBaseURL
|
||||
coinGeckoBaseURL = srv.URL
|
||||
defer func() { coinGeckoBaseURL = oldURL }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
mc := &MarketCollector{CoinID: "test", Historical: true, FromDate: "not-a-date"}
|
||||
result, err := mc.Collect(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 3, result.Items)
|
||||
}
|
||||
|
||||
func TestMarketCollector_Collect_Bad_HistoricalServerError(t *testing.T) {
|
||||
callCount := 0
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
callCount++
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
||||
if callCount == 1 {
|
||||
data := coinData{
|
||||
ID: "test",
|
||||
Symbol: "tst",
|
||||
Name: "Test",
|
||||
MarketData: marketData{
|
||||
CurrentPrice: map[string]float64{"usd": 1.0},
|
||||
},
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(data)
|
||||
} else {
|
||||
// Historical endpoint fails.
|
||||
w.WriteHeader(http.StatusTooManyRequests)
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
oldURL := coinGeckoBaseURL
|
||||
coinGeckoBaseURL = srv.URL
|
||||
defer func() { coinGeckoBaseURL = oldURL }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
mc := &MarketCollector{CoinID: "test", Historical: true}
|
||||
result, err := mc.Collect(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 2, result.Items) // current.json + summary.md
|
||||
assert.Equal(t, 1, result.Errors) // historical failed
|
||||
}
|
||||
|
||||
func TestMarketCollector_Collect_Good_EmitsEvents(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
data := coinData{
|
||||
ID: "bitcoin",
|
||||
Symbol: "btc",
|
||||
Name: "Bitcoin",
|
||||
MarketData: marketData{
|
||||
CurrentPrice: map[string]float64{"usd": 50000},
|
||||
},
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(data)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
oldURL := coinGeckoBaseURL
|
||||
coinGeckoBaseURL = srv.URL
|
||||
defer func() { coinGeckoBaseURL = oldURL }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
var starts, completes int
|
||||
cfg.Dispatcher.On(EventStart, func(e Event) { starts++ })
|
||||
cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ })
|
||||
|
||||
mc := &MarketCollector{CoinID: "bitcoin"}
|
||||
_, err := mc.Collect(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 1, starts)
|
||||
assert.Equal(t, 1, completes)
|
||||
}
|
||||
|
||||
func TestMarketCollector_Collect_Good_CancelledContext(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
oldURL := coinGeckoBaseURL
|
||||
coinGeckoBaseURL = srv.URL
|
||||
defer func() { coinGeckoBaseURL = oldURL }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
mc := &MarketCollector{CoinID: "bitcoin"}
|
||||
result, err := mc.Collect(ctx, cfg)
|
||||
|
||||
// Context cancellation causes error in fetchJSON.
|
||||
require.NoError(t, err) // outer Collect doesn't return errors from currentData fetch
|
||||
assert.Equal(t, 1, result.Errors)
|
||||
}
|
||||
|
||||
func TestFormatMarketSummary_Good_AllFields(t *testing.T) {
|
||||
data := &coinData{
|
||||
Name: "Lethean",
|
||||
Symbol: "lthn",
|
||||
MarketData: marketData{
|
||||
CurrentPrice: map[string]float64{"usd": 0.001},
|
||||
MarketCap: map[string]float64{"usd": 100000},
|
||||
TotalVolume: map[string]float64{"usd": 5000},
|
||||
High24h: map[string]float64{"usd": 0.0015},
|
||||
Low24h: map[string]float64{"usd": 0.0005},
|
||||
PriceChange24h: 0.0002,
|
||||
PriceChangePct24h: 5.5,
|
||||
MarketCapRank: 500,
|
||||
CirculatingSupply: 1000000000,
|
||||
TotalSupply: 2000000000,
|
||||
LastUpdated: "2025-01-15T12:00:00Z",
|
||||
},
|
||||
}
|
||||
|
||||
summary := FormatMarketSummary(data)
|
||||
|
||||
assert.Contains(t, summary, "# Lethean (LTHN)")
|
||||
assert.Contains(t, summary, "24h Volume")
|
||||
assert.Contains(t, summary, "24h High")
|
||||
assert.Contains(t, summary, "24h Low")
|
||||
assert.Contains(t, summary, "24h Price Change")
|
||||
assert.Contains(t, summary, "#500")
|
||||
assert.Contains(t, summary, "Circulating Supply")
|
||||
assert.Contains(t, summary, "Total Supply")
|
||||
assert.Contains(t, summary, "Last updated")
|
||||
}
|
||||
|
||||
func TestFormatMarketSummary_Good_Minimal(t *testing.T) {
|
||||
data := &coinData{
|
||||
Name: "Unknown",
|
||||
Symbol: "ukn",
|
||||
}
|
||||
|
||||
summary := FormatMarketSummary(data)
|
||||
assert.Contains(t, summary, "# Unknown (UKN)")
|
||||
// No price data, so these should be absent.
|
||||
assert.NotContains(t, summary, "Market Cap Rank")
|
||||
}
|
||||
313
collect/papers_http_test.go
Normal file
313
collect/papers_http_test.go
Normal file
|
|
@ -0,0 +1,313 @@
|
|||
package collect
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"forge.lthn.ai/core/go/pkg/io"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
const sampleIACRHTML = `<html><body>
|
||||
<div class="paperentry">
|
||||
<a href="/eprint/2025/001">Zero-Knowledge Proofs</a>
|
||||
<span class="author">Alice</span>
|
||||
<span class="author">Bob</span>
|
||||
<span class="date">2025-01-15</span>
|
||||
<p class="abstract">We present a novel construction for zero-knowledge proofs.</p>
|
||||
</div>
|
||||
<div class="paperentry">
|
||||
<a href="/eprint/2025/002">Lattice Cryptography</a>
|
||||
<span class="author">Charlie</span>
|
||||
<span class="date">2025-01-20</span>
|
||||
<p class="abstract">A survey of lattice-based cryptography.</p>
|
||||
</div>
|
||||
</body></html>`
|
||||
|
||||
const sampleArXivXML = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<entry>
|
||||
<id>http://arxiv.org/abs/2501.12345v1</id>
|
||||
<title>Ring Signatures Revisited</title>
|
||||
<summary>We propose an efficient ring signature scheme.</summary>
|
||||
<published>2025-01-10T00:00:00Z</published>
|
||||
<author><name>Alice</name></author>
|
||||
<author><name>David</name></author>
|
||||
<link href="http://arxiv.org/abs/2501.12345v1" rel="alternate"/>
|
||||
</entry>
|
||||
<entry>
|
||||
<id>http://arxiv.org/abs/2501.67890v1</id>
|
||||
<title>Post-Quantum Signatures</title>
|
||||
<summary>A new approach to post-quantum digital signatures.</summary>
|
||||
<published>2025-01-12T00:00:00Z</published>
|
||||
<author><name>Eve</name></author>
|
||||
<link href="http://arxiv.org/abs/2501.67890v1" rel="alternate"/>
|
||||
</entry>
|
||||
</feed>`
|
||||
|
||||
func TestPapersCollector_CollectIACR_Good(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(sampleIACRHTML))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
old := httpClient
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
p := &PapersCollector{Source: PaperSourceIACR, Query: "zero knowledge"}
|
||||
result, err := p.Collect(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 2, result.Items)
|
||||
assert.Len(t, result.Files, 2)
|
||||
|
||||
// Verify content was written.
|
||||
content, err := m.Read("/output/papers/iacr/2025-001.md")
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, content, "Zero-Knowledge Proofs")
|
||||
}
|
||||
|
||||
func TestPapersCollector_CollectArXiv_Good(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/xml")
|
||||
_, _ = w.Write([]byte(sampleArXivXML))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
old := httpClient
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
p := &PapersCollector{Source: PaperSourceArXiv, Query: "ring signatures"}
|
||||
result, err := p.Collect(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 2, result.Items)
|
||||
assert.Len(t, result.Files, 2)
|
||||
|
||||
// Verify one of the papers.
|
||||
content, err := m.Read("/output/papers/arxiv/2501.12345v1.md")
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, content, "Ring Signatures Revisited")
|
||||
assert.Contains(t, content, "Alice")
|
||||
}
|
||||
|
||||
func TestPapersCollector_CollectArXiv_Good_WithCategory(t *testing.T) {
|
||||
var capturedQuery string
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
capturedQuery = r.URL.RawQuery
|
||||
w.Header().Set("Content-Type", "application/xml")
|
||||
_, _ = w.Write([]byte(sampleArXivXML))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
old := httpClient
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
p := &PapersCollector{Source: PaperSourceArXiv, Query: "crypto", Category: "cs.CR"}
|
||||
_, err := p.Collect(context.Background(), cfg)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, capturedQuery, "cat")
|
||||
}
|
||||
|
||||
func TestPapersCollector_CollectAll_Good(t *testing.T) {
|
||||
callCount := 0
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
callCount++
|
||||
if callCount == 1 {
|
||||
// First call is IACR
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(sampleIACRHTML))
|
||||
} else {
|
||||
// Second call is arXiv
|
||||
w.Header().Set("Content-Type", "application/xml")
|
||||
_, _ = w.Write([]byte(sampleArXivXML))
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
old := httpClient
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
p := &PapersCollector{Source: PaperSourceAll, Query: "cryptography"}
|
||||
result, err := p.Collect(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 4, result.Items) // 2 IACR + 2 arXiv
|
||||
}
|
||||
|
||||
func TestPapersCollector_CollectIACR_Bad_ServerError(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
old := httpClient
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
p := &PapersCollector{Source: PaperSourceIACR, Query: "test"}
|
||||
_, err := p.Collect(context.Background(), cfg)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
func TestPapersCollector_CollectArXiv_Bad_ServerError(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
old := httpClient
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
p := &PapersCollector{Source: PaperSourceArXiv, Query: "test"}
|
||||
_, err := p.Collect(context.Background(), cfg)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
func TestPapersCollector_CollectArXiv_Bad_InvalidXML(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/xml")
|
||||
_, _ = w.Write([]byte(`not xml at all`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
old := httpClient
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
p := &PapersCollector{Source: PaperSourceArXiv, Query: "test"}
|
||||
_, err := p.Collect(context.Background(), cfg)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
func TestPapersCollector_CollectAll_Bad_BothFail(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
old := httpClient
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
p := &PapersCollector{Source: PaperSourceAll, Query: "test"}
|
||||
_, err := p.Collect(context.Background(), cfg)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
func TestPapersCollector_CollectAll_Good_OneFails(t *testing.T) {
|
||||
callCount := 0
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
callCount++
|
||||
if callCount == 1 {
|
||||
// IACR fails
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
} else {
|
||||
// ArXiv succeeds
|
||||
w.Header().Set("Content-Type", "application/xml")
|
||||
_, _ = w.Write([]byte(sampleArXivXML))
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
||||
old := httpClient
|
||||
httpClient = &http.Client{Transport: transport}
|
||||
defer func() { httpClient = old }()
|
||||
|
||||
m := io.NewMockMedium()
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
p := &PapersCollector{Source: PaperSourceAll, Query: "test"}
|
||||
result, err := p.Collect(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 2, result.Items)
|
||||
assert.Equal(t, 1, result.Errors) // IACR failure counted
|
||||
}
|
||||
|
||||
func TestExtractIACRPapers_Good(t *testing.T) {
|
||||
doc, err := html.Parse(strings.NewReader(sampleIACRHTML))
|
||||
require.NoError(t, err)
|
||||
|
||||
papers := extractIACRPapers(doc)
|
||||
assert.Len(t, papers, 2)
|
||||
|
||||
assert.Equal(t, "Zero-Knowledge Proofs", papers[0].Title)
|
||||
assert.Contains(t, papers[0].Authors, "Alice")
|
||||
assert.Contains(t, papers[0].Authors, "Bob")
|
||||
assert.Equal(t, "2025-01-15", papers[0].Date)
|
||||
assert.Contains(t, papers[0].Abstract, "zero-knowledge proofs")
|
||||
assert.Equal(t, "iacr", papers[0].Source)
|
||||
|
||||
assert.Equal(t, "Lattice Cryptography", papers[1].Title)
|
||||
}
|
||||
|
||||
func TestExtractIACRPapers_Good_Empty(t *testing.T) {
|
||||
doc, err := html.Parse(strings.NewReader(`<html><body></body></html>`))
|
||||
require.NoError(t, err)
|
||||
|
||||
papers := extractIACRPapers(doc)
|
||||
assert.Empty(t, papers)
|
||||
}
|
||||
|
||||
func TestExtractIACRPapers_Good_NoTitle(t *testing.T) {
|
||||
doc, err := html.Parse(strings.NewReader(`<html><body><div class="paperentry"></div></body></html>`))
|
||||
require.NoError(t, err)
|
||||
|
||||
papers := extractIACRPapers(doc)
|
||||
// Entry with no title should be excluded by the Title check.
|
||||
assert.Empty(t, papers)
|
||||
}
|
||||
193
collect/process_extra_test.go
Normal file
193
collect/process_extra_test.go
Normal file
|
|
@ -0,0 +1,193 @@
|
|||
package collect
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"forge.lthn.ai/core/go/pkg/io"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestHTMLToMarkdown_Good_OrderedList(t *testing.T) {
|
||||
input := `<ol><li>First</li><li>Second</li><li>Third</li></ol>`
|
||||
result, err := HTMLToMarkdown(input)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, result, "1. First")
|
||||
assert.Contains(t, result, "2. Second")
|
||||
assert.Contains(t, result, "3. Third")
|
||||
}
|
||||
|
||||
func TestHTMLToMarkdown_Good_UnorderedList(t *testing.T) {
|
||||
input := `<ul><li>Alpha</li><li>Beta</li></ul>`
|
||||
result, err := HTMLToMarkdown(input)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, result, "- Alpha")
|
||||
assert.Contains(t, result, "- Beta")
|
||||
}
|
||||
|
||||
func TestHTMLToMarkdown_Good_Blockquote(t *testing.T) {
|
||||
input := `<blockquote>A wise quote</blockquote>`
|
||||
result, err := HTMLToMarkdown(input)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, result, "> A wise quote")
|
||||
}
|
||||
|
||||
func TestHTMLToMarkdown_Good_HorizontalRule(t *testing.T) {
|
||||
input := `<p>Before</p><hr/><p>After</p>`
|
||||
result, err := HTMLToMarkdown(input)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, result, "---")
|
||||
}
|
||||
|
||||
func TestHTMLToMarkdown_Good_LinkWithoutHref(t *testing.T) {
|
||||
input := `<a>bare link text</a>`
|
||||
result, err := HTMLToMarkdown(input)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, result, "bare link text")
|
||||
assert.NotContains(t, result, "[")
|
||||
}
|
||||
|
||||
func TestHTMLToMarkdown_Good_H4H5H6(t *testing.T) {
|
||||
input := `<h4>H4</h4><h5>H5</h5><h6>H6</h6>`
|
||||
result, err := HTMLToMarkdown(input)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, result, "#### H4")
|
||||
assert.Contains(t, result, "##### H5")
|
||||
assert.Contains(t, result, "###### H6")
|
||||
}
|
||||
|
||||
func TestHTMLToMarkdown_Good_StripsStyle(t *testing.T) {
|
||||
input := `<html><head><style>.foo{color:red}</style></head><body><p>Clean</p></body></html>`
|
||||
result, err := HTMLToMarkdown(input)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, result, "Clean")
|
||||
assert.NotContains(t, result, "color")
|
||||
}
|
||||
|
||||
func TestHTMLToMarkdown_Good_LineBreak(t *testing.T) {
|
||||
input := `<p>Line one<br/>Line two</p>`
|
||||
result, err := HTMLToMarkdown(input)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, result, "Line one")
|
||||
assert.Contains(t, result, "Line two")
|
||||
}
|
||||
|
||||
func TestHTMLToMarkdown_Good_NestedBoldItalic(t *testing.T) {
|
||||
input := `<b>bold text</b> and <i>italic text</i>`
|
||||
result, err := HTMLToMarkdown(input)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, result, "**bold text**")
|
||||
assert.Contains(t, result, "*italic text*")
|
||||
}
|
||||
|
||||
func TestJSONToMarkdown_Good_NestedObject(t *testing.T) {
|
||||
input := `{"outer": {"inner_key": "inner_value"}}`
|
||||
result, err := JSONToMarkdown(input)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, result, "**outer:**")
|
||||
assert.Contains(t, result, "**inner_key:** inner_value")
|
||||
}
|
||||
|
||||
func TestJSONToMarkdown_Good_NestedArray(t *testing.T) {
|
||||
input := `[["a", "b"], ["c"]]`
|
||||
result, err := JSONToMarkdown(input)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, result, "# Data")
|
||||
assert.Contains(t, result, "a")
|
||||
assert.Contains(t, result, "b")
|
||||
}
|
||||
|
||||
func TestJSONToMarkdown_Good_ScalarValue(t *testing.T) {
|
||||
input := `42`
|
||||
result, err := JSONToMarkdown(input)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, result, "42")
|
||||
}
|
||||
|
||||
func TestJSONToMarkdown_Good_ArrayOfObjects(t *testing.T) {
|
||||
input := `[{"name": "Alice"}, {"name": "Bob"}]`
|
||||
result, err := JSONToMarkdown(input)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, result, "Item 1")
|
||||
assert.Contains(t, result, "Alice")
|
||||
assert.Contains(t, result, "Item 2")
|
||||
assert.Contains(t, result, "Bob")
|
||||
}
|
||||
|
||||
func TestProcessor_Process_Good_CancelledContext(t *testing.T) {
|
||||
m := io.NewMockMedium()
|
||||
m.Dirs["/input"] = true
|
||||
m.Files["/input/file.html"] = `<h1>Test</h1>`
|
||||
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
p := &Processor{Source: "test", Dir: "/input"}
|
||||
_, err := p.Process(ctx, cfg)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
func TestProcessor_Process_Good_EmitsEvents(t *testing.T) {
|
||||
m := io.NewMockMedium()
|
||||
m.Dirs["/input"] = true
|
||||
m.Files["/input/a.html"] = `<h1>Title</h1>`
|
||||
m.Files["/input/b.json"] = `{"key": "value"}`
|
||||
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
var starts, items, completes int
|
||||
cfg.Dispatcher.On(EventStart, func(e Event) { starts++ })
|
||||
cfg.Dispatcher.On(EventItem, func(e Event) { items++ })
|
||||
cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ })
|
||||
|
||||
p := &Processor{Source: "test", Dir: "/input"}
|
||||
result, err := p.Process(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 2, result.Items)
|
||||
assert.Equal(t, 1, starts)
|
||||
assert.Equal(t, 2, items)
|
||||
assert.Equal(t, 1, completes)
|
||||
}
|
||||
|
||||
func TestProcessor_Process_Good_BadHTML(t *testing.T) {
|
||||
m := io.NewMockMedium()
|
||||
m.Dirs["/input"] = true
|
||||
// html.Parse is very tolerant, so even bad HTML will parse. But we test
|
||||
// that the pipeline handles it gracefully.
|
||||
m.Files["/input/bad.html"] = `<html><body><p>Still valid enough</p>`
|
||||
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
p := &Processor{Source: "test", Dir: "/input"}
|
||||
result, err := p.Process(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 1, result.Items)
|
||||
}
|
||||
|
||||
func TestProcessor_Process_Good_BadJSON(t *testing.T) {
|
||||
m := io.NewMockMedium()
|
||||
m.Dirs["/input"] = true
|
||||
m.Files["/input/bad.json"] = `not valid json`
|
||||
|
||||
cfg := NewConfigWithMedium(m, "/output")
|
||||
cfg.Limiter = nil
|
||||
|
||||
var errors int
|
||||
cfg.Dispatcher.On(EventError, func(e Event) { errors++ })
|
||||
|
||||
p := &Processor{Source: "test", Dir: "/input"}
|
||||
result, err := p.Process(context.Background(), cfg)
|
||||
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 0, result.Items)
|
||||
assert.Equal(t, 1, result.Errors)
|
||||
assert.Equal(t, 1, errors)
|
||||
}
|
||||
76
collect/state_extra_test.go
Normal file
76
collect/state_extra_test.go
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
package collect
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"forge.lthn.ai/core/go/pkg/io"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestState_Get_Good_ReturnsCopy(t *testing.T) {
|
||||
m := io.NewMockMedium()
|
||||
s := NewState(m, "/state.json")
|
||||
|
||||
s.Set("test", &StateEntry{Source: "test", Items: 5})
|
||||
|
||||
// Get returns a copy, so mutating it shouldn't affect internal state.
|
||||
got, ok := s.Get("test")
|
||||
require.True(t, ok)
|
||||
got.Items = 999
|
||||
|
||||
again, ok := s.Get("test")
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, 5, again.Items, "internal state should not be mutated")
|
||||
}
|
||||
|
||||
func TestState_Save_Good_WritesJSON(t *testing.T) {
|
||||
m := io.NewMockMedium()
|
||||
s := NewState(m, "/data/state.json")
|
||||
|
||||
s.Set("src-a", &StateEntry{Source: "src-a", Items: 10, LastID: "abc"})
|
||||
|
||||
err := s.Save()
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify the raw JSON was written.
|
||||
content, err := m.Read("/data/state.json")
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, content, `"src-a"`)
|
||||
assert.Contains(t, content, `"abc"`)
|
||||
}
|
||||
|
||||
func TestState_Load_Good_NullJSON(t *testing.T) {
|
||||
m := io.NewMockMedium()
|
||||
m.Files["/state.json"] = "null"
|
||||
|
||||
s := NewState(m, "/state.json")
|
||||
err := s.Load()
|
||||
require.NoError(t, err)
|
||||
|
||||
// Null JSON should result in empty entries.
|
||||
_, ok := s.Get("anything")
|
||||
assert.False(t, ok)
|
||||
}
|
||||
|
||||
func TestState_SaveLoad_Good_WithCursor(t *testing.T) {
|
||||
m := io.NewMockMedium()
|
||||
s := NewState(m, "/state.json")
|
||||
|
||||
s.Set("paginated", &StateEntry{
|
||||
Source: "paginated",
|
||||
Items: 50,
|
||||
Cursor: "page_token_abc123",
|
||||
})
|
||||
|
||||
err := s.Save()
|
||||
require.NoError(t, err)
|
||||
|
||||
s2 := NewState(m, "/state.json")
|
||||
err = s2.Load()
|
||||
require.NoError(t, err)
|
||||
|
||||
entry, ok := s2.Get("paginated")
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "page_token_abc123", entry.Cursor)
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue