From b4e3d0555a0b74958eb7b8f6a4a475e2b2c3f663 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 20 Feb 2026 01:55:18 +0000 Subject: [PATCH] test(collect): push coverage from 57.3% to 83.0% Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server), papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market (historical with FromDate, invalid date, server errors), process (ordered lists, blockquotes, h4-h6, nested objects, cancelled context), excavate (resume skips completed, progress events), and state (copy safety, cursor round-trip, null JSON). Uses httptest.Server with rewriteTransport to intercept external HTTP calls without touching the production code. Co-Authored-By: Claude Opus 4.6 --- collect/bitcointalk_http_test.go | 256 +++++++++++++++++++++++++ collect/excavate_extra_test.go | 127 +++++++++++++ collect/market_extra_test.go | 242 ++++++++++++++++++++++++ collect/papers_http_test.go | 313 +++++++++++++++++++++++++++++++ collect/process_extra_test.go | 193 +++++++++++++++++++ collect/state_extra_test.go | 76 ++++++++ 6 files changed, 1207 insertions(+) create mode 100644 collect/bitcointalk_http_test.go create mode 100644 collect/excavate_extra_test.go create mode 100644 collect/market_extra_test.go create mode 100644 collect/papers_http_test.go create mode 100644 collect/process_extra_test.go create mode 100644 collect/state_extra_test.go diff --git a/collect/bitcointalk_http_test.go b/collect/bitcointalk_http_test.go new file mode 100644 index 0000000..f045ebe --- /dev/null +++ b/collect/bitcointalk_http_test.go @@ -0,0 +1,256 @@ +package collect + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "testing" + + "forge.lthn.ai/core/go/pkg/io" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// sampleBTCTalkPage returns HTML resembling a BitcoinTalk topic page with the +// given number of posts. If fewer than postsPerPage the caller can infer that +// it is the last page. +func sampleBTCTalkPage(count int) string { + page := `` + for i := 0; i < count; i++ { + page += fmt.Sprintf(` +
+
user%d
+
+
January %02d, 2009
+
+
Post content number %d.
+
`, i, i+1, i) + } + page += `` + return page +} + +func TestBitcoinTalkCollector_Collect_Good_OnePage(t *testing.T) { + // Serve a single page with 5 posts (< 20, so collection stops after one page). + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(sampleBTCTalkPage(5))) + })) + defer srv.Close() + + // Override the package-level HTTP client so requests go to our test server. + oldClient := httpClient + httpClient = srv.Client() + defer func() { httpClient = oldClient }() + + // We also need to redirect the URL that fetchPage constructs. + // The easiest approach: use SetHTTPClient with a custom transport. + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + httpClient = &http.Client{Transport: transport} + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil // disable rate limiting for tests + + b := &BitcoinTalkCollector{TopicID: "12345"} + result, err := b.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 5, result.Items) + assert.Equal(t, 0, result.Errors) + assert.Len(t, result.Files, 5) + assert.Equal(t, "bitcointalk:12345", result.Source) + + // Verify files were written. + for i := 1; i <= 5; i++ { + path := fmt.Sprintf("/output/bitcointalk/12345/posts/%d.md", i) + content, err := m.Read(path) + require.NoError(t, err, "file %s should exist", path) + assert.Contains(t, content, fmt.Sprintf("Post %d by", i)) + } +} + +func TestBitcoinTalkCollector_Collect_Good_PageLimit(t *testing.T) { + pageCount := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + pageCount++ + w.Header().Set("Content-Type", "text/html") + // Return a full page (20 posts) each time so collection would continue + // indefinitely without a Pages limit. + _, _ = w.Write([]byte(sampleBTCTalkPage(20))) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + b := &BitcoinTalkCollector{TopicID: "99999", Pages: 2} + result, err := b.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 40, result.Items) // 2 pages * 20 posts + assert.Equal(t, 2, pageCount) +} + +func TestBitcoinTalkCollector_Collect_Good_CancelledContext(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(sampleBTCTalkPage(5))) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // Cancel immediately. + + b := &BitcoinTalkCollector{TopicID: "12345"} + _, err := b.Collect(ctx, cfg) + assert.Error(t, err) +} + +func TestBitcoinTalkCollector_Collect_Bad_ServerError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + b := &BitcoinTalkCollector{TopicID: "12345"} + result, err := b.Collect(context.Background(), cfg) + + // fetchPage error causes break with Errors incremented. + require.NoError(t, err) + assert.Equal(t, 0, result.Items) + assert.Equal(t, 1, result.Errors) +} + +func TestBitcoinTalkCollector_Collect_Good_EmitsEvents(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(sampleBTCTalkPage(2))) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + var starts, items, completes int + cfg.Dispatcher.On(EventStart, func(e Event) { starts++ }) + cfg.Dispatcher.On(EventItem, func(e Event) { items++ }) + cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ }) + + b := &BitcoinTalkCollector{TopicID: "12345"} + result, err := b.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) + assert.Equal(t, 1, starts) + assert.Equal(t, 2, items) + assert.Equal(t, 1, completes) +} + +func TestSetHTTPClient_Good(t *testing.T) { + old := httpClient + defer func() { httpClient = old }() + + custom := &http.Client{} + SetHTTPClient(custom) + assert.Equal(t, custom, httpClient) +} + +func TestFetchPage_Good(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(sampleBTCTalkPage(3))) + })) + defer srv.Close() + + old := httpClient + httpClient = srv.Client() + defer func() { httpClient = old }() + + b := &BitcoinTalkCollector{TopicID: "12345"} + posts, err := b.fetchPage(context.Background(), srv.URL) + + require.NoError(t, err) + assert.Len(t, posts, 3) +} + +func TestFetchPage_Bad_StatusCode(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusForbidden) + })) + defer srv.Close() + + old := httpClient + httpClient = srv.Client() + defer func() { httpClient = old }() + + b := &BitcoinTalkCollector{TopicID: "12345"} + _, err := b.fetchPage(context.Background(), srv.URL) + assert.Error(t, err) +} + +func TestFetchPage_Bad_InvalidHTML(t *testing.T) { + // html.Parse is very forgiving, so serve an empty page. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(``)) + })) + defer srv.Close() + + old := httpClient + httpClient = srv.Client() + defer func() { httpClient = old }() + + b := &BitcoinTalkCollector{TopicID: "12345"} + posts, err := b.fetchPage(context.Background(), srv.URL) + require.NoError(t, err) + assert.Empty(t, posts) +} + +// rewriteTransport rewrites all request URLs to point at the test server. +type rewriteTransport struct { + base http.RoundTripper + target string +} + +func (t *rewriteTransport) RoundTrip(req *http.Request) (*http.Response, error) { + req = req.Clone(req.Context()) + req.URL.Scheme = "http" + req.URL.Host = t.target[len("http://"):] + base := t.base + if base == nil { + base = http.DefaultTransport + } + return base.RoundTrip(req) +} diff --git a/collect/excavate_extra_test.go b/collect/excavate_extra_test.go new file mode 100644 index 0000000..6620983 --- /dev/null +++ b/collect/excavate_extra_test.go @@ -0,0 +1,127 @@ +package collect + +import ( + "context" + "testing" + "time" + + "forge.lthn.ai/core/go/pkg/io" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestExcavator_Run_Good_ResumeSkipsCompleted(t *testing.T) { + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + // Pre-populate state so source-a looks completed. + cfg.State.Set("source-a", &StateEntry{ + Source: "source-a", + LastRun: time.Now().Add(-1 * time.Hour), + Items: 10, + }) + + c1 := &mockCollector{name: "source-a", items: 10} + c2 := &mockCollector{name: "source-b", items: 5} + + e := &Excavator{ + Collectors: []Collector{c1, c2}, + Resume: true, + } + + result, err := e.Run(context.Background(), cfg) + + require.NoError(t, err) + assert.False(t, c1.called, "source-a should be skipped (already completed)") + assert.True(t, c2.called, "source-b should run") + assert.Equal(t, 5, result.Items) + assert.Equal(t, 1, result.Skipped) +} + +func TestExcavator_Run_Good_ResumeRunsIncomplete(t *testing.T) { + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + // Pre-populate state with 0 items (incomplete). + cfg.State.Set("source-a", &StateEntry{ + Source: "source-a", + LastRun: time.Now(), + Items: 0, + }) + + c1 := &mockCollector{name: "source-a", items: 5} + + e := &Excavator{ + Collectors: []Collector{c1}, + Resume: true, + } + + result, err := e.Run(context.Background(), cfg) + + require.NoError(t, err) + assert.True(t, c1.called, "source-a should run (0 items in previous run)") + assert.Equal(t, 5, result.Items) +} + +func TestExcavator_Run_Good_NilState(t *testing.T) { + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.State = nil + cfg.Limiter = nil + + c1 := &mockCollector{name: "source-a", items: 3} + + e := &Excavator{ + Collectors: []Collector{c1}, + } + + result, err := e.Run(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 3, result.Items) +} + +func TestExcavator_Run_Good_NilDispatcher(t *testing.T) { + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Dispatcher = nil + cfg.Limiter = nil + + c1 := &mockCollector{name: "source-a", items: 2} + + e := &Excavator{ + Collectors: []Collector{c1}, + } + + result, err := e.Run(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) +} + +func TestExcavator_Run_Good_ProgressEvents(t *testing.T) { + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + var progressMsgs []string + cfg.Dispatcher.On(EventProgress, func(e Event) { + progressMsgs = append(progressMsgs, e.Message) + }) + + c1 := &mockCollector{name: "source-a", items: 1} + c2 := &mockCollector{name: "source-b", items: 1} + + e := &Excavator{ + Collectors: []Collector{c1, c2}, + } + + _, err := e.Run(context.Background(), cfg) + require.NoError(t, err) + + assert.Len(t, progressMsgs, 2) + assert.Contains(t, progressMsgs[0], "1/2") + assert.Contains(t, progressMsgs[1], "2/2") +} diff --git a/collect/market_extra_test.go b/collect/market_extra_test.go new file mode 100644 index 0000000..c0ec14d --- /dev/null +++ b/collect/market_extra_test.go @@ -0,0 +1,242 @@ +package collect + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "forge.lthn.ai/core/go/pkg/io" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestMarketCollector_Collect_Good_HistoricalWithFromDate(t *testing.T) { + callCount := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + callCount++ + w.Header().Set("Content-Type", "application/json") + + if callCount == 1 { + data := coinData{ + ID: "lethean", + Symbol: "lthn", + Name: "Lethean", + MarketData: marketData{ + CurrentPrice: map[string]float64{"usd": 0.001}, + }, + } + _ = json.NewEncoder(w).Encode(data) + } else { + // Historical data with FromDate param. + assert.Contains(t, r.URL.RawQuery, "days=") + data := historicalData{ + Prices: [][]float64{{1705305600000, 0.001}}, + MarketCaps: [][]float64{{1705305600000, 10000}}, + TotalVolumes: [][]float64{{1705305600000, 500}}, + } + _ = json.NewEncoder(w).Encode(data) + } + })) + defer srv.Close() + + oldURL := coinGeckoBaseURL + coinGeckoBaseURL = srv.URL + defer func() { coinGeckoBaseURL = oldURL }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + mc := &MarketCollector{CoinID: "lethean", Historical: true, FromDate: "2025-01-01"} + result, err := mc.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 3, result.Items) +} + +func TestMarketCollector_Collect_Good_HistoricalInvalidDate(t *testing.T) { + callCount := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + callCount++ + w.Header().Set("Content-Type", "application/json") + + if callCount == 1 { + data := coinData{ + ID: "test", + Symbol: "tst", + Name: "Test", + MarketData: marketData{ + CurrentPrice: map[string]float64{"usd": 1.0}, + }, + } + _ = json.NewEncoder(w).Encode(data) + } else { + // Should fall back to 365 days with invalid date. + assert.Contains(t, r.URL.RawQuery, "days=365") + data := historicalData{ + Prices: [][]float64{{1705305600000, 1.0}}, + } + _ = json.NewEncoder(w).Encode(data) + } + })) + defer srv.Close() + + oldURL := coinGeckoBaseURL + coinGeckoBaseURL = srv.URL + defer func() { coinGeckoBaseURL = oldURL }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + mc := &MarketCollector{CoinID: "test", Historical: true, FromDate: "not-a-date"} + result, err := mc.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 3, result.Items) +} + +func TestMarketCollector_Collect_Bad_HistoricalServerError(t *testing.T) { + callCount := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + callCount++ + w.Header().Set("Content-Type", "application/json") + + if callCount == 1 { + data := coinData{ + ID: "test", + Symbol: "tst", + Name: "Test", + MarketData: marketData{ + CurrentPrice: map[string]float64{"usd": 1.0}, + }, + } + _ = json.NewEncoder(w).Encode(data) + } else { + // Historical endpoint fails. + w.WriteHeader(http.StatusTooManyRequests) + } + })) + defer srv.Close() + + oldURL := coinGeckoBaseURL + coinGeckoBaseURL = srv.URL + defer func() { coinGeckoBaseURL = oldURL }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + mc := &MarketCollector{CoinID: "test", Historical: true} + result, err := mc.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) // current.json + summary.md + assert.Equal(t, 1, result.Errors) // historical failed +} + +func TestMarketCollector_Collect_Good_EmitsEvents(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + data := coinData{ + ID: "bitcoin", + Symbol: "btc", + Name: "Bitcoin", + MarketData: marketData{ + CurrentPrice: map[string]float64{"usd": 50000}, + }, + } + _ = json.NewEncoder(w).Encode(data) + })) + defer srv.Close() + + oldURL := coinGeckoBaseURL + coinGeckoBaseURL = srv.URL + defer func() { coinGeckoBaseURL = oldURL }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + var starts, completes int + cfg.Dispatcher.On(EventStart, func(e Event) { starts++ }) + cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ }) + + mc := &MarketCollector{CoinID: "bitcoin"} + _, err := mc.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 1, starts) + assert.Equal(t, 1, completes) +} + +func TestMarketCollector_Collect_Good_CancelledContext(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + oldURL := coinGeckoBaseURL + coinGeckoBaseURL = srv.URL + defer func() { coinGeckoBaseURL = oldURL }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + mc := &MarketCollector{CoinID: "bitcoin"} + result, err := mc.Collect(ctx, cfg) + + // Context cancellation causes error in fetchJSON. + require.NoError(t, err) // outer Collect doesn't return errors from currentData fetch + assert.Equal(t, 1, result.Errors) +} + +func TestFormatMarketSummary_Good_AllFields(t *testing.T) { + data := &coinData{ + Name: "Lethean", + Symbol: "lthn", + MarketData: marketData{ + CurrentPrice: map[string]float64{"usd": 0.001}, + MarketCap: map[string]float64{"usd": 100000}, + TotalVolume: map[string]float64{"usd": 5000}, + High24h: map[string]float64{"usd": 0.0015}, + Low24h: map[string]float64{"usd": 0.0005}, + PriceChange24h: 0.0002, + PriceChangePct24h: 5.5, + MarketCapRank: 500, + CirculatingSupply: 1000000000, + TotalSupply: 2000000000, + LastUpdated: "2025-01-15T12:00:00Z", + }, + } + + summary := FormatMarketSummary(data) + + assert.Contains(t, summary, "# Lethean (LTHN)") + assert.Contains(t, summary, "24h Volume") + assert.Contains(t, summary, "24h High") + assert.Contains(t, summary, "24h Low") + assert.Contains(t, summary, "24h Price Change") + assert.Contains(t, summary, "#500") + assert.Contains(t, summary, "Circulating Supply") + assert.Contains(t, summary, "Total Supply") + assert.Contains(t, summary, "Last updated") +} + +func TestFormatMarketSummary_Good_Minimal(t *testing.T) { + data := &coinData{ + Name: "Unknown", + Symbol: "ukn", + } + + summary := FormatMarketSummary(data) + assert.Contains(t, summary, "# Unknown (UKN)") + // No price data, so these should be absent. + assert.NotContains(t, summary, "Market Cap Rank") +} diff --git a/collect/papers_http_test.go b/collect/papers_http_test.go new file mode 100644 index 0000000..4ebf256 --- /dev/null +++ b/collect/papers_http_test.go @@ -0,0 +1,313 @@ +package collect + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "forge.lthn.ai/core/go/pkg/io" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/net/html" +) + +const sampleIACRHTML = ` +
+ Zero-Knowledge Proofs + Alice + Bob + 2025-01-15 +

We present a novel construction for zero-knowledge proofs.

+
+
+ Lattice Cryptography + Charlie + 2025-01-20 +

A survey of lattice-based cryptography.

+
+` + +const sampleArXivXML = ` + + + http://arxiv.org/abs/2501.12345v1 + Ring Signatures Revisited + We propose an efficient ring signature scheme. + 2025-01-10T00:00:00Z + Alice + David + + + + http://arxiv.org/abs/2501.67890v1 + Post-Quantum Signatures + A new approach to post-quantum digital signatures. + 2025-01-12T00:00:00Z + Eve + + +` + +func TestPapersCollector_CollectIACR_Good(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(sampleIACRHTML)) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceIACR, Query: "zero knowledge"} + result, err := p.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) + assert.Len(t, result.Files, 2) + + // Verify content was written. + content, err := m.Read("/output/papers/iacr/2025-001.md") + require.NoError(t, err) + assert.Contains(t, content, "Zero-Knowledge Proofs") +} + +func TestPapersCollector_CollectArXiv_Good(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/xml") + _, _ = w.Write([]byte(sampleArXivXML)) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceArXiv, Query: "ring signatures"} + result, err := p.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) + assert.Len(t, result.Files, 2) + + // Verify one of the papers. + content, err := m.Read("/output/papers/arxiv/2501.12345v1.md") + require.NoError(t, err) + assert.Contains(t, content, "Ring Signatures Revisited") + assert.Contains(t, content, "Alice") +} + +func TestPapersCollector_CollectArXiv_Good_WithCategory(t *testing.T) { + var capturedQuery string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + capturedQuery = r.URL.RawQuery + w.Header().Set("Content-Type", "application/xml") + _, _ = w.Write([]byte(sampleArXivXML)) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceArXiv, Query: "crypto", Category: "cs.CR"} + _, err := p.Collect(context.Background(), cfg) + require.NoError(t, err) + assert.Contains(t, capturedQuery, "cat") +} + +func TestPapersCollector_CollectAll_Good(t *testing.T) { + callCount := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + callCount++ + if callCount == 1 { + // First call is IACR + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(sampleIACRHTML)) + } else { + // Second call is arXiv + w.Header().Set("Content-Type", "application/xml") + _, _ = w.Write([]byte(sampleArXivXML)) + } + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceAll, Query: "cryptography"} + result, err := p.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 4, result.Items) // 2 IACR + 2 arXiv +} + +func TestPapersCollector_CollectIACR_Bad_ServerError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceIACR, Query: "test"} + _, err := p.Collect(context.Background(), cfg) + assert.Error(t, err) +} + +func TestPapersCollector_CollectArXiv_Bad_ServerError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusServiceUnavailable) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceArXiv, Query: "test"} + _, err := p.Collect(context.Background(), cfg) + assert.Error(t, err) +} + +func TestPapersCollector_CollectArXiv_Bad_InvalidXML(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/xml") + _, _ = w.Write([]byte(`not xml at all`)) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceArXiv, Query: "test"} + _, err := p.Collect(context.Background(), cfg) + assert.Error(t, err) +} + +func TestPapersCollector_CollectAll_Bad_BothFail(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceAll, Query: "test"} + _, err := p.Collect(context.Background(), cfg) + assert.Error(t, err) +} + +func TestPapersCollector_CollectAll_Good_OneFails(t *testing.T) { + callCount := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + callCount++ + if callCount == 1 { + // IACR fails + w.WriteHeader(http.StatusInternalServerError) + } else { + // ArXiv succeeds + w.Header().Set("Content-Type", "application/xml") + _, _ = w.Write([]byte(sampleArXivXML)) + } + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceAll, Query: "test"} + result, err := p.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) + assert.Equal(t, 1, result.Errors) // IACR failure counted +} + +func TestExtractIACRPapers_Good(t *testing.T) { + doc, err := html.Parse(strings.NewReader(sampleIACRHTML)) + require.NoError(t, err) + + papers := extractIACRPapers(doc) + assert.Len(t, papers, 2) + + assert.Equal(t, "Zero-Knowledge Proofs", papers[0].Title) + assert.Contains(t, papers[0].Authors, "Alice") + assert.Contains(t, papers[0].Authors, "Bob") + assert.Equal(t, "2025-01-15", papers[0].Date) + assert.Contains(t, papers[0].Abstract, "zero-knowledge proofs") + assert.Equal(t, "iacr", papers[0].Source) + + assert.Equal(t, "Lattice Cryptography", papers[1].Title) +} + +func TestExtractIACRPapers_Good_Empty(t *testing.T) { + doc, err := html.Parse(strings.NewReader(``)) + require.NoError(t, err) + + papers := extractIACRPapers(doc) + assert.Empty(t, papers) +} + +func TestExtractIACRPapers_Good_NoTitle(t *testing.T) { + doc, err := html.Parse(strings.NewReader(`
`)) + require.NoError(t, err) + + papers := extractIACRPapers(doc) + // Entry with no title should be excluded by the Title check. + assert.Empty(t, papers) +} diff --git a/collect/process_extra_test.go b/collect/process_extra_test.go new file mode 100644 index 0000000..1b42680 --- /dev/null +++ b/collect/process_extra_test.go @@ -0,0 +1,193 @@ +package collect + +import ( + "context" + "testing" + + "forge.lthn.ai/core/go/pkg/io" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestHTMLToMarkdown_Good_OrderedList(t *testing.T) { + input := `
  1. First
  2. Second
  3. Third
` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "1. First") + assert.Contains(t, result, "2. Second") + assert.Contains(t, result, "3. Third") +} + +func TestHTMLToMarkdown_Good_UnorderedList(t *testing.T) { + input := `` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "- Alpha") + assert.Contains(t, result, "- Beta") +} + +func TestHTMLToMarkdown_Good_Blockquote(t *testing.T) { + input := `
A wise quote
` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "> A wise quote") +} + +func TestHTMLToMarkdown_Good_HorizontalRule(t *testing.T) { + input := `

Before


After

` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "---") +} + +func TestHTMLToMarkdown_Good_LinkWithoutHref(t *testing.T) { + input := `bare link text` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "bare link text") + assert.NotContains(t, result, "[") +} + +func TestHTMLToMarkdown_Good_H4H5H6(t *testing.T) { + input := `

H4

H5
H6
` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "#### H4") + assert.Contains(t, result, "##### H5") + assert.Contains(t, result, "###### H6") +} + +func TestHTMLToMarkdown_Good_StripsStyle(t *testing.T) { + input := `

Clean

` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "Clean") + assert.NotContains(t, result, "color") +} + +func TestHTMLToMarkdown_Good_LineBreak(t *testing.T) { + input := `

Line one
Line two

` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "Line one") + assert.Contains(t, result, "Line two") +} + +func TestHTMLToMarkdown_Good_NestedBoldItalic(t *testing.T) { + input := `bold text and italic text` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "**bold text**") + assert.Contains(t, result, "*italic text*") +} + +func TestJSONToMarkdown_Good_NestedObject(t *testing.T) { + input := `{"outer": {"inner_key": "inner_value"}}` + result, err := JSONToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "**outer:**") + assert.Contains(t, result, "**inner_key:** inner_value") +} + +func TestJSONToMarkdown_Good_NestedArray(t *testing.T) { + input := `[["a", "b"], ["c"]]` + result, err := JSONToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "# Data") + assert.Contains(t, result, "a") + assert.Contains(t, result, "b") +} + +func TestJSONToMarkdown_Good_ScalarValue(t *testing.T) { + input := `42` + result, err := JSONToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "42") +} + +func TestJSONToMarkdown_Good_ArrayOfObjects(t *testing.T) { + input := `[{"name": "Alice"}, {"name": "Bob"}]` + result, err := JSONToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "Item 1") + assert.Contains(t, result, "Alice") + assert.Contains(t, result, "Item 2") + assert.Contains(t, result, "Bob") +} + +func TestProcessor_Process_Good_CancelledContext(t *testing.T) { + m := io.NewMockMedium() + m.Dirs["/input"] = true + m.Files["/input/file.html"] = `

Test

` + + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + p := &Processor{Source: "test", Dir: "/input"} + _, err := p.Process(ctx, cfg) + assert.Error(t, err) +} + +func TestProcessor_Process_Good_EmitsEvents(t *testing.T) { + m := io.NewMockMedium() + m.Dirs["/input"] = true + m.Files["/input/a.html"] = `

Title

` + m.Files["/input/b.json"] = `{"key": "value"}` + + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + var starts, items, completes int + cfg.Dispatcher.On(EventStart, func(e Event) { starts++ }) + cfg.Dispatcher.On(EventItem, func(e Event) { items++ }) + cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ }) + + p := &Processor{Source: "test", Dir: "/input"} + result, err := p.Process(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) + assert.Equal(t, 1, starts) + assert.Equal(t, 2, items) + assert.Equal(t, 1, completes) +} + +func TestProcessor_Process_Good_BadHTML(t *testing.T) { + m := io.NewMockMedium() + m.Dirs["/input"] = true + // html.Parse is very tolerant, so even bad HTML will parse. But we test + // that the pipeline handles it gracefully. + m.Files["/input/bad.html"] = `

Still valid enough

` + + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &Processor{Source: "test", Dir: "/input"} + result, err := p.Process(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 1, result.Items) +} + +func TestProcessor_Process_Good_BadJSON(t *testing.T) { + m := io.NewMockMedium() + m.Dirs["/input"] = true + m.Files["/input/bad.json"] = `not valid json` + + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + var errors int + cfg.Dispatcher.On(EventError, func(e Event) { errors++ }) + + p := &Processor{Source: "test", Dir: "/input"} + result, err := p.Process(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 0, result.Items) + assert.Equal(t, 1, result.Errors) + assert.Equal(t, 1, errors) +} diff --git a/collect/state_extra_test.go b/collect/state_extra_test.go new file mode 100644 index 0000000..0bdce71 --- /dev/null +++ b/collect/state_extra_test.go @@ -0,0 +1,76 @@ +package collect + +import ( + "testing" + + "forge.lthn.ai/core/go/pkg/io" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestState_Get_Good_ReturnsCopy(t *testing.T) { + m := io.NewMockMedium() + s := NewState(m, "/state.json") + + s.Set("test", &StateEntry{Source: "test", Items: 5}) + + // Get returns a copy, so mutating it shouldn't affect internal state. + got, ok := s.Get("test") + require.True(t, ok) + got.Items = 999 + + again, ok := s.Get("test") + require.True(t, ok) + assert.Equal(t, 5, again.Items, "internal state should not be mutated") +} + +func TestState_Save_Good_WritesJSON(t *testing.T) { + m := io.NewMockMedium() + s := NewState(m, "/data/state.json") + + s.Set("src-a", &StateEntry{Source: "src-a", Items: 10, LastID: "abc"}) + + err := s.Save() + require.NoError(t, err) + + // Verify the raw JSON was written. + content, err := m.Read("/data/state.json") + require.NoError(t, err) + assert.Contains(t, content, `"src-a"`) + assert.Contains(t, content, `"abc"`) +} + +func TestState_Load_Good_NullJSON(t *testing.T) { + m := io.NewMockMedium() + m.Files["/state.json"] = "null" + + s := NewState(m, "/state.json") + err := s.Load() + require.NoError(t, err) + + // Null JSON should result in empty entries. + _, ok := s.Get("anything") + assert.False(t, ok) +} + +func TestState_SaveLoad_Good_WithCursor(t *testing.T) { + m := io.NewMockMedium() + s := NewState(m, "/state.json") + + s.Set("paginated", &StateEntry{ + Source: "paginated", + Items: 50, + Cursor: "page_token_abc123", + }) + + err := s.Save() + require.NoError(t, err) + + s2 := NewState(m, "/state.json") + err = s2.Load() + require.NoError(t, err) + + entry, ok := s2.Get("paginated") + require.True(t, ok) + assert.Equal(t, "page_token_abc123", entry.Cursor) +}