diff --git a/collect/bitcointalk_http_test.go b/collect/bitcointalk_http_test.go new file mode 100644 index 0000000..f045ebe --- /dev/null +++ b/collect/bitcointalk_http_test.go @@ -0,0 +1,256 @@ +package collect + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "testing" + + "forge.lthn.ai/core/go/pkg/io" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// sampleBTCTalkPage returns HTML resembling a BitcoinTalk topic page with the +// given number of posts. If fewer than postsPerPage the caller can infer that +// it is the last page. +func sampleBTCTalkPage(count int) string { + page := `` + for i := 0; i < count; i++ { + page += fmt.Sprintf(` +
+
user%d
+
+
January %02d, 2009
+
+
Post content number %d.
+
`, i, i+1, i) + } + page += `` + return page +} + +func TestBitcoinTalkCollector_Collect_Good_OnePage(t *testing.T) { + // Serve a single page with 5 posts (< 20, so collection stops after one page). + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(sampleBTCTalkPage(5))) + })) + defer srv.Close() + + // Override the package-level HTTP client so requests go to our test server. + oldClient := httpClient + httpClient = srv.Client() + defer func() { httpClient = oldClient }() + + // We also need to redirect the URL that fetchPage constructs. + // The easiest approach: use SetHTTPClient with a custom transport. + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + httpClient = &http.Client{Transport: transport} + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil // disable rate limiting for tests + + b := &BitcoinTalkCollector{TopicID: "12345"} + result, err := b.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 5, result.Items) + assert.Equal(t, 0, result.Errors) + assert.Len(t, result.Files, 5) + assert.Equal(t, "bitcointalk:12345", result.Source) + + // Verify files were written. + for i := 1; i <= 5; i++ { + path := fmt.Sprintf("/output/bitcointalk/12345/posts/%d.md", i) + content, err := m.Read(path) + require.NoError(t, err, "file %s should exist", path) + assert.Contains(t, content, fmt.Sprintf("Post %d by", i)) + } +} + +func TestBitcoinTalkCollector_Collect_Good_PageLimit(t *testing.T) { + pageCount := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + pageCount++ + w.Header().Set("Content-Type", "text/html") + // Return a full page (20 posts) each time so collection would continue + // indefinitely without a Pages limit. + _, _ = w.Write([]byte(sampleBTCTalkPage(20))) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + b := &BitcoinTalkCollector{TopicID: "99999", Pages: 2} + result, err := b.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 40, result.Items) // 2 pages * 20 posts + assert.Equal(t, 2, pageCount) +} + +func TestBitcoinTalkCollector_Collect_Good_CancelledContext(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(sampleBTCTalkPage(5))) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // Cancel immediately. + + b := &BitcoinTalkCollector{TopicID: "12345"} + _, err := b.Collect(ctx, cfg) + assert.Error(t, err) +} + +func TestBitcoinTalkCollector_Collect_Bad_ServerError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + b := &BitcoinTalkCollector{TopicID: "12345"} + result, err := b.Collect(context.Background(), cfg) + + // fetchPage error causes break with Errors incremented. + require.NoError(t, err) + assert.Equal(t, 0, result.Items) + assert.Equal(t, 1, result.Errors) +} + +func TestBitcoinTalkCollector_Collect_Good_EmitsEvents(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(sampleBTCTalkPage(2))) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + var starts, items, completes int + cfg.Dispatcher.On(EventStart, func(e Event) { starts++ }) + cfg.Dispatcher.On(EventItem, func(e Event) { items++ }) + cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ }) + + b := &BitcoinTalkCollector{TopicID: "12345"} + result, err := b.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) + assert.Equal(t, 1, starts) + assert.Equal(t, 2, items) + assert.Equal(t, 1, completes) +} + +func TestSetHTTPClient_Good(t *testing.T) { + old := httpClient + defer func() { httpClient = old }() + + custom := &http.Client{} + SetHTTPClient(custom) + assert.Equal(t, custom, httpClient) +} + +func TestFetchPage_Good(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(sampleBTCTalkPage(3))) + })) + defer srv.Close() + + old := httpClient + httpClient = srv.Client() + defer func() { httpClient = old }() + + b := &BitcoinTalkCollector{TopicID: "12345"} + posts, err := b.fetchPage(context.Background(), srv.URL) + + require.NoError(t, err) + assert.Len(t, posts, 3) +} + +func TestFetchPage_Bad_StatusCode(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusForbidden) + })) + defer srv.Close() + + old := httpClient + httpClient = srv.Client() + defer func() { httpClient = old }() + + b := &BitcoinTalkCollector{TopicID: "12345"} + _, err := b.fetchPage(context.Background(), srv.URL) + assert.Error(t, err) +} + +func TestFetchPage_Bad_InvalidHTML(t *testing.T) { + // html.Parse is very forgiving, so serve an empty page. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(``)) + })) + defer srv.Close() + + old := httpClient + httpClient = srv.Client() + defer func() { httpClient = old }() + + b := &BitcoinTalkCollector{TopicID: "12345"} + posts, err := b.fetchPage(context.Background(), srv.URL) + require.NoError(t, err) + assert.Empty(t, posts) +} + +// rewriteTransport rewrites all request URLs to point at the test server. +type rewriteTransport struct { + base http.RoundTripper + target string +} + +func (t *rewriteTransport) RoundTrip(req *http.Request) (*http.Response, error) { + req = req.Clone(req.Context()) + req.URL.Scheme = "http" + req.URL.Host = t.target[len("http://"):] + base := t.base + if base == nil { + base = http.DefaultTransport + } + return base.RoundTrip(req) +} diff --git a/collect/excavate_extra_test.go b/collect/excavate_extra_test.go new file mode 100644 index 0000000..6620983 --- /dev/null +++ b/collect/excavate_extra_test.go @@ -0,0 +1,127 @@ +package collect + +import ( + "context" + "testing" + "time" + + "forge.lthn.ai/core/go/pkg/io" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestExcavator_Run_Good_ResumeSkipsCompleted(t *testing.T) { + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + // Pre-populate state so source-a looks completed. + cfg.State.Set("source-a", &StateEntry{ + Source: "source-a", + LastRun: time.Now().Add(-1 * time.Hour), + Items: 10, + }) + + c1 := &mockCollector{name: "source-a", items: 10} + c2 := &mockCollector{name: "source-b", items: 5} + + e := &Excavator{ + Collectors: []Collector{c1, c2}, + Resume: true, + } + + result, err := e.Run(context.Background(), cfg) + + require.NoError(t, err) + assert.False(t, c1.called, "source-a should be skipped (already completed)") + assert.True(t, c2.called, "source-b should run") + assert.Equal(t, 5, result.Items) + assert.Equal(t, 1, result.Skipped) +} + +func TestExcavator_Run_Good_ResumeRunsIncomplete(t *testing.T) { + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + // Pre-populate state with 0 items (incomplete). + cfg.State.Set("source-a", &StateEntry{ + Source: "source-a", + LastRun: time.Now(), + Items: 0, + }) + + c1 := &mockCollector{name: "source-a", items: 5} + + e := &Excavator{ + Collectors: []Collector{c1}, + Resume: true, + } + + result, err := e.Run(context.Background(), cfg) + + require.NoError(t, err) + assert.True(t, c1.called, "source-a should run (0 items in previous run)") + assert.Equal(t, 5, result.Items) +} + +func TestExcavator_Run_Good_NilState(t *testing.T) { + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.State = nil + cfg.Limiter = nil + + c1 := &mockCollector{name: "source-a", items: 3} + + e := &Excavator{ + Collectors: []Collector{c1}, + } + + result, err := e.Run(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 3, result.Items) +} + +func TestExcavator_Run_Good_NilDispatcher(t *testing.T) { + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Dispatcher = nil + cfg.Limiter = nil + + c1 := &mockCollector{name: "source-a", items: 2} + + e := &Excavator{ + Collectors: []Collector{c1}, + } + + result, err := e.Run(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) +} + +func TestExcavator_Run_Good_ProgressEvents(t *testing.T) { + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + var progressMsgs []string + cfg.Dispatcher.On(EventProgress, func(e Event) { + progressMsgs = append(progressMsgs, e.Message) + }) + + c1 := &mockCollector{name: "source-a", items: 1} + c2 := &mockCollector{name: "source-b", items: 1} + + e := &Excavator{ + Collectors: []Collector{c1, c2}, + } + + _, err := e.Run(context.Background(), cfg) + require.NoError(t, err) + + assert.Len(t, progressMsgs, 2) + assert.Contains(t, progressMsgs[0], "1/2") + assert.Contains(t, progressMsgs[1], "2/2") +} diff --git a/collect/market_extra_test.go b/collect/market_extra_test.go new file mode 100644 index 0000000..c0ec14d --- /dev/null +++ b/collect/market_extra_test.go @@ -0,0 +1,242 @@ +package collect + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "forge.lthn.ai/core/go/pkg/io" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestMarketCollector_Collect_Good_HistoricalWithFromDate(t *testing.T) { + callCount := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + callCount++ + w.Header().Set("Content-Type", "application/json") + + if callCount == 1 { + data := coinData{ + ID: "lethean", + Symbol: "lthn", + Name: "Lethean", + MarketData: marketData{ + CurrentPrice: map[string]float64{"usd": 0.001}, + }, + } + _ = json.NewEncoder(w).Encode(data) + } else { + // Historical data with FromDate param. + assert.Contains(t, r.URL.RawQuery, "days=") + data := historicalData{ + Prices: [][]float64{{1705305600000, 0.001}}, + MarketCaps: [][]float64{{1705305600000, 10000}}, + TotalVolumes: [][]float64{{1705305600000, 500}}, + } + _ = json.NewEncoder(w).Encode(data) + } + })) + defer srv.Close() + + oldURL := coinGeckoBaseURL + coinGeckoBaseURL = srv.URL + defer func() { coinGeckoBaseURL = oldURL }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + mc := &MarketCollector{CoinID: "lethean", Historical: true, FromDate: "2025-01-01"} + result, err := mc.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 3, result.Items) +} + +func TestMarketCollector_Collect_Good_HistoricalInvalidDate(t *testing.T) { + callCount := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + callCount++ + w.Header().Set("Content-Type", "application/json") + + if callCount == 1 { + data := coinData{ + ID: "test", + Symbol: "tst", + Name: "Test", + MarketData: marketData{ + CurrentPrice: map[string]float64{"usd": 1.0}, + }, + } + _ = json.NewEncoder(w).Encode(data) + } else { + // Should fall back to 365 days with invalid date. + assert.Contains(t, r.URL.RawQuery, "days=365") + data := historicalData{ + Prices: [][]float64{{1705305600000, 1.0}}, + } + _ = json.NewEncoder(w).Encode(data) + } + })) + defer srv.Close() + + oldURL := coinGeckoBaseURL + coinGeckoBaseURL = srv.URL + defer func() { coinGeckoBaseURL = oldURL }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + mc := &MarketCollector{CoinID: "test", Historical: true, FromDate: "not-a-date"} + result, err := mc.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 3, result.Items) +} + +func TestMarketCollector_Collect_Bad_HistoricalServerError(t *testing.T) { + callCount := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + callCount++ + w.Header().Set("Content-Type", "application/json") + + if callCount == 1 { + data := coinData{ + ID: "test", + Symbol: "tst", + Name: "Test", + MarketData: marketData{ + CurrentPrice: map[string]float64{"usd": 1.0}, + }, + } + _ = json.NewEncoder(w).Encode(data) + } else { + // Historical endpoint fails. + w.WriteHeader(http.StatusTooManyRequests) + } + })) + defer srv.Close() + + oldURL := coinGeckoBaseURL + coinGeckoBaseURL = srv.URL + defer func() { coinGeckoBaseURL = oldURL }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + mc := &MarketCollector{CoinID: "test", Historical: true} + result, err := mc.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) // current.json + summary.md + assert.Equal(t, 1, result.Errors) // historical failed +} + +func TestMarketCollector_Collect_Good_EmitsEvents(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + data := coinData{ + ID: "bitcoin", + Symbol: "btc", + Name: "Bitcoin", + MarketData: marketData{ + CurrentPrice: map[string]float64{"usd": 50000}, + }, + } + _ = json.NewEncoder(w).Encode(data) + })) + defer srv.Close() + + oldURL := coinGeckoBaseURL + coinGeckoBaseURL = srv.URL + defer func() { coinGeckoBaseURL = oldURL }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + var starts, completes int + cfg.Dispatcher.On(EventStart, func(e Event) { starts++ }) + cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ }) + + mc := &MarketCollector{CoinID: "bitcoin"} + _, err := mc.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 1, starts) + assert.Equal(t, 1, completes) +} + +func TestMarketCollector_Collect_Good_CancelledContext(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + oldURL := coinGeckoBaseURL + coinGeckoBaseURL = srv.URL + defer func() { coinGeckoBaseURL = oldURL }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + mc := &MarketCollector{CoinID: "bitcoin"} + result, err := mc.Collect(ctx, cfg) + + // Context cancellation causes error in fetchJSON. + require.NoError(t, err) // outer Collect doesn't return errors from currentData fetch + assert.Equal(t, 1, result.Errors) +} + +func TestFormatMarketSummary_Good_AllFields(t *testing.T) { + data := &coinData{ + Name: "Lethean", + Symbol: "lthn", + MarketData: marketData{ + CurrentPrice: map[string]float64{"usd": 0.001}, + MarketCap: map[string]float64{"usd": 100000}, + TotalVolume: map[string]float64{"usd": 5000}, + High24h: map[string]float64{"usd": 0.0015}, + Low24h: map[string]float64{"usd": 0.0005}, + PriceChange24h: 0.0002, + PriceChangePct24h: 5.5, + MarketCapRank: 500, + CirculatingSupply: 1000000000, + TotalSupply: 2000000000, + LastUpdated: "2025-01-15T12:00:00Z", + }, + } + + summary := FormatMarketSummary(data) + + assert.Contains(t, summary, "# Lethean (LTHN)") + assert.Contains(t, summary, "24h Volume") + assert.Contains(t, summary, "24h High") + assert.Contains(t, summary, "24h Low") + assert.Contains(t, summary, "24h Price Change") + assert.Contains(t, summary, "#500") + assert.Contains(t, summary, "Circulating Supply") + assert.Contains(t, summary, "Total Supply") + assert.Contains(t, summary, "Last updated") +} + +func TestFormatMarketSummary_Good_Minimal(t *testing.T) { + data := &coinData{ + Name: "Unknown", + Symbol: "ukn", + } + + summary := FormatMarketSummary(data) + assert.Contains(t, summary, "# Unknown (UKN)") + // No price data, so these should be absent. + assert.NotContains(t, summary, "Market Cap Rank") +} diff --git a/collect/papers_http_test.go b/collect/papers_http_test.go new file mode 100644 index 0000000..4ebf256 --- /dev/null +++ b/collect/papers_http_test.go @@ -0,0 +1,313 @@ +package collect + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "forge.lthn.ai/core/go/pkg/io" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/net/html" +) + +const sampleIACRHTML = ` +
+ Zero-Knowledge Proofs + Alice + Bob + 2025-01-15 +

We present a novel construction for zero-knowledge proofs.

+
+
+ Lattice Cryptography + Charlie + 2025-01-20 +

A survey of lattice-based cryptography.

+
+` + +const sampleArXivXML = ` + + + http://arxiv.org/abs/2501.12345v1 + Ring Signatures Revisited + We propose an efficient ring signature scheme. + 2025-01-10T00:00:00Z + Alice + David + + + + http://arxiv.org/abs/2501.67890v1 + Post-Quantum Signatures + A new approach to post-quantum digital signatures. + 2025-01-12T00:00:00Z + Eve + + +` + +func TestPapersCollector_CollectIACR_Good(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(sampleIACRHTML)) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceIACR, Query: "zero knowledge"} + result, err := p.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) + assert.Len(t, result.Files, 2) + + // Verify content was written. + content, err := m.Read("/output/papers/iacr/2025-001.md") + require.NoError(t, err) + assert.Contains(t, content, "Zero-Knowledge Proofs") +} + +func TestPapersCollector_CollectArXiv_Good(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/xml") + _, _ = w.Write([]byte(sampleArXivXML)) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceArXiv, Query: "ring signatures"} + result, err := p.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) + assert.Len(t, result.Files, 2) + + // Verify one of the papers. + content, err := m.Read("/output/papers/arxiv/2501.12345v1.md") + require.NoError(t, err) + assert.Contains(t, content, "Ring Signatures Revisited") + assert.Contains(t, content, "Alice") +} + +func TestPapersCollector_CollectArXiv_Good_WithCategory(t *testing.T) { + var capturedQuery string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + capturedQuery = r.URL.RawQuery + w.Header().Set("Content-Type", "application/xml") + _, _ = w.Write([]byte(sampleArXivXML)) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceArXiv, Query: "crypto", Category: "cs.CR"} + _, err := p.Collect(context.Background(), cfg) + require.NoError(t, err) + assert.Contains(t, capturedQuery, "cat") +} + +func TestPapersCollector_CollectAll_Good(t *testing.T) { + callCount := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + callCount++ + if callCount == 1 { + // First call is IACR + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(sampleIACRHTML)) + } else { + // Second call is arXiv + w.Header().Set("Content-Type", "application/xml") + _, _ = w.Write([]byte(sampleArXivXML)) + } + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceAll, Query: "cryptography"} + result, err := p.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 4, result.Items) // 2 IACR + 2 arXiv +} + +func TestPapersCollector_CollectIACR_Bad_ServerError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceIACR, Query: "test"} + _, err := p.Collect(context.Background(), cfg) + assert.Error(t, err) +} + +func TestPapersCollector_CollectArXiv_Bad_ServerError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusServiceUnavailable) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceArXiv, Query: "test"} + _, err := p.Collect(context.Background(), cfg) + assert.Error(t, err) +} + +func TestPapersCollector_CollectArXiv_Bad_InvalidXML(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/xml") + _, _ = w.Write([]byte(`not xml at all`)) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceArXiv, Query: "test"} + _, err := p.Collect(context.Background(), cfg) + assert.Error(t, err) +} + +func TestPapersCollector_CollectAll_Bad_BothFail(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceAll, Query: "test"} + _, err := p.Collect(context.Background(), cfg) + assert.Error(t, err) +} + +func TestPapersCollector_CollectAll_Good_OneFails(t *testing.T) { + callCount := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + callCount++ + if callCount == 1 { + // IACR fails + w.WriteHeader(http.StatusInternalServerError) + } else { + // ArXiv succeeds + w.Header().Set("Content-Type", "application/xml") + _, _ = w.Write([]byte(sampleArXivXML)) + } + })) + defer srv.Close() + + transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL} + old := httpClient + httpClient = &http.Client{Transport: transport} + defer func() { httpClient = old }() + + m := io.NewMockMedium() + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &PapersCollector{Source: PaperSourceAll, Query: "test"} + result, err := p.Collect(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) + assert.Equal(t, 1, result.Errors) // IACR failure counted +} + +func TestExtractIACRPapers_Good(t *testing.T) { + doc, err := html.Parse(strings.NewReader(sampleIACRHTML)) + require.NoError(t, err) + + papers := extractIACRPapers(doc) + assert.Len(t, papers, 2) + + assert.Equal(t, "Zero-Knowledge Proofs", papers[0].Title) + assert.Contains(t, papers[0].Authors, "Alice") + assert.Contains(t, papers[0].Authors, "Bob") + assert.Equal(t, "2025-01-15", papers[0].Date) + assert.Contains(t, papers[0].Abstract, "zero-knowledge proofs") + assert.Equal(t, "iacr", papers[0].Source) + + assert.Equal(t, "Lattice Cryptography", papers[1].Title) +} + +func TestExtractIACRPapers_Good_Empty(t *testing.T) { + doc, err := html.Parse(strings.NewReader(``)) + require.NoError(t, err) + + papers := extractIACRPapers(doc) + assert.Empty(t, papers) +} + +func TestExtractIACRPapers_Good_NoTitle(t *testing.T) { + doc, err := html.Parse(strings.NewReader(`
`)) + require.NoError(t, err) + + papers := extractIACRPapers(doc) + // Entry with no title should be excluded by the Title check. + assert.Empty(t, papers) +} diff --git a/collect/process_extra_test.go b/collect/process_extra_test.go new file mode 100644 index 0000000..1b42680 --- /dev/null +++ b/collect/process_extra_test.go @@ -0,0 +1,193 @@ +package collect + +import ( + "context" + "testing" + + "forge.lthn.ai/core/go/pkg/io" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestHTMLToMarkdown_Good_OrderedList(t *testing.T) { + input := `
  1. First
  2. Second
  3. Third
` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "1. First") + assert.Contains(t, result, "2. Second") + assert.Contains(t, result, "3. Third") +} + +func TestHTMLToMarkdown_Good_UnorderedList(t *testing.T) { + input := `` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "- Alpha") + assert.Contains(t, result, "- Beta") +} + +func TestHTMLToMarkdown_Good_Blockquote(t *testing.T) { + input := `
A wise quote
` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "> A wise quote") +} + +func TestHTMLToMarkdown_Good_HorizontalRule(t *testing.T) { + input := `

Before


After

` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "---") +} + +func TestHTMLToMarkdown_Good_LinkWithoutHref(t *testing.T) { + input := `bare link text` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "bare link text") + assert.NotContains(t, result, "[") +} + +func TestHTMLToMarkdown_Good_H4H5H6(t *testing.T) { + input := `

H4

H5
H6
` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "#### H4") + assert.Contains(t, result, "##### H5") + assert.Contains(t, result, "###### H6") +} + +func TestHTMLToMarkdown_Good_StripsStyle(t *testing.T) { + input := `

Clean

` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "Clean") + assert.NotContains(t, result, "color") +} + +func TestHTMLToMarkdown_Good_LineBreak(t *testing.T) { + input := `

Line one
Line two

` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "Line one") + assert.Contains(t, result, "Line two") +} + +func TestHTMLToMarkdown_Good_NestedBoldItalic(t *testing.T) { + input := `bold text and italic text` + result, err := HTMLToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "**bold text**") + assert.Contains(t, result, "*italic text*") +} + +func TestJSONToMarkdown_Good_NestedObject(t *testing.T) { + input := `{"outer": {"inner_key": "inner_value"}}` + result, err := JSONToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "**outer:**") + assert.Contains(t, result, "**inner_key:** inner_value") +} + +func TestJSONToMarkdown_Good_NestedArray(t *testing.T) { + input := `[["a", "b"], ["c"]]` + result, err := JSONToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "# Data") + assert.Contains(t, result, "a") + assert.Contains(t, result, "b") +} + +func TestJSONToMarkdown_Good_ScalarValue(t *testing.T) { + input := `42` + result, err := JSONToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "42") +} + +func TestJSONToMarkdown_Good_ArrayOfObjects(t *testing.T) { + input := `[{"name": "Alice"}, {"name": "Bob"}]` + result, err := JSONToMarkdown(input) + require.NoError(t, err) + assert.Contains(t, result, "Item 1") + assert.Contains(t, result, "Alice") + assert.Contains(t, result, "Item 2") + assert.Contains(t, result, "Bob") +} + +func TestProcessor_Process_Good_CancelledContext(t *testing.T) { + m := io.NewMockMedium() + m.Dirs["/input"] = true + m.Files["/input/file.html"] = `

Test

` + + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + p := &Processor{Source: "test", Dir: "/input"} + _, err := p.Process(ctx, cfg) + assert.Error(t, err) +} + +func TestProcessor_Process_Good_EmitsEvents(t *testing.T) { + m := io.NewMockMedium() + m.Dirs["/input"] = true + m.Files["/input/a.html"] = `

Title

` + m.Files["/input/b.json"] = `{"key": "value"}` + + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + var starts, items, completes int + cfg.Dispatcher.On(EventStart, func(e Event) { starts++ }) + cfg.Dispatcher.On(EventItem, func(e Event) { items++ }) + cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ }) + + p := &Processor{Source: "test", Dir: "/input"} + result, err := p.Process(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 2, result.Items) + assert.Equal(t, 1, starts) + assert.Equal(t, 2, items) + assert.Equal(t, 1, completes) +} + +func TestProcessor_Process_Good_BadHTML(t *testing.T) { + m := io.NewMockMedium() + m.Dirs["/input"] = true + // html.Parse is very tolerant, so even bad HTML will parse. But we test + // that the pipeline handles it gracefully. + m.Files["/input/bad.html"] = `

Still valid enough

` + + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + p := &Processor{Source: "test", Dir: "/input"} + result, err := p.Process(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 1, result.Items) +} + +func TestProcessor_Process_Good_BadJSON(t *testing.T) { + m := io.NewMockMedium() + m.Dirs["/input"] = true + m.Files["/input/bad.json"] = `not valid json` + + cfg := NewConfigWithMedium(m, "/output") + cfg.Limiter = nil + + var errors int + cfg.Dispatcher.On(EventError, func(e Event) { errors++ }) + + p := &Processor{Source: "test", Dir: "/input"} + result, err := p.Process(context.Background(), cfg) + + require.NoError(t, err) + assert.Equal(t, 0, result.Items) + assert.Equal(t, 1, result.Errors) + assert.Equal(t, 1, errors) +} diff --git a/collect/state_extra_test.go b/collect/state_extra_test.go new file mode 100644 index 0000000..0bdce71 --- /dev/null +++ b/collect/state_extra_test.go @@ -0,0 +1,76 @@ +package collect + +import ( + "testing" + + "forge.lthn.ai/core/go/pkg/io" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestState_Get_Good_ReturnsCopy(t *testing.T) { + m := io.NewMockMedium() + s := NewState(m, "/state.json") + + s.Set("test", &StateEntry{Source: "test", Items: 5}) + + // Get returns a copy, so mutating it shouldn't affect internal state. + got, ok := s.Get("test") + require.True(t, ok) + got.Items = 999 + + again, ok := s.Get("test") + require.True(t, ok) + assert.Equal(t, 5, again.Items, "internal state should not be mutated") +} + +func TestState_Save_Good_WritesJSON(t *testing.T) { + m := io.NewMockMedium() + s := NewState(m, "/data/state.json") + + s.Set("src-a", &StateEntry{Source: "src-a", Items: 10, LastID: "abc"}) + + err := s.Save() + require.NoError(t, err) + + // Verify the raw JSON was written. + content, err := m.Read("/data/state.json") + require.NoError(t, err) + assert.Contains(t, content, `"src-a"`) + assert.Contains(t, content, `"abc"`) +} + +func TestState_Load_Good_NullJSON(t *testing.T) { + m := io.NewMockMedium() + m.Files["/state.json"] = "null" + + s := NewState(m, "/state.json") + err := s.Load() + require.NoError(t, err) + + // Null JSON should result in empty entries. + _, ok := s.Get("anything") + assert.False(t, ok) +} + +func TestState_SaveLoad_Good_WithCursor(t *testing.T) { + m := io.NewMockMedium() + s := NewState(m, "/state.json") + + s.Set("paginated", &StateEntry{ + Source: "paginated", + Items: 50, + Cursor: "page_token_abc123", + }) + + err := s.Save() + require.NoError(t, err) + + s2 := NewState(m, "/state.json") + err = s2.Load() + require.NoError(t, err) + + entry, ok := s2.Get("paginated") + require.True(t, ok) + assert.Equal(t, "page_token_abc123", entry.Cursor) +}