2026-03-30 00:54:20 +00:00
|
|
|
// SPDX-License-Identifier: EUPL-1.2
|
2026-03-30 00:19:43 +00:00
|
|
|
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
package collect
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"context"
|
2026-03-29 23:59:48 +00:00
|
|
|
fmt "dappco.re/go/core/scm/internal/ax/fmtx"
|
|
|
|
|
strings "dappco.re/go/core/scm/internal/ax/stringsx"
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
"net/http"
|
|
|
|
|
"net/http/httptest"
|
|
|
|
|
"testing"
|
|
|
|
|
|
2026-03-21 23:54:23 +00:00
|
|
|
"dappco.re/go/core/io"
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
"github.com/stretchr/testify/assert"
|
|
|
|
|
"github.com/stretchr/testify/require"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// sampleBTCTalkPage returns HTML resembling a BitcoinTalk topic page with the
|
|
|
|
|
// given number of posts. If fewer than postsPerPage the caller can infer that
|
|
|
|
|
// it is the last page.
|
|
|
|
|
func sampleBTCTalkPage(count int) string {
|
2026-02-22 21:00:17 +00:00
|
|
|
var page strings.Builder
|
|
|
|
|
page.WriteString(`<html><body>`)
|
|
|
|
|
for i := range count {
|
|
|
|
|
page.WriteString(fmt.Sprintf(`
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
<div class="post">
|
|
|
|
|
<div class="poster_info">user%d</div>
|
|
|
|
|
<div class="headerandpost">
|
|
|
|
|
<div class="smalltext">January %02d, 2009</div>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="inner">Post content number %d.</div>
|
2026-02-22 21:00:17 +00:00
|
|
|
</div>`, i, i+1, i))
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
}
|
2026-02-22 21:00:17 +00:00
|
|
|
page.WriteString(`</body></html>`)
|
|
|
|
|
return page.String()
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestBitcoinTalkCollector_Collect_Good_OnePage_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
// Serve a single page with 5 posts (< 20, so collection stops after one page).
|
|
|
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
w.Header().Set("Content-Type", "text/html")
|
|
|
|
|
_, _ = w.Write([]byte(sampleBTCTalkPage(5)))
|
|
|
|
|
}))
|
|
|
|
|
defer srv.Close()
|
|
|
|
|
|
|
|
|
|
// Override the package-level HTTP client so requests go to our test server.
|
|
|
|
|
oldClient := httpClient
|
|
|
|
|
httpClient = srv.Client()
|
|
|
|
|
defer func() { httpClient = oldClient }()
|
|
|
|
|
|
|
|
|
|
// We also need to redirect the URL that fetchPage constructs.
|
|
|
|
|
// The easiest approach: use SetHTTPClient with a custom transport.
|
|
|
|
|
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
|
|
|
|
httpClient = &http.Client{Transport: transport}
|
|
|
|
|
|
|
|
|
|
m := io.NewMockMedium()
|
|
|
|
|
cfg := NewConfigWithMedium(m, "/output")
|
|
|
|
|
cfg.Limiter = nil // disable rate limiting for tests
|
|
|
|
|
|
|
|
|
|
b := &BitcoinTalkCollector{TopicID: "12345"}
|
|
|
|
|
result, err := b.Collect(context.Background(), cfg)
|
|
|
|
|
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Equal(t, 5, result.Items)
|
|
|
|
|
assert.Equal(t, 0, result.Errors)
|
|
|
|
|
assert.Len(t, result.Files, 5)
|
|
|
|
|
assert.Equal(t, "bitcointalk:12345", result.Source)
|
|
|
|
|
|
|
|
|
|
// Verify files were written.
|
|
|
|
|
for i := 1; i <= 5; i++ {
|
|
|
|
|
path := fmt.Sprintf("/output/bitcointalk/12345/posts/%d.md", i)
|
|
|
|
|
content, err := m.Read(path)
|
|
|
|
|
require.NoError(t, err, "file %s should exist", path)
|
|
|
|
|
assert.Contains(t, content, fmt.Sprintf("Post %d by", i))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestBitcoinTalkCollector_Collect_Good_PageLimit_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
pageCount := 0
|
|
|
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
pageCount++
|
|
|
|
|
w.Header().Set("Content-Type", "text/html")
|
|
|
|
|
// Return a full page (20 posts) each time so collection would continue
|
|
|
|
|
// indefinitely without a Pages limit.
|
|
|
|
|
_, _ = w.Write([]byte(sampleBTCTalkPage(20)))
|
|
|
|
|
}))
|
|
|
|
|
defer srv.Close()
|
|
|
|
|
|
|
|
|
|
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
|
|
|
|
old := httpClient
|
|
|
|
|
httpClient = &http.Client{Transport: transport}
|
|
|
|
|
defer func() { httpClient = old }()
|
|
|
|
|
|
|
|
|
|
m := io.NewMockMedium()
|
|
|
|
|
cfg := NewConfigWithMedium(m, "/output")
|
|
|
|
|
cfg.Limiter = nil
|
|
|
|
|
|
|
|
|
|
b := &BitcoinTalkCollector{TopicID: "99999", Pages: 2}
|
|
|
|
|
result, err := b.Collect(context.Background(), cfg)
|
|
|
|
|
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Equal(t, 40, result.Items) // 2 pages * 20 posts
|
|
|
|
|
assert.Equal(t, 2, pageCount)
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestBitcoinTalkCollector_Collect_Good_CancelledContext_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
w.Header().Set("Content-Type", "text/html")
|
|
|
|
|
_, _ = w.Write([]byte(sampleBTCTalkPage(5)))
|
|
|
|
|
}))
|
|
|
|
|
defer srv.Close()
|
|
|
|
|
|
|
|
|
|
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
|
|
|
|
old := httpClient
|
|
|
|
|
httpClient = &http.Client{Transport: transport}
|
|
|
|
|
defer func() { httpClient = old }()
|
|
|
|
|
|
|
|
|
|
m := io.NewMockMedium()
|
|
|
|
|
cfg := NewConfigWithMedium(m, "/output")
|
|
|
|
|
cfg.Limiter = nil
|
|
|
|
|
|
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
|
cancel() // Cancel immediately.
|
|
|
|
|
|
|
|
|
|
b := &BitcoinTalkCollector{TopicID: "12345"}
|
|
|
|
|
_, err := b.Collect(ctx, cfg)
|
|
|
|
|
assert.Error(t, err)
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestBitcoinTalkCollector_Collect_Bad_ServerError_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
w.WriteHeader(http.StatusInternalServerError)
|
|
|
|
|
}))
|
|
|
|
|
defer srv.Close()
|
|
|
|
|
|
|
|
|
|
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
|
|
|
|
old := httpClient
|
|
|
|
|
httpClient = &http.Client{Transport: transport}
|
|
|
|
|
defer func() { httpClient = old }()
|
|
|
|
|
|
|
|
|
|
m := io.NewMockMedium()
|
|
|
|
|
cfg := NewConfigWithMedium(m, "/output")
|
|
|
|
|
cfg.Limiter = nil
|
|
|
|
|
|
|
|
|
|
b := &BitcoinTalkCollector{TopicID: "12345"}
|
|
|
|
|
result, err := b.Collect(context.Background(), cfg)
|
|
|
|
|
|
|
|
|
|
// fetchPage error causes break with Errors incremented.
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Equal(t, 0, result.Items)
|
|
|
|
|
assert.Equal(t, 1, result.Errors)
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestBitcoinTalkCollector_Collect_Good_EmitsEvents_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
w.Header().Set("Content-Type", "text/html")
|
|
|
|
|
_, _ = w.Write([]byte(sampleBTCTalkPage(2)))
|
|
|
|
|
}))
|
|
|
|
|
defer srv.Close()
|
|
|
|
|
|
|
|
|
|
transport := &rewriteTransport{base: srv.Client().Transport, target: srv.URL}
|
|
|
|
|
old := httpClient
|
|
|
|
|
httpClient = &http.Client{Transport: transport}
|
|
|
|
|
defer func() { httpClient = old }()
|
|
|
|
|
|
|
|
|
|
m := io.NewMockMedium()
|
|
|
|
|
cfg := NewConfigWithMedium(m, "/output")
|
|
|
|
|
cfg.Limiter = nil
|
|
|
|
|
|
|
|
|
|
var starts, items, completes int
|
|
|
|
|
cfg.Dispatcher.On(EventStart, func(e Event) { starts++ })
|
|
|
|
|
cfg.Dispatcher.On(EventItem, func(e Event) { items++ })
|
|
|
|
|
cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ })
|
|
|
|
|
|
|
|
|
|
b := &BitcoinTalkCollector{TopicID: "12345"}
|
|
|
|
|
result, err := b.Collect(context.Background(), cfg)
|
|
|
|
|
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Equal(t, 2, result.Items)
|
|
|
|
|
assert.Equal(t, 1, starts)
|
|
|
|
|
assert.Equal(t, 2, items)
|
|
|
|
|
assert.Equal(t, 1, completes)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func TestSetHTTPClient_Good(t *testing.T) {
|
|
|
|
|
old := httpClient
|
|
|
|
|
defer func() { httpClient = old }()
|
|
|
|
|
|
|
|
|
|
custom := &http.Client{}
|
|
|
|
|
SetHTTPClient(custom)
|
|
|
|
|
assert.Equal(t, custom, httpClient)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func TestFetchPage_Good(t *testing.T) {
|
|
|
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
w.Header().Set("Content-Type", "text/html")
|
|
|
|
|
_, _ = w.Write([]byte(sampleBTCTalkPage(3)))
|
|
|
|
|
}))
|
|
|
|
|
defer srv.Close()
|
|
|
|
|
|
|
|
|
|
old := httpClient
|
|
|
|
|
httpClient = srv.Client()
|
|
|
|
|
defer func() { httpClient = old }()
|
|
|
|
|
|
|
|
|
|
b := &BitcoinTalkCollector{TopicID: "12345"}
|
|
|
|
|
posts, err := b.fetchPage(context.Background(), srv.URL)
|
|
|
|
|
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Len(t, posts, 3)
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestFetchPage_Bad_StatusCode_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
w.WriteHeader(http.StatusForbidden)
|
|
|
|
|
}))
|
|
|
|
|
defer srv.Close()
|
|
|
|
|
|
|
|
|
|
old := httpClient
|
|
|
|
|
httpClient = srv.Client()
|
|
|
|
|
defer func() { httpClient = old }()
|
|
|
|
|
|
|
|
|
|
b := &BitcoinTalkCollector{TopicID: "12345"}
|
|
|
|
|
_, err := b.fetchPage(context.Background(), srv.URL)
|
|
|
|
|
assert.Error(t, err)
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestFetchPage_Bad_InvalidHTML_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
// html.Parse is very forgiving, so serve an empty page.
|
|
|
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
w.Header().Set("Content-Type", "text/html")
|
|
|
|
|
_, _ = w.Write([]byte(`<html><body></body></html>`))
|
|
|
|
|
}))
|
|
|
|
|
defer srv.Close()
|
|
|
|
|
|
|
|
|
|
old := httpClient
|
|
|
|
|
httpClient = srv.Client()
|
|
|
|
|
defer func() { httpClient = old }()
|
|
|
|
|
|
|
|
|
|
b := &BitcoinTalkCollector{TopicID: "12345"}
|
|
|
|
|
posts, err := b.fetchPage(context.Background(), srv.URL)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Empty(t, posts)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// rewriteTransport rewrites all request URLs to point at the test server.
|
|
|
|
|
type rewriteTransport struct {
|
|
|
|
|
base http.RoundTripper
|
|
|
|
|
target string
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (t *rewriteTransport) RoundTrip(req *http.Request) (*http.Response, error) {
|
|
|
|
|
req = req.Clone(req.Context())
|
|
|
|
|
req.URL.Scheme = "http"
|
|
|
|
|
req.URL.Host = t.target[len("http://"):]
|
|
|
|
|
base := t.base
|
|
|
|
|
if base == nil {
|
|
|
|
|
base = http.DefaultTransport
|
|
|
|
|
}
|
|
|
|
|
return base.RoundTrip(req)
|
|
|
|
|
}
|