Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server), papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market (historical with FromDate, invalid date, server errors), process (ordered lists, blockquotes, h4-h6, nested objects, cancelled context), excavate (resume skips completed, progress events), and state (copy safety, cursor round-trip, null JSON). Uses httptest.Server with rewriteTransport to intercept external HTTP calls without touching the production code. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
193 lines
5.5 KiB
Go
193 lines
5.5 KiB
Go
package collect
|
|
|
|
import (
|
|
"context"
|
|
"testing"
|
|
|
|
"forge.lthn.ai/core/go/pkg/io"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
func TestHTMLToMarkdown_Good_OrderedList(t *testing.T) {
|
|
input := `<ol><li>First</li><li>Second</li><li>Third</li></ol>`
|
|
result, err := HTMLToMarkdown(input)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, result, "1. First")
|
|
assert.Contains(t, result, "2. Second")
|
|
assert.Contains(t, result, "3. Third")
|
|
}
|
|
|
|
func TestHTMLToMarkdown_Good_UnorderedList(t *testing.T) {
|
|
input := `<ul><li>Alpha</li><li>Beta</li></ul>`
|
|
result, err := HTMLToMarkdown(input)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, result, "- Alpha")
|
|
assert.Contains(t, result, "- Beta")
|
|
}
|
|
|
|
func TestHTMLToMarkdown_Good_Blockquote(t *testing.T) {
|
|
input := `<blockquote>A wise quote</blockquote>`
|
|
result, err := HTMLToMarkdown(input)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, result, "> A wise quote")
|
|
}
|
|
|
|
func TestHTMLToMarkdown_Good_HorizontalRule(t *testing.T) {
|
|
input := `<p>Before</p><hr/><p>After</p>`
|
|
result, err := HTMLToMarkdown(input)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, result, "---")
|
|
}
|
|
|
|
func TestHTMLToMarkdown_Good_LinkWithoutHref(t *testing.T) {
|
|
input := `<a>bare link text</a>`
|
|
result, err := HTMLToMarkdown(input)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, result, "bare link text")
|
|
assert.NotContains(t, result, "[")
|
|
}
|
|
|
|
func TestHTMLToMarkdown_Good_H4H5H6(t *testing.T) {
|
|
input := `<h4>H4</h4><h5>H5</h5><h6>H6</h6>`
|
|
result, err := HTMLToMarkdown(input)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, result, "#### H4")
|
|
assert.Contains(t, result, "##### H5")
|
|
assert.Contains(t, result, "###### H6")
|
|
}
|
|
|
|
func TestHTMLToMarkdown_Good_StripsStyle(t *testing.T) {
|
|
input := `<html><head><style>.foo{color:red}</style></head><body><p>Clean</p></body></html>`
|
|
result, err := HTMLToMarkdown(input)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, result, "Clean")
|
|
assert.NotContains(t, result, "color")
|
|
}
|
|
|
|
func TestHTMLToMarkdown_Good_LineBreak(t *testing.T) {
|
|
input := `<p>Line one<br/>Line two</p>`
|
|
result, err := HTMLToMarkdown(input)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, result, "Line one")
|
|
assert.Contains(t, result, "Line two")
|
|
}
|
|
|
|
func TestHTMLToMarkdown_Good_NestedBoldItalic(t *testing.T) {
|
|
input := `<b>bold text</b> and <i>italic text</i>`
|
|
result, err := HTMLToMarkdown(input)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, result, "**bold text**")
|
|
assert.Contains(t, result, "*italic text*")
|
|
}
|
|
|
|
func TestJSONToMarkdown_Good_NestedObject(t *testing.T) {
|
|
input := `{"outer": {"inner_key": "inner_value"}}`
|
|
result, err := JSONToMarkdown(input)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, result, "**outer:**")
|
|
assert.Contains(t, result, "**inner_key:** inner_value")
|
|
}
|
|
|
|
func TestJSONToMarkdown_Good_NestedArray(t *testing.T) {
|
|
input := `[["a", "b"], ["c"]]`
|
|
result, err := JSONToMarkdown(input)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, result, "# Data")
|
|
assert.Contains(t, result, "a")
|
|
assert.Contains(t, result, "b")
|
|
}
|
|
|
|
func TestJSONToMarkdown_Good_ScalarValue(t *testing.T) {
|
|
input := `42`
|
|
result, err := JSONToMarkdown(input)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, result, "42")
|
|
}
|
|
|
|
func TestJSONToMarkdown_Good_ArrayOfObjects(t *testing.T) {
|
|
input := `[{"name": "Alice"}, {"name": "Bob"}]`
|
|
result, err := JSONToMarkdown(input)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, result, "Item 1")
|
|
assert.Contains(t, result, "Alice")
|
|
assert.Contains(t, result, "Item 2")
|
|
assert.Contains(t, result, "Bob")
|
|
}
|
|
|
|
func TestProcessor_Process_Good_CancelledContext(t *testing.T) {
|
|
m := io.NewMockMedium()
|
|
m.Dirs["/input"] = true
|
|
m.Files["/input/file.html"] = `<h1>Test</h1>`
|
|
|
|
cfg := NewConfigWithMedium(m, "/output")
|
|
cfg.Limiter = nil
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
cancel()
|
|
|
|
p := &Processor{Source: "test", Dir: "/input"}
|
|
_, err := p.Process(ctx, cfg)
|
|
assert.Error(t, err)
|
|
}
|
|
|
|
func TestProcessor_Process_Good_EmitsEvents(t *testing.T) {
|
|
m := io.NewMockMedium()
|
|
m.Dirs["/input"] = true
|
|
m.Files["/input/a.html"] = `<h1>Title</h1>`
|
|
m.Files["/input/b.json"] = `{"key": "value"}`
|
|
|
|
cfg := NewConfigWithMedium(m, "/output")
|
|
cfg.Limiter = nil
|
|
|
|
var starts, items, completes int
|
|
cfg.Dispatcher.On(EventStart, func(e Event) { starts++ })
|
|
cfg.Dispatcher.On(EventItem, func(e Event) { items++ })
|
|
cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ })
|
|
|
|
p := &Processor{Source: "test", Dir: "/input"}
|
|
result, err := p.Process(context.Background(), cfg)
|
|
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 2, result.Items)
|
|
assert.Equal(t, 1, starts)
|
|
assert.Equal(t, 2, items)
|
|
assert.Equal(t, 1, completes)
|
|
}
|
|
|
|
func TestProcessor_Process_Good_BadHTML(t *testing.T) {
|
|
m := io.NewMockMedium()
|
|
m.Dirs["/input"] = true
|
|
// html.Parse is very tolerant, so even bad HTML will parse. But we test
|
|
// that the pipeline handles it gracefully.
|
|
m.Files["/input/bad.html"] = `<html><body><p>Still valid enough</p>`
|
|
|
|
cfg := NewConfigWithMedium(m, "/output")
|
|
cfg.Limiter = nil
|
|
|
|
p := &Processor{Source: "test", Dir: "/input"}
|
|
result, err := p.Process(context.Background(), cfg)
|
|
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 1, result.Items)
|
|
}
|
|
|
|
func TestProcessor_Process_Good_BadJSON(t *testing.T) {
|
|
m := io.NewMockMedium()
|
|
m.Dirs["/input"] = true
|
|
m.Files["/input/bad.json"] = `not valid json`
|
|
|
|
cfg := NewConfigWithMedium(m, "/output")
|
|
cfg.Limiter = nil
|
|
|
|
var errors int
|
|
cfg.Dispatcher.On(EventError, func(e Event) { errors++ })
|
|
|
|
p := &Processor{Source: "test", Dir: "/input"}
|
|
result, err := p.Process(context.Background(), cfg)
|
|
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 0, result.Items)
|
|
assert.Equal(t, 1, result.Errors)
|
|
assert.Equal(t, 1, errors)
|
|
}
|