2026-03-30 00:54:20 +00:00
|
|
|
// SPDX-License-Identifier: EUPL-1.2
|
2026-03-30 00:19:43 +00:00
|
|
|
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
package collect
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"context"
|
|
|
|
|
"testing"
|
|
|
|
|
|
2026-03-21 23:54:23 +00:00
|
|
|
"dappco.re/go/core/io"
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
"github.com/stretchr/testify/assert"
|
|
|
|
|
"github.com/stretchr/testify/require"
|
|
|
|
|
)
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestHTMLToMarkdown_Good_OrderedList_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
input := `<ol><li>First</li><li>Second</li><li>Third</li></ol>`
|
|
|
|
|
result, err := HTMLToMarkdown(input)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Contains(t, result, "1. First")
|
|
|
|
|
assert.Contains(t, result, "2. Second")
|
|
|
|
|
assert.Contains(t, result, "3. Third")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestHTMLToMarkdown_Good_UnorderedList_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
input := `<ul><li>Alpha</li><li>Beta</li></ul>`
|
|
|
|
|
result, err := HTMLToMarkdown(input)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Contains(t, result, "- Alpha")
|
|
|
|
|
assert.Contains(t, result, "- Beta")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestHTMLToMarkdown_Good_Blockquote_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
input := `<blockquote>A wise quote</blockquote>`
|
|
|
|
|
result, err := HTMLToMarkdown(input)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Contains(t, result, "> A wise quote")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestHTMLToMarkdown_Good_HorizontalRule_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
input := `<p>Before</p><hr/><p>After</p>`
|
|
|
|
|
result, err := HTMLToMarkdown(input)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Contains(t, result, "---")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestHTMLToMarkdown_Good_LinkWithoutHref_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
input := `<a>bare link text</a>`
|
|
|
|
|
result, err := HTMLToMarkdown(input)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Contains(t, result, "bare link text")
|
|
|
|
|
assert.NotContains(t, result, "[")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestHTMLToMarkdown_Good_H4H5H6_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
input := `<h4>H4</h4><h5>H5</h5><h6>H6</h6>`
|
|
|
|
|
result, err := HTMLToMarkdown(input)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Contains(t, result, "#### H4")
|
|
|
|
|
assert.Contains(t, result, "##### H5")
|
|
|
|
|
assert.Contains(t, result, "###### H6")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestHTMLToMarkdown_Good_StripsStyle_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
input := `<html><head><style>.foo{color:red}</style></head><body><p>Clean</p></body></html>`
|
|
|
|
|
result, err := HTMLToMarkdown(input)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Contains(t, result, "Clean")
|
|
|
|
|
assert.NotContains(t, result, "color")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestHTMLToMarkdown_Good_LineBreak_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
input := `<p>Line one<br/>Line two</p>`
|
|
|
|
|
result, err := HTMLToMarkdown(input)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Contains(t, result, "Line one")
|
|
|
|
|
assert.Contains(t, result, "Line two")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestHTMLToMarkdown_Good_NestedBoldItalic_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
input := `<b>bold text</b> and <i>italic text</i>`
|
|
|
|
|
result, err := HTMLToMarkdown(input)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Contains(t, result, "**bold text**")
|
|
|
|
|
assert.Contains(t, result, "*italic text*")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestJSONToMarkdown_Good_NestedObject_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
input := `{"outer": {"inner_key": "inner_value"}}`
|
|
|
|
|
result, err := JSONToMarkdown(input)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Contains(t, result, "**outer:**")
|
|
|
|
|
assert.Contains(t, result, "**inner_key:** inner_value")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestJSONToMarkdown_Good_NestedArray_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
input := `[["a", "b"], ["c"]]`
|
|
|
|
|
result, err := JSONToMarkdown(input)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Contains(t, result, "# Data")
|
|
|
|
|
assert.Contains(t, result, "a")
|
|
|
|
|
assert.Contains(t, result, "b")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestJSONToMarkdown_Good_ScalarValue_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
input := `42`
|
|
|
|
|
result, err := JSONToMarkdown(input)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Contains(t, result, "42")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestJSONToMarkdown_Good_ArrayOfObjects_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
input := `[{"name": "Alice"}, {"name": "Bob"}]`
|
|
|
|
|
result, err := JSONToMarkdown(input)
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Contains(t, result, "Item 1")
|
|
|
|
|
assert.Contains(t, result, "Alice")
|
|
|
|
|
assert.Contains(t, result, "Item 2")
|
|
|
|
|
assert.Contains(t, result, "Bob")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestProcessor_Process_Good_CancelledContext_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
m := io.NewMockMedium()
|
|
|
|
|
m.Dirs["/input"] = true
|
|
|
|
|
m.Files["/input/file.html"] = `<h1>Test</h1>`
|
|
|
|
|
|
|
|
|
|
cfg := NewConfigWithMedium(m, "/output")
|
|
|
|
|
cfg.Limiter = nil
|
|
|
|
|
|
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
|
cancel()
|
|
|
|
|
|
|
|
|
|
p := &Processor{Source: "test", Dir: "/input"}
|
|
|
|
|
_, err := p.Process(ctx, cfg)
|
|
|
|
|
assert.Error(t, err)
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestProcessor_Process_Good_EmitsEvents_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
m := io.NewMockMedium()
|
|
|
|
|
m.Dirs["/input"] = true
|
|
|
|
|
m.Files["/input/a.html"] = `<h1>Title</h1>`
|
|
|
|
|
m.Files["/input/b.json"] = `{"key": "value"}`
|
|
|
|
|
|
|
|
|
|
cfg := NewConfigWithMedium(m, "/output")
|
|
|
|
|
cfg.Limiter = nil
|
|
|
|
|
|
|
|
|
|
var starts, items, completes int
|
|
|
|
|
cfg.Dispatcher.On(EventStart, func(e Event) { starts++ })
|
|
|
|
|
cfg.Dispatcher.On(EventItem, func(e Event) { items++ })
|
|
|
|
|
cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ })
|
|
|
|
|
|
|
|
|
|
p := &Processor{Source: "test", Dir: "/input"}
|
|
|
|
|
result, err := p.Process(context.Background(), cfg)
|
|
|
|
|
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Equal(t, 2, result.Items)
|
|
|
|
|
assert.Equal(t, 1, starts)
|
|
|
|
|
assert.Equal(t, 2, items)
|
|
|
|
|
assert.Equal(t, 1, completes)
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestProcessor_Process_Good_BadHTML_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
m := io.NewMockMedium()
|
|
|
|
|
m.Dirs["/input"] = true
|
|
|
|
|
// html.Parse is very tolerant, so even bad HTML will parse. But we test
|
|
|
|
|
// that the pipeline handles it gracefully.
|
|
|
|
|
m.Files["/input/bad.html"] = `<html><body><p>Still valid enough</p>`
|
|
|
|
|
|
|
|
|
|
cfg := NewConfigWithMedium(m, "/output")
|
|
|
|
|
cfg.Limiter = nil
|
|
|
|
|
|
|
|
|
|
p := &Processor{Source: "test", Dir: "/input"}
|
|
|
|
|
result, err := p.Process(context.Background(), cfg)
|
|
|
|
|
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Equal(t, 1, result.Items)
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-30 06:37:20 +00:00
|
|
|
func TestProcessor_Process_Good_BadJSON_Good(t *testing.T) {
|
test(collect): push coverage from 57.3% to 83.0%
Add HTTP mock tests for BitcoinTalk (fetchPage, Collect with server),
papers (IACR HTML parsing, arXiv XML parsing, PaperSourceAll), market
(historical with FromDate, invalid date, server errors), process
(ordered lists, blockquotes, h4-h6, nested objects, cancelled context),
excavate (resume skips completed, progress events), and state (copy
safety, cursor round-trip, null JSON).
Uses httptest.Server with rewriteTransport to intercept external HTTP
calls without touching the production code.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 01:55:18 +00:00
|
|
|
m := io.NewMockMedium()
|
|
|
|
|
m.Dirs["/input"] = true
|
|
|
|
|
m.Files["/input/bad.json"] = `not valid json`
|
|
|
|
|
|
|
|
|
|
cfg := NewConfigWithMedium(m, "/output")
|
|
|
|
|
cfg.Limiter = nil
|
|
|
|
|
|
|
|
|
|
var errors int
|
|
|
|
|
cfg.Dispatcher.On(EventError, func(e Event) { errors++ })
|
|
|
|
|
|
|
|
|
|
p := &Processor{Source: "test", Dir: "/input"}
|
|
|
|
|
result, err := p.Process(context.Background(), cfg)
|
|
|
|
|
|
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
assert.Equal(t, 0, result.Items)
|
|
|
|
|
assert.Equal(t, 1, result.Errors)
|
|
|
|
|
assert.Equal(t, 1, errors)
|
|
|
|
|
}
|