package collect
import (
"context"
"testing"
"forge.lthn.ai/core/go/pkg/io"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestHTMLToMarkdown_Good_OrderedList(t *testing.T) {
input := `
- First
- Second
- Third
`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "1. First")
assert.Contains(t, result, "2. Second")
assert.Contains(t, result, "3. Third")
}
func TestHTMLToMarkdown_Good_UnorderedList(t *testing.T) {
input := ``
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "- Alpha")
assert.Contains(t, result, "- Beta")
}
func TestHTMLToMarkdown_Good_Blockquote(t *testing.T) {
input := `A wise quote
`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "> A wise quote")
}
func TestHTMLToMarkdown_Good_HorizontalRule(t *testing.T) {
input := `Before
After
`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "---")
}
func TestHTMLToMarkdown_Good_LinkWithoutHref(t *testing.T) {
input := `bare link text`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "bare link text")
assert.NotContains(t, result, "[")
}
func TestHTMLToMarkdown_Good_H4H5H6(t *testing.T) {
input := `H4
H5
H6
`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "#### H4")
assert.Contains(t, result, "##### H5")
assert.Contains(t, result, "###### H6")
}
func TestHTMLToMarkdown_Good_StripsStyle(t *testing.T) {
input := `Clean
`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "Clean")
assert.NotContains(t, result, "color")
}
func TestHTMLToMarkdown_Good_LineBreak(t *testing.T) {
input := `Line one
Line two
`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "Line one")
assert.Contains(t, result, "Line two")
}
func TestHTMLToMarkdown_Good_NestedBoldItalic(t *testing.T) {
input := `bold text and italic text`
result, err := HTMLToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "**bold text**")
assert.Contains(t, result, "*italic text*")
}
func TestJSONToMarkdown_Good_NestedObject(t *testing.T) {
input := `{"outer": {"inner_key": "inner_value"}}`
result, err := JSONToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "**outer:**")
assert.Contains(t, result, "**inner_key:** inner_value")
}
func TestJSONToMarkdown_Good_NestedArray(t *testing.T) {
input := `[["a", "b"], ["c"]]`
result, err := JSONToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "# Data")
assert.Contains(t, result, "a")
assert.Contains(t, result, "b")
}
func TestJSONToMarkdown_Good_ScalarValue(t *testing.T) {
input := `42`
result, err := JSONToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "42")
}
func TestJSONToMarkdown_Good_ArrayOfObjects(t *testing.T) {
input := `[{"name": "Alice"}, {"name": "Bob"}]`
result, err := JSONToMarkdown(input)
require.NoError(t, err)
assert.Contains(t, result, "Item 1")
assert.Contains(t, result, "Alice")
assert.Contains(t, result, "Item 2")
assert.Contains(t, result, "Bob")
}
func TestProcessor_Process_Good_CancelledContext(t *testing.T) {
m := io.NewMockMedium()
m.Dirs["/input"] = true
m.Files["/input/file.html"] = `Test
`
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
ctx, cancel := context.WithCancel(context.Background())
cancel()
p := &Processor{Source: "test", Dir: "/input"}
_, err := p.Process(ctx, cfg)
assert.Error(t, err)
}
func TestProcessor_Process_Good_EmitsEvents(t *testing.T) {
m := io.NewMockMedium()
m.Dirs["/input"] = true
m.Files["/input/a.html"] = `Title
`
m.Files["/input/b.json"] = `{"key": "value"}`
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
var starts, items, completes int
cfg.Dispatcher.On(EventStart, func(e Event) { starts++ })
cfg.Dispatcher.On(EventItem, func(e Event) { items++ })
cfg.Dispatcher.On(EventComplete, func(e Event) { completes++ })
p := &Processor{Source: "test", Dir: "/input"}
result, err := p.Process(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 2, result.Items)
assert.Equal(t, 1, starts)
assert.Equal(t, 2, items)
assert.Equal(t, 1, completes)
}
func TestProcessor_Process_Good_BadHTML(t *testing.T) {
m := io.NewMockMedium()
m.Dirs["/input"] = true
// html.Parse is very tolerant, so even bad HTML will parse. But we test
// that the pipeline handles it gracefully.
m.Files["/input/bad.html"] = `Still valid enough
`
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
p := &Processor{Source: "test", Dir: "/input"}
result, err := p.Process(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 1, result.Items)
}
func TestProcessor_Process_Good_BadJSON(t *testing.T) {
m := io.NewMockMedium()
m.Dirs["/input"] = true
m.Files["/input/bad.json"] = `not valid json`
cfg := NewConfigWithMedium(m, "/output")
cfg.Limiter = nil
var errors int
cfg.Dispatcher.On(EventError, func(e Event) { errors++ })
p := &Processor{Source: "test", Dir: "/input"}
result, err := p.Process(context.Background(), cfg)
require.NoError(t, err)
assert.Equal(t, 0, result.Items)
assert.Equal(t, 1, result.Errors)
assert.Equal(t, 1, errors)
}