346 lines
8.3 KiB
Go
346 lines
8.3 KiB
Go
|
|
package collect
|
||
|
|
|
||
|
|
import (
|
||
|
|
"context"
|
||
|
|
"encoding/json"
|
||
|
|
"fmt"
|
||
|
|
"path/filepath"
|
||
|
|
"sort"
|
||
|
|
"strings"
|
||
|
|
|
||
|
|
core "github.com/host-uk/core/pkg/framework/core"
|
||
|
|
"golang.org/x/net/html"
|
||
|
|
)
|
||
|
|
|
||
|
|
// Processor converts collected data to clean markdown.
|
||
|
|
type Processor struct {
|
||
|
|
// Source identifies the data source directory to process.
|
||
|
|
Source string
|
||
|
|
|
||
|
|
// Dir is the directory containing files to process.
|
||
|
|
Dir string
|
||
|
|
}
|
||
|
|
|
||
|
|
// Name returns the processor name.
|
||
|
|
func (p *Processor) Name() string {
|
||
|
|
return fmt.Sprintf("process:%s", p.Source)
|
||
|
|
}
|
||
|
|
|
||
|
|
// Process reads files from the source directory, converts HTML or JSON
|
||
|
|
// to clean markdown, and writes the results to the output directory.
|
||
|
|
func (p *Processor) Process(ctx context.Context, cfg *Config) (*Result, error) {
|
||
|
|
result := &Result{Source: p.Name()}
|
||
|
|
|
||
|
|
if p.Dir == "" {
|
||
|
|
return result, core.E("collect.Processor.Process", "directory is required", nil)
|
||
|
|
}
|
||
|
|
|
||
|
|
if cfg.Dispatcher != nil {
|
||
|
|
cfg.Dispatcher.EmitStart(p.Name(), fmt.Sprintf("Processing files in %s", p.Dir))
|
||
|
|
}
|
||
|
|
|
||
|
|
if cfg.DryRun {
|
||
|
|
if cfg.Dispatcher != nil {
|
||
|
|
cfg.Dispatcher.EmitProgress(p.Name(), fmt.Sprintf("[dry-run] Would process files in %s", p.Dir), nil)
|
||
|
|
}
|
||
|
|
return result, nil
|
||
|
|
}
|
||
|
|
|
||
|
|
entries, err := cfg.Output.List(p.Dir)
|
||
|
|
if err != nil {
|
||
|
|
return result, core.E("collect.Processor.Process", "failed to list directory", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
outputDir := filepath.Join(cfg.OutputDir, "processed", p.Source)
|
||
|
|
if err := cfg.Output.EnsureDir(outputDir); err != nil {
|
||
|
|
return result, core.E("collect.Processor.Process", "failed to create output directory", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
for _, entry := range entries {
|
||
|
|
if ctx.Err() != nil {
|
||
|
|
return result, core.E("collect.Processor.Process", "context cancelled", ctx.Err())
|
||
|
|
}
|
||
|
|
|
||
|
|
if entry.IsDir() {
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
|
||
|
|
name := entry.Name()
|
||
|
|
srcPath := filepath.Join(p.Dir, name)
|
||
|
|
|
||
|
|
content, err := cfg.Output.Read(srcPath)
|
||
|
|
if err != nil {
|
||
|
|
result.Errors++
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
|
||
|
|
var processed string
|
||
|
|
ext := strings.ToLower(filepath.Ext(name))
|
||
|
|
|
||
|
|
switch ext {
|
||
|
|
case ".html", ".htm":
|
||
|
|
processed, err = htmlToMarkdown(content)
|
||
|
|
if err != nil {
|
||
|
|
result.Errors++
|
||
|
|
if cfg.Dispatcher != nil {
|
||
|
|
cfg.Dispatcher.EmitError(p.Name(), fmt.Sprintf("Failed to convert %s: %v", name, err), nil)
|
||
|
|
}
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
case ".json":
|
||
|
|
processed, err = jsonToMarkdown(content)
|
||
|
|
if err != nil {
|
||
|
|
result.Errors++
|
||
|
|
if cfg.Dispatcher != nil {
|
||
|
|
cfg.Dispatcher.EmitError(p.Name(), fmt.Sprintf("Failed to convert %s: %v", name, err), nil)
|
||
|
|
}
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
case ".md":
|
||
|
|
// Already markdown, just clean up
|
||
|
|
processed = strings.TrimSpace(content)
|
||
|
|
default:
|
||
|
|
result.Skipped++
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
|
||
|
|
// Write with .md extension
|
||
|
|
outName := strings.TrimSuffix(name, ext) + ".md"
|
||
|
|
outPath := filepath.Join(outputDir, outName)
|
||
|
|
|
||
|
|
if err := cfg.Output.Write(outPath, processed); err != nil {
|
||
|
|
result.Errors++
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
|
||
|
|
result.Items++
|
||
|
|
result.Files = append(result.Files, outPath)
|
||
|
|
|
||
|
|
if cfg.Dispatcher != nil {
|
||
|
|
cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Processed: %s", name), nil)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if cfg.Dispatcher != nil {
|
||
|
|
cfg.Dispatcher.EmitComplete(p.Name(), fmt.Sprintf("Processed %d files", result.Items), result)
|
||
|
|
}
|
||
|
|
|
||
|
|
return result, nil
|
||
|
|
}
|
||
|
|
|
||
|
|
// htmlToMarkdown converts HTML content to clean markdown.
|
||
|
|
func htmlToMarkdown(content string) (string, error) {
|
||
|
|
doc, err := html.Parse(strings.NewReader(content))
|
||
|
|
if err != nil {
|
||
|
|
return "", core.E("collect.htmlToMarkdown", "failed to parse HTML", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
var b strings.Builder
|
||
|
|
nodeToMarkdown(&b, doc, 0)
|
||
|
|
return strings.TrimSpace(b.String()), nil
|
||
|
|
}
|
||
|
|
|
||
|
|
// nodeToMarkdown recursively converts an HTML node tree to markdown.
|
||
|
|
func nodeToMarkdown(b *strings.Builder, n *html.Node, depth int) {
|
||
|
|
switch n.Type {
|
||
|
|
case html.TextNode:
|
||
|
|
text := n.Data
|
||
|
|
if strings.TrimSpace(text) != "" {
|
||
|
|
b.WriteString(text)
|
||
|
|
}
|
||
|
|
case html.ElementNode:
|
||
|
|
switch n.Data {
|
||
|
|
case "h1":
|
||
|
|
b.WriteString("\n# ")
|
||
|
|
writeChildrenText(b, n)
|
||
|
|
b.WriteString("\n\n")
|
||
|
|
return
|
||
|
|
case "h2":
|
||
|
|
b.WriteString("\n## ")
|
||
|
|
writeChildrenText(b, n)
|
||
|
|
b.WriteString("\n\n")
|
||
|
|
return
|
||
|
|
case "h3":
|
||
|
|
b.WriteString("\n### ")
|
||
|
|
writeChildrenText(b, n)
|
||
|
|
b.WriteString("\n\n")
|
||
|
|
return
|
||
|
|
case "h4":
|
||
|
|
b.WriteString("\n#### ")
|
||
|
|
writeChildrenText(b, n)
|
||
|
|
b.WriteString("\n\n")
|
||
|
|
return
|
||
|
|
case "h5":
|
||
|
|
b.WriteString("\n##### ")
|
||
|
|
writeChildrenText(b, n)
|
||
|
|
b.WriteString("\n\n")
|
||
|
|
return
|
||
|
|
case "h6":
|
||
|
|
b.WriteString("\n###### ")
|
||
|
|
writeChildrenText(b, n)
|
||
|
|
b.WriteString("\n\n")
|
||
|
|
return
|
||
|
|
case "p":
|
||
|
|
b.WriteString("\n")
|
||
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
|
|
nodeToMarkdown(b, c, depth)
|
||
|
|
}
|
||
|
|
b.WriteString("\n")
|
||
|
|
return
|
||
|
|
case "br":
|
||
|
|
b.WriteString("\n")
|
||
|
|
return
|
||
|
|
case "strong", "b":
|
||
|
|
b.WriteString("**")
|
||
|
|
writeChildrenText(b, n)
|
||
|
|
b.WriteString("**")
|
||
|
|
return
|
||
|
|
case "em", "i":
|
||
|
|
b.WriteString("*")
|
||
|
|
writeChildrenText(b, n)
|
||
|
|
b.WriteString("*")
|
||
|
|
return
|
||
|
|
case "code":
|
||
|
|
b.WriteString("`")
|
||
|
|
writeChildrenText(b, n)
|
||
|
|
b.WriteString("`")
|
||
|
|
return
|
||
|
|
case "pre":
|
||
|
|
b.WriteString("\n```\n")
|
||
|
|
writeChildrenText(b, n)
|
||
|
|
b.WriteString("\n```\n")
|
||
|
|
return
|
||
|
|
case "a":
|
||
|
|
var href string
|
||
|
|
for _, attr := range n.Attr {
|
||
|
|
if attr.Key == "href" {
|
||
|
|
href = attr.Val
|
||
|
|
}
|
||
|
|
}
|
||
|
|
text := getChildrenText(n)
|
||
|
|
if href != "" {
|
||
|
|
fmt.Fprintf(b, "[%s](%s)", text, href)
|
||
|
|
} else {
|
||
|
|
b.WriteString(text)
|
||
|
|
}
|
||
|
|
return
|
||
|
|
case "ul":
|
||
|
|
b.WriteString("\n")
|
||
|
|
case "ol":
|
||
|
|
b.WriteString("\n")
|
||
|
|
counter := 1
|
||
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
|
|
if c.Type == html.ElementNode && c.Data == "li" {
|
||
|
|
fmt.Fprintf(b, "%d. ", counter)
|
||
|
|
for gc := c.FirstChild; gc != nil; gc = gc.NextSibling {
|
||
|
|
nodeToMarkdown(b, gc, depth+1)
|
||
|
|
}
|
||
|
|
b.WriteString("\n")
|
||
|
|
counter++
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return
|
||
|
|
case "li":
|
||
|
|
b.WriteString("- ")
|
||
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
|
|
nodeToMarkdown(b, c, depth+1)
|
||
|
|
}
|
||
|
|
b.WriteString("\n")
|
||
|
|
return
|
||
|
|
case "blockquote":
|
||
|
|
b.WriteString("\n> ")
|
||
|
|
text := getChildrenText(n)
|
||
|
|
b.WriteString(strings.ReplaceAll(text, "\n", "\n> "))
|
||
|
|
b.WriteString("\n")
|
||
|
|
return
|
||
|
|
case "hr":
|
||
|
|
b.WriteString("\n---\n")
|
||
|
|
return
|
||
|
|
case "script", "style", "head":
|
||
|
|
return
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
|
|
nodeToMarkdown(b, c, depth)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// writeChildrenText writes the text content of all children.
|
||
|
|
func writeChildrenText(b *strings.Builder, n *html.Node) {
|
||
|
|
b.WriteString(getChildrenText(n))
|
||
|
|
}
|
||
|
|
|
||
|
|
// getChildrenText returns the concatenated text content of all children.
|
||
|
|
func getChildrenText(n *html.Node) string {
|
||
|
|
var b strings.Builder
|
||
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
|
|
if c.Type == html.TextNode {
|
||
|
|
b.WriteString(c.Data)
|
||
|
|
} else {
|
||
|
|
b.WriteString(getChildrenText(c))
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return b.String()
|
||
|
|
}
|
||
|
|
|
||
|
|
// jsonToMarkdown converts JSON content to a formatted markdown document.
|
||
|
|
func jsonToMarkdown(content string) (string, error) {
|
||
|
|
var data any
|
||
|
|
if err := json.Unmarshal([]byte(content), &data); err != nil {
|
||
|
|
return "", core.E("collect.jsonToMarkdown", "failed to parse JSON", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
var b strings.Builder
|
||
|
|
b.WriteString("# Data\n\n")
|
||
|
|
jsonValueToMarkdown(&b, data, 0)
|
||
|
|
return strings.TrimSpace(b.String()), nil
|
||
|
|
}
|
||
|
|
|
||
|
|
// jsonValueToMarkdown recursively formats a JSON value as markdown.
|
||
|
|
func jsonValueToMarkdown(b *strings.Builder, data any, depth int) {
|
||
|
|
switch v := data.(type) {
|
||
|
|
case map[string]any:
|
||
|
|
keys := make([]string, 0, len(v))
|
||
|
|
for key := range v {
|
||
|
|
keys = append(keys, key)
|
||
|
|
}
|
||
|
|
sort.Strings(keys)
|
||
|
|
for _, key := range keys {
|
||
|
|
val := v[key]
|
||
|
|
indent := strings.Repeat(" ", depth)
|
||
|
|
switch child := val.(type) {
|
||
|
|
case map[string]any, []any:
|
||
|
|
fmt.Fprintf(b, "%s- **%s:**\n", indent, key)
|
||
|
|
jsonValueToMarkdown(b, child, depth+1)
|
||
|
|
default:
|
||
|
|
fmt.Fprintf(b, "%s- **%s:** %v\n", indent, key, val)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
case []any:
|
||
|
|
for i, item := range v {
|
||
|
|
indent := strings.Repeat(" ", depth)
|
||
|
|
switch child := item.(type) {
|
||
|
|
case map[string]any, []any:
|
||
|
|
fmt.Fprintf(b, "%s- Item %d:\n", indent, i+1)
|
||
|
|
jsonValueToMarkdown(b, child, depth+1)
|
||
|
|
default:
|
||
|
|
fmt.Fprintf(b, "%s- %v\n", indent, item)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
default:
|
||
|
|
indent := strings.Repeat(" ", depth)
|
||
|
|
fmt.Fprintf(b, "%s%v\n", indent, data)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// HTMLToMarkdown is exported for testing.
|
||
|
|
func HTMLToMarkdown(content string) (string, error) {
|
||
|
|
return htmlToMarkdown(content)
|
||
|
|
}
|
||
|
|
|
||
|
|
// JSONToMarkdown is exported for testing.
|
||
|
|
func JSONToMarkdown(content string) (string, error) {
|
||
|
|
return jsonToMarkdown(content)
|
||
|
|
}
|