go/pkg/collect/process.go
Claude 52d358daa2 refactor: rename module from github.com/host-uk/core to forge.lthn.ai/core/cli
Move module identity to our own Forgejo instance. All import paths
updated across 434 Go files, sub-module go.mod files, and go.work.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 05:53:52 +00:00

345 lines
8.3 KiB
Go

package collect
import (
"context"
"encoding/json"
"fmt"
"path/filepath"
"sort"
"strings"
core "forge.lthn.ai/core/cli/pkg/framework/core"
"golang.org/x/net/html"
)
// Processor converts collected data to clean markdown.
type Processor struct {
// Source identifies the data source directory to process.
Source string
// Dir is the directory containing files to process.
Dir string
}
// Name returns the processor name.
func (p *Processor) Name() string {
return fmt.Sprintf("process:%s", p.Source)
}
// Process reads files from the source directory, converts HTML or JSON
// to clean markdown, and writes the results to the output directory.
func (p *Processor) Process(ctx context.Context, cfg *Config) (*Result, error) {
result := &Result{Source: p.Name()}
if p.Dir == "" {
return result, core.E("collect.Processor.Process", "directory is required", nil)
}
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitStart(p.Name(), fmt.Sprintf("Processing files in %s", p.Dir))
}
if cfg.DryRun {
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitProgress(p.Name(), fmt.Sprintf("[dry-run] Would process files in %s", p.Dir), nil)
}
return result, nil
}
entries, err := cfg.Output.List(p.Dir)
if err != nil {
return result, core.E("collect.Processor.Process", "failed to list directory", err)
}
outputDir := filepath.Join(cfg.OutputDir, "processed", p.Source)
if err := cfg.Output.EnsureDir(outputDir); err != nil {
return result, core.E("collect.Processor.Process", "failed to create output directory", err)
}
for _, entry := range entries {
if ctx.Err() != nil {
return result, core.E("collect.Processor.Process", "context cancelled", ctx.Err())
}
if entry.IsDir() {
continue
}
name := entry.Name()
srcPath := filepath.Join(p.Dir, name)
content, err := cfg.Output.Read(srcPath)
if err != nil {
result.Errors++
continue
}
var processed string
ext := strings.ToLower(filepath.Ext(name))
switch ext {
case ".html", ".htm":
processed, err = htmlToMarkdown(content)
if err != nil {
result.Errors++
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitError(p.Name(), fmt.Sprintf("Failed to convert %s: %v", name, err), nil)
}
continue
}
case ".json":
processed, err = jsonToMarkdown(content)
if err != nil {
result.Errors++
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitError(p.Name(), fmt.Sprintf("Failed to convert %s: %v", name, err), nil)
}
continue
}
case ".md":
// Already markdown, just clean up
processed = strings.TrimSpace(content)
default:
result.Skipped++
continue
}
// Write with .md extension
outName := strings.TrimSuffix(name, ext) + ".md"
outPath := filepath.Join(outputDir, outName)
if err := cfg.Output.Write(outPath, processed); err != nil {
result.Errors++
continue
}
result.Items++
result.Files = append(result.Files, outPath)
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Processed: %s", name), nil)
}
}
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitComplete(p.Name(), fmt.Sprintf("Processed %d files", result.Items), result)
}
return result, nil
}
// htmlToMarkdown converts HTML content to clean markdown.
func htmlToMarkdown(content string) (string, error) {
doc, err := html.Parse(strings.NewReader(content))
if err != nil {
return "", core.E("collect.htmlToMarkdown", "failed to parse HTML", err)
}
var b strings.Builder
nodeToMarkdown(&b, doc, 0)
return strings.TrimSpace(b.String()), nil
}
// nodeToMarkdown recursively converts an HTML node tree to markdown.
func nodeToMarkdown(b *strings.Builder, n *html.Node, depth int) {
switch n.Type {
case html.TextNode:
text := n.Data
if strings.TrimSpace(text) != "" {
b.WriteString(text)
}
case html.ElementNode:
switch n.Data {
case "h1":
b.WriteString("\n# ")
writeChildrenText(b, n)
b.WriteString("\n\n")
return
case "h2":
b.WriteString("\n## ")
writeChildrenText(b, n)
b.WriteString("\n\n")
return
case "h3":
b.WriteString("\n### ")
writeChildrenText(b, n)
b.WriteString("\n\n")
return
case "h4":
b.WriteString("\n#### ")
writeChildrenText(b, n)
b.WriteString("\n\n")
return
case "h5":
b.WriteString("\n##### ")
writeChildrenText(b, n)
b.WriteString("\n\n")
return
case "h6":
b.WriteString("\n###### ")
writeChildrenText(b, n)
b.WriteString("\n\n")
return
case "p":
b.WriteString("\n")
for c := n.FirstChild; c != nil; c = c.NextSibling {
nodeToMarkdown(b, c, depth)
}
b.WriteString("\n")
return
case "br":
b.WriteString("\n")
return
case "strong", "b":
b.WriteString("**")
writeChildrenText(b, n)
b.WriteString("**")
return
case "em", "i":
b.WriteString("*")
writeChildrenText(b, n)
b.WriteString("*")
return
case "code":
b.WriteString("`")
writeChildrenText(b, n)
b.WriteString("`")
return
case "pre":
b.WriteString("\n```\n")
writeChildrenText(b, n)
b.WriteString("\n```\n")
return
case "a":
var href string
for _, attr := range n.Attr {
if attr.Key == "href" {
href = attr.Val
}
}
text := getChildrenText(n)
if href != "" {
fmt.Fprintf(b, "[%s](%s)", text, href)
} else {
b.WriteString(text)
}
return
case "ul":
b.WriteString("\n")
case "ol":
b.WriteString("\n")
counter := 1
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.Data == "li" {
fmt.Fprintf(b, "%d. ", counter)
for gc := c.FirstChild; gc != nil; gc = gc.NextSibling {
nodeToMarkdown(b, gc, depth+1)
}
b.WriteString("\n")
counter++
}
}
return
case "li":
b.WriteString("- ")
for c := n.FirstChild; c != nil; c = c.NextSibling {
nodeToMarkdown(b, c, depth+1)
}
b.WriteString("\n")
return
case "blockquote":
b.WriteString("\n> ")
text := getChildrenText(n)
b.WriteString(strings.ReplaceAll(text, "\n", "\n> "))
b.WriteString("\n")
return
case "hr":
b.WriteString("\n---\n")
return
case "script", "style", "head":
return
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
nodeToMarkdown(b, c, depth)
}
}
// writeChildrenText writes the text content of all children.
func writeChildrenText(b *strings.Builder, n *html.Node) {
b.WriteString(getChildrenText(n))
}
// getChildrenText returns the concatenated text content of all children.
func getChildrenText(n *html.Node) string {
var b strings.Builder
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.TextNode {
b.WriteString(c.Data)
} else {
b.WriteString(getChildrenText(c))
}
}
return b.String()
}
// jsonToMarkdown converts JSON content to a formatted markdown document.
func jsonToMarkdown(content string) (string, error) {
var data any
if err := json.Unmarshal([]byte(content), &data); err != nil {
return "", core.E("collect.jsonToMarkdown", "failed to parse JSON", err)
}
var b strings.Builder
b.WriteString("# Data\n\n")
jsonValueToMarkdown(&b, data, 0)
return strings.TrimSpace(b.String()), nil
}
// jsonValueToMarkdown recursively formats a JSON value as markdown.
func jsonValueToMarkdown(b *strings.Builder, data any, depth int) {
switch v := data.(type) {
case map[string]any:
keys := make([]string, 0, len(v))
for key := range v {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
val := v[key]
indent := strings.Repeat(" ", depth)
switch child := val.(type) {
case map[string]any, []any:
fmt.Fprintf(b, "%s- **%s:**\n", indent, key)
jsonValueToMarkdown(b, child, depth+1)
default:
fmt.Fprintf(b, "%s- **%s:** %v\n", indent, key, val)
}
}
case []any:
for i, item := range v {
indent := strings.Repeat(" ", depth)
switch child := item.(type) {
case map[string]any, []any:
fmt.Fprintf(b, "%s- Item %d:\n", indent, i+1)
jsonValueToMarkdown(b, child, depth+1)
default:
fmt.Fprintf(b, "%s- %v\n", indent, item)
}
}
default:
indent := strings.Repeat(" ", depth)
fmt.Fprintf(b, "%s%v\n", indent, data)
}
}
// HTMLToMarkdown is exported for testing.
func HTMLToMarkdown(content string) (string, error) {
return htmlToMarkdown(content)
}
// JSONToMarkdown is exported for testing.
func JSONToMarkdown(content string) (string, error) {
return jsonToMarkdown(content)
}