cli/pkg/collect/process.go
Snider f2bc912ebe feat: infrastructure packages and lint cleanup (#281)
* ci: consolidate duplicate workflows and merge CodeQL configs

Remove 17 duplicate workflow files that were split copies of the
combined originals. Each family (CI, CodeQL, Coverage, PR Build,
Alpha Release) had the same job duplicated across separate
push/pull_request/schedule/manual trigger files.

Merge codeql.yml and codescan.yml into a single codeql.yml with
a language matrix covering go, javascript-typescript, python,
and actions — matching the previous default setup coverage.

Remaining workflows (one per family):
- ci.yml (push + PR + manual)
- codeql.yml (push + PR + schedule, all languages)
- coverage.yml (push + PR + manual)
- alpha-release.yml (push + manual)
- pr-build.yml (PR + manual)
- release.yml (tag push)
- agent-verify.yml, auto-label.yml, auto-project.yml

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* feat: add collect, config, crypt, plugin packages and fix all lint issues

Add four new infrastructure packages with CLI commands:
- pkg/config: layered configuration (defaults → file → env → flags)
- pkg/crypt: crypto primitives (Argon2id, AES-GCM, ChaCha20, HMAC, checksums)
- pkg/plugin: plugin system with GitHub-based install/update/remove
- pkg/collect: collection subsystem (GitHub, BitcoinTalk, market, papers, excavate)

Fix all golangci-lint issues across the entire codebase (~100 errcheck,
staticcheck SA1012/SA1019/ST1005, unused, ineffassign fixes) so that
`core go qa` passes with 0 issues.

Closes #167, #168, #170, #250, #251, #252, #253, #254, #255, #256

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 11:34:43 +00:00

345 lines
8.3 KiB
Go

package collect
import (
"context"
"encoding/json"
"fmt"
"path/filepath"
"sort"
"strings"
core "github.com/host-uk/core/pkg/framework/core"
"golang.org/x/net/html"
)
// Processor converts collected data to clean markdown.
type Processor struct {
// Source identifies the data source directory to process.
Source string
// Dir is the directory containing files to process.
Dir string
}
// Name returns the processor name.
func (p *Processor) Name() string {
return fmt.Sprintf("process:%s", p.Source)
}
// Process reads files from the source directory, converts HTML or JSON
// to clean markdown, and writes the results to the output directory.
func (p *Processor) Process(ctx context.Context, cfg *Config) (*Result, error) {
result := &Result{Source: p.Name()}
if p.Dir == "" {
return result, core.E("collect.Processor.Process", "directory is required", nil)
}
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitStart(p.Name(), fmt.Sprintf("Processing files in %s", p.Dir))
}
if cfg.DryRun {
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitProgress(p.Name(), fmt.Sprintf("[dry-run] Would process files in %s", p.Dir), nil)
}
return result, nil
}
entries, err := cfg.Output.List(p.Dir)
if err != nil {
return result, core.E("collect.Processor.Process", "failed to list directory", err)
}
outputDir := filepath.Join(cfg.OutputDir, "processed", p.Source)
if err := cfg.Output.EnsureDir(outputDir); err != nil {
return result, core.E("collect.Processor.Process", "failed to create output directory", err)
}
for _, entry := range entries {
if ctx.Err() != nil {
return result, core.E("collect.Processor.Process", "context cancelled", ctx.Err())
}
if entry.IsDir() {
continue
}
name := entry.Name()
srcPath := filepath.Join(p.Dir, name)
content, err := cfg.Output.Read(srcPath)
if err != nil {
result.Errors++
continue
}
var processed string
ext := strings.ToLower(filepath.Ext(name))
switch ext {
case ".html", ".htm":
processed, err = htmlToMarkdown(content)
if err != nil {
result.Errors++
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitError(p.Name(), fmt.Sprintf("Failed to convert %s: %v", name, err), nil)
}
continue
}
case ".json":
processed, err = jsonToMarkdown(content)
if err != nil {
result.Errors++
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitError(p.Name(), fmt.Sprintf("Failed to convert %s: %v", name, err), nil)
}
continue
}
case ".md":
// Already markdown, just clean up
processed = strings.TrimSpace(content)
default:
result.Skipped++
continue
}
// Write with .md extension
outName := strings.TrimSuffix(name, ext) + ".md"
outPath := filepath.Join(outputDir, outName)
if err := cfg.Output.Write(outPath, processed); err != nil {
result.Errors++
continue
}
result.Items++
result.Files = append(result.Files, outPath)
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitItem(p.Name(), fmt.Sprintf("Processed: %s", name), nil)
}
}
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitComplete(p.Name(), fmt.Sprintf("Processed %d files", result.Items), result)
}
return result, nil
}
// htmlToMarkdown converts HTML content to clean markdown.
func htmlToMarkdown(content string) (string, error) {
doc, err := html.Parse(strings.NewReader(content))
if err != nil {
return "", core.E("collect.htmlToMarkdown", "failed to parse HTML", err)
}
var b strings.Builder
nodeToMarkdown(&b, doc, 0)
return strings.TrimSpace(b.String()), nil
}
// nodeToMarkdown recursively converts an HTML node tree to markdown.
func nodeToMarkdown(b *strings.Builder, n *html.Node, depth int) {
switch n.Type {
case html.TextNode:
text := n.Data
if strings.TrimSpace(text) != "" {
b.WriteString(text)
}
case html.ElementNode:
switch n.Data {
case "h1":
b.WriteString("\n# ")
writeChildrenText(b, n)
b.WriteString("\n\n")
return
case "h2":
b.WriteString("\n## ")
writeChildrenText(b, n)
b.WriteString("\n\n")
return
case "h3":
b.WriteString("\n### ")
writeChildrenText(b, n)
b.WriteString("\n\n")
return
case "h4":
b.WriteString("\n#### ")
writeChildrenText(b, n)
b.WriteString("\n\n")
return
case "h5":
b.WriteString("\n##### ")
writeChildrenText(b, n)
b.WriteString("\n\n")
return
case "h6":
b.WriteString("\n###### ")
writeChildrenText(b, n)
b.WriteString("\n\n")
return
case "p":
b.WriteString("\n")
for c := n.FirstChild; c != nil; c = c.NextSibling {
nodeToMarkdown(b, c, depth)
}
b.WriteString("\n")
return
case "br":
b.WriteString("\n")
return
case "strong", "b":
b.WriteString("**")
writeChildrenText(b, n)
b.WriteString("**")
return
case "em", "i":
b.WriteString("*")
writeChildrenText(b, n)
b.WriteString("*")
return
case "code":
b.WriteString("`")
writeChildrenText(b, n)
b.WriteString("`")
return
case "pre":
b.WriteString("\n```\n")
writeChildrenText(b, n)
b.WriteString("\n```\n")
return
case "a":
var href string
for _, attr := range n.Attr {
if attr.Key == "href" {
href = attr.Val
}
}
text := getChildrenText(n)
if href != "" {
fmt.Fprintf(b, "[%s](%s)", text, href)
} else {
b.WriteString(text)
}
return
case "ul":
b.WriteString("\n")
case "ol":
b.WriteString("\n")
counter := 1
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.Data == "li" {
fmt.Fprintf(b, "%d. ", counter)
for gc := c.FirstChild; gc != nil; gc = gc.NextSibling {
nodeToMarkdown(b, gc, depth+1)
}
b.WriteString("\n")
counter++
}
}
return
case "li":
b.WriteString("- ")
for c := n.FirstChild; c != nil; c = c.NextSibling {
nodeToMarkdown(b, c, depth+1)
}
b.WriteString("\n")
return
case "blockquote":
b.WriteString("\n> ")
text := getChildrenText(n)
b.WriteString(strings.ReplaceAll(text, "\n", "\n> "))
b.WriteString("\n")
return
case "hr":
b.WriteString("\n---\n")
return
case "script", "style", "head":
return
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
nodeToMarkdown(b, c, depth)
}
}
// writeChildrenText writes the text content of all children.
func writeChildrenText(b *strings.Builder, n *html.Node) {
b.WriteString(getChildrenText(n))
}
// getChildrenText returns the concatenated text content of all children.
func getChildrenText(n *html.Node) string {
var b strings.Builder
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.TextNode {
b.WriteString(c.Data)
} else {
b.WriteString(getChildrenText(c))
}
}
return b.String()
}
// jsonToMarkdown converts JSON content to a formatted markdown document.
func jsonToMarkdown(content string) (string, error) {
var data any
if err := json.Unmarshal([]byte(content), &data); err != nil {
return "", core.E("collect.jsonToMarkdown", "failed to parse JSON", err)
}
var b strings.Builder
b.WriteString("# Data\n\n")
jsonValueToMarkdown(&b, data, 0)
return strings.TrimSpace(b.String()), nil
}
// jsonValueToMarkdown recursively formats a JSON value as markdown.
func jsonValueToMarkdown(b *strings.Builder, data any, depth int) {
switch v := data.(type) {
case map[string]any:
keys := make([]string, 0, len(v))
for key := range v {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
val := v[key]
indent := strings.Repeat(" ", depth)
switch child := val.(type) {
case map[string]any, []any:
fmt.Fprintf(b, "%s- **%s:**\n", indent, key)
jsonValueToMarkdown(b, child, depth+1)
default:
fmt.Fprintf(b, "%s- **%s:** %v\n", indent, key, val)
}
}
case []any:
for i, item := range v {
indent := strings.Repeat(" ", depth)
switch child := item.(type) {
case map[string]any, []any:
fmt.Fprintf(b, "%s- Item %d:\n", indent, i+1)
jsonValueToMarkdown(b, child, depth+1)
default:
fmt.Fprintf(b, "%s- %v\n", indent, item)
}
}
default:
indent := strings.Repeat(" ", depth)
fmt.Fprintf(b, "%s%v\n", indent, data)
}
}
// HTMLToMarkdown is exported for testing.
func HTMLToMarkdown(content string) (string, error) {
return htmlToMarkdown(content)
}
// JSONToMarkdown is exported for testing.
func JSONToMarkdown(content string) (string, error) {
return jsonToMarkdown(content)
}