go-scm/collect/github.go
Claude 3e883f6976
feat: extract SCM/forge integration packages from core/go
Forgejo and Gitea SDK wrappers, multi-repo git utilities, AgentCI
dispatch, distributed job orchestrator, and data collection pipelines.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 15:25:58 +00:00

289 lines
7.8 KiB
Go

package collect
import (
"context"
"encoding/json"
"fmt"
"os/exec"
"path/filepath"
"strings"
"time"
core "forge.lthn.ai/core/go/pkg/framework/core"
)
// ghIssue represents a GitHub issue or pull request as returned by the gh CLI.
type ghIssue struct {
Number int `json:"number"`
Title string `json:"title"`
State string `json:"state"`
Author ghAuthor `json:"author"`
Body string `json:"body"`
CreatedAt time.Time `json:"createdAt"`
Labels []ghLabel `json:"labels"`
URL string `json:"url"`
}
type ghAuthor struct {
Login string `json:"login"`
}
type ghLabel struct {
Name string `json:"name"`
}
// ghRepo represents a GitHub repository as returned by the gh CLI.
type ghRepo struct {
Name string `json:"name"`
}
// GitHubCollector collects issues and PRs from GitHub repositories.
type GitHubCollector struct {
// Org is the GitHub organisation.
Org string
// Repo is the repository name. If empty and Org is set, all repos are collected.
Repo string
// IssuesOnly limits collection to issues (excludes PRs).
IssuesOnly bool
// PRsOnly limits collection to PRs (excludes issues).
PRsOnly bool
}
// Name returns the collector name.
func (g *GitHubCollector) Name() string {
if g.Repo != "" {
return fmt.Sprintf("github:%s/%s", g.Org, g.Repo)
}
return fmt.Sprintf("github:%s", g.Org)
}
// Collect gathers issues and/or PRs from GitHub repositories.
func (g *GitHubCollector) Collect(ctx context.Context, cfg *Config) (*Result, error) {
result := &Result{Source: g.Name()}
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitStart(g.Name(), "Starting GitHub collection")
}
// If no specific repo, list all repos in the org
repos := []string{g.Repo}
if g.Repo == "" {
var err error
repos, err = g.listOrgRepos(ctx)
if err != nil {
return result, err
}
}
for _, repo := range repos {
if ctx.Err() != nil {
return result, core.E("collect.GitHub.Collect", "context cancelled", ctx.Err())
}
if !g.PRsOnly {
issueResult, err := g.collectIssues(ctx, cfg, repo)
if err != nil {
result.Errors++
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitError(g.Name(), fmt.Sprintf("Error collecting issues for %s: %v", repo, err), nil)
}
} else {
result.Items += issueResult.Items
result.Skipped += issueResult.Skipped
result.Files = append(result.Files, issueResult.Files...)
}
}
if !g.IssuesOnly {
prResult, err := g.collectPRs(ctx, cfg, repo)
if err != nil {
result.Errors++
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitError(g.Name(), fmt.Sprintf("Error collecting PRs for %s: %v", repo, err), nil)
}
} else {
result.Items += prResult.Items
result.Skipped += prResult.Skipped
result.Files = append(result.Files, prResult.Files...)
}
}
}
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitComplete(g.Name(), fmt.Sprintf("Collected %d items", result.Items), result)
}
return result, nil
}
// listOrgRepos returns all repository names for the configured org.
func (g *GitHubCollector) listOrgRepos(ctx context.Context) ([]string, error) {
cmd := exec.CommandContext(ctx, "gh", "repo", "list", g.Org,
"--json", "name",
"--limit", "1000",
)
out, err := cmd.Output()
if err != nil {
return nil, core.E("collect.GitHub.listOrgRepos", "failed to list repos", err)
}
var repos []ghRepo
if err := json.Unmarshal(out, &repos); err != nil {
return nil, core.E("collect.GitHub.listOrgRepos", "failed to parse repo list", err)
}
names := make([]string, len(repos))
for i, r := range repos {
names[i] = r.Name
}
return names, nil
}
// collectIssues collects issues for a single repository.
func (g *GitHubCollector) collectIssues(ctx context.Context, cfg *Config, repo string) (*Result, error) {
result := &Result{Source: fmt.Sprintf("github:%s/%s/issues", g.Org, repo)}
if cfg.DryRun {
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitProgress(g.Name(), fmt.Sprintf("[dry-run] Would collect issues for %s/%s", g.Org, repo), nil)
}
return result, nil
}
if cfg.Limiter != nil {
if err := cfg.Limiter.Wait(ctx, "github"); err != nil {
return result, err
}
}
repoRef := fmt.Sprintf("%s/%s", g.Org, repo)
cmd := exec.CommandContext(ctx, "gh", "issue", "list",
"--repo", repoRef,
"--json", "number,title,state,author,body,createdAt,labels,url",
"--limit", "100",
"--state", "all",
)
out, err := cmd.Output()
if err != nil {
return result, core.E("collect.GitHub.collectIssues", "gh issue list failed for "+repoRef, err)
}
var issues []ghIssue
if err := json.Unmarshal(out, &issues); err != nil {
return result, core.E("collect.GitHub.collectIssues", "failed to parse issues", err)
}
baseDir := filepath.Join(cfg.OutputDir, "github", g.Org, repo, "issues")
if err := cfg.Output.EnsureDir(baseDir); err != nil {
return result, core.E("collect.GitHub.collectIssues", "failed to create output directory", err)
}
for _, issue := range issues {
filePath := filepath.Join(baseDir, fmt.Sprintf("%d.md", issue.Number))
content := formatIssueMarkdown(issue)
if err := cfg.Output.Write(filePath, content); err != nil {
result.Errors++
continue
}
result.Items++
result.Files = append(result.Files, filePath)
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitItem(g.Name(), fmt.Sprintf("Issue #%d: %s", issue.Number, issue.Title), nil)
}
}
return result, nil
}
// collectPRs collects pull requests for a single repository.
func (g *GitHubCollector) collectPRs(ctx context.Context, cfg *Config, repo string) (*Result, error) {
result := &Result{Source: fmt.Sprintf("github:%s/%s/pulls", g.Org, repo)}
if cfg.DryRun {
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitProgress(g.Name(), fmt.Sprintf("[dry-run] Would collect PRs for %s/%s", g.Org, repo), nil)
}
return result, nil
}
if cfg.Limiter != nil {
if err := cfg.Limiter.Wait(ctx, "github"); err != nil {
return result, err
}
}
repoRef := fmt.Sprintf("%s/%s", g.Org, repo)
cmd := exec.CommandContext(ctx, "gh", "pr", "list",
"--repo", repoRef,
"--json", "number,title,state,author,body,createdAt,labels,url",
"--limit", "100",
"--state", "all",
)
out, err := cmd.Output()
if err != nil {
return result, core.E("collect.GitHub.collectPRs", "gh pr list failed for "+repoRef, err)
}
var prs []ghIssue
if err := json.Unmarshal(out, &prs); err != nil {
return result, core.E("collect.GitHub.collectPRs", "failed to parse pull requests", err)
}
baseDir := filepath.Join(cfg.OutputDir, "github", g.Org, repo, "pulls")
if err := cfg.Output.EnsureDir(baseDir); err != nil {
return result, core.E("collect.GitHub.collectPRs", "failed to create output directory", err)
}
for _, pr := range prs {
filePath := filepath.Join(baseDir, fmt.Sprintf("%d.md", pr.Number))
content := formatIssueMarkdown(pr)
if err := cfg.Output.Write(filePath, content); err != nil {
result.Errors++
continue
}
result.Items++
result.Files = append(result.Files, filePath)
if cfg.Dispatcher != nil {
cfg.Dispatcher.EmitItem(g.Name(), fmt.Sprintf("PR #%d: %s", pr.Number, pr.Title), nil)
}
}
return result, nil
}
// formatIssueMarkdown formats a GitHub issue or PR as markdown.
func formatIssueMarkdown(issue ghIssue) string {
var b strings.Builder
fmt.Fprintf(&b, "# %s\n\n", issue.Title)
fmt.Fprintf(&b, "- **Number:** #%d\n", issue.Number)
fmt.Fprintf(&b, "- **State:** %s\n", issue.State)
fmt.Fprintf(&b, "- **Author:** %s\n", issue.Author.Login)
fmt.Fprintf(&b, "- **Created:** %s\n", issue.CreatedAt.Format(time.RFC3339))
if len(issue.Labels) > 0 {
labels := make([]string, len(issue.Labels))
for i, l := range issue.Labels {
labels[i] = l.Name
}
fmt.Fprintf(&b, "- **Labels:** %s\n", strings.Join(labels, ", "))
}
if issue.URL != "" {
fmt.Fprintf(&b, "- **URL:** %s\n", issue.URL)
}
if issue.Body != "" {
fmt.Fprintf(&b, "\n%s\n", issue.Body)
}
return b.String()
}