cli/internal/cmd/rag/cmd_ingest.go
Snider f2bc912ebe feat: infrastructure packages and lint cleanup (#281)
* ci: consolidate duplicate workflows and merge CodeQL configs

Remove 17 duplicate workflow files that were split copies of the
combined originals. Each family (CI, CodeQL, Coverage, PR Build,
Alpha Release) had the same job duplicated across separate
push/pull_request/schedule/manual trigger files.

Merge codeql.yml and codescan.yml into a single codeql.yml with
a language matrix covering go, javascript-typescript, python,
and actions — matching the previous default setup coverage.

Remaining workflows (one per family):
- ci.yml (push + PR + manual)
- codeql.yml (push + PR + schedule, all languages)
- coverage.yml (push + PR + manual)
- alpha-release.yml (push + manual)
- pr-build.yml (PR + manual)
- release.yml (tag push)
- agent-verify.yml, auto-label.yml, auto-project.yml

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* feat: add collect, config, crypt, plugin packages and fix all lint issues

Add four new infrastructure packages with CLI commands:
- pkg/config: layered configuration (defaults → file → env → flags)
- pkg/crypt: crypto primitives (Argon2id, AES-GCM, ChaCha20, HMAC, checksums)
- pkg/plugin: plugin system with GitHub-based install/update/remove
- pkg/collect: collection subsystem (GitHub, BitcoinTalk, market, papers, excavate)

Fix all golangci-lint issues across the entire codebase (~100 errcheck,
staticcheck SA1012/SA1019/ST1005, unused, ineffassign fixes) so that
`core go qa` passes with 0 issues.

Closes #167, #168, #170, #250, #251, #252, #253, #254, #255, #256

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 11:34:43 +00:00

171 lines
4.2 KiB
Go

package rag
import (
"context"
"fmt"
"github.com/host-uk/core/pkg/cli"
"github.com/host-uk/core/pkg/i18n"
"github.com/host-uk/core/pkg/rag"
"github.com/spf13/cobra"
)
var (
collection string
recreate bool
chunkSize int
chunkOverlap int
)
var ingestCmd = &cobra.Command{
Use: "ingest [directory]",
Short: i18n.T("cmd.rag.ingest.short"),
Long: i18n.T("cmd.rag.ingest.long"),
Args: cobra.MaximumNArgs(1),
RunE: runIngest,
}
func runIngest(cmd *cobra.Command, args []string) error {
directory := "."
if len(args) > 0 {
directory = args[0]
}
ctx := context.Background()
// Connect to Qdrant
fmt.Printf("Connecting to Qdrant at %s:%d...\n", qdrantHost, qdrantPort)
qdrantClient, err := rag.NewQdrantClient(rag.QdrantConfig{
Host: qdrantHost,
Port: qdrantPort,
UseTLS: false,
})
if err != nil {
return fmt.Errorf("failed to connect to Qdrant: %w", err)
}
defer func() { _ = qdrantClient.Close() }()
if err := qdrantClient.HealthCheck(ctx); err != nil {
return fmt.Errorf("qdrant health check failed: %w", err)
}
// Connect to Ollama
fmt.Printf("Using embedding model: %s (via %s:%d)\n", model, ollamaHost, ollamaPort)
ollamaClient, err := rag.NewOllamaClient(rag.OllamaConfig{
Host: ollamaHost,
Port: ollamaPort,
Model: model,
})
if err != nil {
return fmt.Errorf("failed to connect to Ollama: %w", err)
}
if err := ollamaClient.VerifyModel(ctx); err != nil {
return err
}
// Configure ingestion
if chunkSize <= 0 {
return fmt.Errorf("chunk-size must be > 0")
}
if chunkOverlap < 0 || chunkOverlap >= chunkSize {
return fmt.Errorf("chunk-overlap must be >= 0 and < chunk-size")
}
cfg := rag.IngestConfig{
Directory: directory,
Collection: collection,
Recreate: recreate,
Verbose: verbose,
BatchSize: 100,
Chunk: rag.ChunkConfig{
Size: chunkSize,
Overlap: chunkOverlap,
},
}
// Progress callback
progress := func(file string, chunks int, total int) {
if verbose {
fmt.Printf(" Processed: %s (%d chunks total)\n", file, chunks)
} else {
fmt.Printf("\r %s (%d chunks) ", cli.DimStyle.Render(file), chunks)
}
}
// Run ingestion
fmt.Printf("\nIngesting from: %s\n", directory)
if recreate {
fmt.Printf(" (recreating collection: %s)\n", collection)
}
stats, err := rag.Ingest(ctx, qdrantClient, ollamaClient, cfg, progress)
if err != nil {
return err
}
// Summary
fmt.Printf("\n\n%s\n", cli.TitleStyle.Render("Ingestion complete!"))
fmt.Printf(" Files processed: %d\n", stats.Files)
fmt.Printf(" Chunks created: %d\n", stats.Chunks)
if stats.Errors > 0 {
fmt.Printf(" Errors: %s\n", cli.ErrorStyle.Render(fmt.Sprintf("%d", stats.Errors)))
}
fmt.Printf(" Collection: %s\n", collection)
return nil
}
// IngestDirectory is exported for use by other packages (e.g., MCP).
func IngestDirectory(ctx context.Context, directory, collectionName string, recreateCollection bool) error {
qdrantClient, err := rag.NewQdrantClient(rag.DefaultQdrantConfig())
if err != nil {
return err
}
defer func() { _ = qdrantClient.Close() }()
if err := qdrantClient.HealthCheck(ctx); err != nil {
return fmt.Errorf("qdrant health check failed: %w", err)
}
ollamaClient, err := rag.NewOllamaClient(rag.DefaultOllamaConfig())
if err != nil {
return err
}
if err := ollamaClient.VerifyModel(ctx); err != nil {
return err
}
cfg := rag.DefaultIngestConfig()
cfg.Directory = directory
cfg.Collection = collectionName
cfg.Recreate = recreateCollection
_, err = rag.Ingest(ctx, qdrantClient, ollamaClient, cfg, nil)
return err
}
// IngestFile is exported for use by other packages (e.g., MCP).
func IngestFile(ctx context.Context, filePath, collectionName string) (int, error) {
qdrantClient, err := rag.NewQdrantClient(rag.DefaultQdrantConfig())
if err != nil {
return 0, err
}
defer func() { _ = qdrantClient.Close() }()
if err := qdrantClient.HealthCheck(ctx); err != nil {
return 0, fmt.Errorf("qdrant health check failed: %w", err)
}
ollamaClient, err := rag.NewOllamaClient(rag.DefaultOllamaConfig())
if err != nil {
return 0, err
}
if err := ollamaClient.VerifyModel(ctx); err != nil {
return 0, err
}
return rag.IngestFile(ctx, qdrantClient, ollamaClient, collectionName, filePath, rag.DefaultChunkConfig())
}