cli/pkg/rag/ingest.go
Claude 23b82482f2 refactor: rename module from github.com/host-uk/core to forge.lthn.ai/core/cli
Move module identity to our own Forgejo instance. All import paths
updated across 434 Go files, sub-module go.mod files, and go.work.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 05:53:52 +00:00

216 lines
5.2 KiB
Go

package rag
import (
"context"
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
"forge.lthn.ai/core/cli/pkg/log"
)
// IngestConfig holds ingestion configuration.
type IngestConfig struct {
Directory string
Collection string
Recreate bool
Verbose bool
BatchSize int
Chunk ChunkConfig
}
// DefaultIngestConfig returns default ingestion configuration.
func DefaultIngestConfig() IngestConfig {
return IngestConfig{
Collection: "hostuk-docs",
BatchSize: 100,
Chunk: DefaultChunkConfig(),
}
}
// IngestStats holds statistics from ingestion.
type IngestStats struct {
Files int
Chunks int
Errors int
}
// IngestProgress is called during ingestion to report progress.
type IngestProgress func(file string, chunks int, total int)
// Ingest processes a directory of documents and stores them in Qdrant.
func Ingest(ctx context.Context, qdrant *QdrantClient, ollama *OllamaClient, cfg IngestConfig, progress IngestProgress) (*IngestStats, error) {
stats := &IngestStats{}
// Validate batch size to prevent infinite loop
if cfg.BatchSize <= 0 {
cfg.BatchSize = 100 // Safe default
}
// Resolve directory
absDir, err := filepath.Abs(cfg.Directory)
if err != nil {
return nil, log.E("rag.Ingest", "error resolving directory", err)
}
info, err := os.Stat(absDir)
if err != nil {
return nil, log.E("rag.Ingest", "error accessing directory", err)
}
if !info.IsDir() {
return nil, log.E("rag.Ingest", fmt.Sprintf("not a directory: %s", absDir), nil)
}
// Check/create collection
exists, err := qdrant.CollectionExists(ctx, cfg.Collection)
if err != nil {
return nil, log.E("rag.Ingest", "error checking collection", err)
}
if cfg.Recreate && exists {
if err := qdrant.DeleteCollection(ctx, cfg.Collection); err != nil {
return nil, log.E("rag.Ingest", "error deleting collection", err)
}
exists = false
}
if !exists {
vectorDim := ollama.EmbedDimension()
if err := qdrant.CreateCollection(ctx, cfg.Collection, vectorDim); err != nil {
return nil, log.E("rag.Ingest", "error creating collection", err)
}
}
// Find markdown files
var files []string
err = filepath.WalkDir(absDir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() && ShouldProcess(path) {
files = append(files, path)
}
return nil
})
if err != nil {
return nil, log.E("rag.Ingest", "error walking directory", err)
}
if len(files) == 0 {
return nil, log.E("rag.Ingest", fmt.Sprintf("no markdown files found in %s", absDir), nil)
}
// Process files
var points []Point
for _, filePath := range files {
relPath, err := filepath.Rel(absDir, filePath)
if err != nil {
stats.Errors++
continue
}
content, err := os.ReadFile(filePath)
if err != nil {
stats.Errors++
continue
}
if len(strings.TrimSpace(string(content))) == 0 {
continue
}
// Chunk the content
category := Category(relPath)
chunks := ChunkMarkdown(string(content), cfg.Chunk)
for _, chunk := range chunks {
// Generate embedding
embedding, err := ollama.Embed(ctx, chunk.Text)
if err != nil {
stats.Errors++
if cfg.Verbose {
fmt.Printf(" Error embedding %s chunk %d: %v\n", relPath, chunk.Index, err)
}
continue
}
// Create point
points = append(points, Point{
ID: ChunkID(relPath, chunk.Index, chunk.Text),
Vector: embedding,
Payload: map[string]any{
"text": chunk.Text,
"source": relPath,
"section": chunk.Section,
"category": category,
"chunk_index": chunk.Index,
},
})
stats.Chunks++
}
stats.Files++
if progress != nil {
progress(relPath, stats.Chunks, len(files))
}
}
// Batch upsert to Qdrant
if len(points) > 0 {
for i := 0; i < len(points); i += cfg.BatchSize {
end := i + cfg.BatchSize
if end > len(points) {
end = len(points)
}
batch := points[i:end]
if err := qdrant.UpsertPoints(ctx, cfg.Collection, batch); err != nil {
return stats, log.E("rag.Ingest", fmt.Sprintf("error upserting batch %d", i/cfg.BatchSize+1), err)
}
}
}
return stats, nil
}
// IngestFile processes a single file and stores it in Qdrant.
func IngestFile(ctx context.Context, qdrant *QdrantClient, ollama *OllamaClient, collection string, filePath string, chunkCfg ChunkConfig) (int, error) {
content, err := os.ReadFile(filePath)
if err != nil {
return 0, log.E("rag.IngestFile", "error reading file", err)
}
if len(strings.TrimSpace(string(content))) == 0 {
return 0, nil
}
category := Category(filePath)
chunks := ChunkMarkdown(string(content), chunkCfg)
var points []Point
for _, chunk := range chunks {
embedding, err := ollama.Embed(ctx, chunk.Text)
if err != nil {
return 0, log.E("rag.IngestFile", fmt.Sprintf("error embedding chunk %d", chunk.Index), err)
}
points = append(points, Point{
ID: ChunkID(filePath, chunk.Index, chunk.Text),
Vector: embedding,
Payload: map[string]any{
"text": chunk.Text,
"source": filePath,
"section": chunk.Section,
"category": category,
"chunk_index": chunk.Index,
},
})
}
if err := qdrant.UpsertPoints(ctx, collection, points); err != nil {
return 0, log.E("rag.IngestFile", "error upserting points", err)
}
return len(points), nil
}