From ebce05d6a45ece6b0b8ab23d00e7a8d8856189db Mon Sep 17 00:00:00 2001 From: Snider Date: Sat, 21 Feb 2026 21:16:54 +0000 Subject: [PATCH] feat: add RAG Python tools from CLI Python scripts for RAG ingestion and querying (ingest.py, query.py). Co-Authored-By: Virgil --- tools/rag/README.md | 193 ++++++++++++++++++++++++++++ tools/rag/ingest.py | 254 +++++++++++++++++++++++++++++++++++++ tools/rag/query.py | 196 ++++++++++++++++++++++++++++ tools/rag/requirements.txt | 2 + 4 files changed, 645 insertions(+) create mode 100644 tools/rag/README.md create mode 100644 tools/rag/ingest.py create mode 100644 tools/rag/query.py create mode 100644 tools/rag/requirements.txt diff --git a/tools/rag/README.md b/tools/rag/README.md new file mode 100644 index 0000000..e7a4f5d --- /dev/null +++ b/tools/rag/README.md @@ -0,0 +1,193 @@ +# RAG Pipeline for Host UK Documentation + +Store documentation in a vector database so Claude (and local LLMs) can retrieve relevant context without being reminded every conversation. + +## The Problem This Solves + +> "The amount of times I've had to re-tell you how to make a Flux button is crazy" + +Instead of wasting context window on "remember, Flux buttons work like this...", the RAG system: +1. Stores all documentation in Qdrant +2. Claude queries before answering +3. Relevant docs injected automatically +4. No more re-teaching + +## Prerequisites + +**Already running on your lab:** +- Qdrant: `linux.snider.dev:6333` +- Ollama: `linux.snider.dev:11434` (or local) + +**Install Python deps:** +```bash +pip install -r requirements.txt +``` + +**Ensure embedding model is available:** +```bash +ollama pull nomic-embed-text +``` + +## Quick Start + +### 1. Ingest Documentation + +```bash +# Ingest recovered Host UK docs +python ingest.py /Users/snider/Code/host-uk/core/tasks/recovered-hostuk \ + --collection hostuk-docs \ + --recreate + +# Ingest Flux UI docs separately (higher priority) +python ingest.py /path/to/flux-ui-docs \ + --collection flux-ui-docs \ + --recreate +``` + +### 2. Query the Database + +```bash +# Search for Flux button docs +python query.py "how to create a Flux button component" + +# Filter by category +python query.py "path sandboxing" --category architecture + +# Get more results +python query.py "Vi personality" --top 10 + +# Output as JSON +python query.py "brand voice" --format json + +# Output for LLM context injection +python query.py "Flux modal component" --format context +``` + +### 3. List Collections + +```bash +python query.py --list-collections +python query.py --stats --collection flux-ui-docs +``` + +## Collections Strategy + +| Collection | Content | Priority | +|------------|---------|----------| +| `flux-ui-docs` | Flux Pro component docs | High (UI questions) | +| `hostuk-docs` | Recovered implementation docs | Medium | +| `brand-docs` | Vi, brand voice, visual identity | For content generation | +| `lethean-docs` | SASE/dVPN technical docs | Product-specific | + +## Integration with Claude Code + +### Option 1: MCP Server (Best) + +Create an MCP server that Claude can query: + +```go +// In core CLI +func (s *RagServer) Query(query string) ([]Document, error) { + // Query Qdrant + // Return relevant docs +} +``` + +Then Claude can call `rag.query("Flux button")` and get docs automatically. + +### Option 2: CLAUDE.md Instruction + +Add to project CLAUDE.md: + +```markdown +## Before Answering UI Questions + +When asked about Flux UI components, query the RAG database first: +```bash +python /path/to/query.py "your question" --collection flux-ui-docs --format context +``` + +Include the retrieved context in your response. +``` + +### Option 3: Claude Code Hook + +Create a hook that auto-injects context for certain queries. + +## Category Taxonomy + +The ingestion automatically categorizes files: + +| Category | Matches | +|----------|---------| +| `ui-component` | flux, ui/component | +| `brand` | brand, mascot | +| `product-brief` | brief | +| `help-doc` | help, draft | +| `task` | task, plan | +| `architecture` | architecture, migration | +| `documentation` | default | + +## Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `QDRANT_HOST` | linux.snider.dev | Qdrant server | +| `QDRANT_PORT` | 6333 | Qdrant port | +| `EMBEDDING_MODEL` | nomic-embed-text | Ollama model | +| `CHUNK_SIZE` | 500 | Characters per chunk | +| `CHUNK_OVERLAP` | 50 | Overlap between chunks | + +## Training a Model vs RAG + +**RAG** (what this does): +- Model weights unchanged +- Documents retrieved at query time +- Knowledge updates instantly (re-ingest) +- Good for: facts, API docs, current information + +**Fine-tuning** (separate process): +- Model weights updated +- Knowledge baked into model +- Requires retraining to update +- Good for: style, patterns, conventions + +**For Flux UI**: RAG is perfect. The docs change, API changes, you want current info. + +**For Vi's voice**: Fine-tuning is better. Style doesn't change often, should be "baked in". + +## Vector Math (For Understanding) + +```text +"How do I make a Flux button?" + ↓ Embedding +[0.12, -0.45, 0.78, ...768 floats...] + ↓ Cosine similarity search +Find chunks with similar vectors + ↓ Results +1. doc/ui/flux/components/button.md (score: 0.89) +2. doc/ui/flux/forms.md (score: 0.76) +3. doc/ui/flux/components/input.md (score: 0.71) +``` + +The embedding model converts text to "meaning vectors". Similar meanings = similar vectors = found by search. + +## Troubleshooting + +**"No results found"** +- Lower threshold: `--threshold 0.3` +- Check collection has data: `--stats` +- Verify Ollama is running: `ollama list` + +**"Connection refused"** +- Check Qdrant is running: `curl http://linux.snider.dev:6333/collections` +- Check firewall/network + +**"Embedding model not available"** +```bash +ollama pull nomic-embed-text +``` + +--- + +*Part of the Host UK Core CLI tooling* diff --git a/tools/rag/ingest.py b/tools/rag/ingest.py new file mode 100644 index 0000000..7755bc2 --- /dev/null +++ b/tools/rag/ingest.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +""" +RAG Ingestion Pipeline for Host UK Documentation + +Chunks markdown files, generates embeddings via Ollama, stores in Qdrant. + +Usage: + python ingest.py /path/to/docs --collection hostuk-docs + python ingest.py /path/to/flux-ui --collection flux-ui-docs + +Requirements: + pip install qdrant-client ollama markdown +""" + +import argparse +import hashlib +import json +import os +import re +import sys +from pathlib import Path +from typing import Generator + +try: + from qdrant_client import QdrantClient + from qdrant_client.models import Distance, VectorParams, PointStruct + import ollama +except ImportError: + print("Install dependencies: pip install qdrant-client ollama") + sys.exit(1) + + +# Configuration +QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost") +QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333")) +EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "nomic-embed-text") +CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "500")) # chars +CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "50")) # chars +VECTOR_DIM = 768 # nomic-embed-text dimension + + +def chunk_markdown(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> Generator[dict, None, None]: + """ + Chunk markdown by sections (## headers), then by paragraphs if too long. + Preserves context with overlap. + """ + # Split by ## headers first + sections = re.split(r'\n(?=## )', text) + + for section in sections: + if not section.strip(): + continue + + # Extract section title + lines = section.strip().split('\n') + title = lines[0].lstrip('#').strip() if lines[0].startswith('#') else "" + + # If section is small enough, yield as-is + if len(section) <= chunk_size: + yield { + "text": section.strip(), + "section": title, + } + continue + + # Otherwise, chunk by paragraphs + paragraphs = re.split(r'\n\n+', section) + current_chunk = "" + + for para in paragraphs: + if len(current_chunk) + len(para) <= chunk_size: + current_chunk += "\n\n" + para if current_chunk else para + else: + if current_chunk: + yield { + "text": current_chunk.strip(), + "section": title, + } + # Start new chunk with overlap from previous + if overlap and current_chunk: + overlap_text = current_chunk[-overlap:] + current_chunk = overlap_text + "\n\n" + para + else: + current_chunk = para + + # Don't forget the last chunk + if current_chunk.strip(): + yield { + "text": current_chunk.strip(), + "section": title, + } + + +def generate_embedding(text: str) -> list[float]: + """Generate embedding using Ollama.""" + response = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text) + return response["embedding"] + + +def get_file_category(path: str) -> str: + """Determine category from file path.""" + path_lower = path.lower() + + if "flux" in path_lower or "ui/component" in path_lower: + return "ui-component" + elif "brand" in path_lower or "mascot" in path_lower: + return "brand" + elif "brief" in path_lower: + return "product-brief" + elif "help" in path_lower or "draft" in path_lower: + return "help-doc" + elif "task" in path_lower or "plan" in path_lower: + return "task" + elif "architecture" in path_lower or "migration" in path_lower: + return "architecture" + else: + return "documentation" + + +def ingest_directory( + directory: Path, + client: QdrantClient, + collection: str, + verbose: bool = False +) -> dict: + """Ingest all markdown files from directory into Qdrant.""" + + stats = {"files": 0, "chunks": 0, "errors": 0} + points = [] + + # Find all markdown files + md_files = list(directory.rglob("*.md")) + print(f"Found {len(md_files)} markdown files") + + for file_path in md_files: + try: + rel_path = str(file_path.relative_to(directory)) + + with open(file_path, "r", encoding="utf-8", errors="ignore") as f: + content = f.read() + + if not content.strip(): + continue + + # Extract metadata + category = get_file_category(rel_path) + + # Chunk the content + for i, chunk in enumerate(chunk_markdown(content)): + chunk_id = hashlib.md5( + f"{rel_path}:{i}:{chunk['text'][:100]}".encode() + ).hexdigest() + + # Generate embedding + embedding = generate_embedding(chunk["text"]) + + # Create point + point = PointStruct( + id=chunk_id, + vector=embedding, + payload={ + "text": chunk["text"], + "source": rel_path, + "section": chunk["section"], + "category": category, + "chunk_index": i, + } + ) + points.append(point) + stats["chunks"] += 1 + + if verbose: + print(f" [{category}] {rel_path} chunk {i}: {len(chunk['text'])} chars") + + stats["files"] += 1 + if not verbose: + print(f" Processed: {rel_path} ({stats['chunks']} chunks total)") + + except Exception as e: + print(f" Error processing {file_path}: {e}") + stats["errors"] += 1 + + # Batch upsert to Qdrant + if points: + print(f"\nUpserting {len(points)} vectors to Qdrant...") + + # Upsert in batches of 100 + batch_size = 100 + for i in range(0, len(points), batch_size): + batch = points[i:i + batch_size] + client.upsert(collection_name=collection, points=batch) + print(f" Uploaded batch {i // batch_size + 1}/{(len(points) - 1) // batch_size + 1}") + + return stats + + +def main(): + parser = argparse.ArgumentParser(description="Ingest markdown docs into Qdrant") + parser.add_argument("directory", type=Path, help="Directory containing markdown files") + parser.add_argument("--collection", default="hostuk-docs", help="Qdrant collection name") + parser.add_argument("--recreate", action="store_true", help="Delete and recreate collection") + parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") + parser.add_argument("--qdrant-host", default=QDRANT_HOST, help="Qdrant host") + parser.add_argument("--qdrant-port", type=int, default=QDRANT_PORT, help="Qdrant port") + + args = parser.parse_args() + + if not args.directory.exists(): + print(f"Error: Directory not found: {args.directory}") + sys.exit(1) + + # Connect to Qdrant + print(f"Connecting to Qdrant at {args.qdrant_host}:{args.qdrant_port}...") + client = QdrantClient(host=args.qdrant_host, port=args.qdrant_port) + + # Create or recreate collection + collections = [c.name for c in client.get_collections().collections] + + if args.recreate and args.collection in collections: + print(f"Deleting existing collection: {args.collection}") + client.delete_collection(args.collection) + collections.remove(args.collection) + + if args.collection not in collections: + print(f"Creating collection: {args.collection}") + client.create_collection( + collection_name=args.collection, + vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE) + ) + + # Verify Ollama model is available + print(f"Using embedding model: {EMBEDDING_MODEL}") + try: + ollama.embeddings(model=EMBEDDING_MODEL, prompt="test") + except Exception as e: + print(f"Error: Embedding model not available. Run: ollama pull {EMBEDDING_MODEL}") + sys.exit(1) + + # Ingest files + print(f"\nIngesting from: {args.directory}") + stats = ingest_directory(args.directory, client, args.collection, args.verbose) + + # Summary + print(f"\n{'=' * 50}") + print(f"Ingestion complete!") + print(f" Files processed: {stats['files']}") + print(f" Chunks created: {stats['chunks']}") + print(f" Errors: {stats['errors']}") + print(f" Collection: {args.collection}") + print(f"{'=' * 50}") + + +if __name__ == "__main__": + main() diff --git a/tools/rag/query.py b/tools/rag/query.py new file mode 100644 index 0000000..24846d5 --- /dev/null +++ b/tools/rag/query.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +""" +RAG Query Tool for Host UK Documentation + +Query the vector database and retrieve relevant documentation chunks. + +Usage: + python query.py "how do I create a Flux button" + python query.py "what is Vi's personality" --collection hostuk-docs + python query.py "path sandboxing" --top 10 --category architecture + +Requirements: + pip install qdrant-client ollama +""" + +import argparse +import html +import json +import os +import sys +from typing import Optional + +try: + from qdrant_client import QdrantClient + from qdrant_client.models import Filter, FieldCondition, MatchValue + import ollama +except ImportError: + print("Install dependencies: pip install qdrant-client ollama") + sys.exit(1) + + +# Configuration +QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost") +QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333")) +EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "nomic-embed-text") + + +def generate_embedding(text: str) -> list[float]: + """Generate embedding using Ollama.""" + response = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text) + return response["embedding"] + + +def query_rag( + query: str, + client: QdrantClient, + collection: str, + top_k: int = 5, + category: Optional[str] = None, + score_threshold: float = 0.5, +) -> list[dict]: + """Query the RAG database and return relevant chunks.""" + + # Generate query embedding + query_embedding = generate_embedding(query) + + # Build filter if category specified + query_filter = None + if category: + query_filter = Filter( + must=[ + FieldCondition(key="category", match=MatchValue(value=category)) + ] + ) + + # Search + results = client.query_points( + collection_name=collection, + query=query_embedding, + query_filter=query_filter, + limit=top_k, + score_threshold=score_threshold, + ).points + + return [ + { + "score": hit.score, + "text": hit.payload["text"], + "source": hit.payload["source"], + "section": hit.payload.get("section", ""), + "category": hit.payload.get("category", ""), + } + for hit in results + ] + + +def format_results(results: list[dict], query: str, format: str = "text") -> str: + """Format results for display.""" + + if format == "json": + return json.dumps(results, indent=2) + + if not results: + return f"No results found for: {query}" + + output = [] + output.append(f"Query: {query}") + output.append(f"Results: {len(results)}") + output.append("=" * 60) + + for i, r in enumerate(results, 1): + output.append(f"\n[{i}] {r['source']} (score: {r['score']:.3f})") + if r['section']: + output.append(f" Section: {r['section']}") + output.append(f" Category: {r['category']}") + output.append("-" * 40) + # Truncate long text for display + text = r['text'] + if len(text) > 500: + text = text[:500] + "..." + output.append(text) + output.append("") + + return "\n".join(output) + + +def format_for_context(results: list[dict], query: str) -> str: + """Format results as context for LLM injection.""" + + if not results: + return "" + + output = [] + output.append(f'') + + for r in results: + output.append(f'\n') + output.append(html.escape(r['text'])) + output.append("") + + output.append("\n") + + return "\n".join(output) + +def main(): + parser = argparse.ArgumentParser(description="Query RAG documentation") + parser.add_argument("query", nargs="?", help="Search query") + parser.add_argument("--collection", default="hostuk-docs", help="Qdrant collection name") + parser.add_argument("--top", "-k", type=int, default=5, help="Number of results") + parser.add_argument("--category", "-c", help="Filter by category") + parser.add_argument("--threshold", "-t", type=float, default=0.5, help="Score threshold") + parser.add_argument("--format", "-f", choices=["text", "json", "context"], default="text") + parser.add_argument("--qdrant-host", default=QDRANT_HOST) + parser.add_argument("--qdrant-port", type=int, default=QDRANT_PORT) + parser.add_argument("--list-collections", action="store_true", help="List available collections") + parser.add_argument("--stats", action="store_true", help="Show collection stats") + + args = parser.parse_args() + + # Connect to Qdrant + client = QdrantClient(host=args.qdrant_host, port=args.qdrant_port) + + # List collections + if args.list_collections: + collections = client.get_collections().collections + print("Available collections:") + for c in collections: + info = client.get_collection(c.name) + print(f" - {c.name}: {info.points_count} vectors") + return + + # Show stats + if args.stats: + try: + info = client.get_collection(args.collection) + print(f"Collection: {args.collection}") + print(f" Vectors: {info.points_count}") + print(f" Status: {info.status}") + except Exception as e: + print(f"Collection not found: {args.collection}") + return + + # Query required + if not args.query: + parser.print_help() + return + + # Execute query + results = query_rag( + query=args.query, + client=client, + collection=args.collection, + top_k=args.top, + category=args.category, + score_threshold=args.threshold, + ) + + # Format output + if args.format == "context": + print(format_for_context(results, args.query)) + else: + print(format_results(results, args.query, args.format)) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tools/rag/requirements.txt b/tools/rag/requirements.txt new file mode 100644 index 0000000..cd4cc3e --- /dev/null +++ b/tools/rag/requirements.txt @@ -0,0 +1,2 @@ +qdrant-client>=1.12.0,<2.0.0 +ollama>=0.1.0 \ No newline at end of file