From ebce05d6a45ece6b0b8ab23d00e7a8d8856189db Mon Sep 17 00:00:00 2001
From: Snider <snider@host.uk.com>
Date: Sat, 21 Feb 2026 21:16:54 +0000
Subject: [PATCH] feat: add RAG Python tools from CLI

Python scripts for RAG ingestion and querying (ingest.py, query.py).

Co-Authored-By: Virgil <virgil@lethean.io>
---
 tools/rag/README.md        | 193 ++++++++++++++++++++++++++++
 tools/rag/ingest.py        | 254 +++++++++++++++++++++++++++++++++++++
 tools/rag/query.py         | 196 ++++++++++++++++++++++++++++
 tools/rag/requirements.txt |   2 +
 4 files changed, 645 insertions(+)
 create mode 100644 tools/rag/README.md
 create mode 100644 tools/rag/ingest.py
 create mode 100644 tools/rag/query.py
 create mode 100644 tools/rag/requirements.txt

diff --git a/tools/rag/README.md b/tools/rag/README.md
new file mode 100644
index 0000000..e7a4f5d
--- /dev/null
+++ b/tools/rag/README.md
@@ -0,0 +1,193 @@
+# RAG Pipeline for Host UK Documentation
+
+Store documentation in a vector database so Claude (and local LLMs) can retrieve relevant context without being reminded every conversation.
+
+## The Problem This Solves
+
+> "The amount of times I've had to re-tell you how to make a Flux button is crazy"
+
+Instead of wasting context window on "remember, Flux buttons work like this...", the RAG system:
+1. Stores all documentation in Qdrant
+2. Claude queries before answering
+3. Relevant docs injected automatically
+4. No more re-teaching
+
+## Prerequisites
+
+**Already running on your lab:**
+- Qdrant: `linux.snider.dev:6333`
+- Ollama: `linux.snider.dev:11434` (or local)
+
+**Install Python deps:**
+```bash
+pip install -r requirements.txt
+```
+
+**Ensure embedding model is available:**
+```bash
+ollama pull nomic-embed-text
+```
+
+## Quick Start
+
+### 1. Ingest Documentation
+
+```bash
+# Ingest recovered Host UK docs
+python ingest.py /Users/snider/Code/host-uk/core/tasks/recovered-hostuk \
+    --collection hostuk-docs \
+    --recreate
+
+# Ingest Flux UI docs separately (higher priority)
+python ingest.py /path/to/flux-ui-docs \
+    --collection flux-ui-docs \
+    --recreate
+```
+
+### 2. Query the Database
+
+```bash
+# Search for Flux button docs
+python query.py "how to create a Flux button component"
+
+# Filter by category
+python query.py "path sandboxing" --category architecture
+
+# Get more results
+python query.py "Vi personality" --top 10
+
+# Output as JSON
+python query.py "brand voice" --format json
+
+# Output for LLM context injection
+python query.py "Flux modal component" --format context
+```
+
+### 3. List Collections
+
+```bash
+python query.py --list-collections
+python query.py --stats --collection flux-ui-docs
+```
+
+## Collections Strategy
+
+| Collection | Content | Priority |
+|------------|---------|----------|
+| `flux-ui-docs` | Flux Pro component docs | High (UI questions) |
+| `hostuk-docs` | Recovered implementation docs | Medium |
+| `brand-docs` | Vi, brand voice, visual identity | For content generation |
+| `lethean-docs` | SASE/dVPN technical docs | Product-specific |
+
+## Integration with Claude Code
+
+### Option 1: MCP Server (Best)
+
+Create an MCP server that Claude can query:
+
+```go
+// In core CLI
+func (s *RagServer) Query(query string) ([]Document, error) {
+    // Query Qdrant
+    // Return relevant docs
+}
+```
+
+Then Claude can call `rag.query("Flux button")` and get docs automatically.
+
+### Option 2: CLAUDE.md Instruction
+
+Add to project CLAUDE.md:
+
+```markdown
+## Before Answering UI Questions
+
+When asked about Flux UI components, query the RAG database first:
+```bash
+python /path/to/query.py "your question" --collection flux-ui-docs --format context
+```
+
+Include the retrieved context in your response.
+```
+
+### Option 3: Claude Code Hook
+
+Create a hook that auto-injects context for certain queries.
+
+## Category Taxonomy
+
+The ingestion automatically categorizes files:
+
+| Category | Matches |
+|----------|---------|
+| `ui-component` | flux, ui/component |
+| `brand` | brand, mascot |
+| `product-brief` | brief |
+| `help-doc` | help, draft |
+| `task` | task, plan |
+| `architecture` | architecture, migration |
+| `documentation` | default |
+
+## Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `QDRANT_HOST` | linux.snider.dev | Qdrant server |
+| `QDRANT_PORT` | 6333 | Qdrant port |
+| `EMBEDDING_MODEL` | nomic-embed-text | Ollama model |
+| `CHUNK_SIZE` | 500 | Characters per chunk |
+| `CHUNK_OVERLAP` | 50 | Overlap between chunks |
+
+## Training a Model vs RAG
+
+**RAG** (what this does):
+- Model weights unchanged
+- Documents retrieved at query time
+- Knowledge updates instantly (re-ingest)
+- Good for: facts, API docs, current information
+
+**Fine-tuning** (separate process):
+- Model weights updated
+- Knowledge baked into model
+- Requires retraining to update
+- Good for: style, patterns, conventions
+
+**For Flux UI**: RAG is perfect. The docs change, API changes, you want current info.
+
+**For Vi's voice**: Fine-tuning is better. Style doesn't change often, should be "baked in".
+
+## Vector Math (For Understanding)
+
+```text
+"How do I make a Flux button?"
+    ↓ Embedding
+[0.12, -0.45, 0.78, ...768 floats...]
+    ↓ Cosine similarity search
+Find chunks with similar vectors
+    ↓ Results
+1. doc/ui/flux/components/button.md (score: 0.89)
+2. doc/ui/flux/forms.md (score: 0.76)
+3. doc/ui/flux/components/input.md (score: 0.71)
+```
+
+The embedding model converts text to "meaning vectors". Similar meanings = similar vectors = found by search.
+
+## Troubleshooting
+
+**"No results found"**
+- Lower threshold: `--threshold 0.3`
+- Check collection has data: `--stats`
+- Verify Ollama is running: `ollama list`
+
+**"Connection refused"**
+- Check Qdrant is running: `curl http://linux.snider.dev:6333/collections`
+- Check firewall/network
+
+**"Embedding model not available"**
+```bash
+ollama pull nomic-embed-text
+```
+
+---
+
+*Part of the Host UK Core CLI tooling*
diff --git a/tools/rag/ingest.py b/tools/rag/ingest.py
new file mode 100644
index 0000000..7755bc2
--- /dev/null
+++ b/tools/rag/ingest.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python3
+"""
+RAG Ingestion Pipeline for Host UK Documentation
+
+Chunks markdown files, generates embeddings via Ollama, stores in Qdrant.
+
+Usage:
+    python ingest.py /path/to/docs --collection hostuk-docs
+    python ingest.py /path/to/flux-ui --collection flux-ui-docs
+
+Requirements:
+    pip install qdrant-client ollama markdown
+"""
+
+import argparse
+import hashlib
+import json
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Generator
+
+try:
+    from qdrant_client import QdrantClient
+    from qdrant_client.models import Distance, VectorParams, PointStruct
+    import ollama
+except ImportError:
+    print("Install dependencies: pip install qdrant-client ollama")
+    sys.exit(1)
+
+
+# Configuration
+QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")
+QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
+EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "nomic-embed-text")
+CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "500"))  # chars
+CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "50"))  # chars
+VECTOR_DIM = 768  # nomic-embed-text dimension
+
+
+def chunk_markdown(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> Generator[dict, None, None]:
+    """
+    Chunk markdown by sections (## headers), then by paragraphs if too long.
+    Preserves context with overlap.
+    """
+    # Split by ## headers first
+    sections = re.split(r'\n(?=## )', text)
+
+    for section in sections:
+        if not section.strip():
+            continue
+
+        # Extract section title
+        lines = section.strip().split('\n')
+        title = lines[0].lstrip('#').strip() if lines[0].startswith('#') else ""
+
+        # If section is small enough, yield as-is
+        if len(section) <= chunk_size:
+            yield {
+                "text": section.strip(),
+                "section": title,
+            }
+            continue
+
+        # Otherwise, chunk by paragraphs
+        paragraphs = re.split(r'\n\n+', section)
+        current_chunk = ""
+
+        for para in paragraphs:
+            if len(current_chunk) + len(para) <= chunk_size:
+                current_chunk += "\n\n" + para if current_chunk else para
+            else:
+                if current_chunk:
+                    yield {
+                        "text": current_chunk.strip(),
+                        "section": title,
+                    }
+                # Start new chunk with overlap from previous
+                if overlap and current_chunk:
+                    overlap_text = current_chunk[-overlap:]
+                    current_chunk = overlap_text + "\n\n" + para
+                else:
+                    current_chunk = para
+
+        # Don't forget the last chunk
+        if current_chunk.strip():
+            yield {
+                "text": current_chunk.strip(),
+                "section": title,
+            }
+
+
+def generate_embedding(text: str) -> list[float]:
+    """Generate embedding using Ollama."""
+    response = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text)
+    return response["embedding"]
+
+
+def get_file_category(path: str) -> str:
+    """Determine category from file path."""
+    path_lower = path.lower()
+
+    if "flux" in path_lower or "ui/component" in path_lower:
+        return "ui-component"
+    elif "brand" in path_lower or "mascot" in path_lower:
+        return "brand"
+    elif "brief" in path_lower:
+        return "product-brief"
+    elif "help" in path_lower or "draft" in path_lower:
+        return "help-doc"
+    elif "task" in path_lower or "plan" in path_lower:
+        return "task"
+    elif "architecture" in path_lower or "migration" in path_lower:
+        return "architecture"
+    else:
+        return "documentation"
+
+
+def ingest_directory(
+    directory: Path,
+    client: QdrantClient,
+    collection: str,
+    verbose: bool = False
+) -> dict:
+    """Ingest all markdown files from directory into Qdrant."""
+
+    stats = {"files": 0, "chunks": 0, "errors": 0}
+    points = []
+
+    # Find all markdown files
+    md_files = list(directory.rglob("*.md"))
+    print(f"Found {len(md_files)} markdown files")
+
+    for file_path in md_files:
+        try:
+            rel_path = str(file_path.relative_to(directory))
+
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                content = f.read()
+
+            if not content.strip():
+                continue
+
+            # Extract metadata
+            category = get_file_category(rel_path)
+
+            # Chunk the content
+            for i, chunk in enumerate(chunk_markdown(content)):
+                chunk_id = hashlib.md5(
+                    f"{rel_path}:{i}:{chunk['text'][:100]}".encode()
+                ).hexdigest()
+
+                # Generate embedding
+                embedding = generate_embedding(chunk["text"])
+
+                # Create point
+                point = PointStruct(
+                    id=chunk_id,
+                    vector=embedding,
+                    payload={
+                        "text": chunk["text"],
+                        "source": rel_path,
+                        "section": chunk["section"],
+                        "category": category,
+                        "chunk_index": i,
+                    }
+                )
+                points.append(point)
+                stats["chunks"] += 1
+
+                if verbose:
+                    print(f"  [{category}] {rel_path} chunk {i}: {len(chunk['text'])} chars")
+
+            stats["files"] += 1
+            if not verbose:
+                print(f"  Processed: {rel_path} ({stats['chunks']} chunks total)")
+
+        except Exception as e:
+            print(f"  Error processing {file_path}: {e}")
+            stats["errors"] += 1
+
+    # Batch upsert to Qdrant
+    if points:
+        print(f"\nUpserting {len(points)} vectors to Qdrant...")
+
+        # Upsert in batches of 100
+        batch_size = 100
+        for i in range(0, len(points), batch_size):
+            batch = points[i:i + batch_size]
+            client.upsert(collection_name=collection, points=batch)
+            print(f"  Uploaded batch {i // batch_size + 1}/{(len(points) - 1) // batch_size + 1}")
+
+    return stats
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Ingest markdown docs into Qdrant")
+    parser.add_argument("directory", type=Path, help="Directory containing markdown files")
+    parser.add_argument("--collection", default="hostuk-docs", help="Qdrant collection name")
+    parser.add_argument("--recreate", action="store_true", help="Delete and recreate collection")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+    parser.add_argument("--qdrant-host", default=QDRANT_HOST, help="Qdrant host")
+    parser.add_argument("--qdrant-port", type=int, default=QDRANT_PORT, help="Qdrant port")
+
+    args = parser.parse_args()
+
+    if not args.directory.exists():
+        print(f"Error: Directory not found: {args.directory}")
+        sys.exit(1)
+
+    # Connect to Qdrant
+    print(f"Connecting to Qdrant at {args.qdrant_host}:{args.qdrant_port}...")
+    client = QdrantClient(host=args.qdrant_host, port=args.qdrant_port)
+
+    # Create or recreate collection
+    collections = [c.name for c in client.get_collections().collections]
+
+    if args.recreate and args.collection in collections:
+        print(f"Deleting existing collection: {args.collection}")
+        client.delete_collection(args.collection)
+        collections.remove(args.collection)
+
+    if args.collection not in collections:
+        print(f"Creating collection: {args.collection}")
+        client.create_collection(
+            collection_name=args.collection,
+            vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE)
+        )
+
+    # Verify Ollama model is available
+    print(f"Using embedding model: {EMBEDDING_MODEL}")
+    try:
+        ollama.embeddings(model=EMBEDDING_MODEL, prompt="test")
+    except Exception as e:
+        print(f"Error: Embedding model not available. Run: ollama pull {EMBEDDING_MODEL}")
+        sys.exit(1)
+
+    # Ingest files
+    print(f"\nIngesting from: {args.directory}")
+    stats = ingest_directory(args.directory, client, args.collection, args.verbose)
+
+    # Summary
+    print(f"\n{'=' * 50}")
+    print(f"Ingestion complete!")
+    print(f"  Files processed: {stats['files']}")
+    print(f"  Chunks created:  {stats['chunks']}")
+    print(f"  Errors:          {stats['errors']}")
+    print(f"  Collection:      {args.collection}")
+    print(f"{'=' * 50}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/rag/query.py b/tools/rag/query.py
new file mode 100644
index 0000000..24846d5
--- /dev/null
+++ b/tools/rag/query.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+"""
+RAG Query Tool for Host UK Documentation
+
+Query the vector database and retrieve relevant documentation chunks.
+
+Usage:
+    python query.py "how do I create a Flux button"
+    python query.py "what is Vi's personality" --collection hostuk-docs
+    python query.py "path sandboxing" --top 10 --category architecture
+
+Requirements:
+    pip install qdrant-client ollama
+"""
+
+import argparse
+import html
+import json
+import os
+import sys
+from typing import Optional
+
+try:
+    from qdrant_client import QdrantClient
+    from qdrant_client.models import Filter, FieldCondition, MatchValue
+    import ollama
+except ImportError:
+    print("Install dependencies: pip install qdrant-client ollama")
+    sys.exit(1)
+
+
+# Configuration
+QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")
+QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
+EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "nomic-embed-text")
+
+
+def generate_embedding(text: str) -> list[float]:
+    """Generate embedding using Ollama."""
+    response = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text)
+    return response["embedding"]
+
+
+def query_rag(
+    query: str,
+    client: QdrantClient,
+    collection: str,
+    top_k: int = 5,
+    category: Optional[str] = None,
+    score_threshold: float = 0.5,
+) -> list[dict]:
+    """Query the RAG database and return relevant chunks."""
+
+    # Generate query embedding
+    query_embedding = generate_embedding(query)
+
+    # Build filter if category specified
+    query_filter = None
+    if category:
+        query_filter = Filter(
+            must=[
+                FieldCondition(key="category", match=MatchValue(value=category))
+            ]
+        )
+
+    # Search
+    results = client.query_points(
+        collection_name=collection,
+        query=query_embedding,
+        query_filter=query_filter,
+        limit=top_k,
+        score_threshold=score_threshold,
+    ).points
+
+    return [
+        {
+            "score": hit.score,
+            "text": hit.payload["text"],
+            "source": hit.payload["source"],
+            "section": hit.payload.get("section", ""),
+            "category": hit.payload.get("category", ""),
+        }
+        for hit in results
+    ]
+
+
+def format_results(results: list[dict], query: str, format: str = "text") -> str:
+    """Format results for display."""
+
+    if format == "json":
+        return json.dumps(results, indent=2)
+
+    if not results:
+        return f"No results found for: {query}"
+
+    output = []
+    output.append(f"Query: {query}")
+    output.append(f"Results: {len(results)}")
+    output.append("=" * 60)
+
+    for i, r in enumerate(results, 1):
+        output.append(f"\n[{i}] {r['source']} (score: {r['score']:.3f})")
+        if r['section']:
+            output.append(f"    Section: {r['section']}")
+        output.append(f"    Category: {r['category']}")
+        output.append("-" * 40)
+        # Truncate long text for display
+        text = r['text']
+        if len(text) > 500:
+            text = text[:500] + "..."
+        output.append(text)
+        output.append("")
+
+    return "\n".join(output)
+
+
+def format_for_context(results: list[dict], query: str) -> str:
+    """Format results as context for LLM injection."""
+
+    if not results:
+        return ""
+
+    output = []
+    output.append(f'<retrieved_context query="{html.escape(query)}">')
+
+    for r in results:
+        output.append(f'\n<document source="{html.escape(r["source"])}" category="{html.escape(r["category"])}">')
+        output.append(html.escape(r['text']))
+        output.append("</document>")
+
+    output.append("\n</retrieved_context>")
+
+    return "\n".join(output)
+
+def main():
+    parser = argparse.ArgumentParser(description="Query RAG documentation")
+    parser.add_argument("query", nargs="?", help="Search query")
+    parser.add_argument("--collection", default="hostuk-docs", help="Qdrant collection name")
+    parser.add_argument("--top", "-k", type=int, default=5, help="Number of results")
+    parser.add_argument("--category", "-c", help="Filter by category")
+    parser.add_argument("--threshold", "-t", type=float, default=0.5, help="Score threshold")
+    parser.add_argument("--format", "-f", choices=["text", "json", "context"], default="text")
+    parser.add_argument("--qdrant-host", default=QDRANT_HOST)
+    parser.add_argument("--qdrant-port", type=int, default=QDRANT_PORT)
+    parser.add_argument("--list-collections", action="store_true", help="List available collections")
+    parser.add_argument("--stats", action="store_true", help="Show collection stats")
+
+    args = parser.parse_args()
+
+    # Connect to Qdrant
+    client = QdrantClient(host=args.qdrant_host, port=args.qdrant_port)
+
+    # List collections
+    if args.list_collections:
+        collections = client.get_collections().collections
+        print("Available collections:")
+        for c in collections:
+            info = client.get_collection(c.name)
+            print(f"  - {c.name}: {info.points_count} vectors")
+        return
+
+    # Show stats
+    if args.stats:
+        try:
+            info = client.get_collection(args.collection)
+            print(f"Collection: {args.collection}")
+            print(f"  Vectors: {info.points_count}")
+            print(f"  Status: {info.status}")
+        except Exception as e:
+            print(f"Collection not found: {args.collection}")
+        return
+
+    # Query required
+    if not args.query:
+        parser.print_help()
+        return
+
+    # Execute query
+    results = query_rag(
+        query=args.query,
+        client=client,
+        collection=args.collection,
+        top_k=args.top,
+        category=args.category,
+        score_threshold=args.threshold,
+    )
+
+    # Format output
+    if args.format == "context":
+        print(format_for_context(results, args.query))
+    else:
+        print(format_results(results, args.query, args.format))
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/tools/rag/requirements.txt b/tools/rag/requirements.txt
new file mode 100644
index 0000000..cd4cc3e
--- /dev/null
+++ b/tools/rag/requirements.txt
@@ -0,0 +1,2 @@
+qdrant-client>=1.12.0,<2.0.0
+ollama>=0.1.0
\ No newline at end of file