go-rag/keyword_test.go
Snider d8fd067a8c feat: Phase 3 enhancements — sentence splitting, collection helpers, keyword filter, benchmarks
3.1: Sentence-aware chunk splitting at ". ", "? ", "! " boundaries when
paragraphs exceed ChunkConfig.Size. Overlap now aligns to word boundaries
to avoid mid-word splits.

3.2: VectorStore interface gains ListCollections and CollectionInfo methods.
New collections.go with ListCollections, DeleteCollection, CollectionStats
helpers returning backend-agnostic CollectionInfo. Mock updated accordingly.

3.3: KeywordFilter re-ranks QueryResults by boosting scores for keyword
matches (case-insensitive, +10% per keyword). QueryConfig.Keywords flag
enables automatic extraction and filtering.

3.4: Mock-only benchmarks for chunking, query, ingest, formatting, and
keyword filtering.

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-20 08:02:00 +00:00

216 lines
7.1 KiB
Go

package rag
import (
"context"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// --- KeywordFilter tests ---
func TestKeywordFilter(t *testing.T) {
t.Run("no keywords returns results unchanged", func(t *testing.T) {
results := []QueryResult{
{Text: "Hello world.", Score: 0.9},
{Text: "Goodbye world.", Score: 0.8},
}
filtered := KeywordFilter(results, nil)
require.Len(t, filtered, 2)
assert.Equal(t, float32(0.9), filtered[0].Score)
assert.Equal(t, float32(0.8), filtered[1].Score)
})
t.Run("empty keywords returns results unchanged", func(t *testing.T) {
results := []QueryResult{
{Text: "Hello world.", Score: 0.9},
}
filtered := KeywordFilter(results, []string{})
require.Len(t, filtered, 1)
assert.Equal(t, float32(0.9), filtered[0].Score)
})
t.Run("single keyword boosts matching result", func(t *testing.T) {
results := []QueryResult{
{Text: "This document is about Go programming.", Score: 0.8},
{Text: "This document is about Python scripting.", Score: 0.9},
}
filtered := KeywordFilter(results, []string{"Go"})
require.Len(t, filtered, 2)
// Go result should be boosted by 10%: 0.8 * 1.1 = 0.88
// Python result unchanged: 0.9
// Python (0.9) > Go (0.88), so Python still first
assert.Equal(t, "This document is about Python scripting.", filtered[0].Text)
assert.InDelta(t, 0.9, filtered[0].Score, 0.001)
assert.Equal(t, "This document is about Go programming.", filtered[1].Text)
assert.InDelta(t, 0.88, filtered[1].Score, 0.001)
})
t.Run("single keyword can reorder results", func(t *testing.T) {
results := []QueryResult{
{Text: "General information about various topics.", Score: 0.85},
{Text: "Detailed guide to Kubernetes deployment.", Score: 0.80},
}
filtered := KeywordFilter(results, []string{"kubernetes"})
require.Len(t, filtered, 2)
// Kubernetes result boosted: 0.80 * 1.1 = 0.88 > 0.85
assert.Equal(t, "Detailed guide to Kubernetes deployment.", filtered[0].Text)
assert.InDelta(t, 0.88, filtered[0].Score, 0.001)
})
t.Run("multiple keywords compound boost", func(t *testing.T) {
results := []QueryResult{
{Text: "Go is a programming language for systems.", Score: 0.7},
{Text: "Python is used for machine learning tasks.", Score: 0.9},
{Text: "Go and Rust are systems programming languages.", Score: 0.6},
}
filtered := KeywordFilter(results, []string{"go", "systems"})
require.Len(t, filtered, 3)
// First result matches both: 0.7 * 1.2 = 0.84
// Second result matches neither: 0.9
// Third result matches both: 0.6 * 1.2 = 0.72
// Order: Python (0.9), first Go (0.84), third Go+Rust (0.72)
assert.Equal(t, "Python is used for machine learning tasks.", filtered[0].Text)
assert.InDelta(t, 0.9, filtered[0].Score, 0.001)
assert.Equal(t, "Go is a programming language for systems.", filtered[1].Text)
assert.InDelta(t, 0.84, filtered[1].Score, 0.001)
assert.Equal(t, "Go and Rust are systems programming languages.", filtered[2].Text)
assert.InDelta(t, 0.72, filtered[2].Score, 0.001)
})
t.Run("case insensitive matching", func(t *testing.T) {
results := []QueryResult{
{Text: "KUBERNETES is a container orchestration platform.", Score: 0.7},
{Text: "Docker runs containers.", Score: 0.8},
}
filtered := KeywordFilter(results, []string{"kubernetes"})
require.Len(t, filtered, 2)
// KUBERNETES matches "kubernetes" case-insensitively: 0.7 * 1.1 = 0.77
assert.InDelta(t, 0.77, filtered[1].Score, 0.001)
assert.Equal(t, "KUBERNETES is a container orchestration platform.", filtered[1].Text)
})
t.Run("no matches leaves scores unchanged", func(t *testing.T) {
results := []QueryResult{
{Text: "This is about cats.", Score: 0.9},
{Text: "This is about dogs.", Score: 0.8},
}
filtered := KeywordFilter(results, []string{"elephants"})
require.Len(t, filtered, 2)
assert.Equal(t, float32(0.9), filtered[0].Score)
assert.Equal(t, float32(0.8), filtered[1].Score)
assert.Equal(t, "This is about cats.", filtered[0].Text)
assert.Equal(t, "This is about dogs.", filtered[1].Text)
})
t.Run("empty results returns empty", func(t *testing.T) {
filtered := KeywordFilter(nil, []string{"test"})
assert.Empty(t, filtered)
})
}
// --- extractKeywords tests ---
func TestExtractKeywords(t *testing.T) {
t.Run("extracts words 3+ characters", func(t *testing.T) {
keywords := extractKeywords("how do I use Go modules")
assert.Contains(t, keywords, "how")
assert.Contains(t, keywords, "use")
assert.Contains(t, keywords, "modules")
// "do" and "I" are too short
assert.NotContains(t, keywords, "do")
assert.NotContains(t, keywords, "i")
})
t.Run("empty string returns empty", func(t *testing.T) {
keywords := extractKeywords("")
assert.Empty(t, keywords)
})
t.Run("all short words returns empty", func(t *testing.T) {
keywords := extractKeywords("I am a")
assert.Empty(t, keywords)
})
t.Run("keywords are lowercased", func(t *testing.T) {
keywords := extractKeywords("Kubernetes Deployment")
assert.Contains(t, keywords, "kubernetes")
assert.Contains(t, keywords, "deployment")
})
}
// --- Query with Keywords integration ---
func TestQuery_Keywords(t *testing.T) {
t.Run("keywords flag enables keyword boosting", func(t *testing.T) {
store := newMockVectorStore()
store.points["test-col"] = []Point{
{ID: "1", Vector: []float32{0.1}, Payload: map[string]any{
"text": "General overview of the platform.", "source": "a.md",
"section": "", "category": "docs", "chunk_index": 0,
}},
{ID: "2", Vector: []float32{0.1}, Payload: map[string]any{
"text": "Guide to deploying with Kubernetes containers.", "source": "b.md",
"section": "", "category": "docs", "chunk_index": 1,
}},
}
embedder := newMockEmbedder(768)
cfg := DefaultQueryConfig()
cfg.Collection = "test-col"
cfg.Limit = 10
cfg.Threshold = 0.0
cfg.Keywords = true
results, err := Query(context.Background(), store, embedder, "kubernetes containers", cfg)
require.NoError(t, err)
require.Len(t, results, 2)
// The second result (score 0.9 from mock) matches two keywords,
// boosted to 0.9 * 1.2 = 1.08, so it should be first.
assert.Equal(t, "Guide to deploying with Kubernetes containers.", results[0].Text)
})
t.Run("keywords false does not boost", func(t *testing.T) {
store := newMockVectorStore()
store.points["test-col"] = []Point{
{ID: "1", Vector: []float32{0.1}, Payload: map[string]any{
"text": "First result text.", "source": "a.md",
"section": "", "category": "docs", "chunk_index": 0,
}},
{ID: "2", Vector: []float32{0.1}, Payload: map[string]any{
"text": "Second result text with keywords.", "source": "b.md",
"section": "", "category": "docs", "chunk_index": 1,
}},
}
embedder := newMockEmbedder(768)
cfg := DefaultQueryConfig()
cfg.Collection = "test-col"
cfg.Limit = 10
cfg.Threshold = 0.0
cfg.Keywords = false
results, err := Query(context.Background(), store, embedder, "keywords", cfg)
require.NoError(t, err)
require.Len(t, results, 2)
// Without keywords, original order preserved (first has higher score)
assert.Equal(t, "First result text.", results[0].Text)
})
}