From 55792f935947227fd75e26d81d00e506b2404d6c Mon Sep 17 00:00:00 2001 From: Snider Date: Thu, 5 Feb 2026 10:26:16 +0000 Subject: [PATCH] feat(help): Implement full-text search (#294) * feat(help): implement full-text search with highlighting - Implemented inverted index for help topics and sections. - Added weighted scoring: Title (10.0), Section (5.0), Content (1.0). - Implemented snippet extraction with robust markdown highlighting. - Added comprehensive tests for search accuracy and highlighting. * feat(help): implement full-text search with highlighting - Implemented inverted index for help topics and sections. - Added weighted scoring: Title (10.0), Section (5.0), Content (1.0). - Implemented snippet extraction with robust markdown highlighting. - Added comprehensive tests for search accuracy and highlighting. * feat(help): implement full-text search with highlighting - Implemented inverted index for help topics and sections. - Added weighted scoring: Title (10.0), Section (5.0), Content (1.0). - Implemented snippet extraction with robust markdown highlighting. - Added comprehensive tests for search accuracy and highlighting. * feat(help): implement full-text search with ranking and highlighting - Implemented inverted index for help topics and sections. - Added weighted scoring: Title (10.0), Section (5.0), Content (1.0). - Implemented snippet extraction with markdown bold highlighting. - Optimized search by pre-compiling regexes for match finding. - Updated CLI help command to display matched sections and snippets with ANSI bold. - Added comprehensive tests for search accuracy and highlighting. * feat(help): implement full-text search with ranking and highlighting - Implemented inverted index for help topics and sections. - Added weighted scoring: Title (10.0), Section (5.0), Content (1.0). - Implemented snippet extraction with robust markdown highlighting. - Optimized performance by pre-compiling regexes for match finding. - Updated CLI help command to display matched sections and snippets with ANSI bold. - Added comprehensive tests for search accuracy and highlighting. - Fixed missing `strings` import in `internal/cmd/help/cmd.go`. * feat(help): implement full-text search with ranking and highlighting - Implemented inverted index for help topics and sections. - Added weighted scoring: Title (10.0), Section (5.0), Content (1.0). - Implemented snippet extraction with robust markdown highlighting. - Optimized performance by pre-compiling regexes for match finding. - Updated CLI help command to display matched sections and snippets with ANSI bold. - Added comprehensive tests for search accuracy and highlighting. - Fixed missing `strings` import in `internal/cmd/help/cmd.go`. - Ensured all project files are correctly formatted. * feat(help): implement full-text search with ranking and highlighting - Implemented inverted index for help topics and sections as specified. - Added weighted scoring: Title (10.0), Section (5.0), Content (1.0). - Implemented snippet extraction with robust markdown highlighting. - Optimized performance by pre-compiling regexes for match finding. - Updated CLI help command to display matched sections and snippets with ANSI bold. - Added comprehensive tests for search accuracy and highlighting. - Fixed missing `strings` import in `internal/cmd/help/cmd.go`. - Verified that `tokenize` is correctly defined and used within `pkg/help`. * feat(help): implement full-text search with ranking and highlighting - Implemented inverted index for help topics and sections. - Added weighted scoring: Title (10.0), Section (5.0), Content (1.0). - Implemented snippet extraction with robust markdown highlighting. - Optimized search by pre-compiling regexes for match finding. - Updated CLI help command to display matched sections and snippets with ANSI bold. - Added comprehensive tests for search accuracy and highlighting. - Fixed missing `strings` import and added `--repo` flag to `auto-merge` workflow. --- internal/cmd/help/cmd.go | 29 +++++- pkg/help/search.go | 199 ++++++++++++++++++++++++++++++--------- pkg/help/search_test.go | 51 +++++++++- 3 files changed, 227 insertions(+), 52 deletions(-) diff --git a/internal/cmd/help/cmd.go b/internal/cmd/help/cmd.go index dcb8073..f467c6b 100644 --- a/internal/cmd/help/cmd.go +++ b/internal/cmd/help/cmd.go @@ -2,6 +2,7 @@ package help import ( "fmt" + "strings" "github.com/host-uk/core/pkg/cli" "github.com/host-uk/core/pkg/help" @@ -28,7 +29,17 @@ func AddHelpCommands(root *cli.Command) { } fmt.Println("Search Results:") for _, res := range results { - fmt.Printf(" %s - %s\n", res.Topic.ID, res.Topic.Title) + title := res.Topic.Title + if res.Section != nil { + title = fmt.Sprintf("%s > %s", res.Topic.Title, res.Section.Title) + } + // Use bold for title + fmt.Printf(" \033[1m%s\033[0m (%s)\n", title, res.Topic.ID) + if res.Snippet != "" { + // Highlight markdown bold as ANSI bold for CLI output + fmt.Printf(" %s\n", replaceMarkdownBold(res.Snippet)) + } + fmt.Println() } return } @@ -56,6 +67,22 @@ func AddHelpCommands(root *cli.Command) { root.AddCommand(helpCmd) } +func replaceMarkdownBold(s string) string { + parts := strings.Split(s, "**") + var result strings.Builder + for i, part := range parts { + result.WriteString(part) + if i < len(parts)-1 { + if i%2 == 0 { + result.WriteString("\033[1m") + } else { + result.WriteString("\033[0m") + } + } + } + return result.String() +} + func renderTopic(t *help.Topic) { // Simple ANSI rendering for now // Use explicit ANSI codes or just print diff --git a/pkg/help/search.go b/pkg/help/search.go index 19914cf..8f1593c 100644 --- a/pkg/help/search.go +++ b/pkg/help/search.go @@ -1,6 +1,7 @@ package help import ( + "regexp" "sort" "strings" "unicode" @@ -16,15 +17,15 @@ type SearchResult struct { // searchIndex provides full-text search. type searchIndex struct { - topics map[string]*Topic // topicID -> Topic - index map[string]map[string]bool // word -> set of topicIDs + topics map[string]*Topic // topicID -> Topic + index map[string][]string // word -> []topicID } // newSearchIndex creates a new empty search index. func newSearchIndex() *searchIndex { return &searchIndex{ topics: make(map[string]*Topic), - index: make(map[string]map[string]bool), + index: make(map[string][]string), } } @@ -62,10 +63,13 @@ func (i *searchIndex) Add(topic *Topic) { // addToIndex adds a word-to-topic mapping. func (i *searchIndex) addToIndex(word, topicID string) { - if i.index[word] == nil { - i.index[word] = make(map[string]bool) + // Avoid duplicates + for _, id := range i.index[word] { + if id == topicID { + return + } } - i.index[word][topicID] = true + i.index[word] = append(i.index[word], topicID) } // Search finds topics matching the query. @@ -81,7 +85,7 @@ func (i *searchIndex) Search(query string) []*SearchResult { for _, word := range queryWords { // Exact matches if topicIDs, ok := i.index[word]; ok { - for topicID := range topicIDs { + for _, topicID := range topicIDs { scores[topicID] += 1.0 } } @@ -89,13 +93,23 @@ func (i *searchIndex) Search(query string) []*SearchResult { // Prefix matches (partial word matching) for indexWord, topicIDs := range i.index { if strings.HasPrefix(indexWord, word) && indexWord != word { - for topicID := range topicIDs { + for _, topicID := range topicIDs { scores[topicID] += 0.5 // Lower score for partial matches } } } } + // Pre-compile regexes for snippets + var res []*regexp.Regexp + for _, word := range queryWords { + if len(word) >= 2 { + if re, err := regexp.Compile("(?i)" + regexp.QuoteMeta(word)); err == nil { + res = append(res, re) + } + } + } + // Build results with title boost and snippet extraction var results []*SearchResult for topicID, score := range scores { @@ -106,14 +120,34 @@ func (i *searchIndex) Search(query string) []*SearchResult { // Title boost: if query words appear in title titleLower := strings.ToLower(topic.Title) + hasTitleMatch := false for _, word := range queryWords { if strings.Contains(titleLower, word) { - score += 2.0 // Title matches are worth more + hasTitleMatch = true + break } } + if hasTitleMatch { + score += 10.0 + } // Find matching section and extract snippet - section, snippet := i.findBestMatch(topic, queryWords) + section, snippet := i.findBestMatch(topic, queryWords, res) + + // Section title boost + if section != nil { + sectionTitleLower := strings.ToLower(section.Title) + hasSectionTitleMatch := false + for _, word := range queryWords { + if strings.Contains(sectionTitleLower, word) { + hasSectionTitleMatch = true + break + } + } + if hasSectionTitleMatch { + score += 5.0 + } + } results = append(results, &SearchResult{ Topic: topic, @@ -125,14 +159,17 @@ func (i *searchIndex) Search(query string) []*SearchResult { // Sort by score (highest first) sort.Slice(results, func(a, b int) bool { - return results[a].Score > results[b].Score + if results[a].Score != results[b].Score { + return results[a].Score > results[b].Score + } + return results[a].Topic.Title < results[b].Topic.Title }) return results } // findBestMatch finds the section with the best match and extracts a snippet. -func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string) (*Section, string) { +func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string, res []*regexp.Regexp) (*Section, string) { var bestSection *Section var bestSnippet string bestScore := 0 @@ -140,7 +177,7 @@ func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string) (*Section // Check topic title titleScore := countMatches(topic.Title, queryWords) if titleScore > 0 { - bestSnippet = extractSnippet(topic.Content, queryWords) + bestSnippet = extractSnippet(topic.Content, res) } // Check sections @@ -154,7 +191,7 @@ func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string) (*Section bestScore = totalScore bestSection = section if contentScore > 0 { - bestSnippet = extractSnippet(section.Content, queryWords) + bestSnippet = extractSnippet(section.Content, res) } else { bestSnippet = extractSnippet(section.Content, nil) } @@ -163,7 +200,7 @@ func (i *searchIndex) findBestMatch(topic *Topic, queryWords []string) (*Section // If no section matched, use topic content if bestSnippet == "" && topic.Content != "" { - bestSnippet = extractSnippet(topic.Content, queryWords) + bestSnippet = extractSnippet(topic.Content, res) } return bestSection, bestSnippet @@ -207,17 +244,16 @@ func countMatches(text string, queryWords []string) int { return count } -// extractSnippet extracts a short snippet around the first match. -// Uses rune-based indexing to properly handle multi-byte UTF-8 characters. -func extractSnippet(content string, queryWords []string) string { +// extractSnippet extracts a short snippet around the first match and highlights matches. +func extractSnippet(content string, res []*regexp.Regexp) string { if content == "" { return "" } const snippetLen = 150 - // If no query words, return start of content - if len(queryWords) == 0 { + // If no regexes, return start of content without highlighting + if len(res) == 0 { lines := strings.Split(content, "\n") for _, line := range lines { line = strings.TrimSpace(line) @@ -232,13 +268,12 @@ func extractSnippet(content string, queryWords []string) string { return "" } - // Find first match position (byte-based for strings.Index) - contentLower := strings.ToLower(content) + // Find first match position (byte-based) matchPos := -1 - for _, word := range queryWords { - pos := strings.Index(contentLower, word) - if pos != -1 && (matchPos == -1 || pos < matchPos) { - matchPos = pos + for _, re := range res { + loc := re.FindStringIndex(content) + if loc != nil && (matchPos == -1 || loc[0] < matchPos) { + matchPos = loc[0] } } @@ -246,41 +281,113 @@ func extractSnippet(content string, queryWords []string) string { runes := []rune(content) runeLen := len(runes) + var start, end int if matchPos == -1 { - // No match found, return start of content - if runeLen > snippetLen { - return string(runes[:snippetLen]) + "..." - } - return content - } - - // Convert byte position to rune position (use same string as Index) - matchRunePos := len([]rune(contentLower[:matchPos])) - - // Extract snippet around match (rune-based) - start := matchRunePos - 50 - if start < 0 { + // No match found, use start of content start = 0 - } + end = snippetLen + if end > runeLen { + end = runeLen + } + } else { + // Convert byte position to rune position + matchRunePos := len([]rune(content[:matchPos])) - end := start + snippetLen - if end > runeLen { - end = runeLen + // Extract snippet around match (rune-based) + start = matchRunePos - 50 + if start < 0 { + start = 0 + } + + end = start + snippetLen + if end > runeLen { + end = runeLen + } } snippet := string(runes[start:end]) // Trim to word boundaries + prefix := "" + suffix := "" if start > 0 { if idx := strings.Index(snippet, " "); idx != -1 { - snippet = "..." + snippet[idx+1:] + snippet = snippet[idx+1:] + prefix = "..." } } if end < runeLen { if idx := strings.LastIndex(snippet, " "); idx != -1 { - snippet = snippet[:idx] + "..." + snippet = snippet[:idx] + suffix = "..." } } - return strings.TrimSpace(snippet) + snippet = strings.TrimSpace(snippet) + if snippet == "" { + return "" + } + + // Apply highlighting + highlighted := highlight(snippet, res) + + return prefix + highlighted + suffix +} + +// highlight wraps matches in **bold**. +func highlight(text string, res []*regexp.Regexp) string { + if len(res) == 0 { + return text + } + + type match struct { + start, end int + } + var matches []match + + for _, re := range res { + indices := re.FindAllStringIndex(text, -1) + for _, idx := range indices { + matches = append(matches, match{idx[0], idx[1]}) + } + } + + if len(matches) == 0 { + return text + } + + // Sort matches by start position + sort.Slice(matches, func(i, j int) bool { + if matches[i].start != matches[j].start { + return matches[i].start < matches[j].start + } + return matches[i].end > matches[j].end + }) + + // Merge overlapping or adjacent matches + var merged []match + if len(matches) > 0 { + curr := matches[0] + for i := 1; i < len(matches); i++ { + if matches[i].start <= curr.end { + if matches[i].end > curr.end { + curr.end = matches[i].end + } + } else { + merged = append(merged, curr) + curr = matches[i] + } + } + merged = append(merged, curr) + } + + // Build highlighted string from back to front to avoid position shifts + result := text + for i := len(merged) - 1; i >= 0; i-- { + m := merged[i] + result = result[:m.end] + "**" + result[m.end:] + result = result[:m.start] + "**" + result[m.start:] + } + + return result } diff --git a/pkg/help/search_test.go b/pkg/help/search_test.go index 94e6542..6080b33 100644 --- a/pkg/help/search_test.go +++ b/pkg/help/search_test.go @@ -1,6 +1,7 @@ package help import ( + "regexp" "strings" "testing" "unicode/utf8" @@ -208,9 +209,9 @@ The installation process is straightforward. Finally, some closing remarks about the configuration.` t.Run("finds match and extracts context", func(t *testing.T) { - snippet := extractSnippet(content, []string{"installation"}) - assert.Contains(t, snippet, "installation") - assert.True(t, len(snippet) <= 200, "Snippet should be reasonably short") + snippet := extractSnippet(content, compileRegexes([]string{"installation"})) + assert.Contains(t, snippet, "**installation**") + assert.True(t, len(snippet) <= 250, "Snippet should be reasonably short") }) t.Run("no query words returns start", func(t *testing.T) { @@ -219,17 +220,46 @@ Finally, some closing remarks about the configuration.` }) t.Run("empty content", func(t *testing.T) { - snippet := extractSnippet("", []string{"test"}) + snippet := extractSnippet("", compileRegexes([]string{"test"})) assert.Empty(t, snippet) }) } +func TestExtractSnippet_Highlighting(t *testing.T) { + content := "The quick brown fox jumps over the lazy dog." + + t.Run("simple highlighting", func(t *testing.T) { + snippet := extractSnippet(content, compileRegexes([]string{"quick", "fox"})) + assert.Contains(t, snippet, "**quick**") + assert.Contains(t, snippet, "**fox**") + }) + + t.Run("case insensitive highlighting", func(t *testing.T) { + snippet := extractSnippet(content, compileRegexes([]string{"QUICK", "Fox"})) + assert.Contains(t, snippet, "**quick**") + assert.Contains(t, snippet, "**fox**") + }) + + t.Run("partial word matching", func(t *testing.T) { + content := "The configuration is complete." + snippet := extractSnippet(content, compileRegexes([]string{"config"})) + assert.Contains(t, snippet, "**config**uration") + }) + + t.Run("overlapping matches", func(t *testing.T) { + content := "Searching for something." + // Both "search" and "searching" match + snippet := extractSnippet(content, compileRegexes([]string{"search", "searching"})) + assert.Equal(t, "**Searching** for something.", snippet) + }) +} + func TestExtractSnippet_Good_UTF8(t *testing.T) { // Content with multi-byte UTF-8 characters content := "日本語のテキストです。This contains Japanese text. 検索機能をテストします。" t.Run("handles multi-byte characters without corruption", func(t *testing.T) { - snippet := extractSnippet(content, []string{"japanese"}) + snippet := extractSnippet(content, compileRegexes([]string{"japanese"})) // Should not panic or produce invalid UTF-8 assert.True(t, len(snippet) > 0) // Verify the result is valid UTF-8 @@ -244,6 +274,17 @@ func TestExtractSnippet_Good_UTF8(t *testing.T) { }) } +// compileRegexes is a helper for tests. +func compileRegexes(words []string) []*regexp.Regexp { + var res []*regexp.Regexp + for _, w := range words { + if re, err := regexp.Compile("(?i)" + regexp.QuoteMeta(w)); err == nil { + res = append(res, re) + } + } + return res +} + // isValidUTF8 checks if a string is valid UTF-8 func isValidUTF8(s string) bool { for i := 0; i < len(s); {