feat(agent/agentic): Poindexter KD-tree clustering replaces exact-key bucketing in runQAWithReport

qa_cluster.go wires QA finding clustering through Poindexter instead of the old exact (tool, severity, category, rule) bucketing. Hashed feature vectors built from finding metadata + message text are indexed in Poindexter KD-trees with cosine + Euclidean distance, near-neighbours unioned, then emitted as the existing DispatchCluster shape (so consumers don't break). Old exact-key grouping kept as fallback if Poindexter tree construction ever fails. qa.go updated to describe new similarity-based contract; runQAWithReport already consumes clusterFindings, picks up the new grouping automatically. Tests cover: 3 similar findings + 2 distinct → grouped correctly; 5 distinct findings → 5 clusters; 0 findings → empty/nil no panic; sample-cap regression preserved. Note: github.com/Snider/Poindexter added as a direct dep. Per the non-negotiable migration policy on Snider personal-namespace deps (see #219), file follow-up to migrate Poindexter to its canonical core/* home if/when that lands. Co-authored-by: Codex <noreply@openai.com> Closes tasks.lthn.sh/view.php?id=164
2026-04-25 20:25:04 +01:00 · 2026-04-25 20:25:04 +01:00 · 40728e68d1
commit 40728e68d1
parent b42cf5a18c
5 changed files with 400 additions and 85 deletions
--- a/go.mod
+++ b/go.mod
@ -3,13 +3,13 @@ module dappco.re/go/agent
 go 1.26.0

 require (
-	dappco.re/go/core v0.8.0-alpha.1
 	dappco.re/go/api v0.8.0-alpha.1
+	dappco.re/go/core v0.8.0-alpha.1
 	dappco.re/go/forge v0.8.0-alpha.1
+	dappco.re/go/mcp v0.8.0-alpha.1
 	dappco.re/go/process v0.8.0-alpha.1
 	dappco.re/go/store v0.8.0-alpha.1
 	dappco.re/go/ws v0.8.0-alpha.1
-	dappco.re/go/mcp v0.8.0-alpha.1
 	github.com/gin-gonic/gin v1.12.0
 	github.com/gorilla/websocket v1.5.3
 	github.com/modelcontextprotocol/go-sdk v1.5.0
@ -25,6 +25,7 @@ require (
 	dappco.re/go/webview v0.8.0-alpha.1 // indirect
 	github.com/99designs/gqlgen v0.17.88 // indirect
 	github.com/KyleBanks/depth v1.2.1 // indirect
+	github.com/Snider/Poindexter v0.0.0-20260104200422-91146b212a1f
 	github.com/agnivade/levenshtein v1.2.1 // indirect
 	github.com/andybalholm/brotli v1.2.0 // indirect
 	github.com/bahlo/generic-list-go v0.2.0 // indirect
--- a/go.sum
+++ b/go.sum
@ -26,6 +26,8 @@ github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc
 github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE=
 github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
 github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
+github.com/Snider/Poindexter v0.0.0-20260104200422-91146b212a1f h1:+EnE414H9wUaBeUVNjyErusrxSbBGnGV6MBhTw/em0k=
+github.com/Snider/Poindexter v0.0.0-20260104200422-91146b212a1f/go.mod h1:nhgkbg4zWA4AS2Ga3RmcvdsyiI9TdxvSqe5EVBSb3Hk=
 github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM=
 github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU=
 github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNgfBlViaCIJKLlCJ6/fmUseuG0wVQ=
--- a/pkg/agentic/qa.go
+++ b/pkg/agentic/qa.go
@ -97,9 +97,10 @@ type DispatchReport struct {
 }

 // DispatchCluster groups similar findings together so human reviewers can see
-// recurring problem shapes without scanning every raw finding. A cluster keys
-// by (tool, severity, category, rule_id) and counts how many findings fell
-// into that bucket in the current cycle, with representative samples.
+// recurring problem shapes without scanning every raw finding. Clusters are
+// built from Poindexter KD-tree similarity over hashed finding features, then
+// summarised back into the dominant tool/severity/category/rule identity with
+// representative samples.
 //
 // Usage example: `cluster := DispatchCluster{Tool: "gosec", Severity: "error", Category: "security", Count: 3, RuleID: "G101"}`
 type DispatchCluster struct {
@ -694,73 +695,3 @@ func findingsFromJournalPayload(payload map[string]any) []map[string]any {
 func escapeJournalLiteral(value string) string {
 	return core.Replace(value, "'", "''")
 }
-
-// clusterFindings groups the current cycle's findings by (tool, severity,
-// category, rule_id) so `.meta/report.json` surfaces recurring shapes. The
-// cluster count equals the number of findings in the bucket; the sample list
-// is capped at `clusterSampleLimit` representative entries so the payload
-// stays bounded for chatty linters.
-//
-// Usage example: `clusters := clusterFindings(report.Findings)`
-func clusterFindings(findings []QAFinding) []DispatchCluster {
-	if len(findings) == 0 {
-		return nil
-	}
-
-	byKey := make(map[string]*DispatchCluster, len(findings))
-	for _, finding := range findings {
-		key := core.Sprintf("%s|%s|%s|%s", finding.Tool, finding.Severity, finding.Category, firstNonEmpty(finding.Code, finding.RuleID))
-		cluster, ok := byKey[key]
-		if !ok {
-			cluster = &DispatchCluster{
-				Tool:     finding.Tool,
-				Severity: finding.Severity,
-				Category: finding.Category,
-				RuleID:   firstNonEmpty(finding.Code, finding.RuleID),
-			}
-			byKey[key] = cluster
-		}
-		cluster.Count++
-		if len(cluster.Samples) < clusterSampleLimit {
-			cluster.Samples = append(cluster.Samples, DispatchClusterSample{
-				File:    finding.File,
-				Line:    finding.Line,
-				Message: finding.Message,
-			})
-		}
-	}
-
-	// Stable order: highest count first, then by rule identifier so
-	// identical-count clusters are deterministic in the report.
-	clusters := make([]DispatchCluster, 0, len(byKey))
-	for _, cluster := range byKey {
-		clusters = append(clusters, *cluster)
-	}
-	sortDispatchClusters(clusters)
-	return clusters
-}
-
-// sortDispatchClusters orders clusters by descending Count then ascending
-// RuleID so the report is deterministic across runs and `core-agent status`
-// always shows the same ordering for identical data.
-func sortDispatchClusters(clusters []DispatchCluster) {
-	for i := 1; i < len(clusters); i++ {
-		candidate := clusters[i]
-		j := i - 1
-		for j >= 0 && clusterLess(candidate, clusters[j]) {
-			clusters[j+1] = clusters[j]
-			j--
-		}
-		clusters[j+1] = candidate
-	}
-}
-
-func clusterLess(left, right DispatchCluster) bool {
-	if left.Count != right.Count {
-		return left.Count > right.Count
-	}
-	if left.Tool != right.Tool {
-		return left.Tool < right.Tool
-	}
-	return left.RuleID < right.RuleID
-}
--- a/pkg/agentic/qa_cluster.go
+++ b/pkg/agentic/qa_cluster.go
@ -0,0 +1,360 @@
+// SPDX-License-Identifier: EUPL-1.2
+
+package agentic
+
+import (
+	"hash/fnv"
+	"math"
+	"unicode"
+
+	core "dappco.re/go/core"
+	poindexter "github.com/Snider/Poindexter"
+)
+
+const (
+	// qaClusterFeatureDimensions keeps the hashed feature vector compact while
+	// leaving enough buckets for message/token separation in typical QA runs.
+	qaClusterFeatureDimensions = 24
+
+	// RFC §7 uses cosine distance 0.15 for "similar enough" QA findings.
+	qaClusterMinimumCosineSimilarity = 0.85
+	qaClusterCosineDistanceThreshold = 1 - qaClusterMinimumCosineSimilarity
+)
+
+var qaClusterEuclideanDistanceThreshold = math.Sqrt(2 - (2 * qaClusterMinimumCosineSimilarity))
+
+type qaClusterPoint struct {
+	Index int
+}
+
+type qaClusterUnion struct {
+	parent []int
+	size   []int
+}
+
+// clusterFindings groups the current cycle's findings by similarity so
+// `.meta/report.json` surfaces recurring shapes instead of listing every
+// repeated failure individually. When Poindexter cannot build the KD-tree, the
+// function falls back to the previous exact-key bucketing so reporting keeps
+// working.
+//
+// Usage example: `clusters := clusterFindings(report.Findings)`
+func clusterFindings(findings []QAFinding) []DispatchCluster {
+	if len(findings) == 0 {
+		return nil
+	}
+
+	points := qaClusterPoints(findings)
+	cosineTree, err := poindexter.NewKDTree(points,
+		poindexter.WithMetric(poindexter.CosineDistance{}),
+	)
+	if err != nil {
+		return clusterFindingsFallback(findings)
+	}
+
+	euclideanTree, err := poindexter.NewKDTree(points,
+		poindexter.WithMetric(poindexter.EuclideanDistance{}),
+	)
+	if err != nil {
+		return clusterFindingsFallback(findings)
+	}
+
+	union := newQAClusterUnion(len(findings))
+	for _, point := range points {
+		qaClusterUnionRadius(union, cosineTree, point, findings, qaClusterCosineDistanceThreshold)
+		qaClusterUnionRadius(union, euclideanTree, point, findings, qaClusterEuclideanDistanceThreshold)
+	}
+
+	return qaClusterDispatchClusters(findings, union)
+}
+
+func clusterFindingsFallback(findings []QAFinding) []DispatchCluster {
+	byKey := make(map[string]*DispatchCluster, len(findings))
+	for _, finding := range findings {
+		key := core.Sprintf("%s|%s|%s|%s", finding.Tool, finding.Severity, finding.Category, firstNonEmpty(finding.Code, finding.RuleID))
+		cluster, ok := byKey[key]
+		if !ok {
+			cluster = &DispatchCluster{
+				Tool:     finding.Tool,
+				Severity: finding.Severity,
+				Category: finding.Category,
+				RuleID:   firstNonEmpty(finding.Code, finding.RuleID),
+			}
+			byKey[key] = cluster
+		}
+		cluster.Count++
+		if len(cluster.Samples) < clusterSampleLimit {
+			cluster.Samples = append(cluster.Samples, DispatchClusterSample{
+				File:    finding.File,
+				Line:    finding.Line,
+				Message: finding.Message,
+			})
+		}
+	}
+
+	clusters := make([]DispatchCluster, 0, len(byKey))
+	for _, cluster := range byKey {
+		clusters = append(clusters, *cluster)
+	}
+	sortDispatchClusters(clusters)
+	return clusters
+}
+
+func qaClusterPoints(findings []QAFinding) []poindexter.KDPoint[qaClusterPoint] {
+	points := make([]poindexter.KDPoint[qaClusterPoint], len(findings))
+	for index, finding := range findings {
+		points[index] = poindexter.KDPoint[qaClusterPoint]{
+			ID:     core.Sprintf("finding-%d", index),
+			Coords: qaClusterFeatureVector(finding),
+			Value:  qaClusterPoint{Index: index},
+		}
+	}
+	return points
+}
+
+func qaClusterFeatureVector(finding QAFinding) []float64 {
+	coords := make([]float64, qaClusterFeatureDimensions)
+
+	qaClusterAddToken(coords, core.Concat("tool:", core.Lower(finding.Tool)), 4)
+	qaClusterAddToken(coords, core.Concat("severity:", core.Lower(finding.Severity)), 3)
+	qaClusterAddToken(coords, core.Concat("category:", core.Lower(finding.Category)), 2)
+	qaClusterAddToken(coords, core.Concat("rule:", core.Lower(firstNonEmpty(finding.Code, finding.RuleID))), 2)
+	qaClusterAddText(coords, finding.Title, 1.5)
+	qaClusterAddText(coords, finding.Message, 1)
+
+	if qaClusterVectorZero(coords) {
+		qaClusterAddToken(coords, "finding", 1)
+	}
+
+	qaClusterNormalise(coords)
+	return coords
+}
+
+func qaClusterAddText(coords []float64, text string, weight float64) {
+	tokens := qaClusterTokens(text)
+	for _, token := range tokens {
+		qaClusterAddToken(coords, token, weight)
+	}
+	for index := 1; index < len(tokens); index++ {
+		qaClusterAddToken(coords, core.Concat(tokens[index-1], "_", tokens[index]), weight+0.25)
+	}
+}
+
+func qaClusterTokens(text string) []string {
+	if core.Trim(text) == "" {
+		return nil
+	}
+
+	buffer := make([]rune, 0, len(text))
+	for _, value := range text {
+		switch {
+		case unicode.IsLetter(value), unicode.IsDigit(value):
+			buffer = append(buffer, core.ToLower(value))
+		default:
+			buffer = append(buffer, ' ')
+		}
+	}
+
+	parts := core.Split(string(buffer), " ")
+	tokens := make([]string, 0, len(parts))
+	for _, part := range parts {
+		part = core.Trim(part)
+		if len(part) < 3 || qaClusterStopWord(part) {
+			continue
+		}
+		tokens = append(tokens, part)
+	}
+	return tokens
+}
+
+func qaClusterStopWord(value string) bool {
+	switch value {
+	case "the", "and", "for", "with", "that", "this", "from", "into", "your", "you", "are", "was", "were", "has", "have", "had", "not", "but", "can", "could", "should", "would":
+		return true
+	default:
+		return false
+	}
+}
+
+func qaClusterAddToken(coords []float64, token string, weight float64) {
+	if token == "" || weight == 0 {
+		return
+	}
+	coords[qaClusterBucket(token)] += weight
+}
+
+func qaClusterBucket(token string) int {
+	hash := fnv.New32a()
+	_, _ = hash.Write([]byte(token))
+	return int(hash.Sum32() % qaClusterFeatureDimensions)
+}
+
+func qaClusterVectorZero(coords []float64) bool {
+	for _, value := range coords {
+		if value != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func qaClusterNormalise(coords []float64) {
+	var sum float64
+	for _, value := range coords {
+		sum += value * value
+	}
+	if sum == 0 {
+		return
+	}
+
+	length := math.Sqrt(sum)
+	for index := range coords {
+		coords[index] /= length
+	}
+}
+
+func qaClusterUnionRadius(union *qaClusterUnion, tree *poindexter.KDTree[qaClusterPoint], point poindexter.KDPoint[qaClusterPoint], findings []QAFinding, threshold float64) {
+	if union == nil || tree == nil {
+		return
+	}
+
+	neighbours, _ := tree.Radius(point.Coords, threshold)
+	for _, neighbour := range neighbours {
+		leftIndex := point.Value.Index
+		rightIndex := neighbour.Value.Index
+		if leftIndex == rightIndex {
+			continue
+		}
+		if !qaClusterCompatible(findings[leftIndex], findings[rightIndex]) {
+			continue
+		}
+		union.Union(leftIndex, rightIndex)
+	}
+}
+
+func qaClusterCompatible(left, right QAFinding) bool {
+	if left.Tool != "" && right.Tool != "" && left.Tool != right.Tool {
+		return false
+	}
+	if left.Severity != "" && right.Severity != "" && left.Severity != right.Severity {
+		return false
+	}
+	return true
+}
+
+func newQAClusterUnion(size int) *qaClusterUnion {
+	parent := make([]int, size)
+	setSize := make([]int, size)
+	for index := range parent {
+		parent[index] = index
+		setSize[index] = 1
+	}
+	return &qaClusterUnion{
+		parent: parent,
+		size:   setSize,
+	}
+}
+
+func (union *qaClusterUnion) Find(index int) int {
+	if union.parent[index] != index {
+		union.parent[index] = union.Find(union.parent[index])
+	}
+	return union.parent[index]
+}
+
+func (union *qaClusterUnion) Union(left, right int) {
+	leftRoot := union.Find(left)
+	rightRoot := union.Find(right)
+	if leftRoot == rightRoot {
+		return
+	}
+	if union.size[leftRoot] < union.size[rightRoot] {
+		leftRoot, rightRoot = rightRoot, leftRoot
+	}
+	union.parent[rightRoot] = leftRoot
+	union.size[leftRoot] += union.size[rightRoot]
+}
+
+func qaClusterDispatchClusters(findings []QAFinding, union *qaClusterUnion) []DispatchCluster {
+	byRoot := make(map[int][]int, len(findings))
+	for index := range findings {
+		root := union.Find(index)
+		byRoot[root] = append(byRoot[root], index)
+	}
+
+	clusters := make([]DispatchCluster, 0, len(byRoot))
+	for _, members := range byRoot {
+		clusters = append(clusters, qaClusterSummary(findings, members))
+	}
+	sortDispatchClusters(clusters)
+	return clusters
+}
+
+func qaClusterSummary(findings []QAFinding, members []int) DispatchCluster {
+	cluster := DispatchCluster{
+		Tool:     qaClusterDominantValue(findings, members, func(finding QAFinding) string { return finding.Tool }),
+		Severity: qaClusterDominantValue(findings, members, func(finding QAFinding) string { return finding.Severity }),
+		Category: qaClusterDominantValue(findings, members, func(finding QAFinding) string { return finding.Category }),
+		RuleID: qaClusterDominantValue(findings, members, func(finding QAFinding) string {
+			return firstNonEmpty(finding.Code, finding.RuleID)
+		}),
+		Count: len(members),
+	}
+
+	for _, index := range members {
+		finding := findings[index]
+		if len(cluster.Samples) >= clusterSampleLimit {
+			break
+		}
+		cluster.Samples = append(cluster.Samples, DispatchClusterSample{
+			File:    finding.File,
+			Line:    finding.Line,
+			Message: finding.Message,
+		})
+	}
+
+	return cluster
+}
+
+func qaClusterDominantValue(findings []QAFinding, members []int, extract func(QAFinding) string) string {
+	counts := make(map[string]int, len(members))
+	bestValue := ""
+	bestCount := 0
+	for _, index := range members {
+		value := extract(findings[index])
+		if value == "" {
+			continue
+		}
+		counts[value]++
+		if counts[value] > bestCount || (counts[value] == bestCount && (bestValue == "" || value < bestValue)) {
+			bestValue = value
+			bestCount = counts[value]
+		}
+	}
+	return bestValue
+}
+
+// sortDispatchClusters orders clusters by descending Count then ascending
+// RuleID so the report is deterministic across runs and `core-agent status`
+// always shows the same ordering for identical data.
+func sortDispatchClusters(clusters []DispatchCluster) {
+	for i := 1; i < len(clusters); i++ {
+		candidate := clusters[i]
+		j := i - 1
+		for j >= 0 && clusterLess(candidate, clusters[j]) {
+			clusters[j+1] = clusters[j]
+			j--
+		}
+		clusters[j+1] = candidate
+	}
+}
+
+func clusterLess(left, right DispatchCluster) bool {
+	if left.Count != right.Count {
+		return left.Count > right.Count
+	}
+	if left.Tool != right.Tool {
+		return left.Tool < right.Tool
+	}
+	return left.RuleID < right.RuleID
+}
--- a/pkg/agentic/qa_test.go
+++ b/pkg/agentic/qa_test.go
@ -209,31 +209,52 @@ func TestQa_StringOutput_Ugly(t *testing.T) {
 // --- clusterFindings ---

 func TestQa_ClusterFindings_Good(t *testing.T) {
-	// Two G101 findings in the same tool merge into one cluster with count 2.
+	// Similar gosec findings with different rule IDs should still deduplicate.
 	findings := []QAFinding{
-		{Tool: "gosec", Severity: "error", Category: "security", Code: "G101", File: "a.go", Line: 10, Message: "secret"},
-		{Tool: "gosec", Severity: "error", Category: "security", Code: "G101", File: "b.go", Line: 20, Message: "secret"},
-		{Tool: "staticcheck", Severity: "warning", Code: "SA1000", File: "c.go", Line: 5},
+		{Tool: "gosec", Severity: "error", Category: "security", Code: "G101", File: "a.go", Line: 10, Message: "hardcoded password found"},
+		{Tool: "gosec", Severity: "error", Category: "security", Code: "G401", File: "b.go", Line: 20, Message: "hardcoded password found in config"},
+		{Tool: "gosec", Severity: "error", Category: "security", RuleID: "HARDCODED-SECRET", File: "c.go", Line: 30, Message: "possible hardcoded password found"},
+		{Tool: "gosec", Severity: "error", Category: "security", Code: "G304", File: "d.go", Line: 40, Message: "file path built from tainted input"},
+		{Tool: "staticcheck", Severity: "warning", Code: "SA1000", File: "e.go", Line: 5, Message: "invalid regexp pattern"},
 	}
 	clusters := clusterFindings(findings)
-	if assert.Len(t, clusters, 2) {
-		assert.Equal(t, 2, clusters[0].Count)
+	if assert.Len(t, clusters, 3) {
+		assert.Equal(t, 3, clusters[0].Count)
 		assert.Equal(t, "gosec", clusters[0].Tool)
-		assert.Len(t, clusters[0].Samples, 2)
+		assert.Len(t, clusters[0].Samples, 3)
 		assert.Equal(t, 1, clusters[1].Count)
+		assert.Equal(t, 1, clusters[2].Count)
 	}
 }

 func TestQa_ClusterFindings_Bad(t *testing.T) {
-	assert.Nil(t, clusterFindings(nil))
-	assert.Nil(t, clusterFindings([]QAFinding{}))
+	assert.NotPanics(t, func() {
+		assert.Nil(t, clusterFindings(nil))
+		assert.Nil(t, clusterFindings([]QAFinding{}))
+	})
 }

 func TestQa_ClusterFindings_Ugly(t *testing.T) {
+	findings := []QAFinding{
+		{Tool: "gosec", Severity: "error", Code: "G101", File: "a.go", Line: 10, Message: "hardcoded password found"},
+		{Tool: "gosec", Severity: "error", Code: "G304", File: "b.go", Line: 20, Message: "file path built from tainted input"},
+		{Tool: "staticcheck", Severity: "warning", Code: "SA1000", File: "c.go", Line: 30, Message: "invalid regexp pattern"},
+		{Tool: "govet", Severity: "warning", Code: "printf", File: "d.go", Line: 40, Message: "printf format mismatch"},
+		{Tool: "revive", Severity: "info", Code: "var-naming", File: "e.go", Line: 50, Message: "var name should be camelCase"},
+	}
+	clusters := clusterFindings(findings)
+	if assert.Len(t, clusters, 5) {
+		for _, cluster := range clusters {
+			assert.Equal(t, 1, cluster.Count)
+		}
+	}
+}
+
+func TestQa_ClusterFindings_Ugly_SampleLimit(t *testing.T) {
 	// 10 identical findings should cap samples at clusterSampleLimit.
 	findings := make([]QAFinding, 10)
 	for i := range findings {
-		findings[i] = QAFinding{Tool: "gosec", Code: "G101", File: "same.go", Line: i}
+		findings[i] = QAFinding{Tool: "gosec", Severity: "error", Code: "G101", File: "same.go", Line: i, Message: "hardcoded password found"}
 	}
 	clusters := clusterFindings(findings)
 	if assert.Len(t, clusters, 1) {