From 40728e68d14220146c535cb1364f617d11bb7eda Mon Sep 17 00:00:00 2001 From: Snider Date: Sat, 25 Apr 2026 20:25:04 +0100 Subject: [PATCH] feat(agent/agentic): Poindexter KD-tree clustering replaces exact-key bucketing in runQAWithReport MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qa_cluster.go wires QA finding clustering through Poindexter instead of the old exact (tool, severity, category, rule) bucketing. Hashed feature vectors built from finding metadata + message text are indexed in Poindexter KD-trees with cosine + Euclidean distance, near-neighbours unioned, then emitted as the existing DispatchCluster shape (so consumers don't break). Old exact-key grouping kept as fallback if Poindexter tree construction ever fails. qa.go updated to describe new similarity-based contract; runQAWithReport already consumes clusterFindings, picks up the new grouping automatically. Tests cover: 3 similar findings + 2 distinct โ†’ grouped correctly; 5 distinct findings โ†’ 5 clusters; 0 findings โ†’ empty/nil no panic; sample-cap regression preserved. Note: github.com/Snider/Poindexter added as a direct dep. Per the non-negotiable migration policy on Snider personal-namespace deps (see #219), file follow-up to migrate Poindexter to its canonical core/* home if/when that lands. Co-authored-by: Codex Closes tasks.lthn.sh/view.php?id=164 --- go.mod | 5 +- go.sum | 2 + pkg/agentic/qa.go | 77 +------- pkg/agentic/qa_cluster.go | 360 ++++++++++++++++++++++++++++++++++++++ pkg/agentic/qa_test.go | 41 +++-- 5 files changed, 400 insertions(+), 85 deletions(-) create mode 100644 pkg/agentic/qa_cluster.go diff --git a/go.mod b/go.mod index 0d97ad2..07b9aea 100644 --- a/go.mod +++ b/go.mod @@ -3,13 +3,13 @@ module dappco.re/go/agent go 1.26.0 require ( - dappco.re/go/core v0.8.0-alpha.1 dappco.re/go/api v0.8.0-alpha.1 + dappco.re/go/core v0.8.0-alpha.1 dappco.re/go/forge v0.8.0-alpha.1 + dappco.re/go/mcp v0.8.0-alpha.1 dappco.re/go/process v0.8.0-alpha.1 dappco.re/go/store v0.8.0-alpha.1 dappco.re/go/ws v0.8.0-alpha.1 - dappco.re/go/mcp v0.8.0-alpha.1 github.com/gin-gonic/gin v1.12.0 github.com/gorilla/websocket v1.5.3 github.com/modelcontextprotocol/go-sdk v1.5.0 @@ -25,6 +25,7 @@ require ( dappco.re/go/webview v0.8.0-alpha.1 // indirect github.com/99designs/gqlgen v0.17.88 // indirect github.com/KyleBanks/depth v1.2.1 // indirect + github.com/Snider/Poindexter v0.0.0-20260104200422-91146b212a1f github.com/agnivade/levenshtein v1.2.1 // indirect github.com/andybalholm/brotli v1.2.0 // indirect github.com/bahlo/generic-list-go v0.2.0 // indirect diff --git a/go.sum b/go.sum index ff21a39..3ec4d14 100644 --- a/go.sum +++ b/go.sum @@ -26,6 +26,8 @@ github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw= github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ= +github.com/Snider/Poindexter v0.0.0-20260104200422-91146b212a1f h1:+EnE414H9wUaBeUVNjyErusrxSbBGnGV6MBhTw/em0k= +github.com/Snider/Poindexter v0.0.0-20260104200422-91146b212a1f/go.mod h1:nhgkbg4zWA4AS2Ga3RmcvdsyiI9TdxvSqe5EVBSb3Hk= github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM= github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU= github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNgfBlViaCIJKLlCJ6/fmUseuG0wVQ= diff --git a/pkg/agentic/qa.go b/pkg/agentic/qa.go index 7095833..ab00bfe 100644 --- a/pkg/agentic/qa.go +++ b/pkg/agentic/qa.go @@ -97,9 +97,10 @@ type DispatchReport struct { } // DispatchCluster groups similar findings together so human reviewers can see -// recurring problem shapes without scanning every raw finding. A cluster keys -// by (tool, severity, category, rule_id) and counts how many findings fell -// into that bucket in the current cycle, with representative samples. +// recurring problem shapes without scanning every raw finding. Clusters are +// built from Poindexter KD-tree similarity over hashed finding features, then +// summarised back into the dominant tool/severity/category/rule identity with +// representative samples. // // Usage example: `cluster := DispatchCluster{Tool: "gosec", Severity: "error", Category: "security", Count: 3, RuleID: "G101"}` type DispatchCluster struct { @@ -694,73 +695,3 @@ func findingsFromJournalPayload(payload map[string]any) []map[string]any { func escapeJournalLiteral(value string) string { return core.Replace(value, "'", "''") } - -// clusterFindings groups the current cycle's findings by (tool, severity, -// category, rule_id) so `.meta/report.json` surfaces recurring shapes. The -// cluster count equals the number of findings in the bucket; the sample list -// is capped at `clusterSampleLimit` representative entries so the payload -// stays bounded for chatty linters. -// -// Usage example: `clusters := clusterFindings(report.Findings)` -func clusterFindings(findings []QAFinding) []DispatchCluster { - if len(findings) == 0 { - return nil - } - - byKey := make(map[string]*DispatchCluster, len(findings)) - for _, finding := range findings { - key := core.Sprintf("%s|%s|%s|%s", finding.Tool, finding.Severity, finding.Category, firstNonEmpty(finding.Code, finding.RuleID)) - cluster, ok := byKey[key] - if !ok { - cluster = &DispatchCluster{ - Tool: finding.Tool, - Severity: finding.Severity, - Category: finding.Category, - RuleID: firstNonEmpty(finding.Code, finding.RuleID), - } - byKey[key] = cluster - } - cluster.Count++ - if len(cluster.Samples) < clusterSampleLimit { - cluster.Samples = append(cluster.Samples, DispatchClusterSample{ - File: finding.File, - Line: finding.Line, - Message: finding.Message, - }) - } - } - - // Stable order: highest count first, then by rule identifier so - // identical-count clusters are deterministic in the report. - clusters := make([]DispatchCluster, 0, len(byKey)) - for _, cluster := range byKey { - clusters = append(clusters, *cluster) - } - sortDispatchClusters(clusters) - return clusters -} - -// sortDispatchClusters orders clusters by descending Count then ascending -// RuleID so the report is deterministic across runs and `core-agent status` -// always shows the same ordering for identical data. -func sortDispatchClusters(clusters []DispatchCluster) { - for i := 1; i < len(clusters); i++ { - candidate := clusters[i] - j := i - 1 - for j >= 0 && clusterLess(candidate, clusters[j]) { - clusters[j+1] = clusters[j] - j-- - } - clusters[j+1] = candidate - } -} - -func clusterLess(left, right DispatchCluster) bool { - if left.Count != right.Count { - return left.Count > right.Count - } - if left.Tool != right.Tool { - return left.Tool < right.Tool - } - return left.RuleID < right.RuleID -} diff --git a/pkg/agentic/qa_cluster.go b/pkg/agentic/qa_cluster.go new file mode 100644 index 0000000..99c1bf7 --- /dev/null +++ b/pkg/agentic/qa_cluster.go @@ -0,0 +1,360 @@ +// SPDX-License-Identifier: EUPL-1.2 + +package agentic + +import ( + "hash/fnv" + "math" + "unicode" + + core "dappco.re/go/core" + poindexter "github.com/Snider/Poindexter" +) + +const ( + // qaClusterFeatureDimensions keeps the hashed feature vector compact while + // leaving enough buckets for message/token separation in typical QA runs. + qaClusterFeatureDimensions = 24 + + // RFC ยง7 uses cosine distance 0.15 for "similar enough" QA findings. + qaClusterMinimumCosineSimilarity = 0.85 + qaClusterCosineDistanceThreshold = 1 - qaClusterMinimumCosineSimilarity +) + +var qaClusterEuclideanDistanceThreshold = math.Sqrt(2 - (2 * qaClusterMinimumCosineSimilarity)) + +type qaClusterPoint struct { + Index int +} + +type qaClusterUnion struct { + parent []int + size []int +} + +// clusterFindings groups the current cycle's findings by similarity so +// `.meta/report.json` surfaces recurring shapes instead of listing every +// repeated failure individually. When Poindexter cannot build the KD-tree, the +// function falls back to the previous exact-key bucketing so reporting keeps +// working. +// +// Usage example: `clusters := clusterFindings(report.Findings)` +func clusterFindings(findings []QAFinding) []DispatchCluster { + if len(findings) == 0 { + return nil + } + + points := qaClusterPoints(findings) + cosineTree, err := poindexter.NewKDTree(points, + poindexter.WithMetric(poindexter.CosineDistance{}), + ) + if err != nil { + return clusterFindingsFallback(findings) + } + + euclideanTree, err := poindexter.NewKDTree(points, + poindexter.WithMetric(poindexter.EuclideanDistance{}), + ) + if err != nil { + return clusterFindingsFallback(findings) + } + + union := newQAClusterUnion(len(findings)) + for _, point := range points { + qaClusterUnionRadius(union, cosineTree, point, findings, qaClusterCosineDistanceThreshold) + qaClusterUnionRadius(union, euclideanTree, point, findings, qaClusterEuclideanDistanceThreshold) + } + + return qaClusterDispatchClusters(findings, union) +} + +func clusterFindingsFallback(findings []QAFinding) []DispatchCluster { + byKey := make(map[string]*DispatchCluster, len(findings)) + for _, finding := range findings { + key := core.Sprintf("%s|%s|%s|%s", finding.Tool, finding.Severity, finding.Category, firstNonEmpty(finding.Code, finding.RuleID)) + cluster, ok := byKey[key] + if !ok { + cluster = &DispatchCluster{ + Tool: finding.Tool, + Severity: finding.Severity, + Category: finding.Category, + RuleID: firstNonEmpty(finding.Code, finding.RuleID), + } + byKey[key] = cluster + } + cluster.Count++ + if len(cluster.Samples) < clusterSampleLimit { + cluster.Samples = append(cluster.Samples, DispatchClusterSample{ + File: finding.File, + Line: finding.Line, + Message: finding.Message, + }) + } + } + + clusters := make([]DispatchCluster, 0, len(byKey)) + for _, cluster := range byKey { + clusters = append(clusters, *cluster) + } + sortDispatchClusters(clusters) + return clusters +} + +func qaClusterPoints(findings []QAFinding) []poindexter.KDPoint[qaClusterPoint] { + points := make([]poindexter.KDPoint[qaClusterPoint], len(findings)) + for index, finding := range findings { + points[index] = poindexter.KDPoint[qaClusterPoint]{ + ID: core.Sprintf("finding-%d", index), + Coords: qaClusterFeatureVector(finding), + Value: qaClusterPoint{Index: index}, + } + } + return points +} + +func qaClusterFeatureVector(finding QAFinding) []float64 { + coords := make([]float64, qaClusterFeatureDimensions) + + qaClusterAddToken(coords, core.Concat("tool:", core.Lower(finding.Tool)), 4) + qaClusterAddToken(coords, core.Concat("severity:", core.Lower(finding.Severity)), 3) + qaClusterAddToken(coords, core.Concat("category:", core.Lower(finding.Category)), 2) + qaClusterAddToken(coords, core.Concat("rule:", core.Lower(firstNonEmpty(finding.Code, finding.RuleID))), 2) + qaClusterAddText(coords, finding.Title, 1.5) + qaClusterAddText(coords, finding.Message, 1) + + if qaClusterVectorZero(coords) { + qaClusterAddToken(coords, "finding", 1) + } + + qaClusterNormalise(coords) + return coords +} + +func qaClusterAddText(coords []float64, text string, weight float64) { + tokens := qaClusterTokens(text) + for _, token := range tokens { + qaClusterAddToken(coords, token, weight) + } + for index := 1; index < len(tokens); index++ { + qaClusterAddToken(coords, core.Concat(tokens[index-1], "_", tokens[index]), weight+0.25) + } +} + +func qaClusterTokens(text string) []string { + if core.Trim(text) == "" { + return nil + } + + buffer := make([]rune, 0, len(text)) + for _, value := range text { + switch { + case unicode.IsLetter(value), unicode.IsDigit(value): + buffer = append(buffer, core.ToLower(value)) + default: + buffer = append(buffer, ' ') + } + } + + parts := core.Split(string(buffer), " ") + tokens := make([]string, 0, len(parts)) + for _, part := range parts { + part = core.Trim(part) + if len(part) < 3 || qaClusterStopWord(part) { + continue + } + tokens = append(tokens, part) + } + return tokens +} + +func qaClusterStopWord(value string) bool { + switch value { + case "the", "and", "for", "with", "that", "this", "from", "into", "your", "you", "are", "was", "were", "has", "have", "had", "not", "but", "can", "could", "should", "would": + return true + default: + return false + } +} + +func qaClusterAddToken(coords []float64, token string, weight float64) { + if token == "" || weight == 0 { + return + } + coords[qaClusterBucket(token)] += weight +} + +func qaClusterBucket(token string) int { + hash := fnv.New32a() + _, _ = hash.Write([]byte(token)) + return int(hash.Sum32() % qaClusterFeatureDimensions) +} + +func qaClusterVectorZero(coords []float64) bool { + for _, value := range coords { + if value != 0 { + return false + } + } + return true +} + +func qaClusterNormalise(coords []float64) { + var sum float64 + for _, value := range coords { + sum += value * value + } + if sum == 0 { + return + } + + length := math.Sqrt(sum) + for index := range coords { + coords[index] /= length + } +} + +func qaClusterUnionRadius(union *qaClusterUnion, tree *poindexter.KDTree[qaClusterPoint], point poindexter.KDPoint[qaClusterPoint], findings []QAFinding, threshold float64) { + if union == nil || tree == nil { + return + } + + neighbours, _ := tree.Radius(point.Coords, threshold) + for _, neighbour := range neighbours { + leftIndex := point.Value.Index + rightIndex := neighbour.Value.Index + if leftIndex == rightIndex { + continue + } + if !qaClusterCompatible(findings[leftIndex], findings[rightIndex]) { + continue + } + union.Union(leftIndex, rightIndex) + } +} + +func qaClusterCompatible(left, right QAFinding) bool { + if left.Tool != "" && right.Tool != "" && left.Tool != right.Tool { + return false + } + if left.Severity != "" && right.Severity != "" && left.Severity != right.Severity { + return false + } + return true +} + +func newQAClusterUnion(size int) *qaClusterUnion { + parent := make([]int, size) + setSize := make([]int, size) + for index := range parent { + parent[index] = index + setSize[index] = 1 + } + return &qaClusterUnion{ + parent: parent, + size: setSize, + } +} + +func (union *qaClusterUnion) Find(index int) int { + if union.parent[index] != index { + union.parent[index] = union.Find(union.parent[index]) + } + return union.parent[index] +} + +func (union *qaClusterUnion) Union(left, right int) { + leftRoot := union.Find(left) + rightRoot := union.Find(right) + if leftRoot == rightRoot { + return + } + if union.size[leftRoot] < union.size[rightRoot] { + leftRoot, rightRoot = rightRoot, leftRoot + } + union.parent[rightRoot] = leftRoot + union.size[leftRoot] += union.size[rightRoot] +} + +func qaClusterDispatchClusters(findings []QAFinding, union *qaClusterUnion) []DispatchCluster { + byRoot := make(map[int][]int, len(findings)) + for index := range findings { + root := union.Find(index) + byRoot[root] = append(byRoot[root], index) + } + + clusters := make([]DispatchCluster, 0, len(byRoot)) + for _, members := range byRoot { + clusters = append(clusters, qaClusterSummary(findings, members)) + } + sortDispatchClusters(clusters) + return clusters +} + +func qaClusterSummary(findings []QAFinding, members []int) DispatchCluster { + cluster := DispatchCluster{ + Tool: qaClusterDominantValue(findings, members, func(finding QAFinding) string { return finding.Tool }), + Severity: qaClusterDominantValue(findings, members, func(finding QAFinding) string { return finding.Severity }), + Category: qaClusterDominantValue(findings, members, func(finding QAFinding) string { return finding.Category }), + RuleID: qaClusterDominantValue(findings, members, func(finding QAFinding) string { + return firstNonEmpty(finding.Code, finding.RuleID) + }), + Count: len(members), + } + + for _, index := range members { + finding := findings[index] + if len(cluster.Samples) >= clusterSampleLimit { + break + } + cluster.Samples = append(cluster.Samples, DispatchClusterSample{ + File: finding.File, + Line: finding.Line, + Message: finding.Message, + }) + } + + return cluster +} + +func qaClusterDominantValue(findings []QAFinding, members []int, extract func(QAFinding) string) string { + counts := make(map[string]int, len(members)) + bestValue := "" + bestCount := 0 + for _, index := range members { + value := extract(findings[index]) + if value == "" { + continue + } + counts[value]++ + if counts[value] > bestCount || (counts[value] == bestCount && (bestValue == "" || value < bestValue)) { + bestValue = value + bestCount = counts[value] + } + } + return bestValue +} + +// sortDispatchClusters orders clusters by descending Count then ascending +// RuleID so the report is deterministic across runs and `core-agent status` +// always shows the same ordering for identical data. +func sortDispatchClusters(clusters []DispatchCluster) { + for i := 1; i < len(clusters); i++ { + candidate := clusters[i] + j := i - 1 + for j >= 0 && clusterLess(candidate, clusters[j]) { + clusters[j+1] = clusters[j] + j-- + } + clusters[j+1] = candidate + } +} + +func clusterLess(left, right DispatchCluster) bool { + if left.Count != right.Count { + return left.Count > right.Count + } + if left.Tool != right.Tool { + return left.Tool < right.Tool + } + return left.RuleID < right.RuleID +} diff --git a/pkg/agentic/qa_test.go b/pkg/agentic/qa_test.go index 604c73d..02ac9a3 100644 --- a/pkg/agentic/qa_test.go +++ b/pkg/agentic/qa_test.go @@ -209,31 +209,52 @@ func TestQa_StringOutput_Ugly(t *testing.T) { // --- clusterFindings --- func TestQa_ClusterFindings_Good(t *testing.T) { - // Two G101 findings in the same tool merge into one cluster with count 2. + // Similar gosec findings with different rule IDs should still deduplicate. findings := []QAFinding{ - {Tool: "gosec", Severity: "error", Category: "security", Code: "G101", File: "a.go", Line: 10, Message: "secret"}, - {Tool: "gosec", Severity: "error", Category: "security", Code: "G101", File: "b.go", Line: 20, Message: "secret"}, - {Tool: "staticcheck", Severity: "warning", Code: "SA1000", File: "c.go", Line: 5}, + {Tool: "gosec", Severity: "error", Category: "security", Code: "G101", File: "a.go", Line: 10, Message: "hardcoded password found"}, + {Tool: "gosec", Severity: "error", Category: "security", Code: "G401", File: "b.go", Line: 20, Message: "hardcoded password found in config"}, + {Tool: "gosec", Severity: "error", Category: "security", RuleID: "HARDCODED-SECRET", File: "c.go", Line: 30, Message: "possible hardcoded password found"}, + {Tool: "gosec", Severity: "error", Category: "security", Code: "G304", File: "d.go", Line: 40, Message: "file path built from tainted input"}, + {Tool: "staticcheck", Severity: "warning", Code: "SA1000", File: "e.go", Line: 5, Message: "invalid regexp pattern"}, } clusters := clusterFindings(findings) - if assert.Len(t, clusters, 2) { - assert.Equal(t, 2, clusters[0].Count) + if assert.Len(t, clusters, 3) { + assert.Equal(t, 3, clusters[0].Count) assert.Equal(t, "gosec", clusters[0].Tool) - assert.Len(t, clusters[0].Samples, 2) + assert.Len(t, clusters[0].Samples, 3) assert.Equal(t, 1, clusters[1].Count) + assert.Equal(t, 1, clusters[2].Count) } } func TestQa_ClusterFindings_Bad(t *testing.T) { - assert.Nil(t, clusterFindings(nil)) - assert.Nil(t, clusterFindings([]QAFinding{})) + assert.NotPanics(t, func() { + assert.Nil(t, clusterFindings(nil)) + assert.Nil(t, clusterFindings([]QAFinding{})) + }) } func TestQa_ClusterFindings_Ugly(t *testing.T) { + findings := []QAFinding{ + {Tool: "gosec", Severity: "error", Code: "G101", File: "a.go", Line: 10, Message: "hardcoded password found"}, + {Tool: "gosec", Severity: "error", Code: "G304", File: "b.go", Line: 20, Message: "file path built from tainted input"}, + {Tool: "staticcheck", Severity: "warning", Code: "SA1000", File: "c.go", Line: 30, Message: "invalid regexp pattern"}, + {Tool: "govet", Severity: "warning", Code: "printf", File: "d.go", Line: 40, Message: "printf format mismatch"}, + {Tool: "revive", Severity: "info", Code: "var-naming", File: "e.go", Line: 50, Message: "var name should be camelCase"}, + } + clusters := clusterFindings(findings) + if assert.Len(t, clusters, 5) { + for _, cluster := range clusters { + assert.Equal(t, 1, cluster.Count) + } + } +} + +func TestQa_ClusterFindings_Ugly_SampleLimit(t *testing.T) { // 10 identical findings should cap samples at clusterSampleLimit. findings := make([]QAFinding, 10) for i := range findings { - findings[i] = QAFinding{Tool: "gosec", Code: "G101", File: "same.go", Line: i} + findings[i] = QAFinding{Tool: "gosec", Severity: "error", Code: "G101", File: "same.go", Line: i, Message: "hardcoded password found"} } clusters := clusterFindings(findings) if assert.Len(t, clusters, 1) {