Add Cosine and Weighted Cosine distance metrics, enhance KDTree helpers, and update version to 0.3.0

2025-11-03 18:51:23 +00:00 · 2025-11-03 18:51:23 +00:00 · 3886724129
commit 3886724129
parent ca6f89a99c
11 changed files with 395 additions and 13 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -10,6 +10,17 @@ The format is based on Keep a Changelog and this project adheres to Semantic Ver
 - Lint: enable `errcheck` in `.golangci.yml` with test-file exclusion to reduce noise.
 - CI: enable module cache in `actions/setup-go` to speed up workflows.

+## [0.3.0] - 2025-11-03
+### Added
+- New distance metrics: `CosineDistance` and `WeightedCosineDistance` (1 - cosine similarity), with robust zero-vector handling and bounds.
+- N-D normalization helpers: `ComputeNormStatsND`, `BuildND`, `BuildNDWithStats` for arbitrary dimensions, with validation errors (`ErrInvalidFeatures`, `ErrInvalidWeights`, `ErrInvalidInvert`, `ErrStatsDimMismatch`).
+- Tests: unit tests for cosine/weighted-cosine metrics; parity tests between `Build4D` and `BuildND`; error-path tests; extended fuzz to include cosine metrics.
+- pkg.go.dev examples: `ExampleBuildND`, `ExampleBuildNDWithStats`, `ExampleCosineDistance`.
+
+### Changed
+- Version bumped to `0.3.0`.
+- README: list Cosine among supported metrics.
+
 ## [0.2.1] - 2025-11-03
 ### Added
 - Normalization stats helpers: `AxisStats`, `NormStats`, `ComputeNormStats2D/3D/4D`.
--- a/README.md
+++ b/README.md
@ -13,7 +13,7 @@ A Go library package providing utility functions including sorting algorithms wi
 - 🔢 **Sorting Utilities**: Sort integers, strings, and floats in ascending or descending order
 - 🎯 **Custom Sorting**: Sort any type with custom comparison functions or key extractors
 - 🔍 **Binary Search**: Fast search on sorted data
- 🧭 **KDTree (NN Search)**: Build a KDTree over points with generic payloads; nearest, k-NN, and radius queries with Euclidean or Manhattan metrics
+- 🧭 **KDTree (NN Search)**: Build a KDTree over points with generic payloads; nearest, k-NN, and radius queries with Euclidean, Manhattan, Chebyshev, and Cosine metrics
 - 📦 **Generic Functions**: Type-safe operations using Go generics
 - ✅ **Well-Tested**: Comprehensive test coverage
 - 📖 **Documentation**: Full documentation available at GitHub Pages
--- a/doc.go
+++ b/doc.go
@ -1,4 +1,7 @@
 // Package poindexter provides sorting utilities and a KDTree with simple
 // nearest-neighbour queries. It also includes helper functions to build
-// normalised, weighted KD points for 2D/3D/4D use-cases.
+// normalised, weighted KD points for 2D/3D/4D and arbitrary N‑D use-cases.
+//
+// Distance metrics include Euclidean (L2), Manhattan (L1), Chebyshev (L∞), and
+// Cosine/Weighted-Cosine for vector similarity.
 package poindexter
--- a/docs/api.md
+++ b/docs/api.md
@ -13,13 +13,13 @@ func Version() string
 Returns the current version of the library.

 **Returns:**
- `string`: The version string (e.g., "0.2.1")
+- `string`: The version string (e.g., "0.3.0")

 **Example:**

 ```go
 version := poindexter.Version()
-fmt.Println(version) // Output: 0.2.1
+fmt.Println(version) // Output: 0.3.0
 ```

 ---
--- a/examples_test.go
+++ b/examples_test.go
@ -140,6 +140,46 @@ func ExampleBuild2DWithStats() {
 	// Output: dim=2 len=3
 }

+func ExampleBuildND() {
+	type rec struct{ a, b, c float64 }
+	items := []rec{{0, 0, 0}, {1, 2, 3}, {0.5, 1, 1.5}}
+	features := []func(rec) float64{
+		func(r rec) float64 { return r.a },
+		func(r rec) float64 { return r.b },
+		func(r rec) float64 { return r.c },
+	}
+	weights := []float64{1, 0.5, 2}
+	invert := []bool{false, false, false}
+	pts, _ := poindexter.BuildND(items, func(r rec) string { return "" }, features, weights, invert)
+	tr, _ := poindexter.NewKDTree(pts)
+	fmt.Printf("dim=%d len=%d", tr.Dim(), tr.Len())
+	// Output: dim=3 len=3
+}
+
+func ExampleBuildNDWithStats() {
+	type rec struct{ a, b float64 }
+	items := []rec{{0, 0}, {1, 2}, {0.5, 1}}
+	features := []func(rec) float64{
+		func(r rec) float64 { return r.a },
+		func(r rec) float64 { return r.b },
+	}
+	stats, _ := poindexter.ComputeNormStatsND(items, features)
+	weights := []float64{1, 0.5}
+	invert := []bool{false, false}
+	pts, _ := poindexter.BuildNDWithStats(items, func(r rec) string { return "" }, features, weights, invert, stats)
+	tr, _ := poindexter.NewKDTree(pts, poindexter.WithMetric(poindexter.CosineDistance{}))
+	fmt.Printf("dim=%d len=%d", tr.Dim(), tr.Len())
+	// Output: dim=2 len=3
+}
+
+func ExampleCosineDistance() {
+	a := []float64{1, 0}
+	b := []float64{0, 1}
+	d := poindexter.CosineDistance{}.Distance(a, b)
+	fmt.Printf("%.0f", d)
+	// Output: 1
+}
+
 func ExampleBuild4DWithStats() {
 	type rec struct{ a, b, c, d float64 }
 	items := []rec{{0, 0, 0, 0}, {1, 1, 1, 1}}
--- a/fuzz_kdtree_test.go
+++ b/fuzz_kdtree_test.go
@ -67,8 +67,17 @@ func FuzzMetrics_NoNegative(f *testing.F) {
 		m1 := EuclideanDistance{}.Distance(a, b)
 		m2 := ManhattanDistance{}.Distance(a, b)
 		m3 := ChebyshevDistance{}.Distance(a, b)
-		if m1 < 0 || m2 < 0 || m3 < 0 {
-			t.Fatalf("negative metric: %v %v %v", m1, m2, m3)
+		m4 := CosineDistance{}.Distance(a, b)
+		w := make([]float64, dim)
+		for i := range w {
+			w[i] = 1
+		}
+		m5 := WeightedCosineDistance{Weights: w}.Distance(a, b)
+		if m1 < 0 || m2 < 0 || m3 < 0 || m4 < 0 || m5 < 0 {
+			t.Fatalf("negative metric: %v %v %v %v %v", m1, m2, m3, m4, m5)
+		}
+		if m4 > 2 || m5 > 2 {
+			t.Fatalf("cosine distance out of bounds: %v %v", m4, m5)
 		}
 	})
 }
--- a/kdtree.go
+++ b/kdtree.go
@ -75,6 +75,97 @@ func (ChebyshevDistance) Distance(a, b []float64) float64 {
 	return max
 }

+// CosineDistance implements 1 - cosine similarity.
+//
+// Distance is defined as 1 - (a·b)/(||a||*||b||). If both vectors are zero,
+// distance is 0. If exactly one is zero, distance is 1. Numerical results are
+// clamped to [0,2].
+// Note: For typical normalized/weighted feature vectors with non-negative entries,
+// the value will be in [0,1]. Opposite vectors in general spaces can yield up to 2.
+type CosineDistance struct{}
+
+func (CosineDistance) Distance(a, b []float64) float64 {
+	var dot, na2, nb2 float64
+	for i := range a {
+		ai := a[i]
+		bi := b[i]
+		dot += ai * bi
+		na2 += ai * ai
+		nb2 += bi * bi
+	}
+	if na2 == 0 && nb2 == 0 {
+		return 0
+	}
+	if na2 == 0 || nb2 == 0 {
+		return 1
+	}
+	den := math.Sqrt(na2) * math.Sqrt(nb2)
+	if den == 0 { // guard, though covered above
+		return 1
+	}
+	cos := dot / den
+	if cos > 1 {
+		cos = 1
+	} else if cos < -1 {
+		cos = -1
+	}
+	d := 1 - cos
+	if d < 0 {
+		return 0
+	}
+	if d > 2 {
+		return 2
+	}
+	return d
+}
+
+// WeightedCosineDistance implements 1 - weighted cosine similarity, where weights
+// scale each axis in both the dot product and the norms.
+// If Weights is nil or has zero length, this reduces to CosineDistance.
+type WeightedCosineDistance struct{ Weights []float64 }
+
+func (wcd WeightedCosineDistance) Distance(a, b []float64) float64 {
+	w := wcd.Weights
+	if len(w) == 0 || len(w) != len(a) || len(a) != len(b) {
+		// Fallback to unweighted cosine when lengths mismatch or weights missing.
+		return CosineDistance{}.Distance(a, b)
+	}
+	var dot, na2, nb2 float64
+	for i := range a {
+		wi := w[i]
+		ai := a[i]
+		bi := b[i]
+		v := wi * ai
+		dot += v * bi         // wi*ai*bi
+		na2 += v * ai         // wi*ai*ai
+		nb2 += (wi * bi) * bi // wi*bi*bi
+	}
+	if na2 == 0 && nb2 == 0 {
+		return 0
+	}
+	if na2 == 0 || nb2 == 0 {
+		return 1
+	}
+	den := math.Sqrt(na2) * math.Sqrt(nb2)
+	if den == 0 {
+		return 1
+	}
+	cos := dot / den
+	if cos > 1 {
+		cos = 1
+	} else if cos < -1 {
+		cos = -1
+	}
+	d := 1 - cos
+	if d < 0 {
+		return 0
+	}
+	if d > 2 {
+		return 2
+	}
+	return d
+}
+
 // KDOption configures KDTree construction (non-generic to allow inference).
 type KDOption func(*kdOptions)

--- a/kdtree_helpers.go
+++ b/kdtree_helpers.go
@ -1,9 +1,23 @@
 package poindexter

-// Helper builders for KDTree points with min-max normalization, optional inversion per-axis,
+import "errors"
+
+// Helper builders for KDTree points with min-max normalisation, optional inversion per-axis,
 // and per-axis weights. These are convenience utilities to make it easy to map domain
 // records into KD space for 2D/3D/4D use-cases.

+// Errors for helper builders.
+var (
+	// ErrInvalidFeatures indicates that no features were provided or nil feature encountered.
+	ErrInvalidFeatures = errors.New("kdtree: invalid features: provide at least one feature and ensure none are nil")
+	// ErrInvalidWeights indicates weights length doesn't match features length.
+	ErrInvalidWeights = errors.New("kdtree: invalid weights length; must match number of features")
+	// ErrInvalidInvert indicates invert flags length doesn't match features length.
+	ErrInvalidInvert = errors.New("kdtree: invalid invert length; must match number of features")
+	// ErrStatsDimMismatch indicates NormStats dimensions do not match features length.
+	ErrStatsDimMismatch = errors.New("kdtree: stats dimensionality mismatch")
+)
+
 // AxisStats holds the min/max observed for a single axis.
 type AxisStats struct {
 	Min float64
@ -16,6 +30,103 @@ type NormStats struct {
 	Stats []AxisStats
 }

+// ComputeNormStatsND computes per-axis min/max for an arbitrary number of features.
+func ComputeNormStatsND[T any](items []T, features []func(T) float64) (NormStats, error) {
+	if len(features) == 0 {
+		return NormStats{}, ErrInvalidFeatures
+	}
+	// Initialise mins/maxes on first item where possible
+	stats := make([]AxisStats, len(features))
+	if len(items) == 0 {
+		// empty items → zero stats slice of correct dim
+		return NormStats{Stats: stats}, nil
+	}
+	// Seed with first item values
+	first := items[0]
+	for i, f := range features {
+		if f == nil {
+			return NormStats{}, ErrInvalidFeatures
+		}
+		v := f(first)
+		stats[i] = AxisStats{Min: v, Max: v}
+	}
+	// Process remaining items
+	for _, it := range items[1:] {
+		for i, f := range features {
+			v := f(it)
+			if v < stats[i].Min {
+				stats[i].Min = v
+			}
+			if v > stats[i].Max {
+				stats[i].Max = v
+			}
+		}
+	}
+	return NormStats{Stats: stats}, nil
+}
+
+// BuildND constructs normalised-and-weighted KD points from arbitrary amount features.
+// Features are min-max normalised per axis over the provided items, optionally inverted,
+// then multiplied by per-axis weights.
+func BuildND[T any](items []T, id func(T) string, features []func(T) float64, weights []float64, invert []bool) ([]KDPoint[T], error) {
+	if len(items) == 0 {
+		return nil, nil
+	}
+	if len(features) == 0 {
+		return nil, ErrInvalidFeatures
+	}
+	if len(weights) != len(features) {
+		return nil, ErrInvalidWeights
+	}
+	if len(invert) != len(features) {
+		return nil, ErrInvalidInvert
+	}
+	stats, err := ComputeNormStatsND(items, features)
+	if err != nil {
+		return nil, err
+	}
+	return BuildNDWithStats(items, id, features, weights, invert, stats)
+}
+
+// BuildNDWithStats builds points using provided normalisation stats.
+func BuildNDWithStats[T any](items []T, id func(T) string, features []func(T) float64, weights []float64, invert []bool, stats NormStats) ([]KDPoint[T], error) {
+	if len(items) == 0 {
+		return nil, nil
+	}
+	if len(features) == 0 {
+		return nil, ErrInvalidFeatures
+	}
+	if len(weights) != len(features) {
+		return nil, ErrInvalidWeights
+	}
+	if len(invert) != len(features) {
+		return nil, ErrInvalidInvert
+	}
+	if len(stats.Stats) != len(features) {
+		return nil, ErrStatsDimMismatch
+	}
+	pts := make([]KDPoint[T], len(items))
+	for i, it := range items {
+		coords := make([]float64, len(features))
+		for d, f := range features {
+			if f == nil {
+				return nil, ErrInvalidFeatures
+			}
+			n := scale01(f(it), stats.Stats[d].Min, stats.Stats[d].Max)
+			if invert[d] {
+				n = 1 - n
+			}
+			coords[d] = weights[d] * n
+		}
+		var pid string
+		if id != nil {
+			pid = id(it)
+		}
+		pts[i] = KDPoint[T]{ID: pid, Value: it, Coords: coords}
+	}
+	return pts, nil
+}
+
 // minMax returns (min,max) of a slice.
 func minMax(xs []float64) (float64, float64) {
 	if len(xs) == 0 {
@ -89,7 +200,7 @@ func ComputeNormStats4D[T any](items []T, f1, f2, f3, f4 func(T) float64) NormSt
 	return NormStats{Stats: []AxisStats{{mn1, mx1}, {mn2, mx2}, {mn3, mx3}, {mn4, mx4}}}
 }

-// Build2D constructs normalized-and-weighted KD points from items using two feature extractors.
+// Build2D constructs normalised-and-weighted KD points from items using two feature extractors.
 // - id: function to provide a stable string ID (can return "" if you don't need DeleteByID)
 // - f1,f2: feature extractors (raw values)
 // - weights: per-axis weights applied after normalization
@ -156,7 +267,7 @@ func Build2DWithStats[T any](items []T, id func(T) string, f1, f2 func(T) float6
 	return pts, nil
 }

-// Build3D constructs normalized-and-weighted KD points using three feature extractors.
+// Build3D constructs normalised-and-weighted KD points using three feature extractors.
 func Build3D[T any](items []T, id func(T) string, f1, f2, f3 func(T) float64, weights [3]float64, invert [3]bool) ([]KDPoint[T], error) {
 	if len(items) == 0 {
 		return nil, nil
@ -231,7 +342,7 @@ func Build3DWithStats[T any](items []T, id func(T) string, f1, f2, f3 func(T) fl
 	return pts, nil
 }

-// Build4D constructs normalized-and-weighted KD points using four feature extractors.
+// Build4D constructs normalised-and-weighted KD points using four feature extractors.
 func Build4D[T any](items []T, id func(T) string, f1, f2, f3, f4 func(T) float64, weights [4]float64, invert [4]bool) ([]KDPoint[T], error) {
 	if len(items) == 0 {
 		return nil, nil
--- a/kdtree_nd_test.go
+++ b/kdtree_nd_test.go
@ -0,0 +1,117 @@
+package poindexter
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestCosineDistance_Basics(t *testing.T) {
+	// identical vectors → distance 0
+	a := []float64{1, 0, 0}
+	b := []float64{1, 0, 0}
+	d := CosineDistance{}.Distance(a, b)
+	if d != 0 {
+		t.Fatalf("expected 0, got %v", d)
+	}
+	// orthogonal → distance 1
+	b = []float64{0, 1, 0}
+	d = CosineDistance{}.Distance(a, b)
+	if d < 0.999 || d > 1.001 {
+		t.Fatalf("expected ~1, got %v", d)
+	}
+	// opposite → distance 2
+	a = []float64{1, 0}
+	b = []float64{-1, 0}
+	d = CosineDistance{}.Distance(a, b)
+	if d < 1.999 || d > 2.001 {
+		t.Fatalf("expected ~2, got %v", d)
+	}
+	// zero vectors
+	a = []float64{0, 0}
+	b = []float64{0, 0}
+	d = CosineDistance{}.Distance(a, b)
+	if d != 0 {
+		t.Fatalf("both zero → 0, got %v", d)
+	}
+	// one zero
+	a = []float64{0, 0}
+	b = []float64{1, 2}
+	d = CosineDistance{}.Distance(a, b)
+	if d != 1 {
+		t.Fatalf("one zero → 1, got %v", d)
+	}
+}
+
+func TestWeightedCosineDistance_Basics(t *testing.T) {
+	w := WeightedCosineDistance{Weights: []float64{2, 0.5}}
+	a := []float64{1, 0}
+	b := []float64{1, 0}
+	d := w.Distance(a, b)
+	if d != 0 {
+		t.Fatalf("expected 0, got %v", d)
+	}
+	// orthogonal remains ~1 regardless of weights for these axes
+	b = []float64{0, 3}
+	d = w.Distance(a, b)
+	if d < 0.999 || d > 1.001 {
+		t.Fatalf("expected ~1, got %v", d)
+	}
+}
+
+func TestBuildND_ParityWithBuild4D(t *testing.T) {
+	type rec struct{ a, b, c, d float64 }
+	items := []rec{{0, 10, 100, 1}, {10, 20, 200, 2}, {5, 15, 150, 1.5}}
+	weights4 := [4]float64{1.0, 0.5, 2.0, 1.0}
+	invert4 := [4]bool{false, true, false, true}
+	pts4, err := Build4D(items,
+		func(r rec) string { return "" },
+		func(r rec) float64 { return r.a },
+		func(r rec) float64 { return r.b },
+		func(r rec) float64 { return r.c },
+		func(r rec) float64 { return r.d },
+		weights4, invert4,
+	)
+	if err != nil {
+		t.Fatalf("build4d err: %v", err)
+	}
+
+	features := []func(rec) float64{
+		func(r rec) float64 { return r.a },
+		func(r rec) float64 { return r.b },
+		func(r rec) float64 { return r.c },
+		func(r rec) float64 { return r.d },
+	}
+	wts := []float64{weights4[0], weights4[1], weights4[2], weights4[3]}
+	inv := []bool{invert4[0], invert4[1], invert4[2], invert4[3]}
+	ptsN, err := BuildND(items, func(r rec) string { return "" }, features, wts, inv)
+	if err != nil {
+		t.Fatalf("buildND err: %v", err)
+	}
+	if len(ptsN) != len(pts4) {
+		t.Fatalf("len mismatch")
+	}
+	for i := range ptsN {
+		if len(ptsN[i].Coords) != 4 {
+			t.Fatalf("dim != 4")
+		}
+		for d := 0; d < 4; d++ {
+			if fmt.Sprintf("%.6f", ptsN[i].Coords[d]) != fmt.Sprintf("%.6f", pts4[i].Coords[d]) {
+				t.Fatalf("coords mismatch at i=%d d=%d: %v vs %v", i, d, ptsN[i].Coords, pts4[i].Coords)
+			}
+		}
+	}
+}
+
+func TestBuildNDWithStats_Errors(t *testing.T) {
+	type rec struct{ x float64 }
+	items := []rec{{1}, {2}}
+	features := []func(rec) float64{func(r rec) float64 { return r.x }}
+	wts := []float64{1}
+	inv := []bool{false}
+	// stats dim mismatch
+	stats := NormStats{Stats: []AxisStats{{Min: 0, Max: 1}, {Min: 0, Max: 1}}}
+	_, err := BuildNDWithStats(items, func(r rec) string { return "" }, features, wts, inv, stats)
+	if err == nil {
+		t.Fatalf("expected error for stats dim mismatch")
+	}
+}
--- a/poindexter.go
+++ b/poindexter.go
@ -3,7 +3,7 @@ package poindexter

 // Version returns the current version of the library.
 func Version() string {
-	return "0.2.1"
+	return "0.3.0"
 }

 // Hello returns a greeting message.
--- a/poindexter_test.go
+++ b/poindexter_test.go
@ -7,8 +7,8 @@ func TestVersion(t *testing.T) {
 	if version == "" {
 		t.Error("Version should not be empty")
 	}
-	if version != "0.2.1" {
-		t.Errorf("Expected version 0.2.1, got %s", version)
+	if version != "0.3.0" {
+		t.Errorf("Expected version 0.3.0, got %s", version)
 	}
 }