diff --git a/CHANGELOG.md b/CHANGELOG.md index 3cc58e0..7c4f5a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,17 @@ The format is based on Keep a Changelog and this project adheres to Semantic Ver - Lint: enable `errcheck` in `.golangci.yml` with test-file exclusion to reduce noise. - CI: enable module cache in `actions/setup-go` to speed up workflows. +## [0.3.0] - 2025-11-03 +### Added +- New distance metrics: `CosineDistance` and `WeightedCosineDistance` (1 - cosine similarity), with robust zero-vector handling and bounds. +- N-D normalization helpers: `ComputeNormStatsND`, `BuildND`, `BuildNDWithStats` for arbitrary dimensions, with validation errors (`ErrInvalidFeatures`, `ErrInvalidWeights`, `ErrInvalidInvert`, `ErrStatsDimMismatch`). +- Tests: unit tests for cosine/weighted-cosine metrics; parity tests between `Build4D` and `BuildND`; error-path tests; extended fuzz to include cosine metrics. +- pkg.go.dev examples: `ExampleBuildND`, `ExampleBuildNDWithStats`, `ExampleCosineDistance`. + +### Changed +- Version bumped to `0.3.0`. +- README: list Cosine among supported metrics. + ## [0.2.1] - 2025-11-03 ### Added - Normalization stats helpers: `AxisStats`, `NormStats`, `ComputeNormStats2D/3D/4D`. diff --git a/README.md b/README.md index 6c10d61..78a0f5e 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ A Go library package providing utility functions including sorting algorithms wi - ๐Ÿ”ข **Sorting Utilities**: Sort integers, strings, and floats in ascending or descending order - ๐ŸŽฏ **Custom Sorting**: Sort any type with custom comparison functions or key extractors - ๐Ÿ” **Binary Search**: Fast search on sorted data -- ๐Ÿงญ **KDTree (NN Search)**: Build a KDTree over points with generic payloads; nearest, k-NN, and radius queries with Euclidean or Manhattan metrics +- ๐Ÿงญ **KDTree (NN Search)**: Build a KDTree over points with generic payloads; nearest, k-NN, and radius queries with Euclidean, Manhattan, Chebyshev, and Cosine metrics - ๐Ÿ“ฆ **Generic Functions**: Type-safe operations using Go generics - โœ… **Well-Tested**: Comprehensive test coverage - ๐Ÿ“– **Documentation**: Full documentation available at GitHub Pages diff --git a/doc.go b/doc.go index 8e24b7b..00378b2 100644 --- a/doc.go +++ b/doc.go @@ -1,4 +1,7 @@ // Package poindexter provides sorting utilities and a KDTree with simple // nearest-neighbour queries. It also includes helper functions to build -// normalised, weighted KD points for 2D/3D/4D use-cases. +// normalised, weighted KD points for 2D/3D/4D and arbitrary Nโ€‘D use-cases. +// +// Distance metrics include Euclidean (L2), Manhattan (L1), Chebyshev (Lโˆž), and +// Cosine/Weighted-Cosine for vector similarity. package poindexter diff --git a/docs/api.md b/docs/api.md index 4435649..8c5c9f2 100644 --- a/docs/api.md +++ b/docs/api.md @@ -13,13 +13,13 @@ func Version() string Returns the current version of the library. **Returns:** -- `string`: The version string (e.g., "0.2.1") +- `string`: The version string (e.g., "0.3.0") **Example:** ```go version := poindexter.Version() -fmt.Println(version) // Output: 0.2.1 +fmt.Println(version) // Output: 0.3.0 ``` --- diff --git a/examples_test.go b/examples_test.go index 3c1ca81..4a7a100 100644 --- a/examples_test.go +++ b/examples_test.go @@ -140,6 +140,46 @@ func ExampleBuild2DWithStats() { // Output: dim=2 len=3 } +func ExampleBuildND() { + type rec struct{ a, b, c float64 } + items := []rec{{0, 0, 0}, {1, 2, 3}, {0.5, 1, 1.5}} + features := []func(rec) float64{ + func(r rec) float64 { return r.a }, + func(r rec) float64 { return r.b }, + func(r rec) float64 { return r.c }, + } + weights := []float64{1, 0.5, 2} + invert := []bool{false, false, false} + pts, _ := poindexter.BuildND(items, func(r rec) string { return "" }, features, weights, invert) + tr, _ := poindexter.NewKDTree(pts) + fmt.Printf("dim=%d len=%d", tr.Dim(), tr.Len()) + // Output: dim=3 len=3 +} + +func ExampleBuildNDWithStats() { + type rec struct{ a, b float64 } + items := []rec{{0, 0}, {1, 2}, {0.5, 1}} + features := []func(rec) float64{ + func(r rec) float64 { return r.a }, + func(r rec) float64 { return r.b }, + } + stats, _ := poindexter.ComputeNormStatsND(items, features) + weights := []float64{1, 0.5} + invert := []bool{false, false} + pts, _ := poindexter.BuildNDWithStats(items, func(r rec) string { return "" }, features, weights, invert, stats) + tr, _ := poindexter.NewKDTree(pts, poindexter.WithMetric(poindexter.CosineDistance{})) + fmt.Printf("dim=%d len=%d", tr.Dim(), tr.Len()) + // Output: dim=2 len=3 +} + +func ExampleCosineDistance() { + a := []float64{1, 0} + b := []float64{0, 1} + d := poindexter.CosineDistance{}.Distance(a, b) + fmt.Printf("%.0f", d) + // Output: 1 +} + func ExampleBuild4DWithStats() { type rec struct{ a, b, c, d float64 } items := []rec{{0, 0, 0, 0}, {1, 1, 1, 1}} diff --git a/fuzz_kdtree_test.go b/fuzz_kdtree_test.go index 16d7e57..01498cb 100644 --- a/fuzz_kdtree_test.go +++ b/fuzz_kdtree_test.go @@ -67,8 +67,17 @@ func FuzzMetrics_NoNegative(f *testing.F) { m1 := EuclideanDistance{}.Distance(a, b) m2 := ManhattanDistance{}.Distance(a, b) m3 := ChebyshevDistance{}.Distance(a, b) - if m1 < 0 || m2 < 0 || m3 < 0 { - t.Fatalf("negative metric: %v %v %v", m1, m2, m3) + m4 := CosineDistance{}.Distance(a, b) + w := make([]float64, dim) + for i := range w { + w[i] = 1 + } + m5 := WeightedCosineDistance{Weights: w}.Distance(a, b) + if m1 < 0 || m2 < 0 || m3 < 0 || m4 < 0 || m5 < 0 { + t.Fatalf("negative metric: %v %v %v %v %v", m1, m2, m3, m4, m5) + } + if m4 > 2 || m5 > 2 { + t.Fatalf("cosine distance out of bounds: %v %v", m4, m5) } }) } diff --git a/kdtree.go b/kdtree.go index 76180d2..409c4b9 100644 --- a/kdtree.go +++ b/kdtree.go @@ -75,6 +75,97 @@ func (ChebyshevDistance) Distance(a, b []float64) float64 { return max } +// CosineDistance implements 1 - cosine similarity. +// +// Distance is defined as 1 - (aยทb)/(||a||*||b||). If both vectors are zero, +// distance is 0. If exactly one is zero, distance is 1. Numerical results are +// clamped to [0,2]. +// Note: For typical normalized/weighted feature vectors with non-negative entries, +// the value will be in [0,1]. Opposite vectors in general spaces can yield up to 2. +type CosineDistance struct{} + +func (CosineDistance) Distance(a, b []float64) float64 { + var dot, na2, nb2 float64 + for i := range a { + ai := a[i] + bi := b[i] + dot += ai * bi + na2 += ai * ai + nb2 += bi * bi + } + if na2 == 0 && nb2 == 0 { + return 0 + } + if na2 == 0 || nb2 == 0 { + return 1 + } + den := math.Sqrt(na2) * math.Sqrt(nb2) + if den == 0 { // guard, though covered above + return 1 + } + cos := dot / den + if cos > 1 { + cos = 1 + } else if cos < -1 { + cos = -1 + } + d := 1 - cos + if d < 0 { + return 0 + } + if d > 2 { + return 2 + } + return d +} + +// WeightedCosineDistance implements 1 - weighted cosine similarity, where weights +// scale each axis in both the dot product and the norms. +// If Weights is nil or has zero length, this reduces to CosineDistance. +type WeightedCosineDistance struct{ Weights []float64 } + +func (wcd WeightedCosineDistance) Distance(a, b []float64) float64 { + w := wcd.Weights + if len(w) == 0 || len(w) != len(a) || len(a) != len(b) { + // Fallback to unweighted cosine when lengths mismatch or weights missing. + return CosineDistance{}.Distance(a, b) + } + var dot, na2, nb2 float64 + for i := range a { + wi := w[i] + ai := a[i] + bi := b[i] + v := wi * ai + dot += v * bi // wi*ai*bi + na2 += v * ai // wi*ai*ai + nb2 += (wi * bi) * bi // wi*bi*bi + } + if na2 == 0 && nb2 == 0 { + return 0 + } + if na2 == 0 || nb2 == 0 { + return 1 + } + den := math.Sqrt(na2) * math.Sqrt(nb2) + if den == 0 { + return 1 + } + cos := dot / den + if cos > 1 { + cos = 1 + } else if cos < -1 { + cos = -1 + } + d := 1 - cos + if d < 0 { + return 0 + } + if d > 2 { + return 2 + } + return d +} + // KDOption configures KDTree construction (non-generic to allow inference). type KDOption func(*kdOptions) diff --git a/kdtree_helpers.go b/kdtree_helpers.go index a70f0b6..063f0ae 100644 --- a/kdtree_helpers.go +++ b/kdtree_helpers.go @@ -1,9 +1,23 @@ package poindexter -// Helper builders for KDTree points with min-max normalization, optional inversion per-axis, +import "errors" + +// Helper builders for KDTree points with min-max normalisation, optional inversion per-axis, // and per-axis weights. These are convenience utilities to make it easy to map domain // records into KD space for 2D/3D/4D use-cases. +// Errors for helper builders. +var ( + // ErrInvalidFeatures indicates that no features were provided or nil feature encountered. + ErrInvalidFeatures = errors.New("kdtree: invalid features: provide at least one feature and ensure none are nil") + // ErrInvalidWeights indicates weights length doesn't match features length. + ErrInvalidWeights = errors.New("kdtree: invalid weights length; must match number of features") + // ErrInvalidInvert indicates invert flags length doesn't match features length. + ErrInvalidInvert = errors.New("kdtree: invalid invert length; must match number of features") + // ErrStatsDimMismatch indicates NormStats dimensions do not match features length. + ErrStatsDimMismatch = errors.New("kdtree: stats dimensionality mismatch") +) + // AxisStats holds the min/max observed for a single axis. type AxisStats struct { Min float64 @@ -16,6 +30,103 @@ type NormStats struct { Stats []AxisStats } +// ComputeNormStatsND computes per-axis min/max for an arbitrary number of features. +func ComputeNormStatsND[T any](items []T, features []func(T) float64) (NormStats, error) { + if len(features) == 0 { + return NormStats{}, ErrInvalidFeatures + } + // Initialise mins/maxes on first item where possible + stats := make([]AxisStats, len(features)) + if len(items) == 0 { + // empty items โ†’ zero stats slice of correct dim + return NormStats{Stats: stats}, nil + } + // Seed with first item values + first := items[0] + for i, f := range features { + if f == nil { + return NormStats{}, ErrInvalidFeatures + } + v := f(first) + stats[i] = AxisStats{Min: v, Max: v} + } + // Process remaining items + for _, it := range items[1:] { + for i, f := range features { + v := f(it) + if v < stats[i].Min { + stats[i].Min = v + } + if v > stats[i].Max { + stats[i].Max = v + } + } + } + return NormStats{Stats: stats}, nil +} + +// BuildND constructs normalised-and-weighted KD points from arbitrary amount features. +// Features are min-max normalised per axis over the provided items, optionally inverted, +// then multiplied by per-axis weights. +func BuildND[T any](items []T, id func(T) string, features []func(T) float64, weights []float64, invert []bool) ([]KDPoint[T], error) { + if len(items) == 0 { + return nil, nil + } + if len(features) == 0 { + return nil, ErrInvalidFeatures + } + if len(weights) != len(features) { + return nil, ErrInvalidWeights + } + if len(invert) != len(features) { + return nil, ErrInvalidInvert + } + stats, err := ComputeNormStatsND(items, features) + if err != nil { + return nil, err + } + return BuildNDWithStats(items, id, features, weights, invert, stats) +} + +// BuildNDWithStats builds points using provided normalisation stats. +func BuildNDWithStats[T any](items []T, id func(T) string, features []func(T) float64, weights []float64, invert []bool, stats NormStats) ([]KDPoint[T], error) { + if len(items) == 0 { + return nil, nil + } + if len(features) == 0 { + return nil, ErrInvalidFeatures + } + if len(weights) != len(features) { + return nil, ErrInvalidWeights + } + if len(invert) != len(features) { + return nil, ErrInvalidInvert + } + if len(stats.Stats) != len(features) { + return nil, ErrStatsDimMismatch + } + pts := make([]KDPoint[T], len(items)) + for i, it := range items { + coords := make([]float64, len(features)) + for d, f := range features { + if f == nil { + return nil, ErrInvalidFeatures + } + n := scale01(f(it), stats.Stats[d].Min, stats.Stats[d].Max) + if invert[d] { + n = 1 - n + } + coords[d] = weights[d] * n + } + var pid string + if id != nil { + pid = id(it) + } + pts[i] = KDPoint[T]{ID: pid, Value: it, Coords: coords} + } + return pts, nil +} + // minMax returns (min,max) of a slice. func minMax(xs []float64) (float64, float64) { if len(xs) == 0 { @@ -89,7 +200,7 @@ func ComputeNormStats4D[T any](items []T, f1, f2, f3, f4 func(T) float64) NormSt return NormStats{Stats: []AxisStats{{mn1, mx1}, {mn2, mx2}, {mn3, mx3}, {mn4, mx4}}} } -// Build2D constructs normalized-and-weighted KD points from items using two feature extractors. +// Build2D constructs normalised-and-weighted KD points from items using two feature extractors. // - id: function to provide a stable string ID (can return "" if you don't need DeleteByID) // - f1,f2: feature extractors (raw values) // - weights: per-axis weights applied after normalization @@ -156,7 +267,7 @@ func Build2DWithStats[T any](items []T, id func(T) string, f1, f2 func(T) float6 return pts, nil } -// Build3D constructs normalized-and-weighted KD points using three feature extractors. +// Build3D constructs normalised-and-weighted KD points using three feature extractors. func Build3D[T any](items []T, id func(T) string, f1, f2, f3 func(T) float64, weights [3]float64, invert [3]bool) ([]KDPoint[T], error) { if len(items) == 0 { return nil, nil @@ -231,7 +342,7 @@ func Build3DWithStats[T any](items []T, id func(T) string, f1, f2, f3 func(T) fl return pts, nil } -// Build4D constructs normalized-and-weighted KD points using four feature extractors. +// Build4D constructs normalised-and-weighted KD points using four feature extractors. func Build4D[T any](items []T, id func(T) string, f1, f2, f3, f4 func(T) float64, weights [4]float64, invert [4]bool) ([]KDPoint[T], error) { if len(items) == 0 { return nil, nil diff --git a/kdtree_nd_test.go b/kdtree_nd_test.go new file mode 100644 index 0000000..66e8927 --- /dev/null +++ b/kdtree_nd_test.go @@ -0,0 +1,117 @@ +package poindexter + +import ( + "fmt" + "testing" +) + +func TestCosineDistance_Basics(t *testing.T) { + // identical vectors โ†’ distance 0 + a := []float64{1, 0, 0} + b := []float64{1, 0, 0} + d := CosineDistance{}.Distance(a, b) + if d != 0 { + t.Fatalf("expected 0, got %v", d) + } + // orthogonal โ†’ distance 1 + b = []float64{0, 1, 0} + d = CosineDistance{}.Distance(a, b) + if d < 0.999 || d > 1.001 { + t.Fatalf("expected ~1, got %v", d) + } + // opposite โ†’ distance 2 + a = []float64{1, 0} + b = []float64{-1, 0} + d = CosineDistance{}.Distance(a, b) + if d < 1.999 || d > 2.001 { + t.Fatalf("expected ~2, got %v", d) + } + // zero vectors + a = []float64{0, 0} + b = []float64{0, 0} + d = CosineDistance{}.Distance(a, b) + if d != 0 { + t.Fatalf("both zero โ†’ 0, got %v", d) + } + // one zero + a = []float64{0, 0} + b = []float64{1, 2} + d = CosineDistance{}.Distance(a, b) + if d != 1 { + t.Fatalf("one zero โ†’ 1, got %v", d) + } +} + +func TestWeightedCosineDistance_Basics(t *testing.T) { + w := WeightedCosineDistance{Weights: []float64{2, 0.5}} + a := []float64{1, 0} + b := []float64{1, 0} + d := w.Distance(a, b) + if d != 0 { + t.Fatalf("expected 0, got %v", d) + } + // orthogonal remains ~1 regardless of weights for these axes + b = []float64{0, 3} + d = w.Distance(a, b) + if d < 0.999 || d > 1.001 { + t.Fatalf("expected ~1, got %v", d) + } +} + +func TestBuildND_ParityWithBuild4D(t *testing.T) { + type rec struct{ a, b, c, d float64 } + items := []rec{{0, 10, 100, 1}, {10, 20, 200, 2}, {5, 15, 150, 1.5}} + weights4 := [4]float64{1.0, 0.5, 2.0, 1.0} + invert4 := [4]bool{false, true, false, true} + pts4, err := Build4D(items, + func(r rec) string { return "" }, + func(r rec) float64 { return r.a }, + func(r rec) float64 { return r.b }, + func(r rec) float64 { return r.c }, + func(r rec) float64 { return r.d }, + weights4, invert4, + ) + if err != nil { + t.Fatalf("build4d err: %v", err) + } + + features := []func(rec) float64{ + func(r rec) float64 { return r.a }, + func(r rec) float64 { return r.b }, + func(r rec) float64 { return r.c }, + func(r rec) float64 { return r.d }, + } + wts := []float64{weights4[0], weights4[1], weights4[2], weights4[3]} + inv := []bool{invert4[0], invert4[1], invert4[2], invert4[3]} + ptsN, err := BuildND(items, func(r rec) string { return "" }, features, wts, inv) + if err != nil { + t.Fatalf("buildND err: %v", err) + } + if len(ptsN) != len(pts4) { + t.Fatalf("len mismatch") + } + for i := range ptsN { + if len(ptsN[i].Coords) != 4 { + t.Fatalf("dim != 4") + } + for d := 0; d < 4; d++ { + if fmt.Sprintf("%.6f", ptsN[i].Coords[d]) != fmt.Sprintf("%.6f", pts4[i].Coords[d]) { + t.Fatalf("coords mismatch at i=%d d=%d: %v vs %v", i, d, ptsN[i].Coords, pts4[i].Coords) + } + } + } +} + +func TestBuildNDWithStats_Errors(t *testing.T) { + type rec struct{ x float64 } + items := []rec{{1}, {2}} + features := []func(rec) float64{func(r rec) float64 { return r.x }} + wts := []float64{1} + inv := []bool{false} + // stats dim mismatch + stats := NormStats{Stats: []AxisStats{{Min: 0, Max: 1}, {Min: 0, Max: 1}}} + _, err := BuildNDWithStats(items, func(r rec) string { return "" }, features, wts, inv, stats) + if err == nil { + t.Fatalf("expected error for stats dim mismatch") + } +} diff --git a/poindexter.go b/poindexter.go index 2c04e19..aabde1b 100644 --- a/poindexter.go +++ b/poindexter.go @@ -3,7 +3,7 @@ package poindexter // Version returns the current version of the library. func Version() string { - return "0.2.1" + return "0.3.0" } // Hello returns a greeting message. diff --git a/poindexter_test.go b/poindexter_test.go index e2b3a47..9501e94 100644 --- a/poindexter_test.go +++ b/poindexter_test.go @@ -7,8 +7,8 @@ func TestVersion(t *testing.T) { if version == "" { t.Error("Version should not be empty") } - if version != "0.2.1" { - t.Errorf("Expected version 0.2.1, got %s", version) + if version != "0.3.0" { + t.Errorf("Expected version 0.3.0, got %s", version) } }