Add Cosine and Weighted Cosine distance metrics, enhance KDTree helpers, and update version to 0.3.0

This commit is contained in:
Snider 2025-11-03 18:51:23 +00:00
parent ca6f89a99c
commit 3886724129
11 changed files with 395 additions and 13 deletions

View file

@ -10,6 +10,17 @@ The format is based on Keep a Changelog and this project adheres to Semantic Ver
- Lint: enable `errcheck` in `.golangci.yml` with test-file exclusion to reduce noise.
- CI: enable module cache in `actions/setup-go` to speed up workflows.
## [0.3.0] - 2025-11-03
### Added
- New distance metrics: `CosineDistance` and `WeightedCosineDistance` (1 - cosine similarity), with robust zero-vector handling and bounds.
- N-D normalization helpers: `ComputeNormStatsND`, `BuildND`, `BuildNDWithStats` for arbitrary dimensions, with validation errors (`ErrInvalidFeatures`, `ErrInvalidWeights`, `ErrInvalidInvert`, `ErrStatsDimMismatch`).
- Tests: unit tests for cosine/weighted-cosine metrics; parity tests between `Build4D` and `BuildND`; error-path tests; extended fuzz to include cosine metrics.
- pkg.go.dev examples: `ExampleBuildND`, `ExampleBuildNDWithStats`, `ExampleCosineDistance`.
### Changed
- Version bumped to `0.3.0`.
- README: list Cosine among supported metrics.
## [0.2.1] - 2025-11-03
### Added
- Normalization stats helpers: `AxisStats`, `NormStats`, `ComputeNormStats2D/3D/4D`.

View file

@ -13,7 +13,7 @@ A Go library package providing utility functions including sorting algorithms wi
- 🔢 **Sorting Utilities**: Sort integers, strings, and floats in ascending or descending order
- 🎯 **Custom Sorting**: Sort any type with custom comparison functions or key extractors
- 🔍 **Binary Search**: Fast search on sorted data
- 🧭 **KDTree (NN Search)**: Build a KDTree over points with generic payloads; nearest, k-NN, and radius queries with Euclidean or Manhattan metrics
- 🧭 **KDTree (NN Search)**: Build a KDTree over points with generic payloads; nearest, k-NN, and radius queries with Euclidean, Manhattan, Chebyshev, and Cosine metrics
- 📦 **Generic Functions**: Type-safe operations using Go generics
- ✅ **Well-Tested**: Comprehensive test coverage
- 📖 **Documentation**: Full documentation available at GitHub Pages

5
doc.go
View file

@ -1,4 +1,7 @@
// Package poindexter provides sorting utilities and a KDTree with simple
// nearest-neighbour queries. It also includes helper functions to build
// normalised, weighted KD points for 2D/3D/4D use-cases.
// normalised, weighted KD points for 2D/3D/4D and arbitrary ND use-cases.
//
// Distance metrics include Euclidean (L2), Manhattan (L1), Chebyshev (L∞), and
// Cosine/Weighted-Cosine for vector similarity.
package poindexter

View file

@ -13,13 +13,13 @@ func Version() string
Returns the current version of the library.
**Returns:**
- `string`: The version string (e.g., "0.2.1")
- `string`: The version string (e.g., "0.3.0")
**Example:**
```go
version := poindexter.Version()
fmt.Println(version) // Output: 0.2.1
fmt.Println(version) // Output: 0.3.0
```
---

View file

@ -140,6 +140,46 @@ func ExampleBuild2DWithStats() {
// Output: dim=2 len=3
}
func ExampleBuildND() {
type rec struct{ a, b, c float64 }
items := []rec{{0, 0, 0}, {1, 2, 3}, {0.5, 1, 1.5}}
features := []func(rec) float64{
func(r rec) float64 { return r.a },
func(r rec) float64 { return r.b },
func(r rec) float64 { return r.c },
}
weights := []float64{1, 0.5, 2}
invert := []bool{false, false, false}
pts, _ := poindexter.BuildND(items, func(r rec) string { return "" }, features, weights, invert)
tr, _ := poindexter.NewKDTree(pts)
fmt.Printf("dim=%d len=%d", tr.Dim(), tr.Len())
// Output: dim=3 len=3
}
func ExampleBuildNDWithStats() {
type rec struct{ a, b float64 }
items := []rec{{0, 0}, {1, 2}, {0.5, 1}}
features := []func(rec) float64{
func(r rec) float64 { return r.a },
func(r rec) float64 { return r.b },
}
stats, _ := poindexter.ComputeNormStatsND(items, features)
weights := []float64{1, 0.5}
invert := []bool{false, false}
pts, _ := poindexter.BuildNDWithStats(items, func(r rec) string { return "" }, features, weights, invert, stats)
tr, _ := poindexter.NewKDTree(pts, poindexter.WithMetric(poindexter.CosineDistance{}))
fmt.Printf("dim=%d len=%d", tr.Dim(), tr.Len())
// Output: dim=2 len=3
}
func ExampleCosineDistance() {
a := []float64{1, 0}
b := []float64{0, 1}
d := poindexter.CosineDistance{}.Distance(a, b)
fmt.Printf("%.0f", d)
// Output: 1
}
func ExampleBuild4DWithStats() {
type rec struct{ a, b, c, d float64 }
items := []rec{{0, 0, 0, 0}, {1, 1, 1, 1}}

View file

@ -67,8 +67,17 @@ func FuzzMetrics_NoNegative(f *testing.F) {
m1 := EuclideanDistance{}.Distance(a, b)
m2 := ManhattanDistance{}.Distance(a, b)
m3 := ChebyshevDistance{}.Distance(a, b)
if m1 < 0 || m2 < 0 || m3 < 0 {
t.Fatalf("negative metric: %v %v %v", m1, m2, m3)
m4 := CosineDistance{}.Distance(a, b)
w := make([]float64, dim)
for i := range w {
w[i] = 1
}
m5 := WeightedCosineDistance{Weights: w}.Distance(a, b)
if m1 < 0 || m2 < 0 || m3 < 0 || m4 < 0 || m5 < 0 {
t.Fatalf("negative metric: %v %v %v %v %v", m1, m2, m3, m4, m5)
}
if m4 > 2 || m5 > 2 {
t.Fatalf("cosine distance out of bounds: %v %v", m4, m5)
}
})
}

View file

@ -75,6 +75,97 @@ func (ChebyshevDistance) Distance(a, b []float64) float64 {
return max
}
// CosineDistance implements 1 - cosine similarity.
//
// Distance is defined as 1 - (a·b)/(||a||*||b||). If both vectors are zero,
// distance is 0. If exactly one is zero, distance is 1. Numerical results are
// clamped to [0,2].
// Note: For typical normalized/weighted feature vectors with non-negative entries,
// the value will be in [0,1]. Opposite vectors in general spaces can yield up to 2.
type CosineDistance struct{}
func (CosineDistance) Distance(a, b []float64) float64 {
var dot, na2, nb2 float64
for i := range a {
ai := a[i]
bi := b[i]
dot += ai * bi
na2 += ai * ai
nb2 += bi * bi
}
if na2 == 0 && nb2 == 0 {
return 0
}
if na2 == 0 || nb2 == 0 {
return 1
}
den := math.Sqrt(na2) * math.Sqrt(nb2)
if den == 0 { // guard, though covered above
return 1
}
cos := dot / den
if cos > 1 {
cos = 1
} else if cos < -1 {
cos = -1
}
d := 1 - cos
if d < 0 {
return 0
}
if d > 2 {
return 2
}
return d
}
// WeightedCosineDistance implements 1 - weighted cosine similarity, where weights
// scale each axis in both the dot product and the norms.
// If Weights is nil or has zero length, this reduces to CosineDistance.
type WeightedCosineDistance struct{ Weights []float64 }
func (wcd WeightedCosineDistance) Distance(a, b []float64) float64 {
w := wcd.Weights
if len(w) == 0 || len(w) != len(a) || len(a) != len(b) {
// Fallback to unweighted cosine when lengths mismatch or weights missing.
return CosineDistance{}.Distance(a, b)
}
var dot, na2, nb2 float64
for i := range a {
wi := w[i]
ai := a[i]
bi := b[i]
v := wi * ai
dot += v * bi // wi*ai*bi
na2 += v * ai // wi*ai*ai
nb2 += (wi * bi) * bi // wi*bi*bi
}
if na2 == 0 && nb2 == 0 {
return 0
}
if na2 == 0 || nb2 == 0 {
return 1
}
den := math.Sqrt(na2) * math.Sqrt(nb2)
if den == 0 {
return 1
}
cos := dot / den
if cos > 1 {
cos = 1
} else if cos < -1 {
cos = -1
}
d := 1 - cos
if d < 0 {
return 0
}
if d > 2 {
return 2
}
return d
}
// KDOption configures KDTree construction (non-generic to allow inference).
type KDOption func(*kdOptions)

View file

@ -1,9 +1,23 @@
package poindexter
// Helper builders for KDTree points with min-max normalization, optional inversion per-axis,
import "errors"
// Helper builders for KDTree points with min-max normalisation, optional inversion per-axis,
// and per-axis weights. These are convenience utilities to make it easy to map domain
// records into KD space for 2D/3D/4D use-cases.
// Errors for helper builders.
var (
// ErrInvalidFeatures indicates that no features were provided or nil feature encountered.
ErrInvalidFeatures = errors.New("kdtree: invalid features: provide at least one feature and ensure none are nil")
// ErrInvalidWeights indicates weights length doesn't match features length.
ErrInvalidWeights = errors.New("kdtree: invalid weights length; must match number of features")
// ErrInvalidInvert indicates invert flags length doesn't match features length.
ErrInvalidInvert = errors.New("kdtree: invalid invert length; must match number of features")
// ErrStatsDimMismatch indicates NormStats dimensions do not match features length.
ErrStatsDimMismatch = errors.New("kdtree: stats dimensionality mismatch")
)
// AxisStats holds the min/max observed for a single axis.
type AxisStats struct {
Min float64
@ -16,6 +30,103 @@ type NormStats struct {
Stats []AxisStats
}
// ComputeNormStatsND computes per-axis min/max for an arbitrary number of features.
func ComputeNormStatsND[T any](items []T, features []func(T) float64) (NormStats, error) {
if len(features) == 0 {
return NormStats{}, ErrInvalidFeatures
}
// Initialise mins/maxes on first item where possible
stats := make([]AxisStats, len(features))
if len(items) == 0 {
// empty items → zero stats slice of correct dim
return NormStats{Stats: stats}, nil
}
// Seed with first item values
first := items[0]
for i, f := range features {
if f == nil {
return NormStats{}, ErrInvalidFeatures
}
v := f(first)
stats[i] = AxisStats{Min: v, Max: v}
}
// Process remaining items
for _, it := range items[1:] {
for i, f := range features {
v := f(it)
if v < stats[i].Min {
stats[i].Min = v
}
if v > stats[i].Max {
stats[i].Max = v
}
}
}
return NormStats{Stats: stats}, nil
}
// BuildND constructs normalised-and-weighted KD points from arbitrary amount features.
// Features are min-max normalised per axis over the provided items, optionally inverted,
// then multiplied by per-axis weights.
func BuildND[T any](items []T, id func(T) string, features []func(T) float64, weights []float64, invert []bool) ([]KDPoint[T], error) {
if len(items) == 0 {
return nil, nil
}
if len(features) == 0 {
return nil, ErrInvalidFeatures
}
if len(weights) != len(features) {
return nil, ErrInvalidWeights
}
if len(invert) != len(features) {
return nil, ErrInvalidInvert
}
stats, err := ComputeNormStatsND(items, features)
if err != nil {
return nil, err
}
return BuildNDWithStats(items, id, features, weights, invert, stats)
}
// BuildNDWithStats builds points using provided normalisation stats.
func BuildNDWithStats[T any](items []T, id func(T) string, features []func(T) float64, weights []float64, invert []bool, stats NormStats) ([]KDPoint[T], error) {
if len(items) == 0 {
return nil, nil
}
if len(features) == 0 {
return nil, ErrInvalidFeatures
}
if len(weights) != len(features) {
return nil, ErrInvalidWeights
}
if len(invert) != len(features) {
return nil, ErrInvalidInvert
}
if len(stats.Stats) != len(features) {
return nil, ErrStatsDimMismatch
}
pts := make([]KDPoint[T], len(items))
for i, it := range items {
coords := make([]float64, len(features))
for d, f := range features {
if f == nil {
return nil, ErrInvalidFeatures
}
n := scale01(f(it), stats.Stats[d].Min, stats.Stats[d].Max)
if invert[d] {
n = 1 - n
}
coords[d] = weights[d] * n
}
var pid string
if id != nil {
pid = id(it)
}
pts[i] = KDPoint[T]{ID: pid, Value: it, Coords: coords}
}
return pts, nil
}
// minMax returns (min,max) of a slice.
func minMax(xs []float64) (float64, float64) {
if len(xs) == 0 {
@ -89,7 +200,7 @@ func ComputeNormStats4D[T any](items []T, f1, f2, f3, f4 func(T) float64) NormSt
return NormStats{Stats: []AxisStats{{mn1, mx1}, {mn2, mx2}, {mn3, mx3}, {mn4, mx4}}}
}
// Build2D constructs normalized-and-weighted KD points from items using two feature extractors.
// Build2D constructs normalised-and-weighted KD points from items using two feature extractors.
// - id: function to provide a stable string ID (can return "" if you don't need DeleteByID)
// - f1,f2: feature extractors (raw values)
// - weights: per-axis weights applied after normalization
@ -156,7 +267,7 @@ func Build2DWithStats[T any](items []T, id func(T) string, f1, f2 func(T) float6
return pts, nil
}
// Build3D constructs normalized-and-weighted KD points using three feature extractors.
// Build3D constructs normalised-and-weighted KD points using three feature extractors.
func Build3D[T any](items []T, id func(T) string, f1, f2, f3 func(T) float64, weights [3]float64, invert [3]bool) ([]KDPoint[T], error) {
if len(items) == 0 {
return nil, nil
@ -231,7 +342,7 @@ func Build3DWithStats[T any](items []T, id func(T) string, f1, f2, f3 func(T) fl
return pts, nil
}
// Build4D constructs normalized-and-weighted KD points using four feature extractors.
// Build4D constructs normalised-and-weighted KD points using four feature extractors.
func Build4D[T any](items []T, id func(T) string, f1, f2, f3, f4 func(T) float64, weights [4]float64, invert [4]bool) ([]KDPoint[T], error) {
if len(items) == 0 {
return nil, nil

117
kdtree_nd_test.go Normal file
View file

@ -0,0 +1,117 @@
package poindexter
import (
"fmt"
"testing"
)
func TestCosineDistance_Basics(t *testing.T) {
// identical vectors → distance 0
a := []float64{1, 0, 0}
b := []float64{1, 0, 0}
d := CosineDistance{}.Distance(a, b)
if d != 0 {
t.Fatalf("expected 0, got %v", d)
}
// orthogonal → distance 1
b = []float64{0, 1, 0}
d = CosineDistance{}.Distance(a, b)
if d < 0.999 || d > 1.001 {
t.Fatalf("expected ~1, got %v", d)
}
// opposite → distance 2
a = []float64{1, 0}
b = []float64{-1, 0}
d = CosineDistance{}.Distance(a, b)
if d < 1.999 || d > 2.001 {
t.Fatalf("expected ~2, got %v", d)
}
// zero vectors
a = []float64{0, 0}
b = []float64{0, 0}
d = CosineDistance{}.Distance(a, b)
if d != 0 {
t.Fatalf("both zero → 0, got %v", d)
}
// one zero
a = []float64{0, 0}
b = []float64{1, 2}
d = CosineDistance{}.Distance(a, b)
if d != 1 {
t.Fatalf("one zero → 1, got %v", d)
}
}
func TestWeightedCosineDistance_Basics(t *testing.T) {
w := WeightedCosineDistance{Weights: []float64{2, 0.5}}
a := []float64{1, 0}
b := []float64{1, 0}
d := w.Distance(a, b)
if d != 0 {
t.Fatalf("expected 0, got %v", d)
}
// orthogonal remains ~1 regardless of weights for these axes
b = []float64{0, 3}
d = w.Distance(a, b)
if d < 0.999 || d > 1.001 {
t.Fatalf("expected ~1, got %v", d)
}
}
func TestBuildND_ParityWithBuild4D(t *testing.T) {
type rec struct{ a, b, c, d float64 }
items := []rec{{0, 10, 100, 1}, {10, 20, 200, 2}, {5, 15, 150, 1.5}}
weights4 := [4]float64{1.0, 0.5, 2.0, 1.0}
invert4 := [4]bool{false, true, false, true}
pts4, err := Build4D(items,
func(r rec) string { return "" },
func(r rec) float64 { return r.a },
func(r rec) float64 { return r.b },
func(r rec) float64 { return r.c },
func(r rec) float64 { return r.d },
weights4, invert4,
)
if err != nil {
t.Fatalf("build4d err: %v", err)
}
features := []func(rec) float64{
func(r rec) float64 { return r.a },
func(r rec) float64 { return r.b },
func(r rec) float64 { return r.c },
func(r rec) float64 { return r.d },
}
wts := []float64{weights4[0], weights4[1], weights4[2], weights4[3]}
inv := []bool{invert4[0], invert4[1], invert4[2], invert4[3]}
ptsN, err := BuildND(items, func(r rec) string { return "" }, features, wts, inv)
if err != nil {
t.Fatalf("buildND err: %v", err)
}
if len(ptsN) != len(pts4) {
t.Fatalf("len mismatch")
}
for i := range ptsN {
if len(ptsN[i].Coords) != 4 {
t.Fatalf("dim != 4")
}
for d := 0; d < 4; d++ {
if fmt.Sprintf("%.6f", ptsN[i].Coords[d]) != fmt.Sprintf("%.6f", pts4[i].Coords[d]) {
t.Fatalf("coords mismatch at i=%d d=%d: %v vs %v", i, d, ptsN[i].Coords, pts4[i].Coords)
}
}
}
}
func TestBuildNDWithStats_Errors(t *testing.T) {
type rec struct{ x float64 }
items := []rec{{1}, {2}}
features := []func(rec) float64{func(r rec) float64 { return r.x }}
wts := []float64{1}
inv := []bool{false}
// stats dim mismatch
stats := NormStats{Stats: []AxisStats{{Min: 0, Max: 1}, {Min: 0, Max: 1}}}
_, err := BuildNDWithStats(items, func(r rec) string { return "" }, features, wts, inv, stats)
if err == nil {
t.Fatalf("expected error for stats dim mismatch")
}
}

View file

@ -3,7 +3,7 @@ package poindexter
// Version returns the current version of the library.
func Version() string {
return "0.2.1"
return "0.3.0"
}
// Hello returns a greeting message.

View file

@ -7,8 +7,8 @@ func TestVersion(t *testing.T) {
if version == "" {
t.Error("Version should not be empty")
}
if version != "0.2.1" {
t.Errorf("Expected version 0.2.1, got %s", version)
if version != "0.3.0" {
t.Errorf("Expected version 0.3.0, got %s", version)
}
}