Add normalization stats helpers and builders for KDTree

2025-11-03 18:36:09 +00:00 · 2025-11-03 18:36:09 +00:00 · 3ba2b4fce3
commit 3ba2b4fce3
parent f106261216
8 changed files with 432 additions and 12 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,23 +5,32 @@ All notable changes to this project will be documented in this file.
 The format is based on Keep a Changelog and this project adheres to Semantic Versioning.

 ## [Unreleased]
+
+## [0.2.1] - 2025-11-03
 ### Added
+- Normalization stats helpers: `AxisStats`, `NormStats`, `ComputeNormStats2D/3D/4D`.
+- Builders that reuse stats: `Build2DWithStats`, `Build3DWithStats`, `Build4DWithStats`.
+- pkg.go.dev examples: `ExampleBuild2DWithStats`, `ExampleBuild4DWithStats`.
+- Tests for stats parity, min==max safety, and dynamic update with reused stats.
+- Docs: API reference section “KDTree Normalization Stats (reuse across updates)”; updated multi-dimensional docs with WithStats snippet.
+
+### Changed
+- Bumped version to `0.2.1`.
+
+### Previously added in Unreleased
 - README badges (pkg.go.dev, CI, Go Report Card, govulncheck) and KDTree performance/concurrency notes.
 - Examples directory with runnable programs: 1D ping, 2D ping+hop, 3D ping+hop+geo, 4D ping+hop+geo+score.
 - CI workflow (Go 1.22/1.23): tidy check, build, vet, test -race, build examples, govulncheck, golangci-lint.
 - Lint configuration (.golangci.yml) with a pragmatic ruleset.
 - Contributor docs: CONTRIBUTING.md, CODE_OF_CONDUCT.md, SECURITY.md.
 - pkg.go.dev example functions for KDTree usage and helpers.
- Fuzz tests and benchmarks for KDTree (Nearest/KNearest/Radius and metrics). 
-
-### Changed
- Documented KDTree complexity and tie-ordering in code comments.
- Docs: API examples synced to Version 0.2.0; added references to helpers and examples.
+- Fuzz tests and benchmarks for KDTree (Nearest/KNearest/Radius and metrics).

 ## [0.2.0] - 2025-10-??
 ### Added
 - KDTree public API with generic payloads and helper builders (Build2D/3D/4D).
 - Docs pages for DHT examples and multi-dimensional KDTree usage.

-[Unreleased]: https://github.com/Snider/Poindexter/compare/v0.2.0...HEAD
+[Unreleased]: https://github.com/Snider/Poindexter/compare/v0.2.1...HEAD
+[0.2.1]: https://github.com/Snider/Poindexter/releases/tag/v0.2.1
 [0.2.0]: https://github.com/Snider/Poindexter/releases/tag/v0.2.0
--- a/docs/api.md
+++ b/docs/api.md
@ -13,13 +13,13 @@ func Version() string
 Returns the current version of the library.

 **Returns:**
- `string`: The version string (e.g., "0.2.0")
+- `string`: The version string (e.g., "0.2.1")

 **Example:**

 ```go
 version := poindexter.Version()
-fmt.Println(version) // Output: 0.2.0
+fmt.Println(version) // Output: 0.2.1
 ```

 ---
@ -410,3 +410,85 @@ Construct an empty KDTree with the given dimension, then populate later via `Ins
 - Concurrency: KDTree is not safe for concurrent mutation. Wrap with a mutex or share immutable snapshots for read-mostly workloads.

 See runnable examples in the repository `examples/` and the docs pages for 1D DHT and multi-dimensional KDTree usage.
+
+
+## KDTree Normalization Stats (reuse across updates)
+
+To keep normalization consistent across dynamic updates, compute per‑axis min/max once and reuse it to build points later. This avoids drift when the candidate set changes.
+
+### Types
+
+```go
+// AxisStats holds the min/max observed for a single axis.
+type AxisStats struct {
+    Min float64
+    Max float64
+}
+
+// NormStats holds per‑axis normalisation stats; for D dims, Stats has length D.
+type NormStats struct {
+    Stats []AxisStats
+}
+```
+
+### Compute normalization stats
+
+```go
+func ComputeNormStats2D[T any](items []T, f1, f2 func(T) float64) NormStats
+func ComputeNormStats3D[T any](items []T, f1, f2, f3 func(T) float64) NormStats
+func ComputeNormStats4D[T any](items []T, f1, f2, f3, f4 func(T) float64) NormStats
+```
+
+### Build with precomputed stats
+
+```go
+func Build2DWithStats[T any](
+    items []T,
+    id func(T) string,
+    f1, f2 func(T) float64,
+    weights [2]float64,
+    invert [2]bool,
+    stats NormStats,
+) ([]KDPoint[T], error)
+
+func Build3DWithStats[T any](
+    items []T,
+    id func(T) string,
+    f1, f2, f3 func(T) float64,
+    weights [3]float64,
+    invert [3]bool,
+    stats NormStats,
+) ([]KDPoint[T], error)
+
+func Build4DWithStats[T any](
+    items []T,
+    id func(T) string,
+    f1, f2, f3, f4 func(T) float64,
+    weights [4]float64,
+    invert [4]bool,
+    stats NormStats,
+) ([]KDPoint[T], error)
+```
+
+#### Example (2D)
+```go
+// Compute stats once over your baseline set
+stats := poindexter.ComputeNormStats2D(peers,
+    func(p Peer) float64 { return p.PingMS },
+    func(p Peer) float64 { return p.Hops },
+)
+
+// Build points using those stats (now or later)
+pts, _ := poindexter.Build2DWithStats(
+    peers,
+    func(p Peer) string { return p.ID },
+    func(p Peer) float64 { return p.PingMS },
+    func(p Peer) float64 { return p.Hops },
+    [2]float64{1,1}, [2]bool{false,false}, stats,
+)
+```
+
+Notes:
+- If `min==max` for an axis, normalized value is `0` for that axis.
+- `invert[i]` flips the normalized axis as `1 - n` before applying `weights[i]`.
+- These helpers mirror `Build2D/3D/4D`, but use your provided `NormStats` instead of recomputing from the items slice.
--- a/docs/kdtree-multidimensional.md
+++ b/docs/kdtree-multidimensional.md
@ -192,7 +192,26 @@ func main() {

 ## Dynamic updates

-Your routing table changes constantly. Insert/remove peers. For consistent normalization, rebuild points when the candidate set changes (or cache and reuse your min/max stats).
+Your routing table changes constantly. Insert/remove peers. For consistent normalization, compute and reuse your min/max stats (preferred) or rebuild points when the candidate set changes.
+
+Tip: Use the WithStats helpers to reuse normalization across updates:
+
+```go
+// Compute once over your baseline
+stats := poindexter.ComputeNormStats2D(peers,
+    func(p Peer) float64 { return p.PingMS },
+    func(p Peer) float64 { return p.Hops },
+)
+
+// Build now or later using the same stats
+ts, _ := poindexter.Build2DWithStats(
+    peers,
+    func(p Peer) string { return p.ID },
+    func(p Peer) float64 { return p.PingMS },
+    func(p Peer) float64 { return p.Hops },
+    [2]float64{1,1}, [2]bool{false,false}, stats,
+)
+```

 ```go
 package main
--- a/examples_test.go
+++ b/examples_test.go
@ -119,3 +119,47 @@ func ExampleBuild4D() {
 	fmt.Println(tr.Dim())
 	// Output: 4
 }
+
+func ExampleBuild2DWithStats() {
+	type rec struct{ ping, hops float64 }
+	items := []rec{{20, 3}, {30, 2}, {15, 4}}
+	weights := [2]float64{1.0, 1.0}
+	invert := [2]bool{false, false}
+	stats := poindexter.ComputeNormStats2D(items,
+		func(r rec) float64 { return r.ping },
+		func(r rec) float64 { return r.hops },
+	)
+	pts, _ := poindexter.Build2DWithStats(items,
+		func(r rec) string { return "" },
+		func(r rec) float64 { return r.ping },
+		func(r rec) float64 { return r.hops },
+		weights, invert, stats,
+	)
+	tr, _ := poindexter.NewKDTree(pts)
+	fmt.Printf("dim=%d len=%d", tr.Dim(), tr.Len())
+	// Output: dim=2 len=3
+}
+
+func ExampleBuild4DWithStats() {
+	type rec struct{ a, b, c, d float64 }
+	items := []rec{{0, 0, 0, 0}, {1, 1, 1, 1}}
+	weights := [4]float64{1, 1, 1, 1}
+	invert := [4]bool{false, false, false, false}
+	stats := poindexter.ComputeNormStats4D(items,
+		func(r rec) float64 { return r.a },
+		func(r rec) float64 { return r.b },
+		func(r rec) float64 { return r.c },
+		func(r rec) float64 { return r.d },
+	)
+	pts, _ := poindexter.Build4DWithStats(items,
+		func(r rec) string { return "" },
+		func(r rec) float64 { return r.a },
+		func(r rec) float64 { return r.b },
+		func(r rec) float64 { return r.c },
+		func(r rec) float64 { return r.d },
+		weights, invert, stats,
+	)
+	tr, _ := poindexter.NewKDTree(pts)
+	fmt.Println(tr.Dim())
+	// Output: 4
+}
--- a/kdtree_helpers.go
+++ b/kdtree_helpers.go
@ -4,6 +4,18 @@ package poindexter
 // and per-axis weights. These are convenience utilities to make it easy to map domain
 // records into KD space for 2D/3D/4D use-cases.

+// AxisStats holds the min/max observed for a single axis.
+type AxisStats struct {
+	Min float64
+	Max float64
+}
+
+// NormStats holds per-axis normalisation statistics.
+// For D dimensions, Stats has length D.
+type NormStats struct {
+	Stats []AxisStats
+}
+
 // minMax returns (min,max) of a slice.
 func minMax(xs []float64) (float64, float64) {
 	if len(xs) == 0 {
@ -29,6 +41,54 @@ func scale01(v, min, max float64) float64 {
 	return (v - min) / (max - min)
 }

+// ComputeNormStats2D computes per-axis min/max for two features.
+func ComputeNormStats2D[T any](items []T, f1, f2 func(T) float64) NormStats {
+	vals1 := make([]float64, len(items))
+	vals2 := make([]float64, len(items))
+	for i, it := range items {
+		vals1[i] = f1(it)
+		vals2[i] = f2(it)
+	}
+	mn1, mx1 := minMax(vals1)
+	mn2, mx2 := minMax(vals2)
+	return NormStats{Stats: []AxisStats{{mn1, mx1}, {mn2, mx2}}}
+}
+
+// ComputeNormStats3D computes per-axis min/max for three features.
+func ComputeNormStats3D[T any](items []T, f1, f2, f3 func(T) float64) NormStats {
+	vals1 := make([]float64, len(items))
+	vals2 := make([]float64, len(items))
+	vals3 := make([]float64, len(items))
+	for i, it := range items {
+		vals1[i] = f1(it)
+		vals2[i] = f2(it)
+		vals3[i] = f3(it)
+	}
+	mn1, mx1 := minMax(vals1)
+	mn2, mx2 := minMax(vals2)
+	mn3, mx3 := minMax(vals3)
+	return NormStats{Stats: []AxisStats{{mn1, mx1}, {mn2, mx2}, {mn3, mx3}}}
+}
+
+// ComputeNormStats4D computes per-axis min/max for four features.
+func ComputeNormStats4D[T any](items []T, f1, f2, f3, f4 func(T) float64) NormStats {
+	vals1 := make([]float64, len(items))
+	vals2 := make([]float64, len(items))
+	vals3 := make([]float64, len(items))
+	vals4 := make([]float64, len(items))
+	for i, it := range items {
+		vals1[i] = f1(it)
+		vals2[i] = f2(it)
+		vals3[i] = f3(it)
+		vals4[i] = f4(it)
+	}
+	mn1, mx1 := minMax(vals1)
+	mn2, mx2 := minMax(vals2)
+	mn3, mx3 := minMax(vals3)
+	mn4, mx4 := minMax(vals4)
+	return NormStats{Stats: []AxisStats{{mn1, mx1}, {mn2, mx2}, {mn3, mx3}, {mn4, mx4}}}
+}
+
 // Build2D constructs normalized-and-weighted KD points from items using two feature extractors.
 // - id: function to provide a stable string ID (can return "" if you don't need DeleteByID)
 // - f1,f2: feature extractors (raw values)
@ -69,6 +129,33 @@ func Build2D[T any](items []T, id func(T) string, f1, f2 func(T) float64, weight
 	return pts, nil
 }

+// Build2DWithStats builds points using provided normalisation stats.
+func Build2DWithStats[T any](items []T, id func(T) string, f1, f2 func(T) float64, weights [2]float64, invert [2]bool, stats NormStats) ([]KDPoint[T], error) {
+	if len(items) == 0 {
+		return nil, nil
+	}
+	if len(stats.Stats) != 2 {
+		return nil, nil
+	}
+	pts := make([]KDPoint[T], len(items))
+	for i, it := range items {
+		n1 := scale01(f1(it), stats.Stats[0].Min, stats.Stats[0].Max)
+		n2 := scale01(f2(it), stats.Stats[1].Min, stats.Stats[1].Max)
+		if invert[0] {
+			n1 = 1 - n1
+		}
+		if invert[1] {
+			n2 = 1 - n2
+		}
+		pts[i] = KDPoint[T]{
+			ID:     id(it),
+			Value:  it,
+			Coords: []float64{weights[0] * n1, weights[1] * n2},
+		}
+	}
+	return pts, nil
+}
+
 // Build3D constructs normalized-and-weighted KD points using three feature extractors.
 func Build3D[T any](items []T, id func(T) string, f1, f2, f3 func(T) float64, weights [3]float64, invert [3]bool) ([]KDPoint[T], error) {
 	if len(items) == 0 {
@ -113,6 +200,37 @@ func Build3D[T any](items []T, id func(T) string, f1, f2, f3 func(T) float64, we
 	return pts, nil
 }

+// Build3DWithStats builds points using provided normalisation stats.
+func Build3DWithStats[T any](items []T, id func(T) string, f1, f2, f3 func(T) float64, weights [3]float64, invert [3]bool, stats NormStats) ([]KDPoint[T], error) {
+	if len(items) == 0 {
+		return nil, nil
+	}
+	if len(stats.Stats) != 3 {
+		return nil, nil
+	}
+	pts := make([]KDPoint[T], len(items))
+	for i, it := range items {
+		n1 := scale01(f1(it), stats.Stats[0].Min, stats.Stats[0].Max)
+		n2 := scale01(f2(it), stats.Stats[1].Min, stats.Stats[1].Max)
+		n3 := scale01(f3(it), stats.Stats[2].Min, stats.Stats[2].Max)
+		if invert[0] {
+			n1 = 1 - n1
+		}
+		if invert[1] {
+			n2 = 1 - n2
+		}
+		if invert[2] {
+			n3 = 1 - n3
+		}
+		pts[i] = KDPoint[T]{
+			ID:     id(it),
+			Value:  it,
+			Coords: []float64{weights[0] * n1, weights[1] * n2, weights[2] * n3},
+		}
+	}
+	return pts, nil
+}
+
 // Build4D constructs normalized-and-weighted KD points using four feature extractors.
 func Build4D[T any](items []T, id func(T) string, f1, f2, f3, f4 func(T) float64, weights [4]float64, invert [4]bool) ([]KDPoint[T], error) {
 	if len(items) == 0 {
@ -164,3 +282,38 @@ func Build4D[T any](items []T, id func(T) string, f1, f2, f3, f4 func(T) float64
 	}
 	return pts, nil
 }
+
+// Build4DWithStats builds points using provided normalisation stats.
+func Build4DWithStats[T any](items []T, id func(T) string, f1, f2, f3, f4 func(T) float64, weights [4]float64, invert [4]bool, stats NormStats) ([]KDPoint[T], error) {
+	if len(items) == 0 {
+		return nil, nil
+	}
+	if len(stats.Stats) != 4 {
+		return nil, nil
+	}
+	pts := make([]KDPoint[T], len(items))
+	for i, it := range items {
+		n1 := scale01(f1(it), stats.Stats[0].Min, stats.Stats[0].Max)
+		n2 := scale01(f2(it), stats.Stats[1].Min, stats.Stats[1].Max)
+		n3 := scale01(f3(it), stats.Stats[2].Min, stats.Stats[2].Max)
+		n4 := scale01(f4(it), stats.Stats[3].Min, stats.Stats[3].Max)
+		if invert[0] {
+			n1 = 1 - n1
+		}
+		if invert[1] {
+			n2 = 1 - n2
+		}
+		if invert[2] {
+			n3 = 1 - n3
+		}
+		if invert[3] {
+			n4 = 1 - n4
+		}
+		pts[i] = KDPoint[T]{
+			ID:     id(it),
+			Value:  it,
+			Coords: []float64{weights[0] * n1, weights[1] * n2, weights[2] * n3, weights[3] * n4},
+		}
+	}
+	return pts, nil
+}
--- a/kdtree_helpers_test.go
+++ b/kdtree_helpers_test.go
@ -110,3 +110,116 @@ func TestBuild4D_EndToEnd_Example(t *testing.T) {
 		t.Fatalf("expected best B, got %s", best.ID)
 	}
 }
+
+func TestComputeNormStatsAndWithStats_Parity2D(t *testing.T) {
+	type rec struct{ a, b float64 }
+	items := []rec{{0, 10}, {5, 20}, {10, 30}}
+	weights := [2]float64{1, 2}
+	invert := [2]bool{false, true}
+	// Build using automatic stats
+	autoPts, err := Build2D(items,
+		func(r rec) string { return "" },
+		func(r rec) float64 { return r.a },
+		func(r rec) float64 { return r.b },
+		weights, invert,
+	)
+	if err != nil {
+		t.Fatalf("auto build err: %v", err)
+	}
+	// Compute stats and build with stats
+	stats := ComputeNormStats2D(items,
+		func(r rec) float64 { return r.a },
+		func(r rec) float64 { return r.b },
+	)
+	withPts, err := Build2DWithStats(items,
+		func(r rec) string { return "" },
+		func(r rec) float64 { return r.a },
+		func(r rec) float64 { return r.b },
+		weights, invert, stats,
+	)
+	if err != nil {
+		t.Fatalf("with-stats build err: %v", err)
+	}
+	if len(withPts) != len(autoPts) {
+		t.Fatalf("len mismatch")
+	}
+	for i := range withPts {
+		if len(withPts[i].Coords) != 2 {
+			t.Fatalf("dim mismatch")
+		}
+		if withPts[i].Coords[0] != autoPts[i].Coords[0] || withPts[i].Coords[1] != autoPts[i].Coords[1] {
+			t.Fatalf("coords mismatch at %d: %v vs %v", i, withPts[i].Coords, autoPts[i].Coords)
+		}
+	}
+}
+
+func TestBuild3DWithStats_MinEqualsMax_Safe(t *testing.T) {
+	type rec struct{ x, y, z float64 }
+	items := []rec{{1, 2, 3}, {1, 5, 3}, {1, 9, 3}}
+	weights := [3]float64{1, 1, 1}
+	invert := [3]bool{false, false, false}
+	// x and z min==max across items for x=1, z=3
+	stats := NormStats{Stats: []AxisStats{{Min: 1, Max: 1}, {Min: 2, Max: 9}, {Min: 3, Max: 3}}}
+	pts, err := Build3DWithStats(items,
+		func(r rec) string { return "" },
+		func(r rec) float64 { return r.x },
+		func(r rec) float64 { return r.y },
+		func(r rec) float64 { return r.z },
+		weights, invert, stats,
+	)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	for _, p := range pts {
+		if p.Coords[0] != 0 || p.Coords[2] != 0 {
+			t.Fatalf("expected zero for min==max axes, got %v", p.Coords)
+		}
+	}
+}
+
+func TestBuild4DWithStats_DynamicUpdateExample(t *testing.T) {
+	type Peer struct {
+		ID                     string
+		Ping, Hops, Geo, Score float64
+	}
+	base := []Peer{{"A", 20, 3, 1000, 0.8}, {"B", 30, 2, 800, 0.9}}
+	weights := [4]float64{1, 1, 0.2, 1.2}
+	invert := [4]bool{false, false, false, true}
+	stats := ComputeNormStats4D(base,
+		func(p Peer) float64 { return p.Ping },
+		func(p Peer) float64 { return p.Hops },
+		func(p Peer) float64 { return p.Geo },
+		func(p Peer) float64 { return p.Score },
+	)
+	pts, err := Build4DWithStats(base,
+		func(p Peer) string { return p.ID },
+		func(p Peer) float64 { return p.Ping },
+		func(p Peer) float64 { return p.Hops },
+		func(p Peer) float64 { return p.Geo },
+		func(p Peer) float64 { return p.Score },
+		weights, invert, stats,
+	)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	tr, err := NewKDTree(pts)
+	if err != nil {
+		t.Fatalf("kdt err: %v", err)
+	}
+	// add a new peer using same stats
+	newPeer := Peer{"Z", 15, 2, 1200, 0.85}
+	newPts, _ := Build4DWithStats([]Peer{newPeer},
+		func(p Peer) string { return p.ID },
+		func(p Peer) float64 { return p.Ping },
+		func(p Peer) float64 { return p.Hops },
+		func(p Peer) float64 { return p.Geo },
+		func(p Peer) float64 { return p.Score },
+		weights, invert, stats,
+	)
+	if !tr.Insert(newPts[0]) {
+		t.Fatalf("insert failed")
+	}
+	if tr.Dim() != 4 {
+		t.Fatalf("dim != 4")
+	}
+}
--- a/poindexter.go
+++ b/poindexter.go
@ -3,7 +3,7 @@ package poindexter

 // Version returns the current version of the library.
 func Version() string {
-	return "0.2.0"
+	return "0.2.1"
 }

 // Hello returns a greeting message.
--- a/poindexter_test.go
+++ b/poindexter_test.go
@ -7,8 +7,8 @@ func TestVersion(t *testing.T) {
 	if version == "" {
 		t.Error("Version should not be empty")
 	}
-	if version != "0.2.0" {
-		t.Errorf("Expected version 0.2.0, got %s", version)
+	if version != "0.2.1" {
+		t.Errorf("Expected version 0.2.1, got %s", version)
 	}
 }