bench(metal): add 29 benchmarks baselined on M3 Ultra

MatMul (128² to 4096², token projection), Softmax, element-wise ops, fused Metal kernels (RMSNorm, LayerNorm, RoPE, SDPA), Linear, Embedding, reductions, and full sampler chain. CGO floor ~170μs. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 20:47:25 +00:00 · 2026-02-19 20:47:25 +00:00 · ff01175a62
commit ff01175a62
parent 7435648f66
3 changed files with 419 additions and 1 deletions
--- a/FINDINGS.md
+++ b/FINDINGS.md
@ -342,3 +342,76 @@ Was a stub. Now masks tokens whose probability is below `min_p * max_prob`. Uses
 - 165 internal/metal tests — all pass
 - 11 root integration tests — all pass
 - Total: 176 tests passing
+
+---
+
+## 2026-02-19: Benchmark Baseline — M3 Ultra
+
+29 benchmarks in `internal/metal/bench_test.go`. All times in ns/op, measured with `go test -bench=. -benchtime=2s`.
+
+### Matrix Multiply
+
+| Shape | ns/op | Notes |
+|-------|------:|-------|
+| 128×128 | 194,467 | CGO overhead dominates at small sizes |
+| 512×512 | 255,288 | GPU starting to amortise |
+| 1024×1024 | 474,900 | Sweet spot for Metal throughput |
+| 2048×2048 | 4,173,797 | ~4ms — good for decode step |
+| 4096×4096 | 10,715,051 | ~10.7ms — large context attention |
+| 1×2048 → 32000 (token proj) | 626,087 | Output projection per token |
+
+### Fused Metal Kernels
+
+| Operation | Shape | ns/op |
+|-----------|-------|------:|
+| RMSNorm | 1×2048 | 156,696 |
+| RMSNorm | 32×2048 | 225,164 |
+| LayerNorm | 32×2048 | 184,514 |
+| RoPE | 1×1×32×128 (decode) | 176,605 |
+| RoPE | 1×32×512×128 (prefill) | 1,443,803 |
+| SDPA causal | 1 head, seq=32 | 200,926 |
+| SDPA causal | 32 heads, seq=128 | 515,477 |
+| SDPA causal | 32 heads, seq=512 | 1,815,073 |
+
+### Softmax & Reductions
+
+| Operation | Shape | ns/op |
+|-----------|-------|------:|
+| Softmax | 1×1024 | 173,811 |
+| Softmax | 32×32000 | 948,660 |
+| Softmax | 1×128000 | 270,022 |
+| Sum | 1M elements | 175,204 |
+| Argmax | 1×32000 | 171,327 |
+
+### Element-wise (1M elements)
+
+| Operation | ns/op |
+|-----------|------:|
+| Add | 651,687 |
+| Mul | 394,941 |
+| SiLU | 1,192,843 |
+
+### Layers
+
+| Operation | Shape | ns/op |
+|-----------|-------|------:|
+| Linear | 1×2048 → 2048 | 181,417 |
+| Linear | 32×2048 → 8192 | 471,038 |
+| Embedding | 32 tokens, 32K vocab, 2048 dim | 219,154 |
+
+### Sampling (vocab=32000)
+
+| Strategy | ns/op |
+|----------|------:|
+| Greedy (argmax) | 172,698 |
+| TopK=50, temp=1.0 | 542,635 |
+| TopP=0.9, temp=1.0 | 713,538 |
+| Full (TopP+MinP+TopK) | 731,118 |
+
+### Key Observations
+
+1. **CGO floor ~170μs**: All operations have a ~170μs minimum (greedy sample, RMSNorm single row, Sum 1M). This is the CGO call + Metal command buffer overhead.
+2. **MatMul scales well**: 128² → 4096² is only ~55× slower for 1024× more work, showing good GPU utilisation.
+3. **SDPA efficient**: 32-head seq=512 attention at 1.8ms is practical for real-time inference.
+4. **Sampling overhead**: Full chain (TopP+MinP+TopK) adds ~560μs over greedy — acceptable per token.
+5. **Linear layer**: Single-token forward through 2048→2048 at 181μs suggests ~5500 layers/sec ceiling for per-token decode.
--- a/TODO.md
+++ b/TODO.md
@ -9,7 +9,7 @@ Dispatched from core/go orchestration. Pick up tasks in order.
 - [x] **Verify go generate → test round-trip** — ✅ 29/29 tests pass. CMake 3.24+, AppleClang 17.0.0, macOS SDK 26.2. Build takes ~2min on M3 Ultra.
 - [x] **Add missing tests for core operations** — ✅ 86 new tests across 4 files: array_test.go (25), ops_test.go (44), nn_test.go (8), fast_test.go (9). Covers: all scalar/array creation, shape ops, element-wise arithmetic, math functions, matrix ops, reductions, indexing, slicing, fused kernels (RMSNorm, LayerNorm, RoPE, SDPA), Linear, Embedding, RepeatKV. Found non-contiguous view bug in Floats()/DataInt32() — see FINDINGS.md.
 - [x] **Add missing tests for model/tokenizer/sample/cache** — ✅ 33 new tests: cache_test.go (10: KVCache + RotatingKVCache lifecycle, update, bounded, reset), sample_test.go (8: greedy, temperature, topK, chain, stub pass-through), tokenizer_test.go (15: Load/error, BOS/EOS, encode/decode, DecodeToken, SentencePiece space, GPT-2 byte maps). model/ still needs tests (requires model files on disk).
- [ ] **Benchmark suite** — No benchmarks exist. Add: MatMul (various sizes), Softmax, model.Forward (single token), tokenizer.Encode/Decode, full Generate (tokens/sec). Baseline on M3 Ultra.
+- [x] **Benchmark suite** — ✅ 29 benchmarks in bench_test.go. Covers: MatMul (128² to 4096², token-shaped 1×2048→32000), Softmax (1K to 128K vocab), element-wise (Add, Mul, SiLU at 1M elements), fused kernels (RMSNorm, LayerNorm, RoPE, SDPA at various shapes), Linear, Embedding, reductions (Sum, Argmax), and full sampler chain (greedy, TopK, TopP, combined). Baselined on M3 Ultra. model.Forward and tokenizer benchmarks deferred to Phase 2 (require model files on disk).

 ## Phase 2: Model Support

--- a/internal/metal/bench_test.go
+++ b/internal/metal/bench_test.go
@ -0,0 +1,345 @@
+//go:build darwin && arm64
+
+package metal
+
+import (
+	"math"
+	"testing"
+)
+
+// --- Helpers ---
+
+// randomMatrix creates a random float32 matrix of the given shape.
+func randomMatrix(rows, cols int32) *Array {
+	return RandomUniform(0, 1, []int32{rows, cols}, DTypeFloat32)
+}
+
+// randomVector creates a random float32 vector.
+func randomVector(n int32) *Array {
+	return RandomUniform(0, 1, []int32{n}, DTypeFloat32)
+}
+
+// random4D creates a random float32 4D tensor [B, H, L, D].
+func random4D(b, h, l, d int32) *Array {
+	return RandomUniform(0, 1, []int32{b, h, l, d}, DTypeFloat32)
+}
+
+// --- MatMul benchmarks (various sizes) ---
+
+func BenchmarkMatMul_128x128(b *testing.B) {
+	a := randomMatrix(128, 128)
+	w := randomMatrix(128, 128)
+	Materialize(a, w)
+	for b.Loop() {
+		c := Matmul(a, w)
+		Materialize(c)
+	}
+}
+
+func BenchmarkMatMul_512x512(b *testing.B) {
+	a := randomMatrix(512, 512)
+	w := randomMatrix(512, 512)
+	Materialize(a, w)
+	for b.Loop() {
+		c := Matmul(a, w)
+		Materialize(c)
+	}
+}
+
+func BenchmarkMatMul_1024x1024(b *testing.B) {
+	a := randomMatrix(1024, 1024)
+	w := randomMatrix(1024, 1024)
+	Materialize(a, w)
+	for b.Loop() {
+		c := Matmul(a, w)
+		Materialize(c)
+	}
+}
+
+func BenchmarkMatMul_2048x2048(b *testing.B) {
+	a := randomMatrix(2048, 2048)
+	w := randomMatrix(2048, 2048)
+	Materialize(a, w)
+	for b.Loop() {
+		c := Matmul(a, w)
+		Materialize(c)
+	}
+}
+
+func BenchmarkMatMul_4096x4096(b *testing.B) {
+	a := randomMatrix(4096, 4096)
+	w := randomMatrix(4096, 4096)
+	Materialize(a, w)
+	for b.Loop() {
+		c := Matmul(a, w)
+		Materialize(c)
+	}
+}
+
+// Token-shaped matmul: [1, D] x [D, V] — single-token forward through output projection.
+func BenchmarkMatMul_1x2048_x_2048x32000(b *testing.B) {
+	x := randomMatrix(1, 2048)
+	w := randomMatrix(2048, 32000)
+	Materialize(x, w)
+	for b.Loop() {
+		c := Matmul(x, w)
+		Materialize(c)
+	}
+}
+
+// --- Softmax benchmarks ---
+
+func BenchmarkSoftmax_1x1024(b *testing.B) {
+	x := randomMatrix(1, 1024)
+	Materialize(x)
+	for b.Loop() {
+		y := Softmax(x)
+		Materialize(y)
+	}
+}
+
+func BenchmarkSoftmax_32x32000(b *testing.B) {
+	x := randomMatrix(32, 32000)
+	Materialize(x)
+	for b.Loop() {
+		y := Softmax(x)
+		Materialize(y)
+	}
+}
+
+func BenchmarkSoftmax_1x128000(b *testing.B) {
+	x := randomMatrix(1, 128000)
+	Materialize(x)
+	for b.Loop() {
+		y := Softmax(x)
+		Materialize(y)
+	}
+}
+
+// --- Element-wise arithmetic ---
+
+func BenchmarkAdd_1M(b *testing.B) {
+	a := RandomUniform(0, 1, []int32{1000000}, DTypeFloat32)
+	c := RandomUniform(0, 1, []int32{1000000}, DTypeFloat32)
+	Materialize(a, c)
+	for b.Loop() {
+		y := Add(a, c)
+		Materialize(y)
+	}
+}
+
+func BenchmarkMul_1M(b *testing.B) {
+	a := RandomUniform(0, 1, []int32{1000000}, DTypeFloat32)
+	c := RandomUniform(0, 1, []int32{1000000}, DTypeFloat32)
+	Materialize(a, c)
+	for b.Loop() {
+		y := Mul(a, c)
+		Materialize(y)
+	}
+}
+
+func BenchmarkSiLU_1M(b *testing.B) {
+	a := RandomUniform(-3, 3, []int32{1000000}, DTypeFloat32)
+	Materialize(a)
+	for b.Loop() {
+		y := SiLU(a)
+		Materialize(y)
+	}
+}
+
+// --- Fused Metal kernels ---
+
+func BenchmarkRMSNorm_1x2048(b *testing.B) {
+	x := randomMatrix(1, 2048)
+	w := randomVector(2048)
+	Materialize(x, w)
+	for b.Loop() {
+		y := RMSNorm(x, w, 1e-5)
+		Materialize(y)
+	}
+}
+
+func BenchmarkRMSNorm_32x2048(b *testing.B) {
+	x := randomMatrix(32, 2048)
+	w := randomVector(2048)
+	Materialize(x, w)
+	for b.Loop() {
+		y := RMSNorm(x, w, 1e-5)
+		Materialize(y)
+	}
+}
+
+func BenchmarkLayerNorm_32x2048(b *testing.B) {
+	x := randomMatrix(32, 2048)
+	w := randomVector(2048)
+	bias := randomVector(2048)
+	Materialize(x, w, bias)
+	for b.Loop() {
+		y := LayerNorm(x, w, bias, 1e-5)
+		Materialize(y)
+	}
+}
+
+func BenchmarkRoPE_1x1x32x128(b *testing.B) {
+	// Single head, 32 positions, 128 dims — typical decode step shape.
+	x := random4D(1, 1, 32, 128)
+	Materialize(x)
+	for b.Loop() {
+		y := RoPE(x, 128, false, 10000.0, 1.0, 0)
+		Materialize(y)
+	}
+}
+
+func BenchmarkRoPE_1x32x512x128(b *testing.B) {
+	// 32 heads, 512 positions — typical prefill shape.
+	x := random4D(1, 32, 512, 128)
+	Materialize(x)
+	for b.Loop() {
+		y := RoPE(x, 128, false, 10000.0, 1.0, 0)
+		Materialize(y)
+	}
+}
+
+// --- Scaled Dot-Product Attention ---
+
+func BenchmarkSDPA_1head_seq32(b *testing.B) {
+	scale := float32(1.0 / math.Sqrt(128.0))
+	q := random4D(1, 1, 32, 128)
+	k := random4D(1, 1, 32, 128)
+	v := random4D(1, 1, 32, 128)
+	Materialize(q, k, v)
+	for b.Loop() {
+		y := ScaledDotProductAttention(q, k, v, scale, true)
+		Materialize(y)
+	}
+}
+
+func BenchmarkSDPA_32head_seq128(b *testing.B) {
+	scale := float32(1.0 / math.Sqrt(128.0))
+	q := random4D(1, 32, 128, 128)
+	k := random4D(1, 32, 128, 128)
+	v := random4D(1, 32, 128, 128)
+	Materialize(q, k, v)
+	for b.Loop() {
+		y := ScaledDotProductAttention(q, k, v, scale, true)
+		Materialize(y)
+	}
+}
+
+func BenchmarkSDPA_32head_seq512(b *testing.B) {
+	scale := float32(1.0 / math.Sqrt(128.0))
+	q := random4D(1, 32, 512, 128)
+	k := random4D(1, 32, 512, 128)
+	v := random4D(1, 32, 512, 128)
+	Materialize(q, k, v)
+	for b.Loop() {
+		y := ScaledDotProductAttention(q, k, v, scale, true)
+		Materialize(y)
+	}
+}
+
+// --- Neural network layers ---
+
+func BenchmarkLinear_1x2048_to_2048(b *testing.B) {
+	w := randomMatrix(2048, 2048)
+	Materialize(w)
+	layer := NewLinear(w, nil)
+	x := randomMatrix(1, 2048)
+	Materialize(x)
+	for b.Loop() {
+		y := layer.Forward(x)
+		Materialize(y)
+	}
+}
+
+func BenchmarkLinear_32x2048_to_8192(b *testing.B) {
+	w := randomMatrix(8192, 2048)
+	Materialize(w)
+	layer := NewLinear(w, nil)
+	x := randomMatrix(32, 2048)
+	Materialize(x)
+	for b.Loop() {
+		y := layer.Forward(x)
+		Materialize(y)
+	}
+}
+
+func BenchmarkEmbedding_32tokens_vocab32000_dim2048(b *testing.B) {
+	w := randomMatrix(32000, 2048)
+	Materialize(w)
+	emb := &Embedding{Weight: w}
+	indices := FromValues(make([]int32, 32), 32)
+	// Fill with random valid indices
+	for i := range 32 {
+		indices = FromValues([]int32{int32(i % 32000)}, 1)
+	}
+	indices = RandomUniform(0, 31999, []int32{32}, DTypeFloat32)
+	indices = AsType(indices, DTypeInt32)
+	Materialize(indices)
+	for b.Loop() {
+		y := emb.Forward(indices)
+		Materialize(y)
+	}
+}
+
+// --- Reductions ---
+
+func BenchmarkSum_1M(b *testing.B) {
+	a := RandomUniform(0, 1, []int32{1000000}, DTypeFloat32)
+	Materialize(a)
+	for b.Loop() {
+		y := Sum(a, 0, false)
+		Materialize(y)
+	}
+}
+
+func BenchmarkArgmax_1x32000(b *testing.B) {
+	a := randomMatrix(1, 32000)
+	Materialize(a)
+	for b.Loop() {
+		y := Argmax(a, -1, false)
+		Materialize(y)
+	}
+}
+
+// --- Sampling ---
+
+func BenchmarkSampler_Greedy(b *testing.B) {
+	logits := RandomUniform(-5, 5, []int32{1, 32000}, DTypeFloat32)
+	Materialize(logits)
+	s := newSampler(0, 0, 0, 0) // greedy
+	for b.Loop() {
+		tok := s.Sample(logits)
+		Materialize(tok)
+	}
+}
+
+func BenchmarkSampler_TopK50_Temp1(b *testing.B) {
+	logits := RandomUniform(-5, 5, []int32{1, 32000}, DTypeFloat32)
+	Materialize(logits)
+	s := newSampler(1.0, 0, 0, 50)
+	for b.Loop() {
+		tok := s.Sample(logits)
+		Materialize(tok)
+	}
+}
+
+func BenchmarkSampler_TopP09_Temp1(b *testing.B) {
+	logits := RandomUniform(-5, 5, []int32{1, 32000}, DTypeFloat32)
+	Materialize(logits)
+	s := newSampler(1.0, 0.9, 0, 0)
+	for b.Loop() {
+		tok := s.Sample(logits)
+		Materialize(tok)
+	}
+}
+
+func BenchmarkSampler_Full_TopP09_MinP01_TopK50(b *testing.B) {
+	logits := RandomUniform(-5, 5, []int32{1, 32000}, DTypeFloat32)
+	Materialize(logits)
+	s := newSampler(0.8, 0.9, 0.1, 50) // temp=0.8, topP=0.9, minP=0.1, topK=50
+	for b.Loop() {
+		tok := s.Sample(logits)
+		Materialize(tok)
+	}
+}