From df176765e73d8e4ddd8f4c590d3924e00944c9e5 Mon Sep 17 00:00:00 2001
From: Snider <snider@host.uk.com>
Date: Thu, 19 Feb 2026 23:34:31 +0000
Subject: [PATCH] feat: add GenerateMetrics type and Metrics() to TextModel

Expose prefill/decode timing, token counts, throughput, and GPU memory
stats from the last inference operation. Same retrieval pattern as Err().

Co-Authored-By: Virgil <virgil@lethean.io>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 inference.go | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/inference.go b/inference.go
index af86f91..1fbf744 100644
--- a/inference.go
+++ b/inference.go
@@ -26,6 +26,7 @@ import (
 	"fmt"
 	"iter"
 	"sync"
+	"time"
 )
 
 // Token represents a single generated token for streaming.
@@ -52,6 +53,27 @@ type BatchResult struct {
 	Err    error   // Per-prompt error (context cancel, OOM, etc.)
 }
 
+// GenerateMetrics holds performance metrics from the last inference operation.
+// Retrieved via TextModel.Metrics() after Generate, Chat, Classify, or BatchGenerate.
+type GenerateMetrics struct {
+	// Token counts
+	PromptTokens    int // Input tokens (sum across batch for batch ops)
+	GeneratedTokens int // Output tokens generated
+
+	// Timing
+	PrefillDuration time.Duration // Time to process the prompt(s)
+	DecodeDuration  time.Duration // Time for autoregressive decoding
+	TotalDuration   time.Duration // Wall-clock time for the full operation
+
+	// Throughput (computed)
+	PrefillTokensPerSec float64 // PromptTokens / PrefillDuration
+	DecodeTokensPerSec  float64 // GeneratedTokens / DecodeDuration
+
+	// Memory (Metal/GPU)
+	PeakMemoryBytes   uint64 // Peak GPU memory during this operation
+	ActiveMemoryBytes uint64 // Active GPU memory after operation
+}
+
 // TextModel generates text from a loaded model.
 type TextModel interface {
 	// Generate streams tokens for the given prompt.
@@ -73,6 +95,10 @@ type TextModel interface {
 	// ModelType returns the architecture identifier (e.g. "gemma3", "qwen3", "llama3").
 	ModelType() string
 
+	// Metrics returns performance metrics from the last inference operation.
+	// Valid after Generate (iterator exhausted), Chat, Classify, or BatchGenerate.
+	Metrics() GenerateMetrics
+
 	// Err returns the error from the last Generate/Chat call, if any.
 	// Check this after the iterator stops to distinguish EOS from errors.
 	Err() error