docs: expand package doc with workflow examples

Cover generate, chat, classify, batch generate, metrics, model info,
discovery, and Metal memory controls.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Snider 2026-02-19 23:44:07 +00:00
parent d7c8f176f0
commit d1fb26d51e
2 changed files with 80 additions and 5 deletions

View file

@ -41,7 +41,7 @@ Implementation plan: `docs/plans/2026-02-19-backend-abstraction-plan.md`
- [x] **Integration tests** — 7 tests for public API (backend registration, options, LoadModel paths).
- [x] **Error handling audit** — ✅ `checkError()` replaced with `lastError() error` (reads + clears C-level error string). Added `Eval(...*Array) error` and `EvalAsync(...*Array) error` as error-returning variants of Materialize. Generate loop propagates errors via `m.lastErr`. `LoadAllSafetensors` returns `(map, error)`. Model loaders (gemma3, qwen3) check `lastError()` after safetensors load. grad.go/lora.go now surface real MLX error messages. 4 new tests in error_test.go.
- [x] **Memory management — deterministic cleanup** — ✅ `Model.Close()` now walks the full model tree (GemmaModel/Qwen3Model) and explicitly frees all weight arrays via `Free()`. Helpers: `freeLinear`, `freeEmbedding`, `freeRMSNorm`, `freeCaches`, `closeGemma`, `closeQwen3` in close.go. Handles tied output weights (skip double-free), nil safety, idempotent Close(). 8 new tests in close_test.go.
- [ ] **Documentation** — Public API has godoc but needs examples for common workflows.
- [x] **Documentation** — ✅ Package docs expanded with examples for all common workflows: Generate, Chat, Classify, BatchGenerate, Metrics, ModelInfo, Discover, memory controls. Both go-mlx and go-inference package docs updated with godoc heading sections.
## Phase 5: Ecosystem Integration (Virgil wishlist)

83
mlx.go
View file

@ -2,23 +2,98 @@
//
// This package implements the [inference.Backend] interface from
// forge.lthn.ai/core/go-inference for Apple Silicon (M1-M4) GPUs.
// Import it blank to register the "metal" backend automatically:
//
// import _ "forge.lthn.ai/core/go-mlx"
//
// Build mlx-c before use:
//
// go generate ./...
//
// Load a model and generate text:
//
// import "forge.lthn.ai/core/go-inference"
// import _ "forge.lthn.ai/core/go-mlx" // register Metal backend
// # Generate text
//
// m, err := inference.LoadModel("/path/to/model/")
// if err != nil { log.Fatal(err) }
// defer m.Close()
//
// ctx := context.Background()
// for tok := range m.Generate(ctx, "What is 2+2?", inference.WithMaxTokens(128)) {
// fmt.Print(tok.Text)
// }
// if err := m.Err(); err != nil { log.Fatal(err) }
//
// # Multi-turn chat
//
// Chat applies the model's native template (Gemma3, Qwen3, Llama3):
//
// for tok := range m.Chat(ctx, []inference.Message{
// {Role: "system", Content: "You are a helpful assistant."},
// {Role: "user", Content: "Translate 'hello' to French."},
// }, inference.WithMaxTokens(64)) {
// fmt.Print(tok.Text)
// }
//
// # Batch classification
//
// Classify runs a single forward pass per prompt (prefill only, no decoding):
//
// results, err := m.Classify(ctx, []string{
// "Bonjour, comment allez-vous?",
// "The quarterly report shows growth.",
// }, inference.WithTemperature(0))
// for i, r := range results {
// fmt.Printf("prompt %d → %q\n", i, r.Token.Text)
// }
//
// # Batch generation
//
// results, err := m.BatchGenerate(ctx, []string{
// "The capital of France is",
// "Water boils at",
// }, inference.WithMaxTokens(32))
// for i, r := range results {
// for _, tok := range r.Tokens {
// fmt.Print(tok.Text)
// }
// fmt.Println()
// }
//
// # Performance metrics
//
// After any inference call, retrieve timing and memory statistics:
//
// for tok := range m.Generate(ctx, prompt, inference.WithMaxTokens(128)) {
// fmt.Print(tok.Text)
// }
// met := m.Metrics()
// fmt.Printf("decode: %.0f tok/s, peak GPU: %d MB\n",
// met.DecodeTokensPerSec, met.PeakMemoryBytes/1024/1024)
//
// # Model info
//
// info := m.Info()
// fmt.Printf("%s %d-layer, %d-bit quantised\n",
// info.Architecture, info.NumLayers, info.QuantBits)
//
// # Model discovery
//
// models, err := inference.Discover("/path/to/models/")
// for _, d := range models {
// fmt.Printf("%s (%s, %d-bit)\n", d.Path, d.ModelType, d.QuantBits)
// }
//
// # Metal memory controls
//
// These control the Metal allocator directly, not individual models:
//
// mlx.SetCacheLimit(4 << 30) // 4 GB cache limit
// mlx.SetMemoryLimit(32 << 30) // 32 GB hard limit
//
// // Between chat turns, reclaim prompt cache memory:
// mlx.ClearCache()
//
// fmt.Printf("active: %d MB, peak: %d MB\n",
// mlx.GetActiveMemory()/1024/1024, mlx.GetPeakMemory()/1024/1024)
package mlx
//go:generate cmake -S . -B build -DCMAKE_INSTALL_PREFIX=dist -DCMAKE_BUILD_TYPE=Release