From 07cd917259ecb2b86738bb97dede2a6b2f3f9c69 Mon Sep 17 00:00:00 2001 From: Snider Date: Thu, 19 Feb 2026 19:37:27 +0000 Subject: [PATCH] feat: define shared TextModel, Backend, Token, Message interfaces Zero-dependency interface package for the Core inference ecosystem. Backends (go-mlx, go-rocm) implement these interfaces. Consumers (go-ml, go-ai, go-i18n) import them. Includes: - TextModel: Generate, Chat, Err, Close (with context.Context) - Backend: Named engine registry with platform preference - Functional options: WithMaxTokens, WithTemperature, WithTopK, etc. - LoadModel: Auto-selects best available backend Co-Authored-By: Virgil --- CLAUDE.md | 76 +++++++++++++++++++++++++++ FINDINGS.md | 33 ++++++++++++ TODO.md | 37 +++++++++++++ go.mod | 3 ++ inference.go | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++ options.go | 98 ++++++++++++++++++++++++++++++++++ 6 files changed, 392 insertions(+) create mode 100644 CLAUDE.md create mode 100644 FINDINGS.md create mode 100644 TODO.md create mode 100644 go.mod create mode 100644 inference.go create mode 100644 options.go diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..44c6c3e --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,76 @@ +# CLAUDE.md + +## What This Is + +Shared inference interfaces for the Core Go ecosystem. Module: `forge.lthn.ai/core/go-inference` + +This package defines the contract between GPU-specific backends (go-mlx on macOS, go-rocm on Linux) and consumers (go-ml, go-ai, go-i18n). It has **zero dependencies** and compiles on all platforms. + +## Commands + +```bash +go test ./... # Run all tests +go vet ./... # Vet +``` + +## Architecture + +``` +go-inference (this package) ← defines TextModel, Backend, Token, Message + ↑ ↑ + │ │ +go-mlx (darwin/arm64) go-rocm (linux/amd64) + │ │ + └────── go-ml ───────┘ (wraps backends into scoring engine) + ↑ + go-ai (MCP hub) +``` + +### Key Types + +| Type | Purpose | +|------|---------| +| `TextModel` | Core interface: Generate, Chat, Err, Close | +| `Backend` | Named engine that can LoadModel → TextModel | +| `Token` | Streaming token (ID + Text) | +| `Message` | Chat message (Role + Content) | +| `GenerateOption` | Functional option for generation (temp, topK, etc.) | +| `LoadOption` | Functional option for model loading (backend, GPU layers, etc.) | + +### Backend Registry + +Backends register via `init()` with build tags. Consumers call `LoadModel()` which auto-selects the best available backend: + +```go +// Auto-detect: Metal on macOS, ROCm on Linux +m, err := inference.LoadModel("/path/to/model/") + +// Explicit backend +m, err := inference.LoadModel("/path/", inference.WithBackend("rocm")) +``` + +## Coding Standards + +- UK English +- Zero external dependencies — stdlib only +- Tests: testify assert/require +- Conventional commits +- Co-Author: `Co-Authored-By: Virgil ` +- Licence: EUPL-1.2 + +## Consumers + +- **go-mlx**: Implements `Backend` + `TextModel` for Apple Metal (darwin/arm64) +- **go-rocm**: Implements `Backend` + `TextModel` for AMD ROCm (linux/amd64) +- **go-ml**: Wraps inference backends into scoring engine, adds llama.cpp HTTP backend +- **go-ai**: MCP hub, exposes inference via MCP tools +- **go-i18n**: Uses TextModel for Gemma3-1B domain classification + +## Stability + +This package is the shared contract. Changes here affect all backends and consumers. Keep the interface minimal and stable. Add new methods only when two or more consumers need them. + +## Task Queue + +See `TODO.md` for prioritised work. +See `FINDINGS.md` for research notes. diff --git a/FINDINGS.md b/FINDINGS.md new file mode 100644 index 0000000..5a2b8e8 --- /dev/null +++ b/FINDINGS.md @@ -0,0 +1,33 @@ +# FINDINGS.md — go-inference Research & Discovery + +--- + +## 2026-02-19: Package Creation (Virgil) + +### Motivation + +go-mlx (darwin/arm64) and go-rocm (linux/amd64) both need to implement the same TextModel interface, but go-rocm can't import go-mlx (platform-specific CGO dependency). A shared interface package solves this. + +### Alternatives Considered + +1. **Duplicate interfaces** — Each backend defines its own TextModel. Simple but diverges over time as backends evolve independently. Rejected. +2. **Shared interface package** (chosen) — `core/go-inference` defines the contract. ~100 LOC, zero deps, compiles everywhere. +3. **Define in go-ml** — go-ml already has Backend/StreamingBackend. But go-ml has heavy deps (DuckDB, Parquet) that backends shouldn't import. Rejected. + +### Interface Design Decisions + +- **`context.Context` on Generate/Chat**: Required for HTTP handler cancellation, timeouts, graceful shutdown. go-ml's current backend_mlx.go already uses ctx. +- **`Err() error` on TextModel**: iter.Seq can't carry errors. Consumers check Err() after the iterator stops. Pattern matches database/sql Row.Err(). +- **`Chat()` on TextModel**: Models own their chat templates (Gemma3, Qwen3, Llama3 all have different formats). Keeping templates in consumers means every consumer duplicates model-specific formatting. +- **`Available() bool` on Backend**: Needed for Default() to skip unavailable backends (e.g. ROCm registered but no GPU present). +- **`GPULayers` in LoadConfig**: ROCm/llama.cpp support partial GPU offload. Metal always does full offload. Default -1 = all layers. +- **`RepeatPenalty` in GenerateConfig**: llama.cpp backends use this heavily. Metal backends can ignore it. + +### Consumer Mapping + +| Consumer | What it imports | How it uses TextModel | +|----------|----------------|----------------------| +| go-ml | go-inference | Wraps TextModel into its own Backend interface, adds scoring | +| go-ai | go-inference (via go-ml) | Exposes via MCP tools | +| go-i18n | go-inference | Direct: LoadModel → Generate(WithMaxTokens(1)) for classification | +| LEM Lab | go-inference (via go-ml) | Chat streaming for web UI | diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..d996ae5 --- /dev/null +++ b/TODO.md @@ -0,0 +1,37 @@ +# TODO.md — go-inference Task Queue + +Dispatched from core/go orchestration. This package is minimal by design. + +--- + +## Phase 1: Foundation + +- [ ] **Add tests for option application** — Verify GenerateConfig defaults, all With* options, ApplyGenerateOpts/ApplyLoadOpts behaviour. +- [ ] **Add tests for backend registry** — Register, Get, List, Default priority order, LoadModel routing. +- [ ] **Add tests for Default() platform preference** — Verify metal > rocm > llama_cpp ordering. + +## Phase 2: Integration + +- [ ] **go-mlx migration** — go-mlx Phase 4 backend abstraction should import go-inference instead of defining its own TextModel/Backend. Update go-mlx's design doc and plan to reference this package. +- [ ] **go-rocm implementation** — go-rocm implements inference.Backend + inference.TextModel. +- [ ] **go-ml migration** — go-ml's Backend/StreamingBackend should align with or wrap inference.TextModel. The go-ml Backend adds context.Context + non-streaming helpers on top. + +## Phase 3: Extended Interfaces (when needed) + +- [ ] **BatchModel interface** — When go-i18n needs 5K sentences/sec, add: `type BatchModel interface { TextModel; BatchGenerate(ctx, []string, ...GenerateOption) iter.Seq2[int, Token] }`. Not before it's needed. +- [ ] **Stats interface** — When LEM Lab dashboard needs metrics: `type StatsModel interface { TextModel; Stats() GenerateStats }` with tokens/sec, peak memory, GPU util. + +--- + +## Design Principles + +1. **Minimal interface** — Only add methods when 2+ consumers need them +2. **Zero dependencies** — stdlib only, compiles everywhere +3. **Backwards compatible** — New interfaces extend, never modify existing ones +4. **Platform agnostic** — No build tags, no CGO, no OS-specific code + +## Workflow + +1. Virgil in core/go manages this package directly (too small for a dedicated Claude) +2. Changes here are coordinated with go-mlx and go-rocm Claudes via their TODO.md +3. New interface methods require Virgil approval before adding diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..9d4eb26 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module forge.lthn.ai/core/go-inference + +go 1.25.5 diff --git a/inference.go b/inference.go new file mode 100644 index 0000000..8724714 --- /dev/null +++ b/inference.go @@ -0,0 +1,145 @@ +// Package inference defines shared interfaces for text generation backends. +// +// This package is the contract between GPU-specific backends (go-mlx, go-rocm) +// and consumers (go-ml, go-ai, go-i18n). It has zero dependencies and compiles +// on all platforms. +// +// Backend implementations register via init() with build tags: +// +// // go-mlx: //go:build darwin && arm64 +// func init() { inference.Register(metal.NewBackend()) } +// +// // go-rocm: //go:build linux && amd64 +// func init() { inference.Register(rocm.NewBackend()) } +// +// Consumers load models via the registry: +// +// m, err := inference.LoadModel("/path/to/model/") +// defer m.Close() +// for tok := range m.Generate(ctx, "prompt", inference.WithMaxTokens(128)) { +// fmt.Print(tok.Text) +// } +package inference + +import ( + "context" + "fmt" + "iter" + "sync" +) + +// Token represents a single generated token for streaming. +type Token struct { + ID int32 + Text string +} + +// Message represents a chat message for multi-turn conversation. +type Message struct { + Role string // "system", "user", "assistant" + Content string +} + +// TextModel generates text from a loaded model. +type TextModel interface { + // Generate streams tokens for the given prompt. + Generate(ctx context.Context, prompt string, opts ...GenerateOption) iter.Seq[Token] + + // Chat streams tokens from a multi-turn conversation. + // The model applies its native chat template. + Chat(ctx context.Context, messages []Message, opts ...GenerateOption) iter.Seq[Token] + + // ModelType returns the architecture identifier (e.g. "gemma3", "qwen3", "llama3"). + ModelType() string + + // Err returns the error from the last Generate/Chat call, if any. + // Check this after the iterator stops to distinguish EOS from errors. + Err() error + + // Close releases all resources (GPU memory, caches, subprocess). + Close() error +} + +// Backend is a named inference engine that can load models. +type Backend interface { + // Name returns the backend identifier (e.g. "metal", "rocm", "llama_cpp"). + Name() string + + // LoadModel loads a model from the given path. + LoadModel(path string, opts ...LoadOption) (TextModel, error) + + // Available reports whether this backend can run on the current hardware. + Available() bool +} + +var ( + backendsMu sync.RWMutex + backends = map[string]Backend{} +) + +// Register adds a backend to the registry. Typically called from init(). +func Register(b Backend) { + backendsMu.Lock() + defer backendsMu.Unlock() + backends[b.Name()] = b +} + +// Get returns a registered backend by name. +func Get(name string) (Backend, bool) { + backendsMu.RLock() + defer backendsMu.RUnlock() + b, ok := backends[name] + return b, ok +} + +// List returns the names of all registered backends. +func List() []string { + backendsMu.RLock() + defer backendsMu.RUnlock() + names := make([]string, 0, len(backends)) + for name := range backends { + names = append(names, name) + } + return names +} + +// Default returns the first available backend. +// Prefers "metal" on macOS, "rocm" on Linux, then any registered backend. +func Default() (Backend, error) { + backendsMu.RLock() + defer backendsMu.RUnlock() + + // Platform preference order + for _, name := range []string{"metal", "rocm", "llama_cpp"} { + if b, ok := backends[name]; ok && b.Available() { + return b, nil + } + } + // Fall back to any available + for _, b := range backends { + if b.Available() { + return b, nil + } + } + return nil, fmt.Errorf("inference: no backends registered (import a backend package)") +} + +// LoadModel loads a model using the specified or default backend. +func LoadModel(path string, opts ...LoadOption) (TextModel, error) { + cfg := ApplyLoadOpts(opts) + if cfg.Backend != "" { + b, ok := Get(cfg.Backend) + if !ok { + return nil, fmt.Errorf("inference: backend %q not registered", cfg.Backend) + } + if !b.Available() { + return nil, fmt.Errorf("inference: backend %q not available on this hardware", cfg.Backend) + } + return b.LoadModel(path, opts...) + } + b, err := Default() + if err != nil { + return nil, err + } + return b.LoadModel(path, opts...) +} diff --git a/options.go b/options.go new file mode 100644 index 0000000..48cb350 --- /dev/null +++ b/options.go @@ -0,0 +1,98 @@ +package inference + +// GenerateConfig holds generation parameters. +type GenerateConfig struct { + MaxTokens int + Temperature float32 + TopK int + TopP float32 + StopTokens []int32 + RepeatPenalty float32 +} + +// DefaultGenerateConfig returns sensible defaults. +func DefaultGenerateConfig() GenerateConfig { + return GenerateConfig{ + MaxTokens: 256, + Temperature: 0.0, // greedy + } +} + +// GenerateOption configures text generation. +type GenerateOption func(*GenerateConfig) + +// WithMaxTokens sets the maximum number of tokens to generate. +func WithMaxTokens(n int) GenerateOption { + return func(c *GenerateConfig) { c.MaxTokens = n } +} + +// WithTemperature sets the sampling temperature. 0 = greedy. +func WithTemperature(t float32) GenerateOption { + return func(c *GenerateConfig) { c.Temperature = t } +} + +// WithTopK sets top-k sampling. 0 = disabled. +func WithTopK(k int) GenerateOption { + return func(c *GenerateConfig) { c.TopK = k } +} + +// WithTopP sets nucleus sampling threshold. 0 = disabled. +func WithTopP(p float32) GenerateOption { + return func(c *GenerateConfig) { c.TopP = p } +} + +// WithStopTokens sets token IDs that stop generation. +func WithStopTokens(ids ...int32) GenerateOption { + return func(c *GenerateConfig) { c.StopTokens = ids } +} + +// WithRepeatPenalty sets the repetition penalty. 0 = disabled, 1.0 = no penalty. +func WithRepeatPenalty(p float32) GenerateOption { + return func(c *GenerateConfig) { c.RepeatPenalty = p } +} + +// ApplyGenerateOpts builds a GenerateConfig from options. +func ApplyGenerateOpts(opts []GenerateOption) GenerateConfig { + cfg := DefaultGenerateConfig() + for _, o := range opts { + o(&cfg) + } + return cfg +} + +// LoadConfig holds model loading parameters. +type LoadConfig struct { + Backend string // "metal", "rocm", "llama_cpp" (empty = auto-detect) + ContextLen int // Context window size (0 = model default) + GPULayers int // Number of layers to offload to GPU (-1 = all, 0 = none) +} + +// LoadOption configures model loading. +type LoadOption func(*LoadConfig) + +// WithBackend selects a specific inference backend by name. +func WithBackend(name string) LoadOption { + return func(c *LoadConfig) { c.Backend = name } +} + +// WithContextLen sets the context window size. +func WithContextLen(n int) LoadOption { + return func(c *LoadConfig) { c.ContextLen = n } +} + +// WithGPULayers sets how many layers to offload to GPU. +// -1 means all layers (full GPU offload). +func WithGPULayers(n int) LoadOption { + return func(c *LoadConfig) { c.GPULayers = n } +} + +// ApplyLoadOpts builds a LoadConfig from options. +func ApplyLoadOpts(opts []LoadOption) LoadConfig { + cfg := LoadConfig{ + GPULayers: -1, // default: full GPU offload + } + for _, o := range opts { + o(&cfg) + } + return cfg +}