40 lines
1.2 KiB
Go
40 lines
1.2 KiB
Go
package mlx
|
|
|
|
import (
|
|
"context"
|
|
"iter"
|
|
)
|
|
|
|
// Token represents a single generated token for streaming.
|
|
type Token struct {
|
|
ID int32
|
|
Text string
|
|
}
|
|
|
|
// Message represents a chat turn for Chat().
|
|
type Message struct {
|
|
Role string // "user", "assistant", "system"
|
|
Content string
|
|
}
|
|
|
|
// TextModel generates text from a loaded model.
|
|
type TextModel interface {
|
|
// Generate streams tokens for the given prompt.
|
|
// Respects ctx cancellation (HTTP handlers, timeouts, graceful shutdown).
|
|
Generate(ctx context.Context, prompt string, opts ...GenerateOption) iter.Seq[Token]
|
|
|
|
// Chat formats messages using the model's native chat template, then generates.
|
|
// The model owns its template — callers don't need to know Gemma vs Qwen formatting.
|
|
Chat(ctx context.Context, messages []Message, opts ...GenerateOption) iter.Seq[Token]
|
|
|
|
// ModelType returns the architecture identifier (e.g. "gemma3", "qwen3").
|
|
ModelType() string
|
|
|
|
// Err returns the error from the last Generate/Chat call, if any.
|
|
// Distinguishes normal stop (EOS, max tokens) from failures (OOM, C-level error).
|
|
// Returns nil if generation completed normally.
|
|
Err() error
|
|
|
|
// Close releases all resources (GPU memory, caches, subprocess).
|
|
Close() error
|
|
}
|