Backend.Generate and Backend.Chat now return (Result, error) instead of (string, error). Result carries the response text and optional inference.GenerateMetrics for backends that support them. Co-Authored-By: Virgil <virgil@lethean.io>
81 lines
2.8 KiB
Go
81 lines
2.8 KiB
Go
// Package ml provides ML inference, scoring, and model management for CoreGo.
|
|
//
|
|
// It supports multiple inference backends (HTTP, llama-server, Ollama) through
|
|
// a common Backend interface, and includes an ethics-aware scoring engine with
|
|
// both heuristic and LLM-judge capabilities.
|
|
//
|
|
// Register as a CoreGo service:
|
|
//
|
|
// core.New(
|
|
// core.WithService(ml.NewService),
|
|
// )
|
|
package ml
|
|
|
|
import (
|
|
"context"
|
|
|
|
"forge.lthn.ai/core/go-inference"
|
|
)
|
|
|
|
// Result holds the response text and optional inference metrics.
|
|
// Backends that support metrics (e.g. MLX via InferenceAdapter) populate
|
|
// Metrics; HTTP and subprocess backends leave it nil.
|
|
type Result struct {
|
|
Text string
|
|
Metrics *inference.GenerateMetrics
|
|
}
|
|
|
|
// Backend generates text from prompts. Implementations include HTTPBackend
|
|
// (OpenAI-compatible API), LlamaBackend (managed llama-server process), and
|
|
// OllamaBackend (Ollama native API).
|
|
type Backend interface {
|
|
// Generate sends a single user prompt and returns the response.
|
|
Generate(ctx context.Context, prompt string, opts GenOpts) (Result, error)
|
|
|
|
// Chat sends a multi-turn conversation and returns the response.
|
|
Chat(ctx context.Context, messages []Message, opts GenOpts) (Result, error)
|
|
|
|
// Name returns the backend identifier (e.g. "http", "llama", "ollama").
|
|
Name() string
|
|
|
|
// Available reports whether the backend is ready to accept requests.
|
|
Available() bool
|
|
}
|
|
|
|
// GenOpts configures a generation request.
|
|
type GenOpts struct {
|
|
Temperature float64
|
|
MaxTokens int
|
|
Model string // override model for this request
|
|
TopK int // top-k sampling (0 = disabled)
|
|
TopP float64 // nucleus sampling threshold (0 = disabled)
|
|
RepeatPenalty float64 // repetition penalty (0 = disabled, 1.0 = no penalty)
|
|
}
|
|
|
|
// Message is a type alias for inference.Message, providing backward compatibility.
|
|
// All callers continue using ml.Message — it is the same underlying type.
|
|
type Message = inference.Message
|
|
|
|
// TokenCallback receives each generated token as text. Return a non-nil
|
|
// error to stop generation early (e.g. client disconnect).
|
|
type TokenCallback func(token string) error
|
|
|
|
// Deprecated: StreamingBackend is retained for backward compatibility.
|
|
// New code should use inference.TextModel with iter.Seq[Token] directly.
|
|
// See InferenceAdapter for the bridge pattern.
|
|
type StreamingBackend interface {
|
|
Backend
|
|
|
|
// GenerateStream streams tokens from a single prompt via the callback.
|
|
GenerateStream(ctx context.Context, prompt string, opts GenOpts, cb TokenCallback) error
|
|
|
|
// ChatStream streams tokens from a chat conversation via the callback.
|
|
ChatStream(ctx context.Context, messages []Message, opts GenOpts, cb TokenCallback) error
|
|
}
|
|
|
|
// DefaultGenOpts returns sensible defaults for generation.
|
|
func DefaultGenOpts() GenOpts {
|
|
return GenOpts{
|
|
Temperature: 0.1,
|
|
}
|
|
}
|