go-ml/inference.go
Snider 5578671f90 feat: add Result type, break Backend interface to return Result
Backend.Generate and Backend.Chat now return (Result, error) instead of
(string, error). Result carries the response text and optional
inference.GenerateMetrics for backends that support them.

Co-Authored-By: Virgil <virgil@lethean.io>
2026-02-22 17:36:16 +00:00

81 lines
2.8 KiB
Go

// Package ml provides ML inference, scoring, and model management for CoreGo.
//
// It supports multiple inference backends (HTTP, llama-server, Ollama) through
// a common Backend interface, and includes an ethics-aware scoring engine with
// both heuristic and LLM-judge capabilities.
//
// Register as a CoreGo service:
//
// core.New(
// core.WithService(ml.NewService),
// )
package ml
import (
"context"
"forge.lthn.ai/core/go-inference"
)
// Result holds the response text and optional inference metrics.
// Backends that support metrics (e.g. MLX via InferenceAdapter) populate
// Metrics; HTTP and subprocess backends leave it nil.
type Result struct {
Text string
Metrics *inference.GenerateMetrics
}
// Backend generates text from prompts. Implementations include HTTPBackend
// (OpenAI-compatible API), LlamaBackend (managed llama-server process), and
// OllamaBackend (Ollama native API).
type Backend interface {
// Generate sends a single user prompt and returns the response.
Generate(ctx context.Context, prompt string, opts GenOpts) (Result, error)
// Chat sends a multi-turn conversation and returns the response.
Chat(ctx context.Context, messages []Message, opts GenOpts) (Result, error)
// Name returns the backend identifier (e.g. "http", "llama", "ollama").
Name() string
// Available reports whether the backend is ready to accept requests.
Available() bool
}
// GenOpts configures a generation request.
type GenOpts struct {
Temperature float64
MaxTokens int
Model string // override model for this request
TopK int // top-k sampling (0 = disabled)
TopP float64 // nucleus sampling threshold (0 = disabled)
RepeatPenalty float64 // repetition penalty (0 = disabled, 1.0 = no penalty)
}
// Message is a type alias for inference.Message, providing backward compatibility.
// All callers continue using ml.Message — it is the same underlying type.
type Message = inference.Message
// TokenCallback receives each generated token as text. Return a non-nil
// error to stop generation early (e.g. client disconnect).
type TokenCallback func(token string) error
// Deprecated: StreamingBackend is retained for backward compatibility.
// New code should use inference.TextModel with iter.Seq[Token] directly.
// See InferenceAdapter for the bridge pattern.
type StreamingBackend interface {
Backend
// GenerateStream streams tokens from a single prompt via the callback.
GenerateStream(ctx context.Context, prompt string, opts GenOpts, cb TokenCallback) error
// ChatStream streams tokens from a chat conversation via the callback.
ChatStream(ctx context.Context, messages []Message, opts GenOpts, cb TokenCallback) error
}
// DefaultGenOpts returns sensible defaults for generation.
func DefaultGenOpts() GenOpts {
return GenOpts{
Temperature: 0.1,
}
}