From 747e703c7b670a861b6314db4f8b0f5be97087de Mon Sep 17 00:00:00 2001 From: Snider Date: Fri, 20 Feb 2026 02:05:59 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20Phase=202=20backend=20consolidation=20?= =?UTF-8?q?=E2=80=94=20Message=20alias,=20GenOpts,=20deprecation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace Message struct with type alias for inference.Message (backward compat) - Remove convertMessages() — types are now identical via alias - Extend GenOpts with TopK, TopP, RepeatPenalty (mapped in convertOpts) - Deprecate StreamingBackend with doc comment (only 2 callers, both in cli/) - Simplify HTTPTextModel.Chat() — pass messages directly - Update CLAUDE.md with Backend Architecture section - Add 2 new tests, remove 1 obsolete test Co-Authored-By: Virgil --- CLAUDE.md | 41 ++++++++++++++++++++++------ adapter.go | 30 +++++++++++---------- adapter_test.go | 57 +++++++++++++++++++++++++++++---------- backend_http_textmodel.go | 9 ++----- inference.go | 30 ++++++++++++--------- 5 files changed, 111 insertions(+), 56 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 95b18d7..92e5289 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -94,10 +94,33 @@ All resolve via `replace` directives in go.mod: | `db.go` | 258 | DuckDB analytics storage | | `gguf.go` | 369 | GGUF model format parsing | +### Backend Architecture + +Two interface families coexist, bridged by adapters: + +**`inference.TextModel`** (iterator-based) is the **preferred API** for new code. Returns `iter.Seq[inference.Token]` for streaming. Defined in `forge.lthn.ai/core/go-inference`. Use this for GPU backends (MLX Metal, ROCm) and any code that needs token-level control. + +**`ml.Backend`** (string-based) is the **compatibility layer**, still fully supported. Returns complete strings. Used by `service.go`, `judge.go`, and external consumers like `host-uk/cli`. + +**`ml.StreamingBackend`** is **deprecated**. New code should use `inference.TextModel` with `iter.Seq[Token]` directly. Retained for backward compatibility with existing callers. + +**Adapters:** + +| Adapter | Direction | File | +|---------|-----------|------| +| `InferenceAdapter` | `inference.TextModel` -> `ml.Backend` + `ml.StreamingBackend` | `adapter.go` | +| `HTTPTextModel` | `ml.HTTPBackend` -> `inference.TextModel` | `backend_http_textmodel.go` | +| `LlamaTextModel` | `ml.LlamaBackend` -> `inference.TextModel` | `backend_http_textmodel.go` | + +**Unified types (Phase 2):** + +- `ml.Message` is a type alias for `inference.Message` — the types are identical, no conversion needed between packages. +- `ml.GenOpts` extends `inference.GenerateConfig` with a `Model` field for per-request model overrides. The `convertOpts()` helper maps GenOpts to `[]inference.GenerateOption`. + ### Key Types ```go -// Current backend interface (inference.go) +// Backend interface (inference.go) — compatibility layer type Backend interface { Generate(ctx context.Context, prompt string, opts GenOpts) (string, error) Chat(ctx context.Context, messages []Message, opts GenOpts) (string, error) @@ -105,6 +128,7 @@ type Backend interface { Available() bool } +// Deprecated: use inference.TextModel with iter.Seq[Token] directly type StreamingBackend interface { Backend GenerateStream(ctx context.Context, prompt string, opts GenOpts, cb TokenCallback) error @@ -112,15 +136,16 @@ type StreamingBackend interface { } type GenOpts struct { - Temperature float64 - MaxTokens int - Model string + Temperature float64 + MaxTokens int + Model string // override model for this request + TopK int // top-k sampling (0 = disabled) + TopP float64 // nucleus sampling threshold (0 = disabled) + RepeatPenalty float64 // repetition penalty (0 = disabled, 1.0 = no penalty) } -type Message struct { - Role string `json:"role"` - Content string `json:"content"` -} +// Type alias — identical to inference.Message +type Message = inference.Message ``` ## Coding Standards diff --git a/adapter.go b/adapter.go index b3ca8fb..09ab311 100644 --- a/adapter.go +++ b/adapter.go @@ -42,12 +42,13 @@ func (a *InferenceAdapter) Generate(ctx context.Context, prompt string, opts Gen return b.String(), nil } -// Chat converts ml.Message to inference.Message, then collects all tokens. +// Chat sends a multi-turn conversation to the underlying TextModel and collects +// all tokens. Since ml.Message is now a type alias for inference.Message, no +// conversion is needed. func (a *InferenceAdapter) Chat(ctx context.Context, messages []Message, opts GenOpts) (string, error) { - inferMsgs := convertMessages(messages) inferOpts := convertOpts(opts) var b strings.Builder - for tok := range a.model.Chat(ctx, inferMsgs, inferOpts...) { + for tok := range a.model.Chat(ctx, messages, inferOpts...) { b.WriteString(tok.Text) } if err := a.model.Err(); err != nil { @@ -70,10 +71,11 @@ func (a *InferenceAdapter) GenerateStream(ctx context.Context, prompt string, op } // ChatStream forwards each generated chat token's text to the callback. +// Since ml.Message is now a type alias for inference.Message, no conversion +// is needed. func (a *InferenceAdapter) ChatStream(ctx context.Context, messages []Message, opts GenOpts, cb TokenCallback) error { - inferMsgs := convertMessages(messages) inferOpts := convertOpts(opts) - for tok := range a.model.Chat(ctx, inferMsgs, inferOpts...) { + for tok := range a.model.Chat(ctx, messages, inferOpts...) { if err := cb(tok.Text); err != nil { return err } @@ -104,15 +106,15 @@ func convertOpts(opts GenOpts) []inference.GenerateOption { if opts.MaxTokens != 0 { out = append(out, inference.WithMaxTokens(opts.MaxTokens)) } + if opts.TopK > 0 { + out = append(out, inference.WithTopK(opts.TopK)) + } + if opts.TopP > 0 { + out = append(out, inference.WithTopP(float32(opts.TopP))) + } + if opts.RepeatPenalty > 0 { + out = append(out, inference.WithRepeatPenalty(float32(opts.RepeatPenalty))) + } // GenOpts.Model is ignored — the model is already loaded. return out } - -// convertMessages maps ml.Message to inference.Message (trivial field copy). -func convertMessages(msgs []Message) []inference.Message { - out := make([]inference.Message, len(msgs)) - for i, m := range msgs { - out[i] = inference.Message{Role: m.Role, Content: m.Content} - } - return out -} diff --git a/adapter_test.go b/adapter_test.go index 8a934fc..608d14b 100644 --- a/adapter_test.go +++ b/adapter_test.go @@ -211,20 +211,49 @@ func TestInferenceAdapter_ConvertOpts_Good(t *testing.T) { assert.Len(t, opts, 1) } -func TestInferenceAdapter_ConvertMessages_Good(t *testing.T) { - mlMsgs := []Message{ - {Role: "system", Content: "You are helpful."}, - {Role: "user", Content: "Hello"}, - {Role: "assistant", Content: "Hi!"}, - } - inferMsgs := convertMessages(mlMsgs) - require.Len(t, inferMsgs, 3) - assert.Equal(t, "system", inferMsgs[0].Role) - assert.Equal(t, "You are helpful.", inferMsgs[0].Content) - assert.Equal(t, "user", inferMsgs[1].Role) - assert.Equal(t, "Hello", inferMsgs[1].Content) - assert.Equal(t, "assistant", inferMsgs[2].Role) - assert.Equal(t, "Hi!", inferMsgs[2].Content) +func TestInferenceAdapter_ConvertOpts_NewFields_Good(t *testing.T) { + // TopK only. + opts := convertOpts(GenOpts{TopK: 40}) + assert.Len(t, opts, 1) + + // TopP only. + opts = convertOpts(GenOpts{TopP: 0.9}) + assert.Len(t, opts, 1) + + // RepeatPenalty only. + opts = convertOpts(GenOpts{RepeatPenalty: 1.1}) + assert.Len(t, opts, 1) + + // All new fields set together. + opts = convertOpts(GenOpts{TopK: 40, TopP: 0.9, RepeatPenalty: 1.1}) + assert.Len(t, opts, 3) + + // All fields set (Temperature + MaxTokens + TopK + TopP + RepeatPenalty). + opts = convertOpts(GenOpts{ + Temperature: 0.7, + MaxTokens: 512, + TopK: 40, + TopP: 0.9, + RepeatPenalty: 1.1, + }) + assert.Len(t, opts, 5) + + // Zero TopK/TopP/RepeatPenalty should not produce options. + opts = convertOpts(GenOpts{Temperature: 0.5, TopK: 0, TopP: 0, RepeatPenalty: 0}) + assert.Len(t, opts, 1) // only Temperature +} + +func TestInferenceAdapter_MessageAlias_Good(t *testing.T) { + // ml.Message and inference.Message are the same type — verify interchangeability. + mlMsg := Message{Role: "user", Content: "Hello"} + inferMsg := inference.Message{Role: "user", Content: "Hello"} + assert.Equal(t, mlMsg, inferMsg) + + // Can assign directly without conversion. + var msgs []inference.Message + msgs = append(msgs, mlMsg) + assert.Equal(t, "user", msgs[0].Role) + assert.Equal(t, "Hello", msgs[0].Content) } func TestInferenceAdapter_NameAndAvailable_Good(t *testing.T) { diff --git a/backend_http_textmodel.go b/backend_http_textmodel.go index 0df3273..81cb888 100644 --- a/backend_http_textmodel.go +++ b/backend_http_textmodel.go @@ -60,13 +60,8 @@ func (m *HTTPTextModel) Chat(ctx context.Context, messages []inference.Message, Model: m.http.Model(), } - // Convert inference.Message to ml.Message. - mlMsgs := make([]Message, len(messages)) - for i, msg := range messages { - mlMsgs[i] = Message{Role: msg.Role, Content: msg.Content} - } - - result, err := m.http.Chat(ctx, mlMsgs, genOpts) + // ml.Message is now a type alias for inference.Message — no conversion needed. + result, err := m.http.Chat(ctx, messages, genOpts) if err != nil { m.lastErr = err return diff --git a/inference.go b/inference.go index 7e1171f..7601f11 100644 --- a/inference.go +++ b/inference.go @@ -11,7 +11,11 @@ // ) package ml -import "context" +import ( + "context" + + "forge.lthn.ai/core/go-inference" +) // Backend generates text from prompts. Implementations include HTTPBackend // (OpenAI-compatible API), LlamaBackend (managed llama-server process), and @@ -32,25 +36,25 @@ type Backend interface { // GenOpts configures a generation request. type GenOpts struct { - Temperature float64 - MaxTokens int - Model string // override model for this request + Temperature float64 + MaxTokens int + Model string // override model for this request + TopK int // top-k sampling (0 = disabled) + TopP float64 // nucleus sampling threshold (0 = disabled) + RepeatPenalty float64 // repetition penalty (0 = disabled, 1.0 = no penalty) } -// Message is a single chat message. -type Message struct { - Role string `json:"role"` - Content string `json:"content"` -} +// Message is a type alias for inference.Message, providing backward compatibility. +// All callers continue using ml.Message — it is the same underlying type. +type Message = inference.Message // TokenCallback receives each generated token as text. Return a non-nil // error to stop generation early (e.g. client disconnect). type TokenCallback func(token string) error -// StreamingBackend extends Backend with token-by-token streaming. -// Backends that generate tokens incrementally (e.g. MLX) should implement -// this interface. The serve handler uses SSE when the client sends -// "stream": true and the active backend satisfies StreamingBackend. +// Deprecated: StreamingBackend is retained for backward compatibility. +// New code should use inference.TextModel with iter.Seq[Token] directly. +// See InferenceAdapter for the bridge pattern. type StreamingBackend interface { Backend