- options.go: new WithChatCompletions(resolver) and WithChatCompletionsPath(path); api.New(...) now auto-mounts at /v1/chat/completions when a resolver is configured (previously the resolver could be attached but never mounted, which would have panicked Gin) - chat_completions.go: fixed missing net/http import, dropped ModelType during discovery, Retry-After header set after c.JSON silently lost, swapped OpenAI error type/code fields, swapped validate call site, redundant nil check, builder length read before nil-receiver check - openapi.go: effective*Path helpers surface an explicit path even when the corresponding Enabled flag is false so CLI callers still get x-*-path extensions; /swagger always in authentik public paths - chat_completions_test.go: Good/Bad/Ugly coverage for new options, validation, no-resolver behaviour - openapi_test.go: fix stale assertion for CacheEnabled-gated X-Cache - go.mod: bump dappco.re/go/core/cli to v0.5.2 - Removed local go-io / go-log stubs — replace points to outer modules for single source of truth - Migrated forge.lthn.ai/core/cli imports to dappco.re/go/core/cli across cmd/api/*.go + docs Co-Authored-By: Virgil <virgil@lethean.io>
908 lines
24 KiB
Go
908 lines
24 KiB
Go
// SPDX-License-Identifier: EUPL-1.2
|
|
|
|
package api
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"math/rand"
|
|
"net/http"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
"unicode"
|
|
|
|
"dappco.re/go/core"
|
|
inference "dappco.re/go/core/inference"
|
|
|
|
"github.com/gin-gonic/gin"
|
|
"gopkg.in/yaml.v3"
|
|
)
|
|
|
|
const defaultChatCompletionsPath = "/v1/chat/completions"
|
|
|
|
const (
|
|
chatDefaultTemperature = 1.0
|
|
chatDefaultTopP = 0.95
|
|
chatDefaultTopK = 64
|
|
chatDefaultMaxTokens = 2048
|
|
)
|
|
|
|
const channelMarker = "<|channel>"
|
|
|
|
// ChatCompletionRequest is the OpenAI-compatible request body.
|
|
//
|
|
// body := ChatCompletionRequest{
|
|
// Model: "lemer",
|
|
// Messages: []ChatMessage{{Role: "user", Content: "What is 2+2?"}},
|
|
// Stream: true,
|
|
// }
|
|
type ChatCompletionRequest struct {
|
|
Model string `json:"model"`
|
|
Messages []ChatMessage `json:"messages"`
|
|
Temperature *float32 `json:"temperature,omitempty"`
|
|
TopP *float32 `json:"top_p,omitempty"`
|
|
TopK *int `json:"top_k,omitempty"`
|
|
MaxTokens *int `json:"max_tokens,omitempty"`
|
|
Stream bool `json:"stream,omitempty"`
|
|
Stop []string `json:"stop,omitempty"`
|
|
User string `json:"user,omitempty"`
|
|
}
|
|
|
|
// ChatMessage is a single turn in a conversation.
|
|
//
|
|
// msg := ChatMessage{Role: "user", Content: "Hello"}
|
|
type ChatMessage struct {
|
|
Role string `json:"role"`
|
|
Content string `json:"content"`
|
|
}
|
|
|
|
// ChatCompletionResponse is the OpenAI-compatible response body.
|
|
//
|
|
// resp.Choices[0].Message.Content // "4"
|
|
type ChatCompletionResponse struct {
|
|
ID string `json:"id"`
|
|
Object string `json:"object"`
|
|
Created int64 `json:"created"`
|
|
Model string `json:"model"`
|
|
Choices []ChatChoice `json:"choices"`
|
|
Usage ChatUsage `json:"usage"`
|
|
Thought *string `json:"thought,omitempty"`
|
|
}
|
|
|
|
// ChatChoice is a single response option.
|
|
//
|
|
// choice.Message.Content // The generated text
|
|
// choice.FinishReason // "stop", "length", or "error"
|
|
type ChatChoice struct {
|
|
Index int `json:"index"`
|
|
Message ChatMessage `json:"message"`
|
|
FinishReason string `json:"finish_reason"`
|
|
}
|
|
|
|
// ChatUsage reports token consumption for the request.
|
|
//
|
|
// usage.TotalTokens // PromptTokens + CompletionTokens
|
|
type ChatUsage struct {
|
|
PromptTokens int `json:"prompt_tokens"`
|
|
CompletionTokens int `json:"completion_tokens"`
|
|
TotalTokens int `json:"total_tokens"`
|
|
}
|
|
|
|
// ChatCompletionChunk is a single SSE chunk during streaming.
|
|
//
|
|
// chunk.Choices[0].Delta.Content // Partial token text
|
|
type ChatCompletionChunk struct {
|
|
ID string `json:"id"`
|
|
Object string `json:"object"`
|
|
Created int64 `json:"created"`
|
|
Model string `json:"model"`
|
|
Choices []ChatChunkChoice `json:"choices"`
|
|
Thought *string `json:"thought,omitempty"`
|
|
}
|
|
|
|
// ChatChunkChoice is a streaming delta.
|
|
//
|
|
// delta.Content // New token(s) in this chunk
|
|
type ChatChunkChoice struct {
|
|
Index int `json:"index"`
|
|
Delta ChatMessageDelta `json:"delta"`
|
|
FinishReason *string `json:"finish_reason"`
|
|
}
|
|
|
|
// ChatMessageDelta is the incremental content within a streaming chunk.
|
|
//
|
|
// delta.Content // "" on first chunk (role-only), then token text
|
|
type ChatMessageDelta struct {
|
|
Role string `json:"role,omitempty"`
|
|
Content string `json:"content,omitempty"`
|
|
}
|
|
|
|
type chatCompletionError struct {
|
|
Message string `json:"message"`
|
|
Type string `json:"type"`
|
|
Param string `json:"param,omitempty"`
|
|
Code string `json:"code"`
|
|
}
|
|
|
|
type chatCompletionErrorResponse struct {
|
|
Error chatCompletionError `json:"error"`
|
|
}
|
|
|
|
type modelResolutionError struct {
|
|
code string
|
|
param string
|
|
msg string
|
|
}
|
|
|
|
func (e *modelResolutionError) Error() string {
|
|
if e == nil {
|
|
return ""
|
|
}
|
|
return e.msg
|
|
}
|
|
|
|
// ModelResolver resolves model names to loaded inference.TextModel instances.
|
|
//
|
|
// Resolution order:
|
|
//
|
|
// 1. Exact cache hit
|
|
// 2. ~/.core/models.yaml path mapping
|
|
// 3. discovery by architecture via inference.Discover()
|
|
type ModelResolver struct {
|
|
mu sync.RWMutex
|
|
loadedByName map[string]inference.TextModel
|
|
loadedByPath map[string]inference.TextModel
|
|
discovery map[string]string
|
|
}
|
|
|
|
// NewModelResolver constructs a ModelResolver with empty caches. The returned
|
|
// resolver is safe for concurrent use — ResolveModel serialises cache updates
|
|
// through an internal sync.RWMutex.
|
|
//
|
|
// resolver := api.NewModelResolver()
|
|
// engine, _ := api.New(api.WithChatCompletions(resolver))
|
|
func NewModelResolver() *ModelResolver {
|
|
return &ModelResolver{
|
|
loadedByName: make(map[string]inference.TextModel),
|
|
loadedByPath: make(map[string]inference.TextModel),
|
|
discovery: make(map[string]string),
|
|
}
|
|
}
|
|
|
|
// ResolveModel maps a model name to a loaded inference.TextModel.
|
|
// Cached models are reused. Unknown names return an error.
|
|
func (r *ModelResolver) ResolveModel(name string) (inference.TextModel, error) {
|
|
if r == nil {
|
|
return nil, &modelResolutionError{
|
|
code: "model_not_found",
|
|
param: "model",
|
|
msg: "model resolver is not configured",
|
|
}
|
|
}
|
|
|
|
requested := core.Lower(strings.TrimSpace(name))
|
|
if requested == "" {
|
|
return nil, &modelResolutionError{
|
|
code: "invalid_request_error",
|
|
param: "model",
|
|
msg: "model is required",
|
|
}
|
|
}
|
|
|
|
r.mu.RLock()
|
|
if cached, ok := r.loadedByName[requested]; ok {
|
|
r.mu.RUnlock()
|
|
return cached, nil
|
|
}
|
|
r.mu.RUnlock()
|
|
|
|
if path, ok := r.lookupModelPath(requested); ok {
|
|
return r.loadByPath(requested, path)
|
|
}
|
|
|
|
if path, ok := r.resolveDiscoveredPath(requested); ok {
|
|
return r.loadByPath(requested, path)
|
|
}
|
|
|
|
return nil, &modelResolutionError{
|
|
code: "model_not_found",
|
|
param: "model",
|
|
msg: fmt.Sprintf("model %q not found", requested),
|
|
}
|
|
}
|
|
|
|
func (r *ModelResolver) loadByPath(name, path string) (inference.TextModel, error) {
|
|
cleanPath := core.Path(path)
|
|
r.mu.Lock()
|
|
if cached, ok := r.loadedByPath[cleanPath]; ok {
|
|
r.loadedByName[name] = cached
|
|
r.mu.Unlock()
|
|
return cached, nil
|
|
}
|
|
r.mu.Unlock()
|
|
|
|
loaded, err := inference.LoadModel(cleanPath)
|
|
if err != nil {
|
|
if strings.Contains(strings.ToLower(err.Error()), "loading") {
|
|
return nil, &modelResolutionError{
|
|
code: "model_loading",
|
|
param: "model",
|
|
msg: err.Error(),
|
|
}
|
|
}
|
|
return nil, &modelResolutionError{
|
|
code: "model_not_found",
|
|
param: "model",
|
|
msg: err.Error(),
|
|
}
|
|
}
|
|
|
|
r.mu.Lock()
|
|
r.loadedByName[name] = loaded
|
|
r.loadedByPath[cleanPath] = loaded
|
|
r.mu.Unlock()
|
|
return loaded, nil
|
|
}
|
|
|
|
func (r *ModelResolver) lookupModelPath(name string) (string, bool) {
|
|
mappings, ok := r.modelsYAMLMapping()
|
|
if !ok {
|
|
return "", false
|
|
}
|
|
|
|
if path, ok := mappings[name]; ok && strings.TrimSpace(path) != "" {
|
|
return path, true
|
|
}
|
|
return "", false
|
|
}
|
|
|
|
func (r *ModelResolver) modelsYAMLMapping() (map[string]string, bool) {
|
|
configPath := core.Path(core.Env("DIR_HOME"), ".core", "models.yaml")
|
|
data, err := os.ReadFile(configPath)
|
|
if err != nil {
|
|
return nil, false
|
|
}
|
|
|
|
var content any
|
|
if err := yaml.Unmarshal(data, &content); err != nil {
|
|
return nil, false
|
|
}
|
|
|
|
root, ok := content.(map[string]any)
|
|
if !ok || root == nil {
|
|
return nil, false
|
|
}
|
|
|
|
normalized := make(map[string]string)
|
|
|
|
if models, ok := root["models"].(map[string]any); ok && models != nil {
|
|
for key, raw := range models {
|
|
if value, ok := raw.(string); ok {
|
|
normalized[core.Lower(strings.TrimSpace(key))] = strings.TrimSpace(value)
|
|
}
|
|
}
|
|
}
|
|
|
|
for key, raw := range root {
|
|
value, ok := raw.(string)
|
|
if !ok {
|
|
continue
|
|
}
|
|
normalized[core.Lower(strings.TrimSpace(key))] = strings.TrimSpace(value)
|
|
}
|
|
|
|
if len(normalized) == 0 {
|
|
return nil, false
|
|
}
|
|
return normalized, true
|
|
}
|
|
|
|
func (r *ModelResolver) resolveDiscoveredPath(name string) (string, bool) {
|
|
candidates := []string{name}
|
|
if n := strings.IndexRune(name, ':'); n > 0 {
|
|
candidates = append(candidates, name[:n])
|
|
}
|
|
|
|
r.mu.RLock()
|
|
for _, candidate := range candidates {
|
|
if path, ok := r.discovery[candidate]; ok {
|
|
r.mu.RUnlock()
|
|
return path, true
|
|
}
|
|
}
|
|
r.mu.RUnlock()
|
|
|
|
base := core.Path(core.Env("DIR_HOME"), ".core", "models")
|
|
var discovered string
|
|
for _, m := range discoveryModels(base) {
|
|
modelType := strings.ToLower(strings.TrimSpace(m.ModelType))
|
|
for _, candidate := range candidates {
|
|
if candidate != "" && candidate == modelType {
|
|
discovered = m.Path
|
|
break
|
|
}
|
|
}
|
|
if discovered != "" {
|
|
break
|
|
}
|
|
}
|
|
|
|
if discovered == "" {
|
|
return "", false
|
|
}
|
|
|
|
r.mu.Lock()
|
|
for _, candidate := range candidates {
|
|
if candidate != "" {
|
|
r.discovery[candidate] = discovered
|
|
}
|
|
}
|
|
r.mu.Unlock()
|
|
|
|
return discovered, true
|
|
}
|
|
|
|
type discoveredModel struct {
|
|
Path string
|
|
ModelType string
|
|
}
|
|
|
|
// discoveryModels enumerates locally discovered models under base and
|
|
// returns Path + ModelType pairs for name resolution.
|
|
//
|
|
// for _, m := range discoveryModels(base) {
|
|
// _ = m.Path
|
|
// }
|
|
func discoveryModels(base string) []discoveredModel {
|
|
var out []discoveredModel
|
|
for m := range inference.Discover(base) {
|
|
if m.Path == "" || m.ModelType == "" {
|
|
continue
|
|
}
|
|
out = append(out, discoveredModel{Path: m.Path, ModelType: m.ModelType})
|
|
}
|
|
return out
|
|
}
|
|
|
|
// ThinkingExtractor separates thinking channel content from response text.
|
|
// Applied as a post-processing step on the token stream.
|
|
//
|
|
// extractor := NewThinkingExtractor()
|
|
// for tok := range model.Chat(ctx, messages) {
|
|
// extractor.Process(tok)
|
|
// }
|
|
// response := extractor.Content() // User-facing text
|
|
// thinking := extractor.Thinking() // Internal reasoning (may be nil)
|
|
type ThinkingExtractor struct {
|
|
currentChannel string
|
|
content strings.Builder
|
|
thought strings.Builder
|
|
}
|
|
|
|
// NewThinkingExtractor constructs a ThinkingExtractor that starts on the
|
|
// "assistant" channel. Tokens are routed to Content() until a
|
|
// "<|channel>thought" marker switches the stream to the thinking channel (and
|
|
// similarly back).
|
|
//
|
|
// extractor := api.NewThinkingExtractor()
|
|
func NewThinkingExtractor() *ThinkingExtractor {
|
|
return &ThinkingExtractor{
|
|
currentChannel: "assistant",
|
|
}
|
|
}
|
|
|
|
// Process feeds a single generated token into the extractor. Tokens are
|
|
// appended to the current channel buffer (content or thought), switching on
|
|
// the "<|channel>NAME" marker. Non-streaming handlers call Process in a loop
|
|
// and then read Content and Thinking when generation completes.
|
|
//
|
|
// for tok := range model.Chat(ctx, messages) {
|
|
// extractor.Process(tok)
|
|
// }
|
|
func (te *ThinkingExtractor) Process(token inference.Token) {
|
|
te.writeDeltas(token.Text)
|
|
}
|
|
|
|
// Content returns all text accumulated on the user-facing "assistant" channel
|
|
// so far. Safe to call on a nil receiver (returns "").
|
|
//
|
|
// text := extractor.Content()
|
|
func (te *ThinkingExtractor) Content() string {
|
|
if te == nil {
|
|
return ""
|
|
}
|
|
return te.content.String()
|
|
}
|
|
|
|
// Thinking returns all text accumulated on the internal "thought" channel so
|
|
// far or nil when no thinking tokens were produced. Safe to call on a nil
|
|
// receiver.
|
|
//
|
|
// if thinking := extractor.Thinking(); thinking != nil {
|
|
// response.Thought = thinking
|
|
// }
|
|
func (te *ThinkingExtractor) Thinking() *string {
|
|
if te == nil {
|
|
return nil
|
|
}
|
|
if te.thought.Len() == 0 {
|
|
return nil
|
|
}
|
|
out := te.thought.String()
|
|
return &out
|
|
}
|
|
|
|
// writeDeltas tokenises text into the current channel, switching channels
|
|
// whenever it encounters the "<|channel>NAME" marker. It returns the content
|
|
// and thought fragments that were added to the builders during this call so
|
|
// streaming handlers can emit only the new bytes to the wire.
|
|
//
|
|
// contentDelta, thoughtDelta := extractor.writeDeltas(tok.Text)
|
|
func (te *ThinkingExtractor) writeDeltas(text string) (string, string) {
|
|
if te == nil {
|
|
return "", ""
|
|
}
|
|
|
|
beforeContentLen := te.content.Len()
|
|
beforeThoughtLen := te.thought.Len()
|
|
|
|
remaining := text
|
|
for {
|
|
next := strings.Index(remaining, channelMarker)
|
|
if next < 0 {
|
|
te.writeToCurrentChannel(remaining)
|
|
break
|
|
}
|
|
|
|
te.writeToCurrentChannel(remaining[:next])
|
|
remaining = remaining[next+len(channelMarker):]
|
|
|
|
remaining = strings.TrimLeftFunc(remaining, unicode.IsSpace)
|
|
if remaining == "" {
|
|
break
|
|
}
|
|
|
|
chanName, consumed := parseChannelName(remaining)
|
|
if consumed <= 0 {
|
|
te.writeToCurrentChannel(channelMarker)
|
|
if remaining != "" {
|
|
te.writeToCurrentChannel(remaining)
|
|
}
|
|
break
|
|
}
|
|
|
|
if chanName == "" {
|
|
te.writeToCurrentChannel(channelMarker)
|
|
} else {
|
|
te.currentChannel = chanName
|
|
}
|
|
remaining = remaining[consumed:]
|
|
}
|
|
|
|
return te.content.String()[beforeContentLen:], te.thought.String()[beforeThoughtLen:]
|
|
}
|
|
|
|
func (te *ThinkingExtractor) writeToCurrentChannel(text string) {
|
|
if text == "" {
|
|
return
|
|
}
|
|
|
|
if te.currentChannel == "thought" {
|
|
te.thought.WriteString(text)
|
|
return
|
|
}
|
|
te.content.WriteString(text)
|
|
}
|
|
|
|
func parseChannelName(s string) (string, int) {
|
|
if s == "" {
|
|
return "", 0
|
|
}
|
|
count := 0
|
|
for i := 0; i < len(s); i++ {
|
|
c := s[i]
|
|
if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' {
|
|
count++
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
if count == 0 {
|
|
return "", 0
|
|
}
|
|
return strings.ToLower(s[:count]), count
|
|
}
|
|
|
|
type chatCompletionsHandler struct {
|
|
resolver *ModelResolver
|
|
}
|
|
|
|
func newChatCompletionsHandler(resolver *ModelResolver) *chatCompletionsHandler {
|
|
return &chatCompletionsHandler{
|
|
resolver: resolver,
|
|
}
|
|
}
|
|
|
|
func (h *chatCompletionsHandler) ServeHTTP(c *gin.Context) {
|
|
if h == nil || h.resolver == nil {
|
|
writeChatCompletionError(c, http.StatusServiceUnavailable, "invalid_request_error", "model", "chat handler is not configured", "model")
|
|
return
|
|
}
|
|
|
|
var req ChatCompletionRequest
|
|
if err := decodeJSONBody(c.Request.Body, &req); err != nil {
|
|
writeChatCompletionError(c, 400, "invalid_request_error", "body", "invalid request body", "")
|
|
return
|
|
}
|
|
|
|
if err := validateChatRequest(&req); err != nil {
|
|
chatErr, ok := err.(*chatCompletionRequestError)
|
|
if !ok {
|
|
writeChatCompletionError(c, http.StatusBadRequest, "invalid_request_error", "body", err.Error(), "")
|
|
return
|
|
}
|
|
writeChatCompletionError(c, chatErr.Status, chatErr.Type, chatErr.Param, chatErr.Message, chatErr.Code)
|
|
return
|
|
}
|
|
|
|
model, err := h.resolver.ResolveModel(req.Model)
|
|
if err != nil {
|
|
status, errType, errCode, errParam := mapResolverError(err)
|
|
writeChatCompletionError(c, status, errType, errParam, err.Error(), errCode)
|
|
return
|
|
}
|
|
|
|
options, err := chatRequestOptions(&req)
|
|
if err != nil {
|
|
writeChatCompletionError(c, 400, "invalid_request_error", "stop", err.Error(), "")
|
|
return
|
|
}
|
|
|
|
messages := make([]inference.Message, 0, len(req.Messages))
|
|
for _, msg := range req.Messages {
|
|
messages = append(messages, inference.Message{
|
|
Role: msg.Role,
|
|
Content: msg.Content,
|
|
})
|
|
}
|
|
|
|
if req.Stream {
|
|
h.serveStreaming(c, model, req, messages, options...)
|
|
return
|
|
}
|
|
h.serveNonStreaming(c, model, req, messages, options...)
|
|
}
|
|
|
|
func (h *chatCompletionsHandler) serveNonStreaming(c *gin.Context, model inference.TextModel, req ChatCompletionRequest, messages []inference.Message, opts ...inference.GenerateOption) {
|
|
ctx := c.Request.Context()
|
|
created := time.Now().Unix()
|
|
completionID := newChatCompletionID()
|
|
|
|
extractor := NewThinkingExtractor()
|
|
for tok := range model.Chat(ctx, messages, opts...) {
|
|
extractor.Process(tok)
|
|
}
|
|
if err := model.Err(); err != nil {
|
|
if strings.Contains(strings.ToLower(err.Error()), "loading") {
|
|
writeChatCompletionError(c, http.StatusServiceUnavailable, "model_loading", "model", err.Error(), "")
|
|
return
|
|
}
|
|
writeChatCompletionError(c, http.StatusInternalServerError, "inference_error", "model", err.Error(), "")
|
|
return
|
|
}
|
|
|
|
metrics := model.Metrics()
|
|
content := extractor.Content()
|
|
finishReason := "stop"
|
|
if isTokenLengthCapReached(req.MaxTokens, metrics.GeneratedTokens) {
|
|
finishReason = "length"
|
|
}
|
|
|
|
response := ChatCompletionResponse{
|
|
ID: completionID,
|
|
Object: "chat.completion",
|
|
Created: created,
|
|
Model: req.Model,
|
|
Choices: []ChatChoice{
|
|
{
|
|
Index: 0,
|
|
Message: ChatMessage{
|
|
Role: "assistant",
|
|
Content: content,
|
|
},
|
|
FinishReason: finishReason,
|
|
},
|
|
},
|
|
Usage: ChatUsage{
|
|
PromptTokens: metrics.PromptTokens,
|
|
CompletionTokens: metrics.GeneratedTokens,
|
|
TotalTokens: metrics.PromptTokens + metrics.GeneratedTokens,
|
|
},
|
|
}
|
|
if thought := extractor.Thinking(); thought != nil {
|
|
response.Thought = thought
|
|
}
|
|
|
|
c.JSON(http.StatusOK, response)
|
|
}
|
|
|
|
func (h *chatCompletionsHandler) serveStreaming(c *gin.Context, model inference.TextModel, req ChatCompletionRequest, messages []inference.Message, opts ...inference.GenerateOption) {
|
|
ctx := c.Request.Context()
|
|
created := time.Now().Unix()
|
|
completionID := newChatCompletionID()
|
|
|
|
c.Header("Content-Type", "text/event-stream")
|
|
c.Header("Cache-Control", "no-cache")
|
|
c.Header("Connection", "keep-alive")
|
|
c.Status(200)
|
|
c.Writer.Flush()
|
|
|
|
extractor := NewThinkingExtractor()
|
|
chunkFirst := true
|
|
sentAny := false
|
|
|
|
for tok := range model.Chat(ctx, messages, opts...) {
|
|
contentDelta, thoughtDelta := extractor.writeDeltas(tok.Text)
|
|
if !chunkFirst && contentDelta == "" && thoughtDelta == "" {
|
|
continue
|
|
}
|
|
|
|
delta := ChatMessageDelta{}
|
|
if chunkFirst {
|
|
delta.Role = "assistant"
|
|
}
|
|
delta.Content = contentDelta
|
|
|
|
chunk := ChatCompletionChunk{
|
|
ID: completionID,
|
|
Object: "chat.completion.chunk",
|
|
Created: created,
|
|
Model: req.Model,
|
|
Choices: []ChatChunkChoice{
|
|
{
|
|
Index: 0,
|
|
Delta: delta,
|
|
FinishReason: nil,
|
|
},
|
|
},
|
|
}
|
|
if thoughtDelta != "" {
|
|
t := thoughtDelta
|
|
chunk.Thought = &t
|
|
}
|
|
|
|
if encoded, encodeErr := json.Marshal(chunk); encodeErr == nil {
|
|
c.Writer.WriteString(fmt.Sprintf("data: %s\n\n", encoded))
|
|
c.Writer.Flush()
|
|
sentAny = true
|
|
}
|
|
chunkFirst = false
|
|
}
|
|
|
|
if err := model.Err(); err != nil && !sentAny {
|
|
if strings.Contains(strings.ToLower(err.Error()), "loading") {
|
|
writeChatCompletionError(c, http.StatusServiceUnavailable, "model_loading", "model", err.Error(), "")
|
|
return
|
|
}
|
|
writeChatCompletionError(c, http.StatusInternalServerError, "inference_error", "model", err.Error(), "")
|
|
return
|
|
}
|
|
|
|
finishReason := "stop"
|
|
metrics := model.Metrics()
|
|
if err := model.Err(); err != nil {
|
|
finishReason = "error"
|
|
}
|
|
if finishReason != "error" && isTokenLengthCapReached(req.MaxTokens, metrics.GeneratedTokens) {
|
|
finishReason = "length"
|
|
}
|
|
|
|
finished := finishReason
|
|
finalChunk := ChatCompletionChunk{
|
|
ID: completionID,
|
|
Object: "chat.completion.chunk",
|
|
Created: created,
|
|
Model: req.Model,
|
|
Choices: []ChatChunkChoice{
|
|
{
|
|
Index: 0,
|
|
Delta: ChatMessageDelta{},
|
|
FinishReason: &finished,
|
|
},
|
|
},
|
|
}
|
|
if encoded, encodeErr := json.Marshal(finalChunk); encodeErr == nil {
|
|
c.Writer.WriteString(fmt.Sprintf("data: %s\n\n", encoded))
|
|
}
|
|
c.Writer.WriteString("data: [DONE]\n\n")
|
|
c.Writer.Flush()
|
|
}
|
|
|
|
type chatCompletionRequestError struct {
|
|
Status int
|
|
Type string
|
|
Code string
|
|
Param string
|
|
Message string
|
|
}
|
|
|
|
func (e *chatCompletionRequestError) Error() string {
|
|
if e == nil {
|
|
return ""
|
|
}
|
|
return e.Message
|
|
}
|
|
|
|
func validateChatRequest(req *ChatCompletionRequest) error {
|
|
if strings.TrimSpace(req.Model) == "" {
|
|
return &chatCompletionRequestError{
|
|
Status: 400,
|
|
Type: "invalid_request_error",
|
|
Code: "invalid_request_error",
|
|
Param: "model",
|
|
Message: "model is required",
|
|
}
|
|
}
|
|
|
|
if len(req.Messages) == 0 {
|
|
return &chatCompletionRequestError{
|
|
Status: 400,
|
|
Type: "invalid_request_error",
|
|
Code: "invalid_request_error",
|
|
Param: "messages",
|
|
Message: "messages must be a non-empty array",
|
|
}
|
|
}
|
|
|
|
for i, msg := range req.Messages {
|
|
if strings.TrimSpace(msg.Role) == "" {
|
|
return &chatCompletionRequestError{
|
|
Status: 400,
|
|
Type: "invalid_request_error",
|
|
Code: "invalid_request_error",
|
|
Param: fmt.Sprintf("messages[%d].role", i),
|
|
Message: "message role is required",
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func chatRequestOptions(req *ChatCompletionRequest) ([]inference.GenerateOption, error) {
|
|
opts := make([]inference.GenerateOption, 0, 5)
|
|
opts = append(opts, inference.WithTemperature(chatResolvedFloat(req.Temperature, chatDefaultTemperature)))
|
|
opts = append(opts, inference.WithTopP(chatResolvedFloat(req.TopP, chatDefaultTopP)))
|
|
opts = append(opts, inference.WithTopK(chatResolvedInt(req.TopK, chatDefaultTopK)))
|
|
opts = append(opts, inference.WithMaxTokens(chatResolvedInt(req.MaxTokens, chatDefaultMaxTokens)))
|
|
|
|
stops, err := parsedStopTokens(req.Stop)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if len(stops) > 0 {
|
|
opts = append(opts, inference.WithStopTokens(stops...))
|
|
}
|
|
return opts, nil
|
|
}
|
|
|
|
// chatResolvedFloat honours an explicitly set float sampling parameter or
|
|
// falls back to the calibrated default when the pointer is nil.
|
|
//
|
|
// Spec §11.2: "When a parameter is omitted (nil), the server applies the
|
|
// calibrated default. When explicitly set (including 0.0), the server honours
|
|
// the caller's value."
|
|
//
|
|
// temperature := chatResolvedFloat(req.Temperature, chatDefaultTemperature)
|
|
func chatResolvedFloat(v *float32, def float32) float32 {
|
|
if v == nil {
|
|
return def
|
|
}
|
|
return *v
|
|
}
|
|
|
|
// chatResolvedInt honours an explicitly set integer sampling parameter or
|
|
// falls back to the calibrated default when the pointer is nil.
|
|
//
|
|
// topK := chatResolvedInt(req.TopK, chatDefaultTopK)
|
|
func chatResolvedInt(v *int, def int) int {
|
|
if v == nil {
|
|
return def
|
|
}
|
|
return *v
|
|
}
|
|
|
|
func parsedStopTokens(stops []string) ([]int32, error) {
|
|
if len(stops) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
out := make([]int32, 0, len(stops))
|
|
for _, raw := range stops {
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" {
|
|
return nil, fmt.Errorf("stop entries cannot be empty")
|
|
}
|
|
parsed, err := strconv.ParseInt(raw, 10, 32)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid stop token %q", raw)
|
|
}
|
|
out = append(out, int32(parsed))
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// isTokenLengthCapReached reports whether the generated token count meets or
|
|
// exceeds the caller's max_tokens budget. Nil or non-positive caps disable the
|
|
// check (streams terminate by backend signal only).
|
|
//
|
|
// if isTokenLengthCapReached(req.MaxTokens, metrics.GeneratedTokens) {
|
|
// finishReason = "length"
|
|
// }
|
|
func isTokenLengthCapReached(maxTokens *int, generated int) bool {
|
|
if maxTokens == nil || *maxTokens <= 0 {
|
|
return false
|
|
}
|
|
return generated >= *maxTokens
|
|
}
|
|
|
|
func mapResolverError(err error) (int, string, string, string) {
|
|
resErr, ok := err.(*modelResolutionError)
|
|
if !ok {
|
|
return 500, "inference_error", "inference_error", "model"
|
|
}
|
|
switch resErr.code {
|
|
case "model_loading":
|
|
return http.StatusServiceUnavailable, "model_loading", "model_loading", resErr.param
|
|
case "model_not_found":
|
|
return 404, "model_not_found", "model_not_found", resErr.param
|
|
default:
|
|
return 500, "inference_error", "inference_error", resErr.param
|
|
}
|
|
}
|
|
|
|
func writeChatCompletionError(c *gin.Context, status int, errType, param, message, code string) {
|
|
if status <= 0 {
|
|
status = http.StatusInternalServerError
|
|
}
|
|
resp := chatCompletionErrorResponse{
|
|
Error: chatCompletionError{
|
|
Message: message,
|
|
Type: errType,
|
|
Param: param,
|
|
Code: codeOrDefault(code, errType),
|
|
},
|
|
}
|
|
c.Header("Content-Type", "application/json")
|
|
if status == http.StatusServiceUnavailable {
|
|
// Retry-After must be set BEFORE c.JSON commits headers to the
|
|
// wire. RFC 9110 §10.2.3 allows either seconds or an HTTP-date;
|
|
// we use seconds for simplicity and OpenAI parity.
|
|
c.Header("Retry-After", "10")
|
|
}
|
|
c.JSON(status, resp)
|
|
}
|
|
|
|
func codeOrDefault(code, fallback string) string {
|
|
if code != "" {
|
|
return code
|
|
}
|
|
if fallback != "" {
|
|
return fallback
|
|
}
|
|
return "inference_error"
|
|
}
|
|
|
|
func newChatCompletionID() string {
|
|
return fmt.Sprintf("chatcmpl-%d-%06d", time.Now().Unix(), rand.Intn(1_000_000))
|
|
}
|
|
|
|
func decodeJSONBody(reader io.Reader, dest any) error {
|
|
decoder := json.NewDecoder(reader)
|
|
decoder.DisallowUnknownFields()
|
|
return decoder.Decode(dest)
|
|
}
|