cli/cmd/ml/cmd_serve.go

package ml

import (
	"encoding/json"
	"fmt"
	"io"
	"log/slog"
	"net/http"
	"time"

	"forge.lthn.ai/core/go/pkg/cli"
	"forge.lthn.ai/core/go/pkg/ml"
)

var serveCmd = &cli.Command{
	Use:   "serve",
	Short: "Start OpenAI-compatible inference server",
	Long:  "Starts an HTTP server serving /v1/completions and /v1/chat/completions using the configured ML backend.",
	RunE:  runServe,
}

var (
	serveBind      string
	serveModelPath string
)

func init() {
	serveCmd.Flags().StringVar(&serveBind, "bind", "0.0.0.0:8090", "Address to bind")
	serveCmd.Flags().StringVar(&serveModelPath, "model-path", "", "Path to model directory (for mlx backend)")
}

type completionRequest struct {
	Model       string  `json:"model"`
	Prompt      string  `json:"prompt"`
	MaxTokens   int     `json:"max_tokens"`
	Temperature float64 `json:"temperature"`
}

type completionResponse struct {
	ID      string             `json:"id"`
	Object  string             `json:"object"`
	Created int64              `json:"created"`
	Model   string             `json:"model"`
	Choices []completionChoice `json:"choices"`
	Usage   usageInfo          `json:"usage"`
}

type completionChoice struct {
	Text         string `json:"text"`
	Index        int    `json:"index"`
	FinishReason string `json:"finish_reason"`
}

type chatRequest struct {
	Model       string       `json:"model"`
	Messages    []ml.Message `json:"messages"`
	MaxTokens   int          `json:"max_tokens"`
	Temperature float64      `json:"temperature"`
}

type chatResponse struct {
	ID      string       `json:"id"`
	Object  string       `json:"object"`
	Created int64        `json:"created"`
	Model   string       `json:"model"`
	Choices []chatChoice `json:"choices"`
}

type chatChoice struct {
	Message      ml.Message `json:"message"`
	Index        int        `json:"index"`
	FinishReason string     `json:"finish_reason"`
}

type usageInfo struct {
	PromptTokens     int `json:"prompt_tokens"`
	CompletionTokens int `json:"completion_tokens"`
	TotalTokens      int `json:"total_tokens"`
}

func runServe(cmd *cli.Command, args []string) error {
	// Try native MLX backend first (macOS arm64 with mlx tag + model-path set),
	// fall back to HTTP proxy backend.
	backend, err := createServeBackend()
	if err != nil {
		return err
	}

	mux := http.NewServeMux()

	mux.HandleFunc("POST /v1/completions", func(w http.ResponseWriter, r *http.Request) {
		body, _ := io.ReadAll(r.Body)
		var req completionRequest
		if err := json.Unmarshal(body, &req); err != nil {
			http.Error(w, err.Error(), 400)
			return
		}

		opts := ml.GenOpts{
			Temperature: req.Temperature,
			MaxTokens:   req.MaxTokens,
			Model:       req.Model,
		}

		text, err := backend.Generate(r.Context(), req.Prompt, opts)
		if err != nil {
			http.Error(w, err.Error(), 500)
			return
		}

		resp := completionResponse{
			ID:      fmt.Sprintf("cmpl-%d", time.Now().UnixNano()),
			Object:  "text_completion",
			Created: time.Now().Unix(),
			Model:   backend.Name(),
			Choices: []completionChoice{{Text: text, FinishReason: "stop"}},
		}

		w.Header().Set("Content-Type", "application/json")
		json.NewEncoder(w).Encode(resp)
	})

	mux.HandleFunc("POST /v1/chat/completions", func(w http.ResponseWriter, r *http.Request) {
		body, _ := io.ReadAll(r.Body)
		var req chatRequest
		if err := json.Unmarshal(body, &req); err != nil {
			http.Error(w, err.Error(), 400)
			return
		}

		opts := ml.GenOpts{
			Temperature: req.Temperature,
			MaxTokens:   req.MaxTokens,
			Model:       req.Model,
		}

		text, err := backend.Chat(r.Context(), req.Messages, opts)
		if err != nil {
			http.Error(w, err.Error(), 500)
			return
		}

		resp := chatResponse{
			ID:      fmt.Sprintf("chatcmpl-%d", time.Now().UnixNano()),
			Object:  "chat.completion",
			Created: time.Now().Unix(),
			Model:   backend.Name(),
			Choices: []chatChoice{{
				Message:      ml.Message{Role: "assistant", Content: text},
				FinishReason: "stop",
			}},
		}

		w.Header().Set("Content-Type", "application/json")
		json.NewEncoder(w).Encode(resp)
	})

	mux.HandleFunc("GET /v1/models", func(w http.ResponseWriter, r *http.Request) {
		resp := struct {
			Object string `json:"object"`
			Data   []struct {
				ID string `json:"id"`
			} `json:"data"`
		}{
			Object: "list",
			Data: []struct {
				ID string `json:"id"`
			}{{ID: backend.Name()}},
		}
		w.Header().Set("Content-Type", "application/json")
		json.NewEncoder(w).Encode(resp)
	})

	slog.Info("ml serve: starting", "bind", serveBind, "backend", backend.Name())
	fmt.Printf("Serving on http://%s\n", serveBind)
	return http.ListenAndServe(serveBind, mux)
}