cli/internal/cmd/ml/cmd_serve.go

package ml

import (
	"encoding/json"
	"fmt"
	"io"
	"log/slog"
	"net/http"
	"time"

	"forge.lthn.ai/core/cli/pkg/cli"
	"forge.lthn.ai/core/cli/pkg/ml"
)

var serveCmd = &cli.Command{
	Use:   "serve",
	Short: "Start OpenAI-compatible inference server",
	Long:  "Starts an HTTP server serving /v1/completions and /v1/chat/completions using the configured ML backend.",
	RunE:  runServe,
}

var (
	serveBind      string
	serveModelPath string
)

func init() {
	serveCmd.Flags().StringVar(&serveBind, "bind", "0.0.0.0:8090", "Address to bind")
	serveCmd.Flags().StringVar(&serveModelPath, "model-path", "", "Path to model directory (for mlx backend)")
}

type completionRequest struct {
	Model       string  `json:"model"`
	Prompt      string  `json:"prompt"`
	MaxTokens   int     `json:"max_tokens"`
	Temperature float64 `json:"temperature"`
}

type completionResponse struct {
	ID      string             `json:"id"`
	Object  string             `json:"object"`
	Created int64              `json:"created"`
	Model   string             `json:"model"`
	Choices []completionChoice `json:"choices"`
	Usage   usageInfo          `json:"usage"`
}

type completionChoice struct {
	Text         string `json:"text"`
	Index        int    `json:"index"`
	FinishReason string `json:"finish_reason"`
}

type chatRequest struct {
	Model       string       `json:"model"`
	Messages    []ml.Message `json:"messages"`
	MaxTokens   int          `json:"max_tokens"`
	Temperature float64      `json:"temperature"`
}

type chatResponse struct {
	ID      string       `json:"id"`
	Object  string       `json:"object"`
	Created int64        `json:"created"`
	Model   string       `json:"model"`
	Choices []chatChoice `json:"choices"`
}

type chatChoice struct {
	Message      ml.Message `json:"message"`
	Index        int        `json:"index"`
	FinishReason string     `json:"finish_reason"`
}

type usageInfo struct {
	PromptTokens     int `json:"prompt_tokens"`
	CompletionTokens int `json:"completion_tokens"`
	TotalTokens      int `json:"total_tokens"`
}

func runServe(cmd *cli.Command, args []string) error {
	// Try native MLX backend first (macOS arm64 with mlx tag + model-path set),
	// fall back to HTTP proxy backend.
	backend, err := createServeBackend()
	if err != nil {
		return err
	}

	mux := http.NewServeMux()

	mux.HandleFunc("POST /v1/completions", func(w http.ResponseWriter, r *http.Request) {
		body, _ := io.ReadAll(r.Body)
		var req completionRequest
		if err := json.Unmarshal(body, &req); err != nil {
			http.Error(w, err.Error(), 400)
			return
		}

		opts := ml.GenOpts{
			Temperature: req.Temperature,
			MaxTokens:   req.MaxTokens,
			Model:       req.Model,
		}

		text, err := backend.Generate(r.Context(), req.Prompt, opts)
		if err != nil {
			http.Error(w, err.Error(), 500)
			return
		}

		resp := completionResponse{
			ID:      fmt.Sprintf("cmpl-%d", time.Now().UnixNano()),
			Object:  "text_completion",
			Created: time.Now().Unix(),
			Model:   backend.Name(),
			Choices: []completionChoice{{Text: text, FinishReason: "stop"}},
		}

		w.Header().Set("Content-Type", "application/json")
		json.NewEncoder(w).Encode(resp)
	})

	mux.HandleFunc("POST /v1/chat/completions", func(w http.ResponseWriter, r *http.Request) {
		body, _ := io.ReadAll(r.Body)
		var req chatRequest
		if err := json.Unmarshal(body, &req); err != nil {
			http.Error(w, err.Error(), 400)
			return
		}

		opts := ml.GenOpts{
			Temperature: req.Temperature,
			MaxTokens:   req.MaxTokens,
			Model:       req.Model,
		}

		text, err := backend.Chat(r.Context(), req.Messages, opts)
		if err != nil {
			http.Error(w, err.Error(), 500)
			return
		}

		resp := chatResponse{
			ID:      fmt.Sprintf("chatcmpl-%d", time.Now().UnixNano()),
			Object:  "chat.completion",
			Created: time.Now().Unix(),
			Model:   backend.Name(),
			Choices: []chatChoice{{
				Message:      ml.Message{Role: "assistant", Content: text},
				FinishReason: "stop",
			}},
		}

		w.Header().Set("Content-Type", "application/json")
		json.NewEncoder(w).Encode(resp)
	})

	mux.HandleFunc("GET /v1/models", func(w http.ResponseWriter, r *http.Request) {
		resp := struct {
			Object string `json:"object"`
			Data   []struct {
				ID string `json:"id"`
			} `json:"data"`
		}{
			Object: "list",
			Data: []struct {
				ID string `json:"id"`
			}{{ID: backend.Name()}},
		}
		w.Header().Set("Content-Type", "application/json")
		json.NewEncoder(w).Encode(resp)
	})

	slog.Info("ml serve: starting", "bind", serveBind, "backend", backend.Name())
	fmt.Printf("Serving on http://%s\n", serveBind)
	return http.ListenAndServe(serveBind, mux)
}
feat: add native MLX backend for Apple Silicon inference (pkg/mlx) CGo wrapper for mlx-c providing zero-Python Metal GPU inference. Includes Gemma 3 model architecture, BPE tokenizer, KV cache, composable sampling, and OpenAI-compatible serve command. Build-tagged (darwin && arm64 && mlx) with stubs for cross-platform. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-16 01:19:04 +00:00			`package ml`

			`import (`
			`"encoding/json"`
			`"fmt"`
			`"io"`
			`"log/slog"`
			`"net/http"`
			`"time"`

			`"forge.lthn.ai/core/cli/pkg/cli"`
			`"forge.lthn.ai/core/cli/pkg/ml"`
			`)`

			`var serveCmd = &cli.Command{`
			`Use: "serve",`
			`Short: "Start OpenAI-compatible inference server",`
			`Long: "Starts an HTTP server serving /v1/completions and /v1/chat/completions using the configured ML backend.",`
			`RunE: runServe,`
			`}`

			`var (`
			`serveBind string`
			`serveModelPath string`
			`)`

			`func init() {`
			`serveCmd.Flags().StringVar(&serveBind, "bind", "0.0.0.0:8090", "Address to bind")`
			`serveCmd.Flags().StringVar(&serveModelPath, "model-path", "", "Path to model directory (for mlx backend)")`
			`}`

			`type completionRequest struct {`
			Model string `json:"model"`
			Prompt string `json:"prompt"`
			MaxTokens int `json:"max_tokens"`
			Temperature float64 `json:"temperature"`
			`}`

			`type completionResponse struct {`
			ID string `json:"id"`
			Object string `json:"object"`
			Created int64 `json:"created"`
			Model string `json:"model"`
			Choices []completionChoice `json:"choices"`
			Usage usageInfo `json:"usage"`
			`}`

			`type completionChoice struct {`
			Text string `json:"text"`
			Index int `json:"index"`
			FinishReason string `json:"finish_reason"`
			`}`

			`type chatRequest struct {`
			Model string `json:"model"`
			Messages []ml.Message `json:"messages"`
			MaxTokens int `json:"max_tokens"`
			Temperature float64 `json:"temperature"`
			`}`

			`type chatResponse struct {`
			ID string `json:"id"`
			Object string `json:"object"`
			Created int64 `json:"created"`
			Model string `json:"model"`
			Choices []chatChoice `json:"choices"`
			`}`

			`type chatChoice struct {`
			Message ml.Message `json:"message"`
			Index int `json:"index"`
			FinishReason string `json:"finish_reason"`
			`}`

			`type usageInfo struct {`
			PromptTokens int `json:"prompt_tokens"`
			CompletionTokens int `json:"completion_tokens"`
			TotalTokens int `json:"total_tokens"`
			`}`

			`func runServe(cmd *cli.Command, args []string) error {`
feat: use native MLX backend when --model-path is set on Apple Silicon Build-tagged backend selection: MLX on darwin/arm64/mlx, HTTP elsewhere. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-16 02:01:52 +00:00			`// Try native MLX backend first (macOS arm64 with mlx tag + model-path set),`
			`// fall back to HTTP proxy backend.`
			`backend, err := createServeBackend()`
			`if err != nil {`
			`return err`
			`}`
feat: add native MLX backend for Apple Silicon inference (pkg/mlx) CGo wrapper for mlx-c providing zero-Python Metal GPU inference. Includes Gemma 3 model architecture, BPE tokenizer, KV cache, composable sampling, and OpenAI-compatible serve command. Build-tagged (darwin && arm64 && mlx) with stubs for cross-platform. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-16 01:19:04 +00:00
			`mux := http.NewServeMux()`

			`mux.HandleFunc("POST /v1/completions", func(w http.ResponseWriter, r *http.Request) {`
			`body, _ := io.ReadAll(r.Body)`
			`var req completionRequest`
			`if err := json.Unmarshal(body, &req); err != nil {`
			`http.Error(w, err.Error(), 400)`
			`return`
			`}`

			`opts := ml.GenOpts{`
			`Temperature: req.Temperature,`
			`MaxTokens: req.MaxTokens,`
			`Model: req.Model,`
			`}`

			`text, err := backend.Generate(r.Context(), req.Prompt, opts)`
			`if err != nil {`
			`http.Error(w, err.Error(), 500)`
			`return`
			`}`

			`resp := completionResponse{`
			`ID: fmt.Sprintf("cmpl-%d", time.Now().UnixNano()),`
			`Object: "text_completion",`
			`Created: time.Now().Unix(),`
			`Model: backend.Name(),`
			`Choices: []completionChoice{{Text: text, FinishReason: "stop"}},`
			`}`

			`w.Header().Set("Content-Type", "application/json")`
			`json.NewEncoder(w).Encode(resp)`
			`})`

			`mux.HandleFunc("POST /v1/chat/completions", func(w http.ResponseWriter, r *http.Request) {`
			`body, _ := io.ReadAll(r.Body)`
			`var req chatRequest`
			`if err := json.Unmarshal(body, &req); err != nil {`
			`http.Error(w, err.Error(), 400)`
			`return`
			`}`

			`opts := ml.GenOpts{`
			`Temperature: req.Temperature,`
			`MaxTokens: req.MaxTokens,`
			`Model: req.Model,`
			`}`

			`text, err := backend.Chat(r.Context(), req.Messages, opts)`
			`if err != nil {`
			`http.Error(w, err.Error(), 500)`
			`return`
			`}`

			`resp := chatResponse{`
			`ID: fmt.Sprintf("chatcmpl-%d", time.Now().UnixNano()),`
			`Object: "chat.completion",`
			`Created: time.Now().Unix(),`
			`Model: backend.Name(),`
			`Choices: []chatChoice{{`
			`Message: ml.Message{Role: "assistant", Content: text},`
			`FinishReason: "stop",`
			`}},`
			`}`

			`w.Header().Set("Content-Type", "application/json")`
			`json.NewEncoder(w).Encode(resp)`
			`})`

			`mux.HandleFunc("GET /v1/models", func(w http.ResponseWriter, r *http.Request) {`
			`resp := struct {`
			Object string `json:"object"`
			`Data []struct {`
			ID string `json:"id"`
			} `json:"data"`
			`}{`
			`Object: "list",`
			`Data: []struct {`
			ID string `json:"id"`
			`}{{ID: backend.Name()}},`
			`}`
			`w.Header().Set("Content-Type", "application/json")`
			`json.NewEncoder(w).Encode(resp)`
			`})`

			`slog.Info("ml serve: starting", "bind", serveBind, "backend", backend.Name())`
			`fmt.Printf("Serving on http://%s\n", serveBind)`
			`return http.ListenAndServe(serveBind, mux)`
			`}`