2026-02-19 20:58:46 +00:00
|
|
|
package llamacpp
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"bufio"
|
|
|
|
|
"bytes"
|
|
|
|
|
"context"
|
|
|
|
|
"encoding/json"
|
|
|
|
|
"fmt"
|
|
|
|
|
"io"
|
|
|
|
|
"iter"
|
|
|
|
|
"net/http"
|
|
|
|
|
"strings"
|
2026-02-19 21:04:02 +00:00
|
|
|
"sync"
|
2026-03-16 21:08:52 +00:00
|
|
|
|
|
|
|
|
coreerr "forge.lthn.ai/core/go-log"
|
2026-02-19 20:58:46 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// ChatMessage is a single message in a conversation.
|
|
|
|
|
type ChatMessage struct {
|
|
|
|
|
Role string `json:"role"`
|
|
|
|
|
Content string `json:"content"`
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ChatRequest is the request body for /v1/chat/completions.
|
|
|
|
|
type ChatRequest struct {
|
|
|
|
|
Messages []ChatMessage `json:"messages"`
|
|
|
|
|
MaxTokens int `json:"max_tokens,omitempty"`
|
|
|
|
|
Temperature float32 `json:"temperature"`
|
|
|
|
|
TopK int `json:"top_k,omitempty"`
|
|
|
|
|
TopP float32 `json:"top_p,omitempty"`
|
|
|
|
|
RepeatPenalty float32 `json:"repeat_penalty,omitempty"`
|
|
|
|
|
Stream bool `json:"stream"`
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// CompletionRequest is the request body for /v1/completions.
|
|
|
|
|
type CompletionRequest struct {
|
|
|
|
|
Prompt string `json:"prompt"`
|
|
|
|
|
MaxTokens int `json:"max_tokens,omitempty"`
|
|
|
|
|
Temperature float32 `json:"temperature"`
|
|
|
|
|
TopK int `json:"top_k,omitempty"`
|
|
|
|
|
TopP float32 `json:"top_p,omitempty"`
|
|
|
|
|
RepeatPenalty float32 `json:"repeat_penalty,omitempty"`
|
|
|
|
|
Stream bool `json:"stream"`
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type chatChunkResponse struct {
|
|
|
|
|
Choices []struct {
|
|
|
|
|
Delta struct {
|
|
|
|
|
Content string `json:"content"`
|
|
|
|
|
} `json:"delta"`
|
|
|
|
|
FinishReason *string `json:"finish_reason"`
|
|
|
|
|
} `json:"choices"`
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type completionChunkResponse struct {
|
|
|
|
|
Choices []struct {
|
|
|
|
|
Text string `json:"text"`
|
|
|
|
|
FinishReason *string `json:"finish_reason"`
|
|
|
|
|
} `json:"choices"`
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ChatComplete sends a streaming chat completion request to /v1/chat/completions.
|
|
|
|
|
// It returns an iterator over text chunks and a function that returns any error
|
|
|
|
|
// that occurred during the request or while reading the stream.
|
|
|
|
|
func (c *Client) ChatComplete(ctx context.Context, req ChatRequest) (iter.Seq[string], func() error) {
|
|
|
|
|
req.Stream = true
|
|
|
|
|
|
|
|
|
|
body, err := json.Marshal(req)
|
|
|
|
|
if err != nil {
|
2026-03-16 21:08:52 +00:00
|
|
|
return noChunks, func() error { return coreerr.E("llamacpp.ChatComplete", "marshal chat request", err) }
|
2026-02-19 20:58:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/v1/chat/completions", bytes.NewReader(body))
|
|
|
|
|
if err != nil {
|
2026-03-16 21:08:52 +00:00
|
|
|
return noChunks, func() error { return coreerr.E("llamacpp.ChatComplete", "create chat request", err) }
|
2026-02-19 20:58:46 +00:00
|
|
|
}
|
|
|
|
|
httpReq.Header.Set("Content-Type", "application/json")
|
2026-02-19 21:04:02 +00:00
|
|
|
httpReq.Header.Set("Accept", "text/event-stream")
|
2026-02-19 20:58:46 +00:00
|
|
|
|
|
|
|
|
resp, err := c.httpClient.Do(httpReq)
|
|
|
|
|
if err != nil {
|
2026-03-16 21:08:52 +00:00
|
|
|
return noChunks, func() error { return coreerr.E("llamacpp.ChatComplete", "chat request", err) }
|
2026-02-19 20:58:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
|
|
|
defer resp.Body.Close()
|
|
|
|
|
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 256))
|
|
|
|
|
return noChunks, func() error {
|
2026-03-16 21:08:52 +00:00
|
|
|
return coreerr.E("llamacpp.ChatComplete", fmt.Sprintf("chat returned %d: %s", resp.StatusCode, strings.TrimSpace(string(respBody))), nil)
|
2026-02-19 20:58:46 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-19 21:04:02 +00:00
|
|
|
var (
|
|
|
|
|
streamErr error
|
|
|
|
|
closeOnce sync.Once
|
|
|
|
|
closeBody = func() { closeOnce.Do(func() { resp.Body.Close() }) }
|
|
|
|
|
)
|
2026-02-19 20:58:46 +00:00
|
|
|
sseData := parseSSE(resp.Body, &streamErr)
|
|
|
|
|
|
|
|
|
|
tokens := func(yield func(string) bool) {
|
2026-02-19 21:04:02 +00:00
|
|
|
defer closeBody()
|
2026-02-19 20:58:46 +00:00
|
|
|
for raw := range sseData {
|
|
|
|
|
var chunk chatChunkResponse
|
|
|
|
|
if err := json.Unmarshal([]byte(raw), &chunk); err != nil {
|
2026-03-16 21:08:52 +00:00
|
|
|
streamErr = coreerr.E("llamacpp.ChatComplete", "decode chat chunk", err)
|
2026-02-19 20:58:46 +00:00
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
if len(chunk.Choices) == 0 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
text := chunk.Choices[0].Delta.Content
|
|
|
|
|
if text == "" {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if !yield(text) {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-19 21:04:02 +00:00
|
|
|
return tokens, func() error {
|
|
|
|
|
closeBody()
|
|
|
|
|
return streamErr
|
|
|
|
|
}
|
2026-02-19 20:58:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Complete sends a streaming completion request to /v1/completions.
|
|
|
|
|
// It returns an iterator over text chunks and a function that returns any error
|
|
|
|
|
// that occurred during the request or while reading the stream.
|
|
|
|
|
func (c *Client) Complete(ctx context.Context, req CompletionRequest) (iter.Seq[string], func() error) {
|
|
|
|
|
req.Stream = true
|
|
|
|
|
|
|
|
|
|
body, err := json.Marshal(req)
|
|
|
|
|
if err != nil {
|
2026-03-16 21:08:52 +00:00
|
|
|
return noChunks, func() error { return coreerr.E("llamacpp.Complete", "marshal completion request", err) }
|
2026-02-19 20:58:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/v1/completions", bytes.NewReader(body))
|
|
|
|
|
if err != nil {
|
2026-03-16 21:08:52 +00:00
|
|
|
return noChunks, func() error { return coreerr.E("llamacpp.Complete", "create completion request", err) }
|
2026-02-19 20:58:46 +00:00
|
|
|
}
|
|
|
|
|
httpReq.Header.Set("Content-Type", "application/json")
|
2026-02-19 21:04:02 +00:00
|
|
|
httpReq.Header.Set("Accept", "text/event-stream")
|
2026-02-19 20:58:46 +00:00
|
|
|
|
|
|
|
|
resp, err := c.httpClient.Do(httpReq)
|
|
|
|
|
if err != nil {
|
2026-03-16 21:08:52 +00:00
|
|
|
return noChunks, func() error { return coreerr.E("llamacpp.Complete", "completion request", err) }
|
2026-02-19 20:58:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
|
|
|
defer resp.Body.Close()
|
|
|
|
|
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 256))
|
|
|
|
|
return noChunks, func() error {
|
2026-03-16 21:08:52 +00:00
|
|
|
return coreerr.E("llamacpp.Complete", fmt.Sprintf("completion returned %d: %s", resp.StatusCode, strings.TrimSpace(string(respBody))), nil)
|
2026-02-19 20:58:46 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-19 21:04:02 +00:00
|
|
|
var (
|
|
|
|
|
streamErr error
|
|
|
|
|
closeOnce sync.Once
|
|
|
|
|
closeBody = func() { closeOnce.Do(func() { resp.Body.Close() }) }
|
|
|
|
|
)
|
2026-02-19 20:58:46 +00:00
|
|
|
sseData := parseSSE(resp.Body, &streamErr)
|
|
|
|
|
|
|
|
|
|
tokens := func(yield func(string) bool) {
|
2026-02-19 21:04:02 +00:00
|
|
|
defer closeBody()
|
2026-02-19 20:58:46 +00:00
|
|
|
for raw := range sseData {
|
|
|
|
|
var chunk completionChunkResponse
|
|
|
|
|
if err := json.Unmarshal([]byte(raw), &chunk); err != nil {
|
2026-03-16 21:08:52 +00:00
|
|
|
streamErr = coreerr.E("llamacpp.Complete", "decode completion chunk", err)
|
2026-02-19 20:58:46 +00:00
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
if len(chunk.Choices) == 0 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
text := chunk.Choices[0].Text
|
|
|
|
|
if text == "" {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if !yield(text) {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-19 21:04:02 +00:00
|
|
|
return tokens, func() error {
|
|
|
|
|
closeBody()
|
|
|
|
|
return streamErr
|
|
|
|
|
}
|
2026-02-19 20:58:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// parseSSE reads SSE-formatted lines from r and yields the payload of each
|
|
|
|
|
// "data: " line. It stops when it encounters "[DONE]" or an I/O error.
|
|
|
|
|
// Any read error (other than EOF) is stored via errOut.
|
|
|
|
|
func parseSSE(r io.Reader, errOut *error) iter.Seq[string] {
|
|
|
|
|
return func(yield func(string) bool) {
|
|
|
|
|
scanner := bufio.NewScanner(r)
|
|
|
|
|
for scanner.Scan() {
|
|
|
|
|
line := scanner.Text()
|
|
|
|
|
if !strings.HasPrefix(line, "data: ") {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
payload := strings.TrimPrefix(line, "data: ")
|
|
|
|
|
if payload == "[DONE]" {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
if !yield(payload) {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if err := scanner.Err(); err != nil {
|
2026-03-16 21:08:52 +00:00
|
|
|
*errOut = coreerr.E("llamacpp.parseSSE", "read SSE stream", err)
|
2026-02-19 20:58:46 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// noChunks is an empty iterator returned when an error occurs before streaming begins.
|
|
|
|
|
func noChunks(func(string) bool) {}
|