From e64fa7b4ff14d4f3f158b71a966e543aacb6bb2f Mon Sep 17 00:00:00 2001
From: Snider
Date: Tue, 17 Feb 2026 19:18:31 +0000
Subject: [PATCH] feat(ml): add context windowing, fix chat UI max-tokens
- Add --max-context flag to serve (sliding window, default 4 messages)
to prevent KV-cache explosion on multi-turn conversations
- Pass server max-tokens to chat UI HTML attribute instead of
hardcoded 2048 in JavaScript
- Add chat.js and chat_embed.go for embedded LEM chat UI
Co-Authored-By: Virgil
---
cmd/ml/chat.js | 832 +++++++++++++++++++++++++++++++++++++++++++
cmd/ml/chat_embed.go | 44 +++
cmd/ml/cmd_serve.go | 309 +++++++++++++++-
3 files changed, 1178 insertions(+), 7 deletions(-)
create mode 100644 cmd/ml/chat.js
create mode 100644 cmd/ml/chat_embed.go
diff --git a/cmd/ml/chat.js b/cmd/ml/chat.js
new file mode 100644
index 00000000..8948708c
--- /dev/null
+++ b/cmd/ml/chat.js
@@ -0,0 +1,832 @@
+// src/styles.ts
+var chatStyles = `
+ :host {
+ display: flex;
+ flex-direction: column;
+ background: var(--lem-bg, #1a1a1e);
+ color: var(--lem-text, #e0e0e0);
+ font-family: var(--lem-font, system-ui, -apple-system, sans-serif);
+ font-size: 14px;
+ line-height: 1.5;
+ border-radius: 12px;
+ overflow: hidden;
+ border: 1px solid rgba(255, 255, 255, 0.08);
+ }
+
+ .header {
+ display: flex;
+ align-items: center;
+ gap: 10px;
+ padding: 14px 18px;
+ background: rgba(255, 255, 255, 0.03);
+ border-bottom: 1px solid rgba(255, 255, 255, 0.06);
+ flex-shrink: 0;
+ }
+
+ .header-icon {
+ width: 28px;
+ height: 28px;
+ border-radius: 8px;
+ background: var(--lem-accent, #5865f2);
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ font-size: 14px;
+ font-weight: 700;
+ color: #fff;
+ }
+
+ .header-title {
+ font-size: 15px;
+ font-weight: 600;
+ color: var(--lem-text, #e0e0e0);
+ }
+
+ .header-model {
+ font-size: 11px;
+ color: rgba(255, 255, 255, 0.35);
+ margin-left: auto;
+ font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+ }
+
+ .header-status {
+ width: 8px;
+ height: 8px;
+ border-radius: 50%;
+ background: #43b581;
+ flex-shrink: 0;
+ }
+
+ .header-status.disconnected {
+ background: #f04747;
+ }
+`;
+var messagesStyles = `
+ :host {
+ display: block;
+ flex: 1;
+ overflow-y: auto;
+ overflow-x: hidden;
+ padding: 16px 0;
+ scroll-behavior: smooth;
+ }
+
+ :host::-webkit-scrollbar {
+ width: 6px;
+ }
+
+ :host::-webkit-scrollbar-track {
+ background: transparent;
+ }
+
+ :host::-webkit-scrollbar-thumb {
+ background: rgba(255, 255, 255, 0.12);
+ border-radius: 3px;
+ }
+
+ .empty {
+ display: flex;
+ flex-direction: column;
+ align-items: center;
+ justify-content: center;
+ height: 100%;
+ gap: 12px;
+ color: rgba(255, 255, 255, 0.25);
+ }
+
+ .empty-icon {
+ font-size: 36px;
+ opacity: 0.4;
+ }
+
+ .empty-text {
+ font-size: 14px;
+ }
+`;
+var messageStyles = `
+ :host {
+ display: block;
+ padding: 6px 18px;
+ }
+
+ :host([role="user"]) .bubble {
+ background: var(--lem-msg-user, #2a2a3e);
+ margin-left: 40px;
+ border-radius: 12px 12px 4px 12px;
+ }
+
+ :host([role="assistant"]) .bubble {
+ background: var(--lem-msg-assistant, #1e1e2a);
+ margin-right: 40px;
+ border-radius: 12px 12px 12px 4px;
+ }
+
+ .bubble {
+ padding: 10px 14px;
+ word-wrap: break-word;
+ overflow-wrap: break-word;
+ }
+
+ .role {
+ font-size: 11px;
+ font-weight: 600;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+ margin-bottom: 4px;
+ color: rgba(255, 255, 255, 0.35);
+ }
+
+ :host([role="assistant"]) .role {
+ color: var(--lem-accent, #5865f2);
+ }
+
+ .content {
+ color: var(--lem-text, #e0e0e0);
+ line-height: 1.6;
+ }
+
+ .content p {
+ margin: 0 0 8px 0;
+ }
+
+ .content p:last-child {
+ margin-bottom: 0;
+ }
+
+ .content strong {
+ font-weight: 600;
+ color: #fff;
+ }
+
+ .content em {
+ font-style: italic;
+ color: rgba(255, 255, 255, 0.8);
+ }
+
+ .content code {
+ font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+ font-size: 12px;
+ background: rgba(0, 0, 0, 0.3);
+ padding: 2px 5px;
+ border-radius: 4px;
+ color: #e8a0bf;
+ }
+
+ .content pre {
+ margin: 8px 0;
+ padding: 12px;
+ background: rgba(0, 0, 0, 0.35);
+ border-radius: 8px;
+ overflow-x: auto;
+ border: 1px solid rgba(255, 255, 255, 0.06);
+ }
+
+ .content pre code {
+ background: none;
+ padding: 0;
+ font-size: 12px;
+ color: #c9d1d9;
+ line-height: 1.5;
+ }
+
+ .think-panel {
+ margin: 6px 0 8px;
+ padding: 8px 12px;
+ background: rgba(88, 101, 242, 0.06);
+ border-left: 2px solid rgba(88, 101, 242, 0.3);
+ border-radius: 0 6px 6px 0;
+ font-size: 12px;
+ color: rgba(255, 255, 255, 0.45);
+ line-height: 1.5;
+ max-height: 200px;
+ overflow-y: auto;
+ }
+
+ .think-panel::-webkit-scrollbar {
+ width: 4px;
+ }
+
+ .think-panel::-webkit-scrollbar-thumb {
+ background: rgba(255, 255, 255, 0.1);
+ border-radius: 2px;
+ }
+
+ .think-label {
+ font-size: 10px;
+ font-weight: 600;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+ color: rgba(88, 101, 242, 0.5);
+ margin-bottom: 4px;
+ cursor: pointer;
+ user-select: none;
+ }
+
+ .think-label:hover {
+ color: rgba(88, 101, 242, 0.7);
+ }
+
+ .think-panel.collapsed .think-content {
+ display: none;
+ }
+
+ .cursor {
+ display: inline-block;
+ width: 7px;
+ height: 16px;
+ background: var(--lem-accent, #5865f2);
+ border-radius: 1px;
+ animation: blink 0.8s step-end infinite;
+ vertical-align: text-bottom;
+ margin-left: 2px;
+ }
+
+ @keyframes blink {
+ 50% { opacity: 0; }
+ }
+`;
+var inputStyles = `
+ :host {
+ display: block;
+ padding: 12px 16px 16px;
+ border-top: 1px solid rgba(255, 255, 255, 0.06);
+ flex-shrink: 0;
+ }
+
+ .input-wrapper {
+ display: flex;
+ align-items: flex-end;
+ gap: 10px;
+ background: rgba(255, 255, 255, 0.05);
+ border: 1px solid rgba(255, 255, 255, 0.08);
+ border-radius: 12px;
+ padding: 8px 12px;
+ transition: border-color 0.15s;
+ }
+
+ .input-wrapper:focus-within {
+ border-color: var(--lem-accent, #5865f2);
+ }
+
+ textarea {
+ flex: 1;
+ background: none;
+ border: none;
+ outline: none;
+ color: var(--lem-text, #e0e0e0);
+ font-family: inherit;
+ font-size: 14px;
+ line-height: 1.5;
+ resize: none;
+ max-height: 120px;
+ min-height: 22px;
+ padding: 0;
+ }
+
+ textarea::placeholder {
+ color: rgba(255, 255, 255, 0.25);
+ }
+
+ .send-btn {
+ background: var(--lem-accent, #5865f2);
+ border: none;
+ border-radius: 8px;
+ color: #fff;
+ width: 32px;
+ height: 32px;
+ cursor: pointer;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ flex-shrink: 0;
+ transition: opacity 0.15s, transform 0.1s;
+ }
+
+ .send-btn:hover {
+ opacity: 0.85;
+ }
+
+ .send-btn:active {
+ transform: scale(0.95);
+ }
+
+ .send-btn:disabled {
+ opacity: 0.3;
+ cursor: default;
+ transform: none;
+ }
+
+ .send-btn svg {
+ width: 16px;
+ height: 16px;
+ }
+`;
+
+// src/lem-messages.ts
+var LemMessages = class extends HTMLElement {
+ shadow;
+ container;
+ emptyEl;
+ shouldAutoScroll = true;
+ constructor() {
+ super();
+ this.shadow = this.attachShadow({ mode: "open" });
+ }
+ connectedCallback() {
+ const style = document.createElement("style");
+ style.textContent = messagesStyles;
+ this.container = document.createElement("div");
+ this.emptyEl = document.createElement("div");
+ this.emptyEl.className = "empty";
+ const emptyIcon = document.createElement("div");
+ emptyIcon.className = "empty-icon";
+ emptyIcon.textContent = "\u2728";
+ const emptyText = document.createElement("div");
+ emptyText.className = "empty-text";
+ emptyText.textContent = "Start a conversation";
+ this.emptyEl.appendChild(emptyIcon);
+ this.emptyEl.appendChild(emptyText);
+ this.shadow.appendChild(style);
+ this.shadow.appendChild(this.emptyEl);
+ this.shadow.appendChild(this.container);
+ this.addEventListener("scroll", () => {
+ const threshold = 60;
+ this.shouldAutoScroll = this.scrollHeight - this.scrollTop - this.clientHeight < threshold;
+ });
+ }
+ addMessage(role, text) {
+ this.emptyEl.style.display = "none";
+ const msg = document.createElement("lem-message");
+ msg.setAttribute("role", role);
+ this.container.appendChild(msg);
+ if (text) {
+ msg.text = text;
+ }
+ this.scrollToBottom();
+ return msg;
+ }
+ scrollToBottom() {
+ if (this.shouldAutoScroll) {
+ requestAnimationFrame(() => {
+ this.scrollTop = this.scrollHeight;
+ });
+ }
+ }
+ clear() {
+ this.container.replaceChildren();
+ this.emptyEl.style.display = "";
+ this.shouldAutoScroll = true;
+ }
+};
+customElements.define("lem-messages", LemMessages);
+
+// src/markdown.ts
+function escapeHtml(text) {
+ return text.replace(/&/g, "&").replace(//g, ">").replace(/"/g, """);
+}
+function parseInline(text) {
+ let result = escapeHtml(text);
+ result = result.replace(/`([^`]+)`/g, "$1");
+ result = result.replace(/\*\*(.+?)\*\*/g, "$1");
+ result = result.replace(/__(.+?)__/g, "$1");
+ result = result.replace(/(?$1");
+ result = result.replace(/(?$1");
+ return result;
+}
+function renderMarkdown(text) {
+ const lines = text.split("\n");
+ const output = [];
+ let inCodeBlock = false;
+ let codeLines = [];
+ let codeLang = "";
+ for (const line of lines) {
+ if (line.trimStart().startsWith("```")) {
+ if (!inCodeBlock) {
+ inCodeBlock = true;
+ codeLang = line.trimStart().slice(3).trim();
+ codeLines = [];
+ } else {
+ const langAttr = codeLang ? ` data-lang="${escapeHtml(codeLang)}"` : "";
+ output.push(
+ `${escapeHtml(codeLines.join("\n"))}
`
+ );
+ inCodeBlock = false;
+ codeLines = [];
+ codeLang = "";
+ }
+ continue;
+ }
+ if (inCodeBlock) {
+ codeLines.push(line);
+ continue;
+ }
+ if (line.trim() === "") {
+ output.push("");
+ continue;
+ }
+ output.push(parseInline(line));
+ }
+ if (inCodeBlock) {
+ const langAttr = codeLang ? ` data-lang="${escapeHtml(codeLang)}"` : "";
+ output.push(
+ `${escapeHtml(codeLines.join("\n"))}
`
+ );
+ }
+ const paragraphs = [];
+ let current = [];
+ for (const line of output) {
+ if (line === "") {
+ if (current.length > 0) {
+ paragraphs.push(wrapParagraph(current));
+ current = [];
+ }
+ } else {
+ current.push(line);
+ }
+ }
+ if (current.length > 0) {
+ paragraphs.push(wrapParagraph(current));
+ }
+ return paragraphs.join("");
+}
+function wrapParagraph(lines) {
+ const joined = lines.join("
");
+ if (joined.startsWith("${joined}
`;
+}
+
+// src/lem-message.ts
+var LemMessage = class extends HTMLElement {
+ shadow;
+ thinkPanel;
+ thinkContent;
+ thinkLabel;
+ contentEl;
+ cursorEl;
+ _text = "";
+ _streaming = false;
+ _thinkCollapsed = false;
+ constructor() {
+ super();
+ this.shadow = this.attachShadow({ mode: "open" });
+ }
+ connectedCallback() {
+ const role = this.getAttribute("role") || "user";
+ const style = document.createElement("style");
+ style.textContent = messageStyles;
+ const bubble = document.createElement("div");
+ bubble.className = "bubble";
+ const roleEl = document.createElement("div");
+ roleEl.className = "role";
+ roleEl.textContent = role === "assistant" ? "LEM" : "You";
+ this.thinkPanel = document.createElement("div");
+ this.thinkPanel.className = "think-panel";
+ this.thinkPanel.style.display = "none";
+ this.thinkLabel = document.createElement("div");
+ this.thinkLabel.className = "think-label";
+ this.thinkLabel.textContent = "\u25BC reasoning";
+ this.thinkLabel.addEventListener("click", () => {
+ this._thinkCollapsed = !this._thinkCollapsed;
+ this.thinkPanel.classList.toggle("collapsed", this._thinkCollapsed);
+ this.thinkLabel.textContent = this._thinkCollapsed ? "\u25B6 reasoning" : "\u25BC reasoning";
+ });
+ this.thinkContent = document.createElement("div");
+ this.thinkContent.className = "think-content";
+ this.thinkPanel.appendChild(this.thinkLabel);
+ this.thinkPanel.appendChild(this.thinkContent);
+ this.contentEl = document.createElement("div");
+ this.contentEl.className = "content";
+ bubble.appendChild(roleEl);
+ if (role === "assistant") {
+ bubble.appendChild(this.thinkPanel);
+ }
+ bubble.appendChild(this.contentEl);
+ this.shadow.appendChild(style);
+ this.shadow.appendChild(bubble);
+ if (this._text) {
+ this.render();
+ }
+ }
+ get text() {
+ return this._text;
+ }
+ set text(value) {
+ this._text = value;
+ this.render();
+ }
+ get streaming() {
+ return this._streaming;
+ }
+ set streaming(value) {
+ this._streaming = value;
+ this.render();
+ }
+ appendToken(token) {
+ this._text += token;
+ this.render();
+ }
+ /**
+ * Splits text into think/response portions and renders each.
+ *
+ * Safety: renderMarkdown() escapes all HTML entities (& < > ") before any
+ * inline formatting is applied. The source is the local MLX model output,
+ * not arbitrary user HTML. Shadow DOM provides additional isolation.
+ */
+ render() {
+ if (!this.contentEl) return;
+ const { think, response } = this.splitThink(this._text);
+ if (think !== null && this.thinkPanel) {
+ this.thinkPanel.style.display = "";
+ this.thinkContent.textContent = think;
+ }
+ const responseHtml = renderMarkdown(response);
+ this.contentEl.innerHTML = responseHtml;
+ if (this._streaming) {
+ if (!this.cursorEl) {
+ this.cursorEl = document.createElement("span");
+ this.cursorEl.className = "cursor";
+ }
+ if (think !== null && !this._text.includes("")) {
+ this.thinkContent.appendChild(this.cursorEl);
+ } else {
+ const lastChild = this.contentEl.lastElementChild || this.contentEl;
+ lastChild.appendChild(this.cursorEl);
+ }
+ }
+ }
+ /**
+ * Split raw text into think content and response content.
+ * Returns { think: string | null, response: string }
+ */
+ splitThink(text) {
+ const thinkStart = text.indexOf("");
+ if (thinkStart === -1) {
+ return { think: null, response: text };
+ }
+ const afterOpen = thinkStart + "".length;
+ const thinkEnd = text.indexOf("", afterOpen);
+ if (thinkEnd === -1) {
+ return {
+ think: text.slice(afterOpen).trim(),
+ response: text.slice(0, thinkStart).trim()
+ };
+ }
+ const thinkText = text.slice(afterOpen, thinkEnd).trim();
+ const beforeThink = text.slice(0, thinkStart).trim();
+ const afterThink = text.slice(thinkEnd + "".length).trim();
+ const response = [beforeThink, afterThink].filter(Boolean).join("\n");
+ return { think: thinkText, response };
+ }
+};
+customElements.define("lem-message", LemMessage);
+
+// src/lem-input.ts
+var LemInput = class extends HTMLElement {
+ shadow;
+ textarea;
+ sendBtn;
+ _disabled = false;
+ constructor() {
+ super();
+ this.shadow = this.attachShadow({ mode: "open" });
+ }
+ connectedCallback() {
+ const style = document.createElement("style");
+ style.textContent = inputStyles;
+ const wrapper = document.createElement("div");
+ wrapper.className = "input-wrapper";
+ this.textarea = document.createElement("textarea");
+ this.textarea.rows = 1;
+ this.textarea.placeholder = "Message LEM...";
+ this.sendBtn = document.createElement("button");
+ this.sendBtn.className = "send-btn";
+ this.sendBtn.type = "button";
+ this.sendBtn.disabled = true;
+ this.sendBtn.appendChild(this.createSendIcon());
+ wrapper.appendChild(this.textarea);
+ wrapper.appendChild(this.sendBtn);
+ this.shadow.appendChild(style);
+ this.shadow.appendChild(wrapper);
+ this.textarea.addEventListener("input", () => {
+ this.textarea.style.height = "auto";
+ this.textarea.style.height = Math.min(this.textarea.scrollHeight, 120) + "px";
+ this.sendBtn.disabled = this._disabled || this.textarea.value.trim() === "";
+ });
+ this.textarea.addEventListener("keydown", (e) => {
+ if (e.key === "Enter" && !e.shiftKey) {
+ e.preventDefault();
+ this.submit();
+ }
+ });
+ this.sendBtn.addEventListener("click", () => this.submit());
+ }
+ /** Build the send arrow SVG using DOM API (no innerHTML) */
+ createSendIcon() {
+ const ns = "http://www.w3.org/2000/svg";
+ const svg = document.createElementNS(ns, "svg");
+ svg.setAttribute("viewBox", "0 0 24 24");
+ svg.setAttribute("fill", "none");
+ svg.setAttribute("stroke", "currentColor");
+ svg.setAttribute("stroke-width", "2");
+ svg.setAttribute("stroke-linecap", "round");
+ svg.setAttribute("stroke-linejoin", "round");
+ svg.setAttribute("width", "16");
+ svg.setAttribute("height", "16");
+ const line = document.createElementNS(ns, "line");
+ line.setAttribute("x1", "22");
+ line.setAttribute("y1", "2");
+ line.setAttribute("x2", "11");
+ line.setAttribute("y2", "13");
+ const polygon = document.createElementNS(ns, "polygon");
+ polygon.setAttribute("points", "22 2 15 22 11 13 2 9 22 2");
+ svg.appendChild(line);
+ svg.appendChild(polygon);
+ return svg;
+ }
+ submit() {
+ const text = this.textarea.value.trim();
+ if (!text || this._disabled) return;
+ this.dispatchEvent(
+ new CustomEvent("lem-send", {
+ bubbles: true,
+ composed: true,
+ detail: { text }
+ })
+ );
+ this.textarea.value = "";
+ this.textarea.style.height = "auto";
+ this.sendBtn.disabled = true;
+ this.textarea.focus();
+ }
+ get disabled() {
+ return this._disabled;
+ }
+ set disabled(value) {
+ this._disabled = value;
+ this.textarea.disabled = value;
+ this.sendBtn.disabled = value || this.textarea.value.trim() === "";
+ this.textarea.placeholder = value ? "LEM is thinking..." : "Message LEM...";
+ }
+ focus() {
+ this.textarea?.focus();
+ }
+};
+customElements.define("lem-input", LemInput);
+
+// src/lem-chat.ts
+var LemChat = class extends HTMLElement {
+ shadow;
+ messages;
+ input;
+ statusEl;
+ history = [];
+ abortController = null;
+ static get observedAttributes() {
+ return ["endpoint", "model", "system-prompt", "max-tokens", "temperature"];
+ }
+ constructor() {
+ super();
+ this.shadow = this.attachShadow({ mode: "open" });
+ }
+ connectedCallback() {
+ const style = document.createElement("style");
+ style.textContent = chatStyles;
+ const header = document.createElement("div");
+ header.className = "header";
+ this.statusEl = document.createElement("div");
+ this.statusEl.className = "header-status";
+ const icon = document.createElement("div");
+ icon.className = "header-icon";
+ icon.textContent = "L";
+ const title = document.createElement("div");
+ title.className = "header-title";
+ title.textContent = "LEM";
+ const modelLabel = document.createElement("div");
+ modelLabel.className = "header-model";
+ modelLabel.textContent = this.getAttribute("model") || "local";
+ header.appendChild(this.statusEl);
+ header.appendChild(icon);
+ header.appendChild(title);
+ header.appendChild(modelLabel);
+ this.messages = document.createElement("lem-messages");
+ this.input = document.createElement("lem-input");
+ this.shadow.appendChild(style);
+ this.shadow.appendChild(header);
+ this.shadow.appendChild(this.messages);
+ this.shadow.appendChild(this.input);
+ this.addEventListener("lem-send", ((e) => {
+ this.handleSend(e.detail.text);
+ }));
+ const systemPrompt = this.getAttribute("system-prompt");
+ if (systemPrompt) {
+ this.history.push({ role: "system", content: systemPrompt });
+ }
+ this.checkConnection();
+ requestAnimationFrame(() => this.input.focus());
+ }
+ disconnectedCallback() {
+ this.abortController?.abort();
+ }
+ get endpoint() {
+ const attr = this.getAttribute("endpoint");
+ if (!attr) return window.location.origin;
+ return attr;
+ }
+ get model() {
+ return this.getAttribute("model") || "";
+ }
+ get maxTokens() {
+ const val = this.getAttribute("max-tokens");
+ return val ? parseInt(val, 10) : 2048;
+ }
+ get temperature() {
+ const val = this.getAttribute("temperature");
+ return val ? parseFloat(val) : 0.7;
+ }
+ async checkConnection() {
+ try {
+ const resp = await fetch(`${this.endpoint}/v1/models`, {
+ signal: AbortSignal.timeout(3e3)
+ });
+ this.statusEl.classList.toggle("disconnected", !resp.ok);
+ } catch {
+ this.statusEl.classList.add("disconnected");
+ }
+ }
+ async handleSend(text) {
+ this.messages.addMessage("user", text);
+ this.history.push({ role: "user", content: text });
+ const assistantMsg = this.messages.addMessage("assistant");
+ assistantMsg.streaming = true;
+ this.input.disabled = true;
+ this.abortController?.abort();
+ this.abortController = new AbortController();
+ let fullResponse = "";
+ try {
+ const response = await fetch(`${this.endpoint}/v1/chat/completions`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ signal: this.abortController.signal,
+ body: JSON.stringify({
+ model: this.model,
+ messages: this.history,
+ max_tokens: this.maxTokens,
+ temperature: this.temperature,
+ stream: true
+ })
+ });
+ if (!response.ok) {
+ throw new Error(`Server error: ${response.status}`);
+ }
+ if (!response.body) {
+ throw new Error("No response body");
+ }
+ const reader = response.body.getReader();
+ const decoder = new TextDecoder();
+ let buffer = "";
+ while (true) {
+ const { done, value } = await reader.read();
+ if (done) break;
+ buffer += decoder.decode(value, { stream: true });
+ const lines = buffer.split("\n");
+ buffer = lines.pop() || "";
+ for (const line of lines) {
+ if (!line.startsWith("data: ")) continue;
+ const data = line.slice(6).trim();
+ if (data === "[DONE]") continue;
+ try {
+ const chunk = JSON.parse(data);
+ const delta = chunk.choices?.[0]?.delta;
+ if (delta?.content) {
+ fullResponse += delta.content;
+ assistantMsg.appendToken(delta.content);
+ this.messages.scrollToBottom();
+ }
+ } catch {
+ }
+ }
+ }
+ } catch (err) {
+ if (err instanceof Error && err.name === "AbortError") {
+ } else {
+ const errorText = err instanceof Error ? err.message : "Connection failed";
+ if (!fullResponse) {
+ assistantMsg.text = `\u26A0\uFE0F ${errorText}`;
+ }
+ this.statusEl.classList.add("disconnected");
+ }
+ } finally {
+ assistantMsg.streaming = false;
+ this.input.disabled = false;
+ this.input.focus();
+ this.abortController = null;
+ if (fullResponse) {
+ this.history.push({ role: "assistant", content: fullResponse });
+ }
+ }
+ }
+};
+customElements.define("lem-chat", LemChat);
+export {
+ LemChat
+};
diff --git a/cmd/ml/chat_embed.go b/cmd/ml/chat_embed.go
new file mode 100644
index 00000000..447d7a95
--- /dev/null
+++ b/cmd/ml/chat_embed.go
@@ -0,0 +1,44 @@
+package ml
+
+import (
+ _ "embed"
+)
+
+//go:embed chat.js
+var lemChatJS []byte
+
+const chatHTML = `
+
+
+
+
+ LEM Chat
+
+
+
+
+
+
+`
diff --git a/cmd/ml/cmd_serve.go b/cmd/ml/cmd_serve.go
index 1f0ab8a2..180ced2a 100644
--- a/cmd/ml/cmd_serve.go
+++ b/cmd/ml/cmd_serve.go
@@ -1,15 +1,21 @@
package ml
import (
+ "context"
"encoding/json"
"fmt"
"io"
"log/slog"
"net/http"
+ "os"
+ "os/signal"
+ "runtime"
+ "sync/atomic"
+ "syscall"
"time"
"forge.lthn.ai/core/go/pkg/cli"
- "forge.lthn.ai/core/go/pkg/ml"
+ "forge.lthn.ai/core/go-ai/ml"
)
var serveCmd = &cli.Command{
@@ -20,13 +26,23 @@ var serveCmd = &cli.Command{
}
var (
- serveBind string
- serveModelPath string
+ serveBind string
+ serveModelPath string
+ serveThreads int
+ serveMaxTokens int
+ serveTimeout int
+ serveMaxRequests int
+ serveMaxContext int
)
func init() {
serveCmd.Flags().StringVar(&serveBind, "bind", "0.0.0.0:8090", "Address to bind")
serveCmd.Flags().StringVar(&serveModelPath, "model-path", "", "Path to model directory (for mlx backend)")
+ serveCmd.Flags().IntVar(&serveThreads, "threads", 0, "Max CPU threads (0 = all available)")
+ serveCmd.Flags().IntVar(&serveMaxTokens, "max-tokens", 4096, "Default max tokens per request")
+ serveCmd.Flags().IntVar(&serveTimeout, "timeout", 300, "Request timeout in seconds")
+ serveCmd.Flags().IntVar(&serveMaxRequests, "max-requests", 1, "Max concurrent requests (Metal is single-stream)")
+ serveCmd.Flags().IntVar(&serveMaxContext, "max-context", 4, "Max chat messages to keep (sliding window, 0=unlimited)")
}
type completionRequest struct {
@@ -34,6 +50,7 @@ type completionRequest struct {
Prompt string `json:"prompt"`
MaxTokens int `json:"max_tokens"`
Temperature float64 `json:"temperature"`
+ Stream bool `json:"stream"`
}
type completionResponse struct {
@@ -56,6 +73,7 @@ type chatRequest struct {
Messages []ml.Message `json:"messages"`
MaxTokens int `json:"max_tokens"`
Temperature float64 `json:"temperature"`
+ Stream bool `json:"stream"`
}
type chatResponse struct {
@@ -72,6 +90,40 @@ type chatChoice struct {
FinishReason string `json:"finish_reason"`
}
+// SSE streaming types (OpenAI chunk format)
+type chatChunkResponse struct {
+ ID string `json:"id"`
+ Object string `json:"object"`
+ Created int64 `json:"created"`
+ Model string `json:"model"`
+ Choices []chatChunkChoice `json:"choices"`
+}
+
+type chatChunkChoice struct {
+ Delta chatChunkDelta `json:"delta"`
+ Index int `json:"index"`
+ FinishReason *string `json:"finish_reason"`
+}
+
+type chatChunkDelta struct {
+ Role string `json:"role,omitempty"`
+ Content string `json:"content,omitempty"`
+}
+
+type completionChunkResponse struct {
+ ID string `json:"id"`
+ Object string `json:"object"`
+ Created int64 `json:"created"`
+ Model string `json:"model"`
+ Choices []completionChunkChoice `json:"choices"`
+}
+
+type completionChunkChoice struct {
+ Text string `json:"text"`
+ Index int `json:"index"`
+ FinishReason *string `json:"finish_reason"`
+}
+
type usageInfo struct {
PromptTokens int `json:"prompt_tokens"`
CompletionTokens int `json:"completion_tokens"`
@@ -79,16 +131,54 @@ type usageInfo struct {
}
func runServe(cmd *cli.Command, args []string) error {
- // Try native MLX backend first (macOS arm64 with mlx tag + model-path set),
- // fall back to HTTP proxy backend.
+ // Cap CPU threads
+ if serveThreads > 0 {
+ prev := runtime.GOMAXPROCS(serveThreads)
+ slog.Info("ml serve: capped threads", "threads", serveThreads, "previous", prev)
+ }
+
backend, err := createServeBackend()
if err != nil {
return err
}
+ // Check if backend supports streaming
+ streamer, canStream := backend.(ml.StreamingBackend)
+
+ // Request tracking
+ var activeRequests atomic.Int32
+ startTime := time.Now()
+
mux := http.NewServeMux()
+ // Health endpoint
+ mux.HandleFunc("GET /healthz", func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "application/json")
+ json.NewEncoder(w).Encode(map[string]any{
+ "status": "ok",
+ "model": backend.Name(),
+ "uptime_seconds": int(time.Since(startTime).Seconds()),
+ "active_requests": activeRequests.Load(),
+ "max_threads": runtime.GOMAXPROCS(0),
+ "max_tokens": serveMaxTokens,
+ "max_context": serveMaxContext,
+ })
+ })
+
mux.HandleFunc("POST /v1/completions", func(w http.ResponseWriter, r *http.Request) {
+ // Concurrency gate
+ if int(activeRequests.Load()) >= serveMaxRequests {
+ http.Error(w, `{"error":"server busy, max concurrent requests reached"}`, http.StatusTooManyRequests)
+ return
+ }
+ activeRequests.Add(1)
+ defer activeRequests.Add(-1)
+
+ // Request timeout
+ ctx, cancel := context.WithTimeout(r.Context(), time.Duration(serveTimeout)*time.Second)
+ defer cancel()
+ r = r.WithContext(ctx)
+
body, _ := io.ReadAll(r.Body)
var req completionRequest
if err := json.Unmarshal(body, &req); err != nil {
@@ -96,12 +186,67 @@ func runServe(cmd *cli.Command, args []string) error {
return
}
+ // Enforce server-level max-tokens cap
+ if req.MaxTokens == 0 || req.MaxTokens > serveMaxTokens {
+ req.MaxTokens = serveMaxTokens
+ }
+
opts := ml.GenOpts{
Temperature: req.Temperature,
MaxTokens: req.MaxTokens,
Model: req.Model,
}
+ // Streaming path
+ if req.Stream && canStream {
+ id := fmt.Sprintf("cmpl-%d", time.Now().UnixNano())
+ created := time.Now().Unix()
+
+ w.Header().Set("Content-Type", "text/event-stream")
+ w.Header().Set("Cache-Control", "no-cache")
+ w.Header().Set("Connection", "keep-alive")
+ w.Header().Set("X-Accel-Buffering", "no")
+ flusher, ok := w.(http.Flusher)
+ if !ok {
+ http.Error(w, "streaming not supported", 500)
+ return
+ }
+
+ err := streamer.GenerateStream(r.Context(), req.Prompt, opts, func(token string) error {
+ chunk := completionChunkResponse{
+ ID: id,
+ Object: "text_completion",
+ Created: created,
+ Model: backend.Name(),
+ Choices: []completionChunkChoice{{Text: token}},
+ }
+ data, _ := json.Marshal(chunk)
+ fmt.Fprintf(w, "data: %s\n\n", data)
+ flusher.Flush()
+ return nil
+ })
+
+ if err != nil {
+ slog.Error("stream error", "err", err)
+ }
+
+ // Send final chunk with finish_reason
+ stop := "stop"
+ final := completionChunkResponse{
+ ID: id,
+ Object: "text_completion",
+ Created: created,
+ Model: backend.Name(),
+ Choices: []completionChunkChoice{{FinishReason: &stop}},
+ }
+ data, _ := json.Marshal(final)
+ fmt.Fprintf(w, "data: %s\n\n", data)
+ fmt.Fprintf(w, "data: [DONE]\n\n")
+ flusher.Flush()
+ return
+ }
+
+ // Non-streaming path
text, err := backend.Generate(r.Context(), req.Prompt, opts)
if err != nil {
http.Error(w, err.Error(), 500)
@@ -121,6 +266,19 @@ func runServe(cmd *cli.Command, args []string) error {
})
mux.HandleFunc("POST /v1/chat/completions", func(w http.ResponseWriter, r *http.Request) {
+ // Concurrency gate
+ if int(activeRequests.Load()) >= serveMaxRequests {
+ http.Error(w, `{"error":"server busy, max concurrent requests reached"}`, http.StatusTooManyRequests)
+ return
+ }
+ activeRequests.Add(1)
+ defer activeRequests.Add(-1)
+
+ // Request timeout
+ ctx, cancel := context.WithTimeout(r.Context(), time.Duration(serveTimeout)*time.Second)
+ defer cancel()
+ r = r.WithContext(ctx)
+
body, _ := io.ReadAll(r.Body)
var req chatRequest
if err := json.Unmarshal(body, &req); err != nil {
@@ -128,12 +286,97 @@ func runServe(cmd *cli.Command, args []string) error {
return
}
+ // Enforce server-level max-tokens cap
+ if req.MaxTokens == 0 || req.MaxTokens > serveMaxTokens {
+ req.MaxTokens = serveMaxTokens
+ }
+
+ // Sliding window: keep system prompt (if any) + last N messages
+ // Prevents KV-cache explosion on multi-turn conversations
+ if serveMaxContext > 0 && len(req.Messages) > serveMaxContext {
+ var kept []ml.Message
+ rest := req.Messages
+ // Preserve system message if present
+ if len(rest) > 0 && rest[0].Role == "system" {
+ kept = append(kept, rest[0])
+ rest = rest[1:]
+ }
+ // Keep only the last N user/assistant messages
+ if len(rest) > serveMaxContext {
+ rest = rest[len(rest)-serveMaxContext:]
+ }
+ req.Messages = append(kept, rest...)
+ slog.Debug("ml serve: context window applied", "kept", len(req.Messages))
+ }
+
opts := ml.GenOpts{
Temperature: req.Temperature,
MaxTokens: req.MaxTokens,
Model: req.Model,
}
+ // Streaming path
+ if req.Stream && canStream {
+ id := fmt.Sprintf("chatcmpl-%d", time.Now().UnixNano())
+ created := time.Now().Unix()
+
+ w.Header().Set("Content-Type", "text/event-stream")
+ w.Header().Set("Cache-Control", "no-cache")
+ w.Header().Set("Connection", "keep-alive")
+ w.Header().Set("X-Accel-Buffering", "no")
+ flusher, ok := w.(http.Flusher)
+ if !ok {
+ http.Error(w, "streaming not supported", 500)
+ return
+ }
+
+ // Send initial role chunk
+ roleChunk := chatChunkResponse{
+ ID: id,
+ Object: "chat.completion.chunk",
+ Created: created,
+ Model: backend.Name(),
+ Choices: []chatChunkChoice{{Delta: chatChunkDelta{Role: "assistant"}}},
+ }
+ data, _ := json.Marshal(roleChunk)
+ fmt.Fprintf(w, "data: %s\n\n", data)
+ flusher.Flush()
+
+ err := streamer.ChatStream(r.Context(), req.Messages, opts, func(token string) error {
+ chunk := chatChunkResponse{
+ ID: id,
+ Object: "chat.completion.chunk",
+ Created: created,
+ Model: backend.Name(),
+ Choices: []chatChunkChoice{{Delta: chatChunkDelta{Content: token}}},
+ }
+ data, _ := json.Marshal(chunk)
+ fmt.Fprintf(w, "data: %s\n\n", data)
+ flusher.Flush()
+ return nil
+ })
+
+ if err != nil {
+ slog.Error("stream error", "err", err)
+ }
+
+ // Send final chunk with finish_reason
+ stop := "stop"
+ final := chatChunkResponse{
+ ID: id,
+ Object: "chat.completion.chunk",
+ Created: created,
+ Model: backend.Name(),
+ Choices: []chatChunkChoice{{Delta: chatChunkDelta{}, FinishReason: &stop}},
+ }
+ data, _ = json.Marshal(final)
+ fmt.Fprintf(w, "data: %s\n\n", data)
+ fmt.Fprintf(w, "data: [DONE]\n\n")
+ flusher.Flush()
+ return
+ }
+
+ // Non-streaming path
text, err := backend.Chat(r.Context(), req.Messages, opts)
if err != nil {
http.Error(w, err.Error(), 500)
@@ -171,7 +414,59 @@ func runServe(cmd *cli.Command, args []string) error {
json.NewEncoder(w).Encode(resp)
})
- slog.Info("ml serve: starting", "bind", serveBind, "backend", backend.Name())
+ // Serve the lem-chat UI at root — same origin, no CORS needed
+ mux.HandleFunc("GET /chat.js", func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "application/javascript")
+ w.Write(lemChatJS)
+ })
+
+ mux.HandleFunc("GET /", func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path != "/" {
+ http.NotFound(w, r)
+ return
+ }
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ fmt.Fprintf(w, chatHTML, backend.Name(), serveMaxTokens)
+ })
+
+ slog.Info("ml serve: starting",
+ "bind", serveBind,
+ "backend", backend.Name(),
+ "streaming", canStream,
+ "threads", runtime.GOMAXPROCS(0),
+ "max_tokens", serveMaxTokens,
+ "max_context_msgs", serveMaxContext,
+ "timeout_s", serveTimeout,
+ "max_requests", serveMaxRequests,
+ )
fmt.Printf("Serving on http://%s\n", serveBind)
- return http.ListenAndServe(serveBind, mux)
+
+ // Graceful shutdown on SIGINT/SIGTERM
+ srv := &http.Server{
+ Addr: serveBind,
+ Handler: mux,
+ }
+
+ errCh := make(chan error, 1)
+ go func() {
+ errCh <- srv.ListenAndServe()
+ }()
+
+ sigCh := make(chan os.Signal, 1)
+ signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
+
+ select {
+ case sig := <-sigCh:
+ slog.Info("ml serve: shutting down", "signal", sig)
+ ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+ defer cancel()
+ if err := srv.Shutdown(ctx); err != nil {
+ slog.Error("ml serve: shutdown error", "err", err)
+ return err
+ }
+ slog.Info("ml serve: stopped cleanly")
+ return nil
+ case err := <-errCh:
+ return err
+ }
}