go-rocm/server_test.go

//go:build linux && amd64

package rocm

import (
	"context"
	"os"
	"strings"
	"testing"

	"forge.lthn.ai/core/go-inference"
	coreerr "forge.lthn.ai/core/go-log"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

func TestFindLlamaServer_InPATH(t *testing.T) {
	// llama-server is at /usr/local/bin/llama-server on this machine.
	path, err := findLlamaServer()
	require.NoError(t, err)
	assert.Contains(t, path, "llama-server")
}

func TestFindLlamaServer_EnvOverride(t *testing.T) {
	t.Setenv("ROCM_LLAMA_SERVER_PATH", "/usr/local/bin/llama-server")
	path, err := findLlamaServer()
	require.NoError(t, err)
	assert.Equal(t, "/usr/local/bin/llama-server", path)
}

func TestFindLlamaServer_EnvNotFound(t *testing.T) {
	t.Setenv("ROCM_LLAMA_SERVER_PATH", "/nonexistent/llama-server")
	_, err := findLlamaServer()
	assert.ErrorContains(t, err, "not found")
}

func TestFreePort(t *testing.T) {
	port, err := freePort()
	require.NoError(t, err)
	assert.Greater(t, port, 0)
	assert.Less(t, port, 65536)
}

func TestFreePort_UniquePerCall(t *testing.T) {
	p1, err := freePort()
	require.NoError(t, err)
	p2, err := freePort()
	require.NoError(t, err)
	_ = p1
	_ = p2
}

func TestServerEnv_HIPVisibleDevices(t *testing.T) {
	env := serverEnv()
	var hipVals []string
	for _, e := range env {
		if strings.HasPrefix(e, "HIP_VISIBLE_DEVICES=") {
			hipVals = append(hipVals, e)
		}
	}
	assert.Equal(t, []string{"HIP_VISIBLE_DEVICES=0"}, hipVals)
}

func TestServerEnv_FiltersExistingHIP(t *testing.T) {
	t.Setenv("HIP_VISIBLE_DEVICES", "1")
	env := serverEnv()
	var hipVals []string
	for _, e := range env {
		if strings.HasPrefix(e, "HIP_VISIBLE_DEVICES=") {
			hipVals = append(hipVals, e)
		}
	}
	assert.Equal(t, []string{"HIP_VISIBLE_DEVICES=0"}, hipVals)
}

func TestAvailable(t *testing.T) {
	b := &rocmBackend{}
	if _, err := os.Stat("/dev/kfd"); err != nil {
		t.Skip("no ROCm hardware")
	}
	assert.True(t, b.Available())
}


func TestServerAlive_Running(t *testing.T) {
	s := &server{exited: make(chan struct{})}
	assert.True(t, s.alive())
}

func TestServerAlive_Exited(t *testing.T) {
	exited := make(chan struct{})
	close(exited)
	s := &server{exited: exited, exitErr: coreerr.E("test", "process killed", nil)}
	assert.False(t, s.alive())
}

func TestGenerate_ServerDead(t *testing.T) {
	exited := make(chan struct{})
	close(exited)
	s := &server{
		exited:  exited,
		exitErr: coreerr.E("test", "process killed", nil),
	}
	m := &rocmModel{srv: s}

	var count int
	for range m.Generate(context.Background(), "hello") {
		count++
	}
	assert.Equal(t, 0, count)
	assert.ErrorContains(t, m.Err(), "server has exited")
}

func TestStartServer_RetriesOnProcessExit(t *testing.T) {
	// /bin/false starts successfully but exits immediately with code 1.
	// startServer should retry up to 3 times, then fail.
	_, err := startServer("/bin/false", "/nonexistent/model.gguf", 999, 0, 0)
	require.Error(t, err)
	assert.Contains(t, err.Error(), "failed after 3 attempts")
}

func TestChat_ServerDead(t *testing.T) {
	exited := make(chan struct{})
	close(exited)
	s := &server{
		exited:  exited,
		exitErr: coreerr.E("test", "process killed", nil),
	}
	m := &rocmModel{srv: s}

	msgs := []inference.Message{{Role: "user", Content: "hello"}}
	var count int
	for range m.Chat(context.Background(), msgs) {
		count++
	}
	assert.Equal(t, 0, count)
	assert.ErrorContains(t, m.Err(), "server has exited")
}
feat: server lifecycle and helpers for llama-server subprocess Adds server.go with the process lifecycle layer that manages spawning llama-server, waiting for readiness, and graceful shutdown. Includes three helper functions (findLlamaServer, freePort, serverEnv) and the full startServer/waitReady/stop lifecycle. The serverEnv function critically filters HIP_VISIBLE_DEVICES to mask the Ryzen 9 iGPU which crashes llama-server if not excluded. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:08:07 +00:00			`//go:build linux && amd64`

			`package rocm`

			`import (`
feat: detect server crash before Generate/Chat calls Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 21:34:46 +00:00			`"context"`
feat: Backend Available() and LoadModel() with GPU detection Replace stub backend with real implementation: Available() checks /dev/kfd and llama-server presence, LoadModel() wires up server lifecycle to return a rocmModel. Add guessModelType() for architecture detection from GGUF filenames (handles hyphenated variants like Llama-3). Add TestAvailable and TestGuessModelType. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:12:02 +00:00			`"os"`
feat: server lifecycle and helpers for llama-server subprocess Adds server.go with the process lifecycle layer that manages spawning llama-server, waiting for readiness, and graceful shutdown. Includes three helper functions (findLlamaServer, freePort, serverEnv) and the full startServer/waitReady/stop lifecycle. The serverEnv function critically filters HIP_VISIBLE_DEVICES to mask the Ryzen 9 iGPU which crashes llama-server if not excluded. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:08:07 +00:00			`"strings"`
			`"testing"`

feat: detect server crash before Generate/Chat calls Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 21:34:46 +00:00			`"forge.lthn.ai/core/go-inference"`
fix(dx): audit coding standards and add tests for untested paths - CLAUDE.md: document coreerr.E() error handling and go-io exclusion - server_test.go: replace fmt.Errorf with coreerr.E() in test fixtures - gguf_test.go: add tests for v2 format, skipValue (all type branches), readTypedValue uint64 path, unsupported version, truncated file - discover_test.go: add test for corrupt GGUF file skipping - vram_test.go: add tests for invalid/empty sysfs content Coverage: 65.8% → 79.2% (+13.4%) Co-Authored-By: Virgil <virgil@lethean.io> 2026-03-17 08:50:17 +00:00			`coreerr "forge.lthn.ai/core/go-log"`
feat: server lifecycle and helpers for llama-server subprocess Adds server.go with the process lifecycle layer that manages spawning llama-server, waiting for readiness, and graceful shutdown. Includes three helper functions (findLlamaServer, freePort, serverEnv) and the full startServer/waitReady/stop lifecycle. The serverEnv function critically filters HIP_VISIBLE_DEVICES to mask the Ryzen 9 iGPU which crashes llama-server if not excluded. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:08:07 +00:00			`"github.com/stretchr/testify/assert"`
			`"github.com/stretchr/testify/require"`
			`)`

			`func TestFindLlamaServer_InPATH(t *testing.T) {`
			`// llama-server is at /usr/local/bin/llama-server on this machine.`
			`path, err := findLlamaServer()`
			`require.NoError(t, err)`
			`assert.Contains(t, path, "llama-server")`
			`}`

			`func TestFindLlamaServer_EnvOverride(t *testing.T) {`
			`t.Setenv("ROCM_LLAMA_SERVER_PATH", "/usr/local/bin/llama-server")`
			`path, err := findLlamaServer()`
			`require.NoError(t, err)`
			`assert.Equal(t, "/usr/local/bin/llama-server", path)`
			`}`

			`func TestFindLlamaServer_EnvNotFound(t *testing.T) {`
			`t.Setenv("ROCM_LLAMA_SERVER_PATH", "/nonexistent/llama-server")`
			`_, err := findLlamaServer()`
			`assert.ErrorContains(t, err, "not found")`
			`}`

			`func TestFreePort(t *testing.T) {`
			`port, err := freePort()`
			`require.NoError(t, err)`
			`assert.Greater(t, port, 0)`
			`assert.Less(t, port, 65536)`
			`}`

			`func TestFreePort_UniquePerCall(t *testing.T) {`
			`p1, err := freePort()`
			`require.NoError(t, err)`
			`p2, err := freePort()`
			`require.NoError(t, err)`
			`_ = p1`
			`_ = p2`
			`}`

			`func TestServerEnv_HIPVisibleDevices(t *testing.T) {`
			`env := serverEnv()`
			`var hipVals []string`
			`for _, e := range env {`
			`if strings.HasPrefix(e, "HIP_VISIBLE_DEVICES=") {`
			`hipVals = append(hipVals, e)`
			`}`
			`}`
			`assert.Equal(t, []string{"HIP_VISIBLE_DEVICES=0"}, hipVals)`
			`}`

			`func TestServerEnv_FiltersExistingHIP(t *testing.T) {`
			`t.Setenv("HIP_VISIBLE_DEVICES", "1")`
			`env := serverEnv()`
			`var hipVals []string`
			`for _, e := range env {`
			`if strings.HasPrefix(e, "HIP_VISIBLE_DEVICES=") {`
			`hipVals = append(hipVals, e)`
			`}`
			`}`
			`assert.Equal(t, []string{"HIP_VISIBLE_DEVICES=0"}, hipVals)`
			`}`
feat: Backend Available() and LoadModel() with GPU detection Replace stub backend with real implementation: Available() checks /dev/kfd and llama-server presence, LoadModel() wires up server lifecycle to return a rocmModel. Add guessModelType() for architecture detection from GGUF filenames (handles hyphenated variants like Llama-3). Add TestAvailable and TestGuessModelType. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:12:02 +00:00
			`func TestAvailable(t *testing.T) {`
			`b := &rocmBackend{}`
			`if _, err := os.Stat("/dev/kfd"); err != nil {`
			`t.Skip("no ROCm hardware")`
			`}`
			`assert.True(t, b.Available())`
			`}`

feat: detect server crash before Generate/Chat calls Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 21:34:46 +00:00
			`func TestServerAlive_Running(t *testing.T) {`
			`s := &server{exited: make(chan struct{})}`
			`assert.True(t, s.alive())`
			`}`

			`func TestServerAlive_Exited(t *testing.T) {`
			`exited := make(chan struct{})`
			`close(exited)`
fix(dx): audit coding standards and add tests for untested paths - CLAUDE.md: document coreerr.E() error handling and go-io exclusion - server_test.go: replace fmt.Errorf with coreerr.E() in test fixtures - gguf_test.go: add tests for v2 format, skipValue (all type branches), readTypedValue uint64 path, unsupported version, truncated file - discover_test.go: add test for corrupt GGUF file skipping - vram_test.go: add tests for invalid/empty sysfs content Coverage: 65.8% → 79.2% (+13.4%) Co-Authored-By: Virgil <virgil@lethean.io> 2026-03-17 08:50:17 +00:00			`s := &server{exited: exited, exitErr: coreerr.E("test", "process killed", nil)}`
feat: detect server crash before Generate/Chat calls Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 21:34:46 +00:00			`assert.False(t, s.alive())`
			`}`

			`func TestGenerate_ServerDead(t *testing.T) {`
			`exited := make(chan struct{})`
			`close(exited)`
			`s := &server{`
			`exited: exited,`
fix(dx): audit coding standards and add tests for untested paths - CLAUDE.md: document coreerr.E() error handling and go-io exclusion - server_test.go: replace fmt.Errorf with coreerr.E() in test fixtures - gguf_test.go: add tests for v2 format, skipValue (all type branches), readTypedValue uint64 path, unsupported version, truncated file - discover_test.go: add test for corrupt GGUF file skipping - vram_test.go: add tests for invalid/empty sysfs content Coverage: 65.8% → 79.2% (+13.4%) Co-Authored-By: Virgil <virgil@lethean.io> 2026-03-17 08:50:17 +00:00			`exitErr: coreerr.E("test", "process killed", nil),`
feat: detect server crash before Generate/Chat calls Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 21:34:46 +00:00			`}`
			`m := &rocmModel{srv: s}`

			`var count int`
			`for range m.Generate(context.Background(), "hello") {`
			`count++`
			`}`
			`assert.Equal(t, 0, count)`
			`assert.ErrorContains(t, m.Err(), "server has exited")`
			`}`

feat: retry port selection in startServer on process failure Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:40:05 +00:00			`func TestStartServer_RetriesOnProcessExit(t *testing.T) {`
			`// /bin/false starts successfully but exits immediately with code 1.`
			`// startServer should retry up to 3 times, then fail.`
feat: pass --parallel N to llama-server for concurrent inference slots Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 23:13:19 +00:00			`_, err := startServer("/bin/false", "/nonexistent/model.gguf", 999, 0, 0)`
feat: retry port selection in startServer on process failure Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:40:05 +00:00			`require.Error(t, err)`
			`assert.Contains(t, err.Error(), "failed after 3 attempts")`
			`}`

feat: detect server crash before Generate/Chat calls Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 21:34:46 +00:00			`func TestChat_ServerDead(t *testing.T) {`
			`exited := make(chan struct{})`
			`close(exited)`
			`s := &server{`
			`exited: exited,`
fix(dx): audit coding standards and add tests for untested paths - CLAUDE.md: document coreerr.E() error handling and go-io exclusion - server_test.go: replace fmt.Errorf with coreerr.E() in test fixtures - gguf_test.go: add tests for v2 format, skipValue (all type branches), readTypedValue uint64 path, unsupported version, truncated file - discover_test.go: add test for corrupt GGUF file skipping - vram_test.go: add tests for invalid/empty sysfs content Coverage: 65.8% → 79.2% (+13.4%) Co-Authored-By: Virgil <virgil@lethean.io> 2026-03-17 08:50:17 +00:00			`exitErr: coreerr.E("test", "process killed", nil),`
feat: detect server crash before Generate/Chat calls Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 21:34:46 +00:00			`}`
			`m := &rocmModel{srv: s}`

			`msgs := []inference.Message{{Role: "user", Content: "hello"}}`
			`var count int`
			`for range m.Chat(context.Background(), msgs) {`
			`count++`
			`}`
			`assert.Equal(t, 0, count)`
			`assert.ErrorContains(t, m.Err(), "server has exited")`
			`}`