go-rocm/backend.go

//go:build linux && amd64

package rocm

import (
	"os"
	"strings"

	coreerr "forge.lthn.ai/core/go-log"
	"forge.lthn.ai/core/go-inference"
	"forge.lthn.ai/core/go-rocm/internal/gguf"
)

// rocmBackend implements inference.Backend for AMD ROCm GPUs.
type rocmBackend struct{}

func (b *rocmBackend) Name() string { return "rocm" }

// Available reports whether ROCm GPU inference can run on this machine.
// Checks for the ROCm kernel driver (/dev/kfd) and a findable llama-server binary.
func (b *rocmBackend) Available() bool {
	if _, err := os.Stat("/dev/kfd"); err != nil {
		return false
	}
	if _, err := findLlamaServer(); err != nil {
		return false
	}
	return true
}

// LoadModel loads a GGUF model onto the AMD GPU via llama-server.
// Model architecture is read from GGUF metadata (replacing filename-based guessing).
// If no context length is specified, defaults to min(model_context_length, 4096)
// to prevent VRAM exhaustion on models with 128K+ native context.
func (b *rocmBackend) LoadModel(path string, opts ...inference.LoadOption) (inference.TextModel, error) {
	cfg := inference.ApplyLoadOpts(opts)

	binary, err := findLlamaServer()
	if err != nil {
		return nil, err
	}

	meta, err := gguf.ReadMetadata(path)
	if err != nil {
		return nil, coreerr.E("rocm.LoadModel", "read model metadata", err)
	}

	ctxLen := cfg.ContextLen
	if ctxLen == 0 && meta.ContextLength > 0 {
		ctxLen = int(min(meta.ContextLength, 4096))
	}

	srv, err := startServer(binary, path, cfg.GPULayers, ctxLen, cfg.ParallelSlots)
	if err != nil {
		return nil, err
	}

	// Map quantisation file type to bit width.
	quantBits := 0
	quantGroup := 0
	ftName := gguf.FileTypeName(meta.FileType)
	switch {
	case strings.HasPrefix(ftName, "Q4_"):
		quantBits = 4
		quantGroup = 32
	case strings.HasPrefix(ftName, "Q5_"):
		quantBits = 5
		quantGroup = 32
	case strings.HasPrefix(ftName, "Q8_"):
		quantBits = 8
		quantGroup = 32
	case strings.HasPrefix(ftName, "Q2_"):
		quantBits = 2
		quantGroup = 16
	case strings.HasPrefix(ftName, "Q3_"):
		quantBits = 3
		quantGroup = 32
	case strings.HasPrefix(ftName, "Q6_"):
		quantBits = 6
		quantGroup = 64
	case ftName == "F16":
		quantBits = 16
	case ftName == "F32":
		quantBits = 32
	}

	return &rocmModel{
		srv:       srv,
		modelType: meta.Architecture,
		modelInfo: inference.ModelInfo{
			Architecture: meta.Architecture,
			NumLayers:    int(meta.BlockCount),
			QuantBits:    quantBits,
			QuantGroup:   quantGroup,
		},
	}, nil
}
feat: scaffold go-rocm AMD GPU inference package Implements inference.Backend via llama-server subprocess (llama.cpp + HIP/ROCm). Targets RX 7800 XT (gfx1101, RDNA 3, 16GB VRAM). Includes: - Backend registration with build tags (linux/amd64) - Stub backend.go with llama-server lifecycle outline - CLAUDE.md with build instructions for llama.cpp + ROCm - TODO.md with 5-phase task queue - FINDINGS.md with hardware specs, VRAM budget, design rationale Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 19:39:40 +00:00			`//go:build linux && amd64`

			`package rocm`

feat: Backend Available() and LoadModel() with GPU detection Replace stub backend with real implementation: Available() checks /dev/kfd and llama-server presence, LoadModel() wires up server lifecycle to return a rocmModel. Add guessModelType() for architecture detection from GGUF filenames (handles hyphenated variants like Llama-3). Add TestAvailable and TestGuessModelType. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:12:02 +00:00			`import (`
			`"os"`
feat: implement Classify, BatchGenerate, Info, Metrics on rocmModel Brings rocmModel into compliance with the updated inference.TextModel interface from go-inference. - Classify: simulates prefill-only via max_tokens=1, temperature=0 - BatchGenerate: sequential autoregressive per prompt via /v1/completions - Info: populates ModelInfo from GGUF metadata (architecture, layers, quant) - Metrics: captures timing + VRAM usage via sysfs after each operation - Refactors duplicate server-exit error handling into setServerExitErr() - Adds timing instrumentation to existing Generate and Chat methods Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-24 18:50:37 +00:00			`"strings"`
feat: Backend Available() and LoadModel() with GPU detection Replace stub backend with real implementation: Available() checks /dev/kfd and llama-server presence, LoadModel() wires up server lifecycle to return a rocmModel. Add guessModelType() for architecture detection from GGUF filenames (handles hyphenated variants like Llama-3). Add TestAvailable and TestGuessModelType. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:12:02 +00:00
refactor: replace fmt.Errorf/errors.New with coreerr.E() Co-Authored-By: Virgil <virgil@lethean.io> 2026-03-16 21:08:52 +00:00			`coreerr "forge.lthn.ai/core/go-log"`
feat: Backend Available() and LoadModel() with GPU detection Replace stub backend with real implementation: Available() checks /dev/kfd and llama-server presence, LoadModel() wires up server lifecycle to return a rocmModel. Add guessModelType() for architecture detection from GGUF filenames (handles hyphenated variants like Llama-3). Add TestAvailable and TestGuessModelType. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:12:02 +00:00			`"forge.lthn.ai/core/go-inference"`
feat: use GGUF metadata for model type and context window auto-detection Replaces filename-based guessModelType with GGUF header parsing. Caps default context at 4096 to prevent VRAM exhaustion on models with 128K+ native context. Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 22:23:07 +00:00			`"forge.lthn.ai/core/go-rocm/internal/gguf"`
feat: Backend Available() and LoadModel() with GPU detection Replace stub backend with real implementation: Available() checks /dev/kfd and llama-server presence, LoadModel() wires up server lifecycle to return a rocmModel. Add guessModelType() for architecture detection from GGUF filenames (handles hyphenated variants like Llama-3). Add TestAvailable and TestGuessModelType. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:12:02 +00:00			`)`
feat: scaffold go-rocm AMD GPU inference package Implements inference.Backend via llama-server subprocess (llama.cpp + HIP/ROCm). Targets RX 7800 XT (gfx1101, RDNA 3, 16GB VRAM). Includes: - Backend registration with build tags (linux/amd64) - Stub backend.go with llama-server lifecycle outline - CLAUDE.md with build instructions for llama.cpp + ROCm - TODO.md with 5-phase task queue - FINDINGS.md with hardware specs, VRAM budget, design rationale Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 19:39:40 +00:00
			`// rocmBackend implements inference.Backend for AMD ROCm GPUs.`
			`type rocmBackend struct{}`

			`func (b *rocmBackend) Name() string { return "rocm" }`

feat: Backend Available() and LoadModel() with GPU detection Replace stub backend with real implementation: Available() checks /dev/kfd and llama-server presence, LoadModel() wires up server lifecycle to return a rocmModel. Add guessModelType() for architecture detection from GGUF filenames (handles hyphenated variants like Llama-3). Add TestAvailable and TestGuessModelType. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:12:02 +00:00			`// Available reports whether ROCm GPU inference can run on this machine.`
			`// Checks for the ROCm kernel driver (/dev/kfd) and a findable llama-server binary.`
feat: scaffold go-rocm AMD GPU inference package Implements inference.Backend via llama-server subprocess (llama.cpp + HIP/ROCm). Targets RX 7800 XT (gfx1101, RDNA 3, 16GB VRAM). Includes: - Backend registration with build tags (linux/amd64) - Stub backend.go with llama-server lifecycle outline - CLAUDE.md with build instructions for llama.cpp + ROCm - TODO.md with 5-phase task queue - FINDINGS.md with hardware specs, VRAM budget, design rationale Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 19:39:40 +00:00			`func (b *rocmBackend) Available() bool {`
feat: Backend Available() and LoadModel() with GPU detection Replace stub backend with real implementation: Available() checks /dev/kfd and llama-server presence, LoadModel() wires up server lifecycle to return a rocmModel. Add guessModelType() for architecture detection from GGUF filenames (handles hyphenated variants like Llama-3). Add TestAvailable and TestGuessModelType. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:12:02 +00:00			`if _, err := os.Stat("/dev/kfd"); err != nil {`
			`return false`
			`}`
			`if _, err := findLlamaServer(); err != nil {`
			`return false`
			`}`
			`return true`
feat: scaffold go-rocm AMD GPU inference package Implements inference.Backend via llama-server subprocess (llama.cpp + HIP/ROCm). Targets RX 7800 XT (gfx1101, RDNA 3, 16GB VRAM). Includes: - Backend registration with build tags (linux/amd64) - Stub backend.go with llama-server lifecycle outline - CLAUDE.md with build instructions for llama.cpp + ROCm - TODO.md with 5-phase task queue - FINDINGS.md with hardware specs, VRAM budget, design rationale Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 19:39:40 +00:00			`}`

feat: Backend Available() and LoadModel() with GPU detection Replace stub backend with real implementation: Available() checks /dev/kfd and llama-server presence, LoadModel() wires up server lifecycle to return a rocmModel. Add guessModelType() for architecture detection from GGUF filenames (handles hyphenated variants like Llama-3). Add TestAvailable and TestGuessModelType. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:12:02 +00:00			`// LoadModel loads a GGUF model onto the AMD GPU via llama-server.`
feat: use GGUF metadata for model type and context window auto-detection Replaces filename-based guessModelType with GGUF header parsing. Caps default context at 4096 to prevent VRAM exhaustion on models with 128K+ native context. Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 22:23:07 +00:00			`// Model architecture is read from GGUF metadata (replacing filename-based guessing).`
			`// If no context length is specified, defaults to min(model_context_length, 4096)`
			`// to prevent VRAM exhaustion on models with 128K+ native context.`
feat: scaffold go-rocm AMD GPU inference package Implements inference.Backend via llama-server subprocess (llama.cpp + HIP/ROCm). Targets RX 7800 XT (gfx1101, RDNA 3, 16GB VRAM). Includes: - Backend registration with build tags (linux/amd64) - Stub backend.go with llama-server lifecycle outline - CLAUDE.md with build instructions for llama.cpp + ROCm - TODO.md with 5-phase task queue - FINDINGS.md with hardware specs, VRAM budget, design rationale Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 19:39:40 +00:00			`func (b *rocmBackend) LoadModel(path string, opts ...inference.LoadOption) (inference.TextModel, error) {`
feat: Backend Available() and LoadModel() with GPU detection Replace stub backend with real implementation: Available() checks /dev/kfd and llama-server presence, LoadModel() wires up server lifecycle to return a rocmModel. Add guessModelType() for architecture detection from GGUF filenames (handles hyphenated variants like Llama-3). Add TestAvailable and TestGuessModelType. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:12:02 +00:00			`cfg := inference.ApplyLoadOpts(opts)`

			`binary, err := findLlamaServer()`
			`if err != nil {`
			`return nil, err`
			`}`

feat: use GGUF metadata for model type and context window auto-detection Replaces filename-based guessModelType with GGUF header parsing. Caps default context at 4096 to prevent VRAM exhaustion on models with 128K+ native context. Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 22:23:07 +00:00			`meta, err := gguf.ReadMetadata(path)`
			`if err != nil {`
refactor: replace fmt.Errorf/errors.New with coreerr.E() Co-Authored-By: Virgil <virgil@lethean.io> 2026-03-16 21:08:52 +00:00			`return nil, coreerr.E("rocm.LoadModel", "read model metadata", err)`
feat: use GGUF metadata for model type and context window auto-detection Replaces filename-based guessModelType with GGUF header parsing. Caps default context at 4096 to prevent VRAM exhaustion on models with 128K+ native context. Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 22:23:07 +00:00			`}`

			`ctxLen := cfg.ContextLen`
			`if ctxLen == 0 && meta.ContextLength > 0 {`
			`ctxLen = int(min(meta.ContextLength, 4096))`
			`}`

feat: pass --parallel N to llama-server for concurrent inference slots Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 23:13:19 +00:00			`srv, err := startServer(binary, path, cfg.GPULayers, ctxLen, cfg.ParallelSlots)`
feat: Backend Available() and LoadModel() with GPU detection Replace stub backend with real implementation: Available() checks /dev/kfd and llama-server presence, LoadModel() wires up server lifecycle to return a rocmModel. Add guessModelType() for architecture detection from GGUF filenames (handles hyphenated variants like Llama-3). Add TestAvailable and TestGuessModelType. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:12:02 +00:00			`if err != nil {`
			`return nil, err`
			`}`

feat: implement Classify, BatchGenerate, Info, Metrics on rocmModel Brings rocmModel into compliance with the updated inference.TextModel interface from go-inference. - Classify: simulates prefill-only via max_tokens=1, temperature=0 - BatchGenerate: sequential autoregressive per prompt via /v1/completions - Info: populates ModelInfo from GGUF metadata (architecture, layers, quant) - Metrics: captures timing + VRAM usage via sysfs after each operation - Refactors duplicate server-exit error handling into setServerExitErr() - Adds timing instrumentation to existing Generate and Chat methods Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-24 18:50:37 +00:00			`// Map quantisation file type to bit width.`
			`quantBits := 0`
			`quantGroup := 0`
			`ftName := gguf.FileTypeName(meta.FileType)`
			`switch {`
			`case strings.HasPrefix(ftName, "Q4_"):`
			`quantBits = 4`
			`quantGroup = 32`
			`case strings.HasPrefix(ftName, "Q5_"):`
			`quantBits = 5`
			`quantGroup = 32`
			`case strings.HasPrefix(ftName, "Q8_"):`
			`quantBits = 8`
			`quantGroup = 32`
			`case strings.HasPrefix(ftName, "Q2_"):`
			`quantBits = 2`
			`quantGroup = 16`
			`case strings.HasPrefix(ftName, "Q3_"):`
			`quantBits = 3`
			`quantGroup = 32`
			`case strings.HasPrefix(ftName, "Q6_"):`
			`quantBits = 6`
			`quantGroup = 64`
			`case ftName == "F16":`
			`quantBits = 16`
			`case ftName == "F32":`
			`quantBits = 32`
			`}`

feat: Backend Available() and LoadModel() with GPU detection Replace stub backend with real implementation: Available() checks /dev/kfd and llama-server presence, LoadModel() wires up server lifecycle to return a rocmModel. Add guessModelType() for architecture detection from GGUF filenames (handles hyphenated variants like Llama-3). Add TestAvailable and TestGuessModelType. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:12:02 +00:00			`return &rocmModel{`
			`srv: srv,`
feat: use GGUF metadata for model type and context window auto-detection Replaces filename-based guessModelType with GGUF header parsing. Caps default context at 4096 to prevent VRAM exhaustion on models with 128K+ native context. Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-19 22:23:07 +00:00			`modelType: meta.Architecture,`
feat: implement Classify, BatchGenerate, Info, Metrics on rocmModel Brings rocmModel into compliance with the updated inference.TextModel interface from go-inference. - Classify: simulates prefill-only via max_tokens=1, temperature=0 - BatchGenerate: sequential autoregressive per prompt via /v1/completions - Info: populates ModelInfo from GGUF metadata (architecture, layers, quant) - Metrics: captures timing + VRAM usage via sysfs after each operation - Refactors duplicate server-exit error handling into setServerExitErr() - Adds timing instrumentation to existing Generate and Chat methods Co-Authored-By: Virgil <virgil@lethean.io> 2026-02-24 18:50:37 +00:00			`modelInfo: inference.ModelInfo{`
			`Architecture: meta.Architecture,`
			`NumLayers: int(meta.BlockCount),`
			`QuantBits: quantBits,`
			`QuantGroup: quantGroup,`
			`},`
feat: Backend Available() and LoadModel() with GPU detection Replace stub backend with real implementation: Available() checks /dev/kfd and llama-server presence, LoadModel() wires up server lifecycle to return a rocmModel. Add guessModelType() for architecture detection from GGUF filenames (handles hyphenated variants like Llama-3). Add TestAvailable and TestGuessModelType. Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-02-19 21:12:02 +00:00			`}, nil`
			`}`