Principle 1 — Predictable Names: - rocmModel.srv → rocmModel.server (struct field) - recordMetrics: met → metrics (local var) - backend.go/model.go: cfg → config (local vars) - gguf.go: tc/kc → tensorCount32/kvCount32 (v2 count reads) Principle 2 — Comments as Usage Examples: - Added concrete usage examples to all exported functions: VRAMInfo, ModelInfo, DiscoverModels, GetVRAMInfo, ROCmAvailable, LoadModel, Available, NewClient, Health, ChatComplete, Complete, ReadMetadata, FileTypeName Principle 5 — Test naming (_Good/_Bad/_Ugly): - All test functions renamed to AX-7 convention across: discover_test.go, vram_test.go, server_test.go, internal/gguf/gguf_test.go, internal/llamacpp/client_test.go, internal/llamacpp/health_test.go Also: fix go.sum missing entry for dappco.re/go/core transitive dep (pulled in by go-inference replace directive). All tests pass: go test ./... -short -count=1 Co-Authored-By: Virgil <virgil@lethean.io> Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
74 lines
1.8 KiB
Go
74 lines
1.8 KiB
Go
//go:build linux && amd64
|
|
|
|
package rocm
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
|
|
coreerr "forge.lthn.ai/core/go-log"
|
|
)
|
|
|
|
// GetVRAMInfo reads VRAM usage for the discrete GPU from sysfs.
|
|
// It identifies the dGPU by selecting the card with the largest VRAM total,
|
|
// which avoids hardcoding card numbers (e.g. card0=iGPU, card1=dGPU on Ryzen).
|
|
//
|
|
// Note: total and used are read non-atomically from sysfs; transient
|
|
// inconsistencies are possible under heavy allocation churn.
|
|
//
|
|
// info, err := rocm.GetVRAMInfo()
|
|
// fmt.Printf("VRAM: %d MiB used / %d MiB total (free: %d MiB)",
|
|
// info.Used/(1024*1024), info.Total/(1024*1024), info.Free/(1024*1024))
|
|
func GetVRAMInfo() (VRAMInfo, error) {
|
|
cards, err := filepath.Glob("/sys/class/drm/card[0-9]*/device/mem_info_vram_total")
|
|
if err != nil {
|
|
return VRAMInfo{}, coreerr.E("rocm.GetVRAMInfo", "glob vram sysfs", err)
|
|
}
|
|
if len(cards) == 0 {
|
|
return VRAMInfo{}, coreerr.E("rocm.GetVRAMInfo", "no GPU VRAM info found in sysfs", nil)
|
|
}
|
|
|
|
var bestDir string
|
|
var bestTotal uint64
|
|
|
|
for _, totalPath := range cards {
|
|
total, err := readSysfsUint64(totalPath)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if total > bestTotal {
|
|
bestTotal = total
|
|
bestDir = filepath.Dir(totalPath)
|
|
}
|
|
}
|
|
|
|
if bestDir == "" {
|
|
return VRAMInfo{}, coreerr.E("rocm.GetVRAMInfo", "no readable VRAM sysfs entries", nil)
|
|
}
|
|
|
|
used, err := readSysfsUint64(filepath.Join(bestDir, "mem_info_vram_used"))
|
|
if err != nil {
|
|
return VRAMInfo{}, coreerr.E("rocm.GetVRAMInfo", "read vram used", err)
|
|
}
|
|
|
|
free := uint64(0)
|
|
if bestTotal > used {
|
|
free = bestTotal - used
|
|
}
|
|
|
|
return VRAMInfo{
|
|
Total: bestTotal,
|
|
Used: used,
|
|
Free: free,
|
|
}, nil
|
|
}
|
|
|
|
func readSysfsUint64(path string) (uint64, error) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
|
|
}
|