feat(runtime): align provider lifecycle with AX requirements
Improve provider discovery dedupe, startup/shutdown robustness, and health-check cancellation semantics. Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
parent
0669feb69b
commit
f361fd69f6
3 changed files with 137 additions and 54 deletions
37
providers.go
37
providers.go
|
|
@ -3,6 +3,7 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"forge.lthn.ai/core/api/pkg/provider"
|
||||
|
|
@ -40,15 +41,26 @@ func (p *ProvidersAPI) list(c *gin.Context) {
|
|||
registryInfo := p.registry.Info()
|
||||
runtimeInfo := p.runtime.List()
|
||||
|
||||
// Merge runtime provider info with registry info
|
||||
// Merge registry and runtime provider data without duplication.
|
||||
providers := make([]providerDTO, 0, len(registryInfo)+len(runtimeInfo))
|
||||
seen := make(map[string]struct{}, len(registryInfo)+len(runtimeInfo))
|
||||
|
||||
providerKey := func(name, namespace string) string {
|
||||
return fmt.Sprintf("%s|%s", name, namespace)
|
||||
}
|
||||
|
||||
for _, info := range registryInfo {
|
||||
key := providerKey(info.Name, info.BasePath)
|
||||
if _, ok := seen[key]; ok {
|
||||
continue
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
|
||||
dto := providerDTO{
|
||||
Name: info.Name,
|
||||
BasePath: info.BasePath,
|
||||
Channels: info.Channels,
|
||||
Status: "active",
|
||||
Channels: info.Channels,
|
||||
}
|
||||
if info.Element != nil {
|
||||
dto.Element = &elementDTO{
|
||||
|
|
@ -61,21 +73,21 @@ func (p *ProvidersAPI) list(c *gin.Context) {
|
|||
|
||||
// Add runtime providers not already in registry
|
||||
for _, ri := range runtimeInfo {
|
||||
found := false
|
||||
for _, p := range providers {
|
||||
if p.Name == ri.Code {
|
||||
found = true
|
||||
break
|
||||
key := providerKey(ri.Code, ri.Namespace)
|
||||
if _, ok := seen[key]; ok {
|
||||
continue
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
seen[key] = struct{}{}
|
||||
|
||||
providers = append(providers, providerDTO{
|
||||
Name: ri.Code,
|
||||
BasePath: ri.Namespace,
|
||||
Status: "active",
|
||||
Status: ri.Status,
|
||||
Code: ri.Code,
|
||||
Version: ri.Version,
|
||||
Namespace: ri.Namespace,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, providersResponse{Providers: providers})
|
||||
}
|
||||
|
|
@ -88,6 +100,9 @@ type providerDTO struct {
|
|||
Name string `json:"name"`
|
||||
BasePath string `json:"basePath"`
|
||||
Status string `json:"status,omitempty"`
|
||||
Code string `json:"code,omitempty"`
|
||||
Version string `json:"version,omitempty"`
|
||||
Namespace string `json:"namespace,omitempty"`
|
||||
Element *elementDTO `json:"element,omitempty"`
|
||||
Channels []string `json:"channels,omitempty"`
|
||||
}
|
||||
|
|
|
|||
135
runtime.go
135
runtime.go
|
|
@ -58,9 +58,6 @@ func defaultProvidersDir() string {
|
|||
// Providers that fail to start are logged and skipped — they do not prevent
|
||||
// other providers from starting.
|
||||
func (rm *RuntimeManager) StartAll(ctx context.Context) error {
|
||||
rm.mu.Lock()
|
||||
defer rm.mu.Unlock()
|
||||
|
||||
dir := defaultProvidersDir()
|
||||
discovered, err := marketplace.DiscoverProviders(dir)
|
||||
if err != nil {
|
||||
|
|
@ -73,23 +70,39 @@ func (rm *RuntimeManager) StartAll(ctx context.Context) error {
|
|||
}
|
||||
|
||||
log.Printf("runtime: discovered %d provider(s) in %s", len(discovered), dir)
|
||||
started := make([]*RuntimeProvider, 0, len(discovered))
|
||||
seen := make(map[string]struct{}, len(discovered))
|
||||
|
||||
for _, dp := range discovered {
|
||||
key := fmt.Sprintf("%s|%s", dp.Manifest.Code, dp.Manifest.Namespace)
|
||||
if _, ok := seen[key]; ok {
|
||||
log.Printf("runtime: skipped duplicate provider discovery for %s", dp.Manifest.Code)
|
||||
continue
|
||||
}
|
||||
|
||||
rp, err := rm.startProvider(ctx, dp)
|
||||
if err != nil {
|
||||
log.Printf("runtime: failed to start %s: %v", dp.Manifest.Code, err)
|
||||
continue
|
||||
}
|
||||
rm.providers = append(rm.providers, rp)
|
||||
seen[key] = struct{}{}
|
||||
started = append(started, rp)
|
||||
log.Printf("runtime: started %s on port %d", dp.Manifest.Code, rp.Port)
|
||||
}
|
||||
|
||||
rm.mu.Lock()
|
||||
rm.providers = append(rm.providers, started...)
|
||||
rm.mu.Unlock()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// startProvider starts a single provider binary and registers its proxy.
|
||||
func (rm *RuntimeManager) startProvider(ctx context.Context, dp marketplace.DiscoveredProvider) (*RuntimeProvider, error) {
|
||||
m := dp.Manifest
|
||||
if rm.engine == nil {
|
||||
return nil, coreerr.E("runtime.startProvider", "runtime engine not configured", nil)
|
||||
}
|
||||
|
||||
// Assign a free port.
|
||||
port, err := findFreePort()
|
||||
|
|
@ -120,9 +133,9 @@ func (rm *RuntimeManager) startProvider(ctx context.Context, dp marketplace.Disc
|
|||
|
||||
// Wait for health check.
|
||||
healthURL := fmt.Sprintf("http://127.0.0.1:%d/health", port)
|
||||
if err := waitForHealth(healthURL, 10*time.Second); err != nil {
|
||||
// Kill the process if health check fails.
|
||||
_ = cmd.Process.Kill()
|
||||
if err := waitForHealth(ctx, healthURL, 10*time.Second); err != nil {
|
||||
// Stop the process if health check fails.
|
||||
stopProviderProcess(cmd, 2*time.Second)
|
||||
return nil, coreerr.E("runtime.startProvider", fmt.Sprintf("health check failed for %s", m.Code), err)
|
||||
}
|
||||
|
||||
|
|
@ -171,27 +184,20 @@ func (rm *RuntimeManager) startProvider(ctx context.Context, dp marketplace.Disc
|
|||
// StopAll terminates all running provider processes.
|
||||
func (rm *RuntimeManager) StopAll() {
|
||||
rm.mu.Lock()
|
||||
defer rm.mu.Unlock()
|
||||
|
||||
for _, rp := range rm.providers {
|
||||
if rp.Cmd != nil && rp.Cmd.Process != nil {
|
||||
log.Printf("runtime: stopping %s (pid %d)", rp.Manifest.Code, rp.Cmd.Process.Pid)
|
||||
_ = rp.Cmd.Process.Signal(os.Interrupt)
|
||||
|
||||
// Give the process 5 seconds to exit gracefully.
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- rp.Cmd.Wait() }()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
// Exited cleanly.
|
||||
case <-time.After(5 * time.Second):
|
||||
_ = rp.Cmd.Process.Kill()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
providers := rm.providers
|
||||
rm.providers = nil
|
||||
rm.mu.Unlock()
|
||||
|
||||
for _, rp := range providers {
|
||||
if rp.Cmd != nil && rp.Cmd.Process != nil {
|
||||
name := ""
|
||||
if rp.Manifest != nil {
|
||||
name = rp.Manifest.Code
|
||||
}
|
||||
log.Printf("runtime: stopping provider (%s) pid %d", name, rp.Cmd.Process.Pid)
|
||||
stopProviderProcess(rp.Cmd, 5*time.Second)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// List returns a copy of all running provider info.
|
||||
|
|
@ -208,6 +214,7 @@ func (rm *RuntimeManager) List() []RuntimeProviderInfo {
|
|||
Namespace: rp.Manifest.Namespace,
|
||||
Port: rp.Port,
|
||||
Dir: rp.Dir,
|
||||
Status: "running",
|
||||
})
|
||||
}
|
||||
return infos
|
||||
|
|
@ -221,6 +228,7 @@ type RuntimeProviderInfo struct {
|
|||
Namespace string `json:"namespace"`
|
||||
Port int `json:"port"`
|
||||
Dir string `json:"dir"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
// findFreePort asks the OS for an available TCP port on 127.0.0.1.
|
||||
|
|
@ -238,22 +246,81 @@ func findFreePort() (int, error) {
|
|||
}
|
||||
|
||||
// waitForHealth polls a health URL until it returns 200 or the timeout expires.
|
||||
func waitForHealth(url string, timeout time.Duration) error {
|
||||
deadline := time.Now().Add(timeout)
|
||||
client := &http.Client{Timeout: 2 * time.Second}
|
||||
func waitForHealth(ctx context.Context, url string, timeout time.Duration) error {
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
if timeout <= 0 {
|
||||
timeout = 5 * time.Second
|
||||
}
|
||||
|
||||
for time.Now().Before(deadline) {
|
||||
resp, err := client.Get(url)
|
||||
timeoutCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
|
||||
client := &http.Client{Timeout: 2 * time.Second}
|
||||
ticker := time.NewTicker(100 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
req, err := http.NewRequestWithContext(timeoutCtx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return coreerr.E("runtime.waitForHealth", "create health request", err)
|
||||
}
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err == nil {
|
||||
resp.Body.Close()
|
||||
if resp.StatusCode == http.StatusOK {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
select {
|
||||
case <-timeoutCtx.Done():
|
||||
return coreerr.E(
|
||||
"runtime.waitForHealth",
|
||||
fmt.Sprintf("timed out after %s: %s", timeout, url),
|
||||
timeoutCtx.Err(),
|
||||
)
|
||||
case <-ticker.C:
|
||||
// Keep polling until timeout.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return coreerr.E("runtime.waitForHealth", fmt.Sprintf("timed out after %s: %s", timeout, url), nil)
|
||||
func stopProviderProcess(cmd *exec.Cmd, timeout time.Duration) {
|
||||
if cmd == nil || cmd.Process == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if timeout <= 0 {
|
||||
timeout = 1 * time.Second
|
||||
}
|
||||
|
||||
_ = cmd.Process.Signal(os.Interrupt)
|
||||
if stopProviderProcessWait(cmd, timeout) {
|
||||
return
|
||||
}
|
||||
|
||||
_ = cmd.Process.Kill()
|
||||
stopProviderProcessWait(cmd, 2*time.Second)
|
||||
}
|
||||
|
||||
func stopProviderProcessWait(cmd *exec.Cmd, timeout time.Duration) bool {
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
_ = cmd.Wait()
|
||||
close(done)
|
||||
}()
|
||||
|
||||
timer := time.NewTimer(timeout)
|
||||
defer timer.Stop()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
return true
|
||||
case <-timer.C:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// staticAssetGroup is a simple RouteGroup that serves static files.
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ func TestWaitForHealth_Good(t *testing.T) {
|
|||
}))
|
||||
defer srv.Close()
|
||||
|
||||
err := waitForHealth(srv.URL, 5*time.Second)
|
||||
err := waitForHealth(context.Background(), srv.URL, 5*time.Second)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
|
|
@ -46,13 +46,13 @@ func TestWaitForHealth_Bad_Timeout(t *testing.T) {
|
|||
}))
|
||||
defer srv.Close()
|
||||
|
||||
err := waitForHealth(srv.URL, 500*time.Millisecond)
|
||||
err := waitForHealth(context.Background(), srv.URL, 500*time.Millisecond)
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "timed out")
|
||||
}
|
||||
|
||||
func TestWaitForHealth_Bad_NoServer(t *testing.T) {
|
||||
err := waitForHealth("http://127.0.0.1:1", 500*time.Millisecond)
|
||||
err := waitForHealth(context.Background(), "http://127.0.0.1:1", 500*time.Millisecond)
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "timed out")
|
||||
}
|
||||
|
|
@ -92,6 +92,7 @@ func TestRuntimeManager_List_Good_WithProviders(t *testing.T) {
|
|||
assert.Equal(t, "test", infos[0].Namespace)
|
||||
assert.Equal(t, 12345, infos[0].Port)
|
||||
assert.Equal(t, "/tmp/test-provider", infos[0].Dir)
|
||||
assert.Equal(t, "running", infos[0].Status)
|
||||
}
|
||||
|
||||
func TestRuntimeManager_StopAll_Good_Empty(t *testing.T) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue