- Export ReadStatus (was readStatus) for cross-package use - AgentCompleted now emits agent.completed with repo/agent/workspace/status for every finished task, not just failures - queue.drained only fires when genuinely empty — verified by checking PIDs are alive via kill(0), not just trusting stale status files - Fix Docker mount paths: /root/ → /home/dev/ for non-root container - Update all callers and tests Co-Authored-By: Virgil <virgil@lethean.io>
666 lines
16 KiB
Go
666 lines
16 KiB
Go
// SPDX-License-Identifier: EUPL-1.2
|
|
|
|
// Package monitor provides a background subsystem that watches the ecosystem
|
|
// and pushes notifications to connected MCP clients.
|
|
//
|
|
// Checks performed on each tick:
|
|
// - Agent completions: scans workspace for newly completed agents
|
|
// - Repo drift: checks forge for repos with unpushed/unpulled changes
|
|
// - Inbox: checks for unread agent messages
|
|
package monitor
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"sync"
|
|
"syscall"
|
|
"time"
|
|
|
|
"dappco.re/go/agent/pkg/agentic"
|
|
core "dappco.re/go/core"
|
|
coremcp "forge.lthn.ai/core/mcp/pkg/mcp"
|
|
"github.com/modelcontextprotocol/go-sdk/mcp"
|
|
)
|
|
|
|
// fs provides unrestricted filesystem access (root "/" = no sandbox).
|
|
//
|
|
// r := fs.Read(core.Concat(wsRoot, "/", name, "/status.json"))
|
|
// if text, ok := resultString(r); ok { json.Unmarshal([]byte(text), &st) }
|
|
var fs = agentic.LocalFs()
|
|
|
|
// workspaceStatusPaths returns all status.json files across both old and new workspace layouts.
|
|
// Old: workspace/{name}/status.json (1 level)
|
|
// New: workspace/{org}/{repo}/{identifier}/status.json (3 levels)
|
|
func workspaceStatusPaths(wsRoot string) []string {
|
|
old := core.PathGlob(core.Concat(wsRoot, "/*/status.json"))
|
|
new := core.PathGlob(core.Concat(wsRoot, "/*/*/*/status.json"))
|
|
return append(old, new...)
|
|
}
|
|
|
|
func workspaceStatusPath(wsDir string) string {
|
|
return core.Concat(wsDir, "/status.json")
|
|
}
|
|
|
|
func brainKeyPath(home string) string {
|
|
return filepath.Join(home, ".claude", "brain.key")
|
|
}
|
|
|
|
func monitorPath(path string) string {
|
|
ds := core.Env("DS")
|
|
return core.Replace(core.Replace(path, "\\", ds), "/", ds)
|
|
}
|
|
|
|
func monitorHomeDir() string {
|
|
if d := core.Env("CORE_HOME"); d != "" {
|
|
return d
|
|
}
|
|
return core.Env("DIR_HOME")
|
|
}
|
|
|
|
func monitorAPIURL() string {
|
|
if u := core.Env("CORE_API_URL"); u != "" {
|
|
return u
|
|
}
|
|
return "https://api.lthn.sh"
|
|
}
|
|
|
|
func monitorBrainKey() string {
|
|
if k := core.Env("CORE_BRAIN_KEY"); k != "" {
|
|
return k
|
|
}
|
|
if r := fs.Read(brainKeyPath(monitorHomeDir())); r.OK {
|
|
if value, ok := resultString(r); ok {
|
|
return core.Trim(value)
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func resultString(r core.Result) (string, bool) {
|
|
value, ok := r.Value.(string)
|
|
if !ok {
|
|
return "", false
|
|
}
|
|
return value, true
|
|
}
|
|
|
|
// ChannelNotifier pushes events to connected MCP sessions.
|
|
//
|
|
// mon.SetNotifier(notifier)
|
|
type ChannelNotifier interface {
|
|
ChannelSend(ctx context.Context, channel string, data any)
|
|
}
|
|
|
|
// Subsystem implements mcp.Subsystem for background monitoring.
|
|
//
|
|
// mon := monitor.New(monitor.Options{Interval: 2 * time.Minute})
|
|
// mon.SetNotifier(notifier)
|
|
// mon.Start(ctx)
|
|
type Subsystem struct {
|
|
server *mcp.Server
|
|
notifier ChannelNotifier
|
|
interval time.Duration
|
|
cancel context.CancelFunc
|
|
wg sync.WaitGroup
|
|
|
|
// Track last seen state to only notify on changes
|
|
lastCompletedCount int // completed workspaces seen on the last scan
|
|
seenCompleted map[string]bool // workspace names we've already notified about
|
|
seenRunning map[string]bool // workspace names we've already sent start notification for
|
|
completionsSeeded bool // true after first completions check
|
|
lastInboxMaxID int // highest message ID seen
|
|
inboxSeeded bool // true after first inbox check
|
|
lastSyncTimestamp int64
|
|
mu sync.Mutex
|
|
|
|
// Event-driven poke channel — dispatch goroutine sends here on completion
|
|
poke chan struct{}
|
|
}
|
|
|
|
var _ coremcp.Subsystem = (*Subsystem)(nil)
|
|
var _ agentic.CompletionNotifier = (*Subsystem)(nil)
|
|
|
|
// SetNotifier wires up channel event broadcasting.
|
|
//
|
|
// mon.SetNotifier(notifier)
|
|
func (m *Subsystem) SetNotifier(n ChannelNotifier) {
|
|
m.notifier = n
|
|
}
|
|
|
|
// Options configures the monitor interval.
|
|
//
|
|
// monitor.New(monitor.Options{Interval: 30 * time.Second})
|
|
type Options struct {
|
|
// Interval between checks (default: 2 minutes)
|
|
Interval time.Duration
|
|
}
|
|
|
|
// New creates a monitor subsystem.
|
|
//
|
|
// mon := monitor.New(monitor.Options{Interval: 30 * time.Second})
|
|
func New(opts ...Options) *Subsystem {
|
|
interval := 2 * time.Minute
|
|
if len(opts) > 0 && opts[0].Interval > 0 {
|
|
interval = opts[0].Interval
|
|
}
|
|
// Override via env for debugging
|
|
if envInterval := os.Getenv("MONITOR_INTERVAL"); envInterval != "" {
|
|
if d, err := time.ParseDuration(envInterval); err == nil {
|
|
interval = d
|
|
}
|
|
}
|
|
return &Subsystem{
|
|
interval: interval,
|
|
poke: make(chan struct{}, 1),
|
|
seenCompleted: make(map[string]bool),
|
|
seenRunning: make(map[string]bool),
|
|
}
|
|
}
|
|
|
|
// debugChannel sends a debug message via the notifier so it arrives as a channel event.
|
|
func (m *Subsystem) debugChannel(msg string) {
|
|
if m.notifier != nil {
|
|
m.notifier.ChannelSend(context.Background(), "monitor.debug", map[string]any{"msg": msg})
|
|
}
|
|
}
|
|
|
|
// Name returns the subsystem identifier used by MCP registration.
|
|
//
|
|
// mon.Name() // "monitor"
|
|
func (m *Subsystem) Name() string { return "monitor" }
|
|
|
|
// RegisterTools binds the monitor resource to an MCP server.
|
|
//
|
|
// mon.RegisterTools(server)
|
|
func (m *Subsystem) RegisterTools(server *mcp.Server) {
|
|
m.server = server
|
|
|
|
// Register a resource that clients can read for current status
|
|
server.AddResource(&mcp.Resource{
|
|
Name: "Agent Status",
|
|
URI: "status://agents",
|
|
Description: "Current status of all agent workspaces",
|
|
MIMEType: "application/json",
|
|
}, m.agentStatusResource)
|
|
}
|
|
|
|
// Start begins the background monitoring loop after MCP startup.
|
|
//
|
|
// mon.Start(ctx)
|
|
func (m *Subsystem) Start(ctx context.Context) {
|
|
monCtx, cancel := context.WithCancel(ctx)
|
|
m.cancel = cancel
|
|
|
|
core.Print(os.Stderr, "monitor: started (interval=%s, notifier=%v)", m.interval, m.notifier != nil)
|
|
|
|
m.wg.Add(1)
|
|
go func() {
|
|
defer m.wg.Done()
|
|
m.loop(monCtx)
|
|
}()
|
|
}
|
|
|
|
// Shutdown stops the monitoring loop and waits for it to exit.
|
|
//
|
|
// _ = mon.Shutdown(ctx)
|
|
func (m *Subsystem) Shutdown(_ context.Context) error {
|
|
if m.cancel != nil {
|
|
m.cancel()
|
|
}
|
|
m.wg.Wait()
|
|
return nil
|
|
}
|
|
|
|
// Poke triggers an immediate check cycle (legacy — prefer AgentStarted/AgentCompleted).
|
|
func (m *Subsystem) Poke() {
|
|
select {
|
|
case m.poke <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
|
|
// AgentStarted is called when an agent spawns.
|
|
// No individual notification — fleet status is checked on completion.
|
|
//
|
|
// mon.AgentStarted("codex:gpt-5.3-codex-spark", "go-io", "core/go-io/task-5")
|
|
func (m *Subsystem) AgentStarted(agent, repo, workspace string) {
|
|
// No-op — we only notify on failures and queue drain
|
|
}
|
|
|
|
// AgentCompleted is called when an agent finishes.
|
|
// Emits agent.completed for every finish, then checks if the queue is empty.
|
|
//
|
|
// mon.AgentCompleted("codex", "go-io", "core/go-io/task-5", "completed")
|
|
func (m *Subsystem) AgentCompleted(agent, repo, workspace, status string) {
|
|
m.mu.Lock()
|
|
m.seenCompleted[workspace] = true
|
|
m.mu.Unlock()
|
|
|
|
if m.notifier != nil {
|
|
m.notifier.ChannelSend(context.Background(), "agent.completed", map[string]any{
|
|
"repo": repo,
|
|
"agent": agent,
|
|
"workspace": workspace,
|
|
"status": status,
|
|
})
|
|
}
|
|
|
|
m.Poke()
|
|
go m.checkIdleAfterDelay()
|
|
}
|
|
|
|
// checkIdleAfterDelay waits briefly then checks if the fleet is genuinely idle.
|
|
// Only emits queue.drained when there are truly zero running or queued agents,
|
|
// verified by checking PIDs are alive, not just trusting status files.
|
|
func (m *Subsystem) checkIdleAfterDelay() {
|
|
time.Sleep(5 * time.Second) // wait for queue drain to fill slots
|
|
if m.notifier == nil {
|
|
return
|
|
}
|
|
|
|
running, queued := m.countLiveWorkspaces()
|
|
if running == 0 && queued == 0 {
|
|
m.notifier.ChannelSend(context.Background(), "queue.drained", map[string]any{
|
|
"running": running,
|
|
"queued": queued,
|
|
})
|
|
}
|
|
}
|
|
|
|
// countLiveWorkspaces counts workspaces that are genuinely active.
|
|
// For "running" status, verifies the PID is still alive.
|
|
func (m *Subsystem) countLiveWorkspaces() (running, queued int) {
|
|
wsRoot := agentic.WorkspaceRoot()
|
|
old := core.PathGlob(core.JoinPath(wsRoot, "*", "status.json"))
|
|
deep := core.PathGlob(core.JoinPath(wsRoot, "*", "*", "*", "status.json"))
|
|
for _, path := range append(old, deep...) {
|
|
wsDir := core.PathDir(path)
|
|
st, err := agentic.ReadStatus(wsDir)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
switch st.Status {
|
|
case "running":
|
|
if st.PID > 0 && pidAlive(st.PID) {
|
|
running++
|
|
}
|
|
case "queued":
|
|
queued++
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// pidAlive checks whether a process is still running.
|
|
func pidAlive(pid int) bool {
|
|
proc, err := os.FindProcess(pid)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
err = proc.Signal(syscall.Signal(0))
|
|
return err == nil
|
|
}
|
|
|
|
func (m *Subsystem) loop(ctx context.Context) {
|
|
// Initial check after short delay (let server fully start)
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-time.After(5 * time.Second):
|
|
}
|
|
|
|
// Initialise sync timestamp to now (don't pull everything on first run)
|
|
m.initSyncTimestamp()
|
|
|
|
// Run first check immediately
|
|
m.check(ctx)
|
|
|
|
ticker := time.NewTicker(m.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
m.check(ctx)
|
|
case <-m.poke:
|
|
m.check(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *Subsystem) check(ctx context.Context) {
|
|
var messages []string
|
|
|
|
// Check agent completions
|
|
if msg := m.checkCompletions(); msg != "" {
|
|
messages = append(messages, msg)
|
|
}
|
|
|
|
// Harvest completed workspaces — push branches, check for binaries
|
|
if msg := m.harvestCompleted(); msg != "" {
|
|
messages = append(messages, msg)
|
|
}
|
|
|
|
// Check inbox
|
|
if msg := m.checkInbox(); msg != "" {
|
|
messages = append(messages, msg)
|
|
}
|
|
|
|
// Sync repos from other agents' changes
|
|
if msg := m.syncRepos(); msg != "" {
|
|
messages = append(messages, msg)
|
|
}
|
|
|
|
// Only notify if there's something new
|
|
if len(messages) == 0 {
|
|
return
|
|
}
|
|
|
|
combined := core.Join("\n", messages...)
|
|
m.notify(ctx, combined)
|
|
|
|
// Notify resource subscribers that agent status changed
|
|
if m.server != nil {
|
|
m.server.ResourceUpdated(ctx, &mcp.ResourceUpdatedNotificationParams{
|
|
URI: "status://agents",
|
|
})
|
|
}
|
|
}
|
|
|
|
// checkCompletions scans workspace for newly completed agents.
|
|
// Tracks by workspace name (not count) so harvest status rewrites
|
|
// don't suppress future notifications.
|
|
func (m *Subsystem) checkCompletions() string {
|
|
wsRoot := agentic.WorkspaceRoot()
|
|
entries := workspaceStatusPaths(wsRoot)
|
|
|
|
running := 0
|
|
queued := 0
|
|
completed := 0
|
|
var newlyCompleted []string
|
|
|
|
m.mu.Lock()
|
|
seeded := m.completionsSeeded
|
|
for _, entry := range entries {
|
|
r := fs.Read(entry)
|
|
if !r.OK {
|
|
continue
|
|
}
|
|
entryData, ok := resultString(r)
|
|
if !ok {
|
|
continue
|
|
}
|
|
var st struct {
|
|
Status string `json:"status"`
|
|
Repo string `json:"repo"`
|
|
Agent string `json:"agent"`
|
|
}
|
|
if json.Unmarshal([]byte(entryData), &st) != nil {
|
|
continue
|
|
}
|
|
|
|
// Use full relative path as dedup key — "core/go/main" not just "main"
|
|
wsDir := filepath.Dir(entry)
|
|
wsName := wsDir
|
|
if len(wsDir) > len(wsRoot)+1 {
|
|
wsName = wsDir[len(wsRoot)+1:]
|
|
}
|
|
|
|
switch st.Status {
|
|
case "completed":
|
|
completed++
|
|
if !m.seenCompleted[wsName] {
|
|
m.seenCompleted[wsName] = true
|
|
if seeded {
|
|
newlyCompleted = append(newlyCompleted, core.Sprintf("%s (%s)", st.Repo, st.Agent))
|
|
}
|
|
}
|
|
case "running":
|
|
running++
|
|
if !m.seenRunning[wsName] && seeded {
|
|
m.seenRunning[wsName] = true
|
|
// No individual start notification — too noisy
|
|
}
|
|
case "queued":
|
|
queued++
|
|
case "blocked", "failed":
|
|
if !m.seenCompleted[wsName] {
|
|
m.seenCompleted[wsName] = true
|
|
if seeded {
|
|
newlyCompleted = append(newlyCompleted, core.Sprintf("%s (%s) [%s]", st.Repo, st.Agent, st.Status))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
m.lastCompletedCount = completed
|
|
m.completionsSeeded = true
|
|
m.mu.Unlock()
|
|
|
|
if len(newlyCompleted) == 0 {
|
|
return ""
|
|
}
|
|
|
|
// Emit agent.completed for each newly finished task
|
|
if m.notifier != nil {
|
|
for _, desc := range newlyCompleted {
|
|
m.notifier.ChannelSend(context.Background(), "agent.completed", map[string]any{
|
|
"description": desc,
|
|
})
|
|
}
|
|
}
|
|
|
|
// Only emit queue.drained when genuinely empty — verified by live PID check
|
|
liveRunning, liveQueued := m.countLiveWorkspaces()
|
|
if m.notifier != nil && liveRunning == 0 && liveQueued == 0 {
|
|
m.notifier.ChannelSend(context.Background(), "queue.drained", map[string]any{
|
|
"completed": len(newlyCompleted),
|
|
})
|
|
}
|
|
|
|
msg := core.Sprintf("%d agent(s) completed", len(newlyCompleted))
|
|
if running > 0 {
|
|
msg = core.Concat(msg, core.Sprintf(", %d still running", running))
|
|
}
|
|
if queued > 0 {
|
|
msg = core.Concat(msg, core.Sprintf(", %d queued", queued))
|
|
}
|
|
return msg
|
|
}
|
|
|
|
// checkInbox checks for unread messages.
|
|
func (m *Subsystem) checkInbox() string {
|
|
apiKeyStr := os.Getenv("CORE_BRAIN_KEY")
|
|
if apiKeyStr == "" {
|
|
home, _ := os.UserHomeDir()
|
|
keyFile := brainKeyPath(home)
|
|
r := fs.Read(keyFile)
|
|
if !r.OK {
|
|
return ""
|
|
}
|
|
value, ok := resultString(r)
|
|
if !ok {
|
|
return ""
|
|
}
|
|
apiKeyStr = value
|
|
}
|
|
|
|
// Call the API to check inbox
|
|
apiURL := os.Getenv("CORE_API_URL")
|
|
if apiURL == "" {
|
|
apiURL = "https://api.lthn.sh"
|
|
}
|
|
req, err := http.NewRequest("GET", core.Concat(apiURL, "/v1/messages/inbox?agent=", url.QueryEscape(agentic.AgentName())), nil)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
req.Header.Set("Authorization", core.Concat("Bearer ", core.Trim(apiKeyStr)))
|
|
|
|
client := &http.Client{Timeout: 10 * time.Second}
|
|
httpResp, err := client.Do(req)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
defer httpResp.Body.Close()
|
|
|
|
if httpResp.StatusCode != 200 {
|
|
return ""
|
|
}
|
|
|
|
var resp struct {
|
|
Data []struct {
|
|
ID int `json:"id"`
|
|
Read bool `json:"read"`
|
|
From string `json:"from"`
|
|
Subject string `json:"subject"`
|
|
Content string `json:"content"`
|
|
} `json:"data"`
|
|
}
|
|
if json.NewDecoder(httpResp.Body).Decode(&resp) != nil {
|
|
m.debugChannel("checkInbox: failed to decode response")
|
|
return ""
|
|
}
|
|
|
|
// Find max ID, count unread, collect new messages
|
|
maxID := 0
|
|
unread := 0
|
|
|
|
m.mu.Lock()
|
|
prevMaxID := m.lastInboxMaxID
|
|
seeded := m.inboxSeeded
|
|
m.mu.Unlock()
|
|
|
|
type newMessage struct {
|
|
ID int `json:"id"`
|
|
From string `json:"from"`
|
|
Subject string `json:"subject"`
|
|
Content string `json:"content"`
|
|
}
|
|
var newMessages []newMessage
|
|
|
|
for _, msg := range resp.Data {
|
|
if msg.ID > maxID {
|
|
maxID = msg.ID
|
|
}
|
|
if !msg.Read {
|
|
unread++
|
|
}
|
|
// Collect messages newer than what we've seen
|
|
if msg.ID > prevMaxID {
|
|
newMessages = append(newMessages, newMessage{
|
|
ID: msg.ID,
|
|
From: msg.From,
|
|
Subject: msg.Subject,
|
|
Content: msg.Content,
|
|
})
|
|
}
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.lastInboxMaxID = maxID
|
|
m.inboxSeeded = true
|
|
m.mu.Unlock()
|
|
|
|
// First check after startup: seed, don't fire
|
|
if !seeded {
|
|
return ""
|
|
}
|
|
|
|
// Only fire if there are new messages (higher ID than last seen)
|
|
if maxID <= prevMaxID || len(newMessages) == 0 {
|
|
return ""
|
|
}
|
|
|
|
// Push channel event with full message content
|
|
if m.notifier != nil {
|
|
m.notifier.ChannelSend(context.Background(), "inbox.message", map[string]any{
|
|
"new": len(newMessages),
|
|
"total": unread,
|
|
"messages": newMessages,
|
|
})
|
|
}
|
|
|
|
return core.Sprintf("%d unread message(s) in inbox", unread)
|
|
}
|
|
|
|
// notify sends a log notification to all connected MCP sessions.
|
|
func (m *Subsystem) notify(ctx context.Context, message string) {
|
|
if m.server == nil {
|
|
return
|
|
}
|
|
|
|
// Use the server's session list to broadcast
|
|
for ss := range m.server.Sessions() {
|
|
ss.Log(ctx, &mcp.LoggingMessageParams{
|
|
Level: "info",
|
|
Logger: "monitor",
|
|
Data: message,
|
|
})
|
|
}
|
|
}
|
|
|
|
// agentStatusResource returns current workspace status as a JSON resource.
|
|
func (m *Subsystem) agentStatusResource(ctx context.Context, req *mcp.ReadResourceRequest) (*mcp.ReadResourceResult, error) {
|
|
wsRoot := agentic.WorkspaceRoot()
|
|
entries := workspaceStatusPaths(wsRoot)
|
|
|
|
type wsInfo struct {
|
|
Name string `json:"name"`
|
|
Status string `json:"status"`
|
|
Repo string `json:"repo"`
|
|
Agent string `json:"agent"`
|
|
PRURL string `json:"pr_url,omitempty"`
|
|
}
|
|
|
|
var workspaces []wsInfo
|
|
for _, entry := range entries {
|
|
r := fs.Read(entry)
|
|
if !r.OK {
|
|
continue
|
|
}
|
|
entryData, ok := resultString(r)
|
|
if !ok {
|
|
continue
|
|
}
|
|
var st struct {
|
|
Status string `json:"status"`
|
|
Repo string `json:"repo"`
|
|
Agent string `json:"agent"`
|
|
PRURL string `json:"pr_url"`
|
|
}
|
|
if json.Unmarshal([]byte(entryData), &st) != nil {
|
|
continue
|
|
}
|
|
entryDir := filepath.Dir(entry)
|
|
entryName := entryDir
|
|
if len(entryDir) > len(wsRoot)+1 {
|
|
entryName = entryDir[len(wsRoot)+1:]
|
|
}
|
|
workspaces = append(workspaces, wsInfo{
|
|
Name: entryName,
|
|
Status: st.Status,
|
|
Repo: st.Repo,
|
|
Agent: st.Agent,
|
|
PRURL: st.PRURL,
|
|
})
|
|
}
|
|
|
|
result, err := json.Marshal(workspaces)
|
|
if err != nil {
|
|
return nil, core.E("monitor.agentStatus", "failed to encode workspace status", err)
|
|
}
|
|
return &mcp.ReadResourceResult{
|
|
Contents: []*mcp.ResourceContents{
|
|
{
|
|
URI: "status://agents",
|
|
MIMEType: "application/json",
|
|
Text: string(result),
|
|
},
|
|
},
|
|
}, nil
|
|
}
|