From 57fb567a685899af1e99c4c6503c963d38a4676f Mon Sep 17 00:00:00 2001 From: Virgil Date: Thu, 2 Apr 2026 13:55:56 +0000 Subject: [PATCH] feat(gui): add webview element screenshots --- pkg/display/display.go | 26 ++++++++ pkg/mcp/tools_webview.go | 39 ++++++++++++ pkg/webview/messages.go | 6 ++ pkg/webview/service.go | 121 ++++++++++++++++++++++++++++++++++++ pkg/webview/service_test.go | 50 ++++++++++++++- pkg/window/platform.go | 28 +++++---- 6 files changed, 256 insertions(+), 14 deletions(-) diff --git a/pkg/display/display.go b/pkg/display/display.go index 4797195..9803012 100644 --- a/pkg/display/display.go +++ b/pkg/display/display.go @@ -345,6 +345,16 @@ func (s *Service) handleWSMessage(msg WSMessage) (any, bool, error) { return nil, false, e } result, handled, err = s.Core().PERFORM(webview.TaskScreenshot{Window: w}) + case "webview:screenshot-element": + w, e := wsRequire(msg.Data, "window") + if e != nil { + return nil, false, e + } + sel, e := wsRequire(msg.Data, "selector") + if e != nil { + return nil, false, e + } + result, handled, err = s.Core().PERFORM(webview.TaskScreenshotElement{Window: w, Selector: sel}) case "webview:scroll": w, e := wsRequire(msg.Data, "window") if e != nil { @@ -521,6 +531,22 @@ func (s *Service) handleWSMessage(msg WSMessage) (any, bool, error) { } sel, _ := msg.Data["selector"].(string) // selector optional for dom-tree (defaults to root) result, handled, err = s.Core().QUERY(webview.QueryDOMTree{Window: w, Selector: sel}) + case "webview:source": + w, e := wsRequire(msg.Data, "window") + if e != nil { + return nil, false, e + } + result, handled, err = s.Core().QUERY(webview.QueryDOMTree{Window: w}) + case "webview:element-info": + w, e := wsRequire(msg.Data, "window") + if e != nil { + return nil, false, e + } + sel, e := wsRequire(msg.Data, "selector") + if e != nil { + return nil, false, e + } + result, handled, err = s.Core().QUERY(webview.QuerySelector{Window: w, Selector: sel}) case "webview:url": w, e := wsRequire(msg.Data, "window") if e != nil { diff --git a/pkg/mcp/tools_webview.go b/pkg/mcp/tools_webview.go index 3f4c368..22b8c70 100644 --- a/pkg/mcp/tools_webview.go +++ b/pkg/mcp/tools_webview.go @@ -110,6 +110,30 @@ func (s *Subsystem) webviewScreenshot(_ context.Context, _ *mcp.CallToolRequest, return nil, WebviewScreenshotOutput{Base64: sr.Base64, MimeType: sr.MimeType}, nil } +// --- webview_screenshot_element --- + +type WebviewScreenshotElementInput struct { + Window string `json:"window"` + Selector string `json:"selector"` +} + +type WebviewScreenshotElementOutput struct { + Base64 string `json:"base64"` + MimeType string `json:"mimeType"` +} + +func (s *Subsystem) webviewScreenshotElement(_ context.Context, _ *mcp.CallToolRequest, input WebviewScreenshotElementInput) (*mcp.CallToolResult, WebviewScreenshotElementOutput, error) { + result, _, err := s.core.PERFORM(webview.TaskScreenshotElement{Window: input.Window, Selector: input.Selector}) + if err != nil { + return nil, WebviewScreenshotElementOutput{}, err + } + sr, ok := result.(webview.ScreenshotResult) + if !ok { + return nil, WebviewScreenshotElementOutput{}, fmt.Errorf("unexpected result type from webview element screenshot") + } + return nil, WebviewScreenshotElementOutput{Base64: sr.Base64, MimeType: sr.MimeType}, nil +} + // --- webview_scroll --- type WebviewScrollInput struct { @@ -328,6 +352,12 @@ func (s *Subsystem) webviewQuery(_ context.Context, _ *mcp.CallToolRequest, inpu return nil, WebviewQueryOutput{Element: el}, nil } +// --- webview_element_info --- + +func (s *Subsystem) webviewElementInfo(_ context.Context, _ *mcp.CallToolRequest, input WebviewQueryInput) (*mcp.CallToolResult, WebviewQueryOutput, error) { + return s.webviewQuery(nil, nil, input) +} + // --- webview_query_all --- type WebviewQueryAllInput struct { @@ -374,6 +404,12 @@ func (s *Subsystem) webviewDOMTree(_ context.Context, _ *mcp.CallToolRequest, in return nil, WebviewDOMTreeOutput{HTML: html}, nil } +// --- webview_source --- + +func (s *Subsystem) webviewSource(_ context.Context, _ *mcp.CallToolRequest, input WebviewDOMTreeInput) (*mcp.CallToolResult, WebviewDOMTreeOutput, error) { + return s.webviewDOMTree(nil, nil, input) +} + // --- webview_computed_style --- type WebviewComputedStyleInput struct { @@ -613,6 +649,7 @@ func (s *Subsystem) registerWebviewTools(server *mcp.Server) { mcp.AddTool(server, &mcp.Tool{Name: "webview_type", Description: "Type text into an element in a webview"}, s.webviewType) mcp.AddTool(server, &mcp.Tool{Name: "webview_navigate", Description: "Navigate a webview to a URL"}, s.webviewNavigate) mcp.AddTool(server, &mcp.Tool{Name: "webview_screenshot", Description: "Capture a webview screenshot as base64 PNG"}, s.webviewScreenshot) + mcp.AddTool(server, &mcp.Tool{Name: "webview_screenshot_element", Description: "Capture a specific element as base64 PNG"}, s.webviewScreenshotElement) mcp.AddTool(server, &mcp.Tool{Name: "webview_scroll", Description: "Scroll a webview to an absolute position"}, s.webviewScroll) mcp.AddTool(server, &mcp.Tool{Name: "webview_hover", Description: "Hover over an element in a webview"}, s.webviewHover) mcp.AddTool(server, &mcp.Tool{Name: "webview_select", Description: "Select an option in a select element"}, s.webviewSelect) @@ -622,8 +659,10 @@ func (s *Subsystem) registerWebviewTools(server *mcp.Server) { mcp.AddTool(server, &mcp.Tool{Name: "webview_console", Description: "Get captured console messages from a webview"}, s.webviewConsole) mcp.AddTool(server, &mcp.Tool{Name: "webview_console_clear", Description: "Clear captured console messages"}, s.webviewConsoleClear) mcp.AddTool(server, &mcp.Tool{Name: "webview_query", Description: "Find a single DOM element by CSS selector"}, s.webviewQuery) + mcp.AddTool(server, &mcp.Tool{Name: "webview_element_info", Description: "Get detailed information about a DOM element"}, s.webviewElementInfo) mcp.AddTool(server, &mcp.Tool{Name: "webview_query_all", Description: "Find all DOM elements matching a CSS selector"}, s.webviewQueryAll) mcp.AddTool(server, &mcp.Tool{Name: "webview_dom_tree", Description: "Get HTML content of a webview"}, s.webviewDOMTree) + mcp.AddTool(server, &mcp.Tool{Name: "webview_source", Description: "Get page HTML source"}, s.webviewSource) mcp.AddTool(server, &mcp.Tool{Name: "webview_computed_style", Description: "Get computed styles for an element"}, s.webviewComputedStyle) mcp.AddTool(server, &mcp.Tool{Name: "webview_performance", Description: "Get page performance metrics"}, s.webviewPerformance) mcp.AddTool(server, &mcp.Tool{Name: "webview_resources", Description: "List loaded page resources"}, s.webviewResources) diff --git a/pkg/webview/messages.go b/pkg/webview/messages.go index a063438..9459057 100644 --- a/pkg/webview/messages.go +++ b/pkg/webview/messages.go @@ -94,6 +94,12 @@ type TaskScreenshot struct { Window string `json:"window"` } +// TaskScreenshotElement captures a specific element as PNG. Result: ScreenshotResult +type TaskScreenshotElement struct { + Window string `json:"window"` + Selector string `json:"selector"` +} + // TaskScroll scrolls to an absolute position (window.scrollTo). Result: nil type TaskScroll struct { Window string `json:"window"` diff --git a/pkg/webview/service.go b/pkg/webview/service.go index 44c4266..b5f9c69 100644 --- a/pkg/webview/service.go +++ b/pkg/webview/service.go @@ -2,10 +2,15 @@ package webview import ( + "bytes" "context" "encoding/base64" "encoding/json" "fmt" + "image" + "image/draw" + "image/png" + "math" "reflect" "strconv" "strings" @@ -381,6 +386,19 @@ func (s *Service) handleTask(_ *core.Core, t core.Task) (any, bool, error) { Base64: base64.StdEncoding.EncodeToString(png), MimeType: "image/png", }, true, nil + case TaskScreenshotElement: + conn, err := s.getConn(t.Window) + if err != nil { + return nil, true, err + } + png, err := captureElementScreenshot(conn, t.Selector) + if err != nil { + return nil, true, err + } + return ScreenshotResult{ + Base64: base64.StdEncoding.EncodeToString(png), + MimeType: "image/png", + }, true, nil case TaskScroll: conn, err := s.getConn(t.Window) if err != nil { @@ -433,8 +451,26 @@ func (s *Service) handleTask(_ *core.Core, t core.Task) (any, bool, error) { _, err = conn.Evaluate(highlightScript(t.Selector, t.Colour)) return nil, true, err case TaskOpenDevTools: + ws, err := core.ServiceFor[*window.Service](s.Core(), "window") + if err != nil { + return nil, true, err + } + pw, ok := ws.Manager().Get(t.Window) + if !ok { + return nil, true, fmt.Errorf("window not found: %s", t.Window) + } + pw.OpenDevTools() return nil, true, nil case TaskCloseDevTools: + ws, err := core.ServiceFor[*window.Service](s.Core(), "window") + if err != nil { + return nil, true, err + } + pw, ok := ws.Manager().Get(t.Window) + if !ok { + return nil, true, fmt.Errorf("window not found: %s", t.Window) + } + pw.CloseDevTools() return nil, true, nil case TaskInjectNetworkLogging: conn, err := s.getConn(t.Window) @@ -502,6 +538,91 @@ func coerceToNetworkEntries(v any) ([]NetworkEntry, error) { return coerceJSON[[]NetworkEntry](v) } +type elementScreenshotBounds struct { + Left float64 `json:"left"` + Top float64 `json:"top"` + Width float64 `json:"width"` + Height float64 `json:"height"` + DevicePixelRatio float64 `json:"devicePixelRatio"` +} + +func elementScreenshotScript(selector string) string { + sel := jsQuote(selector) + return fmt.Sprintf(`(function(){ + const el = document.querySelector(%s); + if (!el) return null; + try { el.scrollIntoView({block: "center", inline: "center"}); } catch (e) {} + const rect = el.getBoundingClientRect(); + return { + left: rect.left, + top: rect.top, + width: rect.width, + height: rect.height, + devicePixelRatio: window.devicePixelRatio || 1 + }; +})()`, sel) +} + +func captureElementScreenshot(conn connector, selector string) ([]byte, error) { + result, err := conn.Evaluate(elementScreenshotScript(selector)) + if err != nil { + return nil, err + } + if result == nil { + return nil, fmt.Errorf("webview: element not found: %s", selector) + } + bounds, err := coerceJSON[elementScreenshotBounds](result) + if err != nil { + return nil, err + } + if bounds.Width <= 0 || bounds.Height <= 0 { + return nil, fmt.Errorf("webview: element has no measurable bounds: %s", selector) + } + raw, err := conn.Screenshot() + if err != nil { + return nil, err + } + img, _, err := image.Decode(bytes.NewReader(raw)) + if err != nil { + return nil, err + } + + scale := bounds.DevicePixelRatio + if scale <= 0 { + scale = 1 + } + left := int(math.Floor(bounds.Left * scale)) + top := int(math.Floor(bounds.Top * scale)) + right := int(math.Ceil((bounds.Left + bounds.Width) * scale)) + bottom := int(math.Ceil((bounds.Top + bounds.Height) * scale)) + + srcBounds := img.Bounds() + if left < srcBounds.Min.X { + left = srcBounds.Min.X + } + if top < srcBounds.Min.Y { + top = srcBounds.Min.Y + } + if right > srcBounds.Max.X { + right = srcBounds.Max.X + } + if bottom > srcBounds.Max.Y { + bottom = srcBounds.Max.Y + } + if right <= left || bottom <= top { + return nil, fmt.Errorf("webview: element is outside the captured screenshot: %s", selector) + } + + crop := image.NewRGBA(image.Rect(0, 0, right-left, bottom-top)) + draw.Draw(crop, crop.Bounds(), img, image.Point{X: left, Y: top}, draw.Src) + + var buf bytes.Buffer + if err := png.Encode(&buf, crop); err != nil { + return nil, err + } + return buf.Bytes(), nil +} + // realConnector wraps *gowebview.Webview, converting types at the boundary. type realConnector struct { wv *gowebview.Webview diff --git a/pkg/webview/service_test.go b/pkg/webview/service_test.go index cf605ae..980c791 100644 --- a/pkg/webview/service_test.go +++ b/pkg/webview/service_test.go @@ -2,7 +2,12 @@ package webview import ( + "bytes" "context" + "encoding/base64" + "image" + "image/color" + "image/png" "strings" "testing" @@ -107,7 +112,11 @@ func (m *mockConnector) GetConsole() []ConsoleMessage { return m.console } func newTestService(t *testing.T, mock *mockConnector) (*Service, *core.Core) { t.Helper() factory := Register() - c, err := core.New(core.WithService(factory), core.WithServiceLock()) + c, err := core.New( + core.WithService(window.Register(window.NewMockPlatform())), + core.WithService(factory), + core.WithServiceLock(), + ) require.NoError(t, err) require.NoError(t, c.ServiceStartup(context.Background(), nil)) svc := core.MustServiceFor[*Service](c, "webview") @@ -203,6 +212,43 @@ func TestTaskScreenshot_Good(t *testing.T) { assert.NotEmpty(t, sr.Base64) } +func TestTaskScreenshotElement_Good(t *testing.T) { + img := image.NewRGBA(image.Rect(0, 0, 4, 4)) + for y := 0; y < 4; y++ { + for x := 0; x < 4; x++ { + img.SetRGBA(x, y, color.RGBA{R: uint8(x * 40), G: uint8(y * 40), B: 200, A: 255}) + } + } + var buf bytes.Buffer + require.NoError(t, png.Encode(&buf, img)) + + mock := &mockConnector{ + screenshot: buf.Bytes(), + evalFn: func(script string) (any, error) { + return map[string]any{ + "left": 1.0, + "top": 1.0, + "width": 2.0, + "height": 2.0, + "devicePixelRatio": 1.0, + }, nil + }, + } + _, c := newTestService(t, mock) + + result, handled, err := c.PERFORM(TaskScreenshotElement{Window: "main", Selector: "#card"}) + require.NoError(t, err) + assert.True(t, handled) + sr, ok := result.(ScreenshotResult) + require.True(t, ok) + + raw, err := base64.StdEncoding.DecodeString(sr.Base64) + require.NoError(t, err) + decoded, err := png.Decode(bytes.NewReader(raw)) + require.NoError(t, err) + assert.Equal(t, image.Rect(0, 0, 2, 2), decoded.Bounds()) +} + func TestTaskClearConsole_Good(t *testing.T) { mock := &mockConnector{} _, c := newTestService(t, mock) @@ -214,6 +260,8 @@ func TestTaskClearConsole_Good(t *testing.T) { func TestTaskDevTools_Good(t *testing.T) { _, c := newTestService(t, &mockConnector{}) + _, _, err := c.PERFORM(window.TaskOpenWindow{Opts: []window.WindowOption{window.WithName("main")}}) + require.NoError(t, err) _, handled, err := c.PERFORM(TaskOpenDevTools{Window: "main"}) require.NoError(t, err) assert.True(t, handled) diff --git a/pkg/window/platform.go b/pkg/window/platform.go index ae4e2e6..58943b0 100644 --- a/pkg/window/platform.go +++ b/pkg/window/platform.go @@ -9,19 +9,19 @@ type Platform interface { // PlatformWindowOptions are the backend-specific options passed to CreateWindow. type PlatformWindowOptions struct { - Name string - Title string - URL string - Width, Height int - X, Y int - MinWidth, MinHeight int - MaxWidth, MaxHeight int - Frameless bool - Hidden bool - AlwaysOnTop bool - BackgroundColour [4]uint8 // RGBA - DisableResize bool - EnableFileDrop bool + Name string + Title string + URL string + Width, Height int + X, Y int + MinWidth, MinHeight int + MaxWidth, MaxHeight int + Frameless bool + Hidden bool + AlwaysOnTop bool + BackgroundColour [4]uint8 // RGBA + DisableResize bool + EnableFileDrop bool } // PlatformWindow is a live window handle from the backend. @@ -54,6 +54,8 @@ type PlatformWindow interface { Hide() Fullscreen() UnFullscreen() + OpenDevTools() + CloseDevTools() // Events OnWindowEvent(handler func(event WindowEvent))