fix(html): keep striptags single-pass
Some checks are pending
Security Scan / security (push) Waiting to run
Test / test (push) Waiting to run

Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
Virgil 2026-04-04 01:48:31 +00:00
parent 68874bed0b
commit acaf9d83a0
2 changed files with 32 additions and 17 deletions

View file

@ -4,6 +4,7 @@ package html
import (
"strings"
"unicode"
"dappco.re/go/core/i18n/reversal"
)
@ -15,33 +16,39 @@ import (
func StripTags(html string) string {
var b strings.Builder
inTag := false
prevSpace := true // starts true to trim leading space
pendingSpace := false
seenText := false
for _, r := range html {
if inTag {
if r == '>' {
inTag = false
if seenText {
pendingSpace = true
}
}
continue
}
if r == '<' {
inTag = true
continue
}
if r == '>' {
inTag = false
if !prevSpace {
b.WriteByte(' ')
prevSpace = true
if unicode.IsSpace(r) {
if seenText {
pendingSpace = true
}
continue
}
if !inTag {
if r == ' ' || r == '\t' || r == '\n' {
if !prevSpace {
b.WriteByte(' ')
prevSpace = true
}
} else {
b.WriteRune(r)
prevSpace = false
}
if pendingSpace {
b.WriteByte(' ')
pendingSpace = false
}
b.WriteRune(r)
seenText = true
}
return strings.TrimSpace(b.String())
return b.String()
}
// pipeline.go: Imprint renders a node tree to HTML, strips tags, tokenises the text,

View file

@ -32,6 +32,14 @@ func TestStripTags_MultipleRegions(t *testing.T) {
}
}
func TestStripTags_BoundaryWhitespace(t *testing.T) {
got := StripTags(`<p></p><p>hello</p><p></p>`)
want := "hello"
if got != want {
t.Errorf("StripTags(boundary whitespace) = %q, want %q", got, want)
}
}
func TestStripTags_Empty(t *testing.T) {
got := StripTags("")
if got != "" {