fix(html): keep striptags single-pass
Co-Authored-By: Virgil <virgil@lethean.io>
This commit is contained in:
parent
68874bed0b
commit
acaf9d83a0
2 changed files with 32 additions and 17 deletions
41
pipeline.go
41
pipeline.go
|
|
@ -4,6 +4,7 @@ package html
|
|||
|
||||
import (
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"dappco.re/go/core/i18n/reversal"
|
||||
)
|
||||
|
|
@ -15,33 +16,39 @@ import (
|
|||
func StripTags(html string) string {
|
||||
var b strings.Builder
|
||||
inTag := false
|
||||
prevSpace := true // starts true to trim leading space
|
||||
pendingSpace := false
|
||||
seenText := false
|
||||
for _, r := range html {
|
||||
if inTag {
|
||||
if r == '>' {
|
||||
inTag = false
|
||||
if seenText {
|
||||
pendingSpace = true
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if r == '<' {
|
||||
inTag = true
|
||||
continue
|
||||
}
|
||||
if r == '>' {
|
||||
inTag = false
|
||||
if !prevSpace {
|
||||
b.WriteByte(' ')
|
||||
prevSpace = true
|
||||
|
||||
if unicode.IsSpace(r) {
|
||||
if seenText {
|
||||
pendingSpace = true
|
||||
}
|
||||
continue
|
||||
}
|
||||
if !inTag {
|
||||
if r == ' ' || r == '\t' || r == '\n' {
|
||||
if !prevSpace {
|
||||
b.WriteByte(' ')
|
||||
prevSpace = true
|
||||
}
|
||||
} else {
|
||||
b.WriteRune(r)
|
||||
prevSpace = false
|
||||
}
|
||||
|
||||
if pendingSpace {
|
||||
b.WriteByte(' ')
|
||||
pendingSpace = false
|
||||
}
|
||||
b.WriteRune(r)
|
||||
seenText = true
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// pipeline.go: Imprint renders a node tree to HTML, strips tags, tokenises the text,
|
||||
|
|
|
|||
|
|
@ -32,6 +32,14 @@ func TestStripTags_MultipleRegions(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestStripTags_BoundaryWhitespace(t *testing.T) {
|
||||
got := StripTags(`<p></p><p>hello</p><p></p>`)
|
||||
want := "hello"
|
||||
if got != want {
|
||||
t.Errorf("StripTags(boundary whitespace) = %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStripTags_Empty(t *testing.T) {
|
||||
got := StripTags("")
|
||||
if got != "" {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue