diff --git a/pipeline.go b/pipeline.go index f452c8e..3159b4f 100644 --- a/pipeline.go +++ b/pipeline.go @@ -4,6 +4,7 @@ package html import ( "strings" + "unicode" "dappco.re/go/core/i18n/reversal" ) @@ -15,33 +16,39 @@ import ( func StripTags(html string) string { var b strings.Builder inTag := false - prevSpace := true // starts true to trim leading space + pendingSpace := false + seenText := false for _, r := range html { + if inTag { + if r == '>' { + inTag = false + if seenText { + pendingSpace = true + } + } + continue + } + if r == '<' { inTag = true continue } - if r == '>' { - inTag = false - if !prevSpace { - b.WriteByte(' ') - prevSpace = true + + if unicode.IsSpace(r) { + if seenText { + pendingSpace = true } continue } - if !inTag { - if r == ' ' || r == '\t' || r == '\n' { - if !prevSpace { - b.WriteByte(' ') - prevSpace = true - } - } else { - b.WriteRune(r) - prevSpace = false - } + + if pendingSpace { + b.WriteByte(' ') + pendingSpace = false } + b.WriteRune(r) + seenText = true } - return strings.TrimSpace(b.String()) + return b.String() } // pipeline.go: Imprint renders a node tree to HTML, strips tags, tokenises the text, diff --git a/pipeline_test.go b/pipeline_test.go index d6637b3..265b0e3 100644 --- a/pipeline_test.go +++ b/pipeline_test.go @@ -32,6 +32,14 @@ func TestStripTags_MultipleRegions(t *testing.T) { } } +func TestStripTags_BoundaryWhitespace(t *testing.T) { + got := StripTags(`
hello
`) + want := "hello" + if got != want { + t.Errorf("StripTags(boundary whitespace) = %q, want %q", got, want) + } +} + func TestStripTags_Empty(t *testing.T) { got := StripTags("") if got != "" {