diff --git a/pipeline.go b/pipeline.go index 146533a..bd5bf42 100644 --- a/pipeline.go +++ b/pipeline.go @@ -14,36 +14,59 @@ import ( // Does not handle script/style element content (go-html does not generate these). func StripTags(html string) string { b := core.NewBuilder() + runes := []rune(html) inTag := false prevSpace := true // starts true to trim leading space - for _, r := range html { - if r == '<' { - inTag = true - continue - } - if r == '>' { - inTag = false - if !prevSpace { - b.WriteByte(' ') - prevSpace = true - } - continue - } - if !inTag { - if r == ' ' || r == '\t' || r == '\n' { + for i := 0; i < len(runes); i++ { + r := runes[i] + if inTag { + if r == '>' { + inTag = false if !prevSpace { b.WriteByte(' ') prevSpace = true } - } else { - b.WriteRune(r) - prevSpace = false } + continue + } + + switch r { + case '<': + if i+1 < len(runes) && isTagStartRune(runes[i+1]) { + inTag = true + continue + } + b.WriteRune(r) + prevSpace = false + case '>': + b.WriteRune(r) + prevSpace = false + case ' ', '\t', '\n', '\r': + if !prevSpace { + b.WriteByte(' ') + prevSpace = true + } + default: + b.WriteRune(r) + prevSpace = false } } return core.Trim(b.String()) } +func isTagStartRune(r rune) bool { + switch { + case r >= 'a' && r <= 'z': + return true + case r >= 'A' && r <= 'Z': + return true + case r == '/', r == '!', r == '?': + return true + default: + return false + } +} + // Imprint renders a node tree to HTML, strips tags, tokenises the text, // and returns a GrammarImprint — the full render-reverse pipeline. // Usage example: imp := Imprint(Text("welcome"), NewContext()) diff --git a/pipeline_test.go b/pipeline_test.go index 9e556d2..7450ee9 100644 --- a/pipeline_test.go +++ b/pipeline_test.go @@ -46,6 +46,14 @@ func TestStripTags_NoTags_Good(t *testing.T) { } } +func TestStripTags_PreservesComparisonOperators_Good(t *testing.T) { + got := StripTags(`
1 < 2 and 3 > 2
`) + want := "1 < 2 and 3 > 2" + if got != want { + t.Errorf("StripTags(comparisons) = %q, want %q", got, want) + } +} + func TestStripTags_Entities_Good(t *testing.T) { got := StripTags(`<script>`) want := "<script>"