diff options
author | Bjørn Erik Pedersen <[email protected]> | 2022-05-25 10:56:14 +0200 |
---|---|---|
committer | Bjørn Erik Pedersen <[email protected]> | 2022-05-25 17:55:23 +0200 |
commit | 3854a6fa6c323d1c09aa71a0626c9eef62709294 (patch) | |
tree | ea3727c14f73fb73aef89d43795dd6d6f75f1220 | |
parent | cd0112a05a9ddb7043c9808284f93d8099c48473 (diff) | |
download | hugo-3854a6fa6c323d1c09aa71a0626c9eef62709294.tar.gz hugo-3854a6fa6c323d1c09aa71a0626c9eef62709294.zip |
Fix Plainify edge cases
This commit replaces the main part of `helpers.StripHTML` with Go's implementation in its html/template package.
It's a little slower, but correctness is more important:
```bash
BenchmarkStripHTMLOld-10 680316 1764 ns/op 728 B/op 4 allocs/op
BenchmarkStripHTMLNew-10 384520 3099 ns/op 2089 B/op 10 allocs/op
```
Fixes #9199
Fixes #9909
Closes #9410
-rw-r--r-- | helpers/content.go | 40 | ||||
-rw-r--r-- | helpers/content_test.go | 38 | ||||
-rw-r--r-- | hugolib/page__per_output.go | 2 | ||||
-rw-r--r-- | hugolib/page_test.go | 5 | ||||
-rw-r--r-- | tpl/internal/go_templates/htmltemplate/hugo_template.go | 5 | ||||
-rw-r--r-- | tpl/strings/strings.go | 7 | ||||
-rw-r--r-- | tpl/template.go | 46 | ||||
-rw-r--r-- | tpl/template_test.go | 41 | ||||
-rw-r--r-- | tpl/transform/transform.go | 3 | ||||
-rw-r--r-- | tpl/transform/transform_test.go | 1 |
10 files changed, 103 insertions, 85 deletions
diff --git a/helpers/content.go b/helpers/content.go index 835663b76..d04e34a07 100644 --- a/helpers/content.go +++ b/helpers/content.go @@ -34,7 +34,6 @@ import ( "github.com/gohugoio/hugo/markup" - bp "github.com/gohugoio/hugo/bufferpool" "github.com/gohugoio/hugo/config" ) @@ -104,45 +103,6 @@ func NewContentSpec(cfg config.Provider, logger loggers.Logger, contentFs afero. return spec, nil } -var stripHTMLReplacer = strings.NewReplacer("\n", " ", "</p>", "\n", "<br>", "\n", "<br />", "\n") - -// StripHTML accepts a string, strips out all HTML tags and returns it. -func StripHTML(s string) string { - // Shortcut strings with no tags in them - if !strings.ContainsAny(s, "<>") { - return s - } - s = stripHTMLReplacer.Replace(s) - - // Walk through the string removing all tags - b := bp.GetBuffer() - defer bp.PutBuffer(b) - var inTag, isSpace, wasSpace bool - for _, r := range s { - if !inTag { - isSpace = false - } - - switch { - case r == '<': - inTag = true - case r == '>': - inTag = false - case unicode.IsSpace(r): - isSpace = true - fallthrough - default: - if !inTag && (!isSpace || (isSpace && !wasSpace)) { - b.WriteRune(r) - } - } - - wasSpace = isSpace - - } - return b.String() -} - // stripEmptyNav strips out empty <nav> tags from content. func stripEmptyNav(in []byte) []byte { return bytes.Replace(in, []byte("<nav>\n</nav>\n\n"), []byte(``), -1) diff --git a/helpers/content_test.go b/helpers/content_test.go index 4b67b44f0..54b7ef3f9 100644 --- a/helpers/content_test.go +++ b/helpers/content_test.go @@ -52,44 +52,6 @@ func TestTrimShortHTML(t *testing.T) { } } -func TestStripHTML(t *testing.T) { - type test struct { - input, expected string - } - data := []test{ - {"<h1>strip h1 tag <h1>", "strip h1 tag "}, - {"<p> strip p tag </p>", " strip p tag "}, - {"</br> strip br<br>", " strip br\n"}, - {"</br> strip br2<br />", " strip br2\n"}, - {"This <strong>is</strong> a\nnewline", "This is a newline"}, - {"No Tags", "No Tags"}, - {`<p>Summary Next Line. -<figure > - - <img src="/not/real" /> - - -</figure> -. -More text here.</p> - -<p>Some more text</p>`, "Summary Next Line. . More text here.\nSome more text\n"}, - } - for i, d := range data { - output := StripHTML(d.input) - if d.expected != output { - t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output) - } - } -} - -func BenchmarkStripHTML(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - StripHTML(tstHTMLContent) - } -} - func TestStripEmptyNav(t *testing.T) { c := qt.New(t) cleaned := stripEmptyNav([]byte("do<nav>\n</nav>\n\nbedobedo")) diff --git a/hugolib/page__per_output.go b/hugolib/page__per_output.go index 6460b120b..59299b709 100644 --- a/hugolib/page__per_output.go +++ b/hugolib/page__per_output.go @@ -201,7 +201,7 @@ func newPageContentOutput(p *pageState, po *pageOutput) (*pageContentOutput, err }) cp.initPlain = cp.initMain.Branch(func() (any, error) { - cp.plain = helpers.StripHTML(string(cp.content)) + cp.plain = tpl.StripHTML(string(cp.content)) cp.plainWords = strings.Fields(cp.plain) cp.setWordCounts(p.m.isCJKLanguage) diff --git a/hugolib/page_test.go b/hugolib/page_test.go index b93173131..e5f8840a6 100644 --- a/hugolib/page_test.go +++ b/hugolib/page_test.go @@ -26,6 +26,7 @@ import ( "github.com/gohugoio/hugo/htesting" "github.com/gohugoio/hugo/markup/asciidocext" "github.com/gohugoio/hugo/markup/rst" + "github.com/gohugoio/hugo/tpl" "github.com/gohugoio/hugo/config" @@ -40,7 +41,6 @@ import ( qt "github.com/frankban/quicktest" "github.com/gohugoio/hugo/deps" - "github.com/gohugoio/hugo/helpers" ) const ( @@ -351,7 +351,7 @@ func normalizeExpected(ext, str string) string { default: return str case "html": - return strings.Trim(helpers.StripHTML(str), " ") + return strings.Trim(tpl.StripHTML(str), " ") case "ad": paragraphs := strings.Split(str, "</p>") expected := "" @@ -1736,6 +1736,7 @@ Len Summary: {{ len .Summary }} Len Content: {{ len .Content }} SUMMARY:{{ .Summary }}:{{ len .Summary }}:END + `} b := newTestSitesBuilder(t) diff --git a/tpl/internal/go_templates/htmltemplate/hugo_template.go b/tpl/internal/go_templates/htmltemplate/hugo_template.go index eba54fbbf..99edf8f68 100644 --- a/tpl/internal/go_templates/htmltemplate/hugo_template.go +++ b/tpl/internal/go_templates/htmltemplate/hugo_template.go @@ -34,3 +34,8 @@ func (t *Template) Prepare() (*template.Template, error) { } return t.text, nil } + +// See https://github.com/golang/go/issues/5884 +func StripTags(html string) string { + return stripTags(html) +} diff --git a/tpl/strings/strings.go b/tpl/strings/strings.go index 482a8a837..a49451483 100644 --- a/tpl/strings/strings.go +++ b/tpl/strings/strings.go @@ -25,6 +25,7 @@ import ( "github.com/gohugoio/hugo/common/text" "github.com/gohugoio/hugo/deps" "github.com/gohugoio/hugo/helpers" + "github.com/gohugoio/hugo/tpl" "github.com/spf13/cast" ) @@ -52,7 +53,7 @@ func (ns *Namespace) CountRunes(s any) (int, error) { } counter := 0 - for _, r := range helpers.StripHTML(ss) { + for _, r := range tpl.StripHTML(ss) { if !helpers.IsWhitespace(r) { counter++ } @@ -83,11 +84,11 @@ func (ns *Namespace) CountWords(s any) (int, error) { } if !isCJKLanguage { - return len(strings.Fields(helpers.StripHTML((ss)))), nil + return len(strings.Fields(tpl.StripHTML(ss))), nil } counter := 0 - for _, word := range strings.Fields(helpers.StripHTML(ss)) { + for _, word := range strings.Fields(tpl.StripHTML(ss)) { runeCount := utf8.RuneCountInString(word) if len(word) == runeCount { counter++ diff --git a/tpl/template.go b/tpl/template.go index 299b7208d..738750de7 100644 --- a/tpl/template.go +++ b/tpl/template.go @@ -18,9 +18,14 @@ import ( "io" "reflect" "regexp" + "strings" + "unicode" + + bp "github.com/gohugoio/hugo/bufferpool" "github.com/gohugoio/hugo/output" + htmltemplate "github.com/gohugoio/hugo/tpl/internal/go_templates/htmltemplate" texttemplate "github.com/gohugoio/hugo/tpl/internal/go_templates/texttemplate" ) @@ -163,3 +168,44 @@ func GetHasLockFromContext(ctx context.Context) bool { func SetHasLockInContext(ctx context.Context, hasLock bool) context.Context { return context.WithValue(ctx, texttemplate.HasLockContextKey, hasLock) } + +const hugoNewLinePlaceholder = "___hugonl_" + +var ( + stripHTMLReplacerPre = strings.NewReplacer("\n", " ", "</p>", hugoNewLinePlaceholder, "<br>", hugoNewLinePlaceholder, "<br />", hugoNewLinePlaceholder) + whitespaceRe = regexp.MustCompile(`\s+`) +) + +// StripHTML strips out all HTML tags in s. +func StripHTML(s string) string { + // Shortcut strings with no tags in them + if !strings.ContainsAny(s, "<>") { + return s + } + + pre := stripHTMLReplacerPre.Replace(s) + preReplaced := pre != s + + s = htmltemplate.StripTags(pre) + + if preReplaced { + s = strings.ReplaceAll(s, hugoNewLinePlaceholder, "\n") + } + + var wasSpace bool + b := bp.GetBuffer() + defer bp.PutBuffer(b) + for _, r := range s { + isSpace := unicode.IsSpace(r) + if !(isSpace && wasSpace) { + b.WriteRune(r) + } + wasSpace = isSpace + } + + if b.Len() > 0 { + s = b.String() + } + + return s +} diff --git a/tpl/template_test.go b/tpl/template_test.go index afd3c4b00..d989b7158 100644 --- a/tpl/template_test.go +++ b/tpl/template_test.go @@ -28,3 +28,44 @@ func TestExtractBaseof(t *testing.T) { c.Assert(extractBaseOf("not baseof for you"), qt.Equals, "") c.Assert(extractBaseOf("template: blog/baseof.html:23:11:"), qt.Equals, "blog/baseof.html") } + +func TestStripHTML(t *testing.T) { + type test struct { + input, expected string + } + data := []test{ + {"<h1>strip h1 tag <h1>", "strip h1 tag "}, + {"<p> strip p tag </p>", " strip p tag "}, + {"</br> strip br<br>", " strip br\n"}, + {"</br> strip br2<br />", " strip br2\n"}, + {"This <strong>is</strong> a\nnewline", "This is a newline"}, + {"No Tags", "No Tags"}, + {`<p>Summary Next Line. +<figure > + + <img src="/not/real" /> + + +</figure> +. +More text here.</p> + +<p>Some more text</p>`, "Summary Next Line. . More text here.\nSome more text\n"}, + + // Issue 9199 + {"<div data-action='click->my-controller#doThing'>qwe</div>", "qwe"}, + {"Hello, World!", "Hello, World!"}, + {"foo&bar", "foo&bar"}, + {`Hello <a href="www.example.com/">World</a>!`, "Hello World!"}, + {"Foo <textarea>Bar</textarea> Baz", "Foo Bar Baz"}, + {"Foo <!-- Bar --> Baz", "Foo Baz"}, + } + for i, d := range data { + output := StripHTML(d.input) + if d.expected != output { + t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output) + } + } +} + +const tstHTMLContent = "<!DOCTYPE html><html><head><script src=\"http://two/foobar.js\"></script></head><body><nav><ul><li hugo-nav=\"section_0\"></li><li hugo-nav=\"section_1\"></li></ul></nav><article>content <a href=\"http://two/foobar\">foobar</a>. Follow up</article><p>This is some text.<br>And some more.</p></body></html>" diff --git a/tpl/transform/transform.go b/tpl/transform/transform.go index 7c0914378..69ab98a9a 100644 --- a/tpl/transform/transform.go +++ b/tpl/transform/transform.go @@ -22,6 +22,7 @@ import ( "github.com/gohugoio/hugo/cache/namedmemcache" "github.com/gohugoio/hugo/markup/converter/hooks" "github.com/gohugoio/hugo/markup/highlight" + "github.com/gohugoio/hugo/tpl" "github.com/gohugoio/hugo/deps" "github.com/gohugoio/hugo/helpers" @@ -141,7 +142,7 @@ func (ns *Namespace) Plainify(s any) (string, error) { return "", err } - return helpers.StripHTML(ss), nil + return tpl.StripHTML(ss), nil } // For internal use. diff --git a/tpl/transform/transform_test.go b/tpl/transform/transform_test.go index ab2fd3b9e..edef4e1bd 100644 --- a/tpl/transform/transform_test.go +++ b/tpl/transform/transform_test.go @@ -237,6 +237,7 @@ func TestPlainify(t *testing.T) { expect any }{ {"<em>Note:</em> blah <b>blah</b>", "Note: blah blah"}, + {"<div data-action='click->my-controller#doThing'>qwe</div>", "qwe"}, // errors {tstNoStringer{}, false}, } { |