diff options
author | Dirk Olbrich <[email protected]> | 2021-04-12 23:42:51 +0200 |
---|---|---|
committer | Bjørn Erik Pedersen <[email protected]> | 2021-04-20 17:24:17 +0200 |
commit | bc80022e033a5462d1a9ce541f40a050994011cc (patch) | |
tree | 2767906a3b8dec84e332e35f4c0a2236dff4add7 /publisher | |
parent | 2bb9496ce29dfe90e8b3664ed8cf7f895011b2d4 (diff) | |
download | hugo-bc80022e033a5462d1a9ce541f40a050994011cc.tar.gz hugo-bc80022e033a5462d1a9ce541f40a050994011cc.zip |
publisher: Exclude comment and doctype elements from writeStats
- Reorder code blocks
- Rename cssClassCollectorWriter to htmlElementCollectorWriter, as it just collect html element information
- Expand benchmark to test for minified and unminified content
Fixes #8396
Fixes #8417
Diffstat (limited to 'publisher')
-rw-r--r-- | publisher/htmlElementsCollector.go | 295 | ||||
-rw-r--r-- | publisher/htmlElementsCollector_test.go | 150 |
2 files changed, 291 insertions, 154 deletions
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go index d9479aafa..9f4be1ff5 100644 --- a/publisher/htmlElementsCollector.go +++ b/publisher/htmlElementsCollector.go @@ -20,21 +20,10 @@ import ( "strings" "sync" - "github.com/gohugoio/hugo/helpers" "golang.org/x/net/html" -) - -func newHTMLElementsCollector() *htmlElementsCollector { - return &htmlElementsCollector{ - elementSet: make(map[string]bool), - } -} -func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *cssClassCollectorWriter { - return &cssClassCollectorWriter{ - collector: collector, - } -} + "github.com/gohugoio/hugo/helpers" +) // HTMLElements holds lists of tags and attribute values for classes and id. type HTMLElements struct { @@ -59,7 +48,50 @@ func (h *HTMLElements) Sort() { sort.Strings(h.IDs) } -type cssClassCollectorWriter struct { +type htmlElementsCollector struct { + // Contains the raw HTML string. We will get the same element + // several times, and want to avoid costly reparsing when this + // is used for aggregated data only. + elementSet map[string]bool + + elements []htmlElement + + mu sync.RWMutex +} + +func newHTMLElementsCollector() *htmlElementsCollector { + return &htmlElementsCollector{ + elementSet: make(map[string]bool), + } +} + +func (c *htmlElementsCollector) getHTMLElements() HTMLElements { + var ( + classes []string + ids []string + tags []string + ) + + for _, el := range c.elements { + classes = append(classes, el.Classes...) + ids = append(ids, el.IDs...) + tags = append(tags, el.Tag) + } + + classes = helpers.UniqueStringsSorted(classes) + ids = helpers.UniqueStringsSorted(ids) + tags = helpers.UniqueStringsSorted(tags) + + els := HTMLElements{ + Classes: classes, + IDs: ids, + Tags: tags, + } + + return els +} + +type htmlElementsCollectorWriter struct { collector *htmlElementsCollector buff bytes.Buffer @@ -70,11 +102,19 @@ type cssClassCollectorWriter struct { quoteValue byte } -func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) { +func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { + return &htmlElementsCollectorWriter{ + collector: collector, + } +} + +// Write splits the incoming stream into single html element and writes these into elementSet +func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) { n = len(p) i := 0 for i < len(p) { + // if is not collecting, cycle through byte stream until start bracket "<" is found if !w.isCollecting { for ; i < len(p); i++ { b := p[i] @@ -86,109 +126,89 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) { } if w.isCollecting { + // if is collecting, cycle through byte stream until end bracket ">" is found + // disregard any ">" if within a quote + // write bytes until found to buffer for ; i < len(p); i++ { b := p[i] w.toggleIfQuote(b) + w.buff.WriteByte(b) + if !w.inQuote && b == '>' { w.endCollecting() break } - w.buff.WriteByte(b) } + } - if !w.isCollecting { - if w.inPreTag != "" { - s := w.buff.String() - if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName { - w.inPreTag = "" - } - w.buff.Reset() - continue - } - - // First check if we have processed this element before. - w.collector.mu.RLock() - - // See https://github.com/dominikh/go-tools/issues/723 - //lint:ignore S1030 This construct avoids memory allocation for the string. - seen := w.collector.elementSet[string(w.buff.Bytes())] - w.collector.mu.RUnlock() - if seen { - w.buff.Reset() - continue - } - - s := w.buff.String() - - w.buff.Reset() + // if no end bracket ">" is found while collecting, but the stream ended + // this could mean we received chunks of a stream from e.g. the minify functionality + // next if loop will be skipped - if strings.HasPrefix(s, "</") { - continue + // at this point we have collected an element line between angle brackets "<" and ">" + if !w.isCollecting { + s := w.buff.String() + w.buff.Reset() + + // filter out unwanted tags + // empty string, just in case + // if within preformatted code blocks <pre>, <textarea>, <script>, <style> + // comments and doctype tags + // end tags + switch { + case s == "": // empty string + continue + case w.inPreTag != "": // within preformatted code block + if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName { + w.inPreTag = "" } + continue + case strings.HasPrefix(s, "<!"): // comment or doctype tag + continue + case strings.HasPrefix(s, "</"): // end tag + continue + } - key := s - - s, tagName := w.insertStandinHTMLElement(s) - el := parseHTMLElement(s) - el.Tag = tagName - if w.isPreFormatted(tagName) { - w.inPreTag = tagName - } + // check if we have processed this element before. + w.collector.mu.RLock() + seen := w.collector.elementSet[s] + w.collector.mu.RUnlock() + if seen { + continue + } - w.collector.mu.Lock() - w.collector.elementSet[key] = true - if el.Tag != "" { - w.collector.elements = append(w.collector.elements, el) - } - w.collector.mu.Unlock() + // check if a preformatted code block started + if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) { + w.inPreTag = tagName + } + // parse each collected element + el, err := parseHTMLElement(s) + if err != nil { + return n, err } + + // write this tag to the element set + w.collector.mu.Lock() + w.collector.elementSet[s] = true + w.collector.elements = append(w.collector.elements, el) + w.collector.mu.Unlock() } } return } -// No need to look inside these for HTML elements. -func (c *cssClassCollectorWriter) isPreFormatted(s string) bool { - return s == "pre" || s == "textarea" || s == "script" -} - -// The net/html parser does not handle single table elements as input, e.g. tbody. -// We only care about the element/class/ids, so just store away the original tag name -// and pretend it's a <div>. -func (c *cssClassCollectorWriter) insertStandinHTMLElement(el string) (string, string) { - tag := el[1:] - spacei := strings.Index(tag, " ") - if spacei != -1 { - tag = tag[:spacei] - } - tag = strings.Trim(tag, "\n ") - newv := strings.Replace(el, tag, "div", 1) - return newv, strings.ToLower(tag) -} - -func (c *cssClassCollectorWriter) parseEndTag(s string) (string, bool) { - if !strings.HasPrefix(s, "</") { - return "", false - } - s = strings.TrimPrefix(s, "</") - s = strings.TrimSuffix(s, ">") - return strings.ToLower(strings.TrimSpace(s)), true +func (c *htmlElementsCollectorWriter) startCollecting() { + c.isCollecting = true } -func (c *cssClassCollectorWriter) endCollecting() { +func (c *htmlElementsCollectorWriter) endCollecting() { c.isCollecting = false c.inQuote = false - -} - -func (c *cssClassCollectorWriter) startCollecting() { - c.isCollecting = true - } -func (c *cssClassCollectorWriter) toggleIfQuote(b byte) { +func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) { if isQuote(b) { if c.inQuote && b == c.quoteValue { c.inQuote = false @@ -199,51 +219,46 @@ func (c *cssClassCollectorWriter) toggleIfQuote(b byte) { } } -type htmlElement struct { - Tag string - Classes []string - IDs []string +func isQuote(b byte) bool { + return b == '"' || b == '\'' } -type htmlElementsCollector struct { - // Contains the raw HTML string. We will get the same element - // several times, and want to avoid costly reparsing when this - // is used for aggregated data only. - elementSet map[string]bool +func parseStartTag(s string) (string, bool) { + if strings.HasPrefix(s, "</") || strings.HasPrefix(s, "<!") { + return "", false + } - elements []htmlElement + s = strings.TrimPrefix(s, "<") + s = strings.TrimSuffix(s, ">") - mu sync.RWMutex -} + spaceIndex := strings.Index(s, " ") + if spaceIndex != -1 { + s = s[:spaceIndex] + } -func (c *htmlElementsCollector) getHTMLElements() HTMLElements { - var ( - classes []string - ids []string - tags []string - ) + return strings.ToLower(strings.TrimSpace(s)), true +} - for _, el := range c.elements { - classes = append(classes, el.Classes...) - ids = append(ids, el.IDs...) - tags = append(tags, el.Tag) +func parseEndTag(s string) (string, bool) { + if !strings.HasPrefix(s, "</") { + return "", false } - classes = helpers.UniqueStringsSorted(classes) - ids = helpers.UniqueStringsSorted(ids) - tags = helpers.UniqueStringsSorted(tags) + s = strings.TrimPrefix(s, "</") + s = strings.TrimSuffix(s, ">") - els := HTMLElements{ - Classes: classes, - IDs: ids, - Tags: tags, - } + return strings.ToLower(strings.TrimSpace(s)), true +} - return els +// No need to look inside these for HTML elements. +func isPreFormatted(s string) bool { + return s == "pre" || s == "textarea" || s == "script" || s == "style" } -func isQuote(b byte) bool { - return b == '"' || b == '\'' +type htmlElement struct { + Tag string + Classes []string + IDs []string } var ( @@ -252,11 +267,29 @@ var ( classAttrRe = regexp.MustCompile(`(?i)^class$|transition`) ) -func parseHTMLElement(elStr string) (el htmlElement) { - elStr = strings.TrimSpace(elStr) - if !strings.HasSuffix(elStr, ">") { - elStr += ">" +func parseHTMLElement(elStr string) (el htmlElement, err error) { + var tagBuffer string = "" + exceptionList := map[string]bool{ + "thead": true, + "tbody": true, + "tfoot": true, + "td": true, + "tr": true, } + + tagName, ok := parseStartTag(elStr) + if !ok { + return + } + + // The net/html parser does not handle single table elements as input, e.g. tbody. + // We only care about the element/class/ids, so just store away the original tag name + // and pretend it's a <div>. + if exceptionList[tagName] { + tagBuffer = tagName + elStr = strings.Replace(elStr, tagName, "div", 1) + } + n, err := html.Parse(strings.NewReader(elStr)) if err != nil { return @@ -287,7 +320,6 @@ func parseHTMLElement(elStr string) (el htmlElement) { val = strings.Join(lines, "\n") val = jsonAttrRe.ReplaceAllString(val, "$1") el.Classes = append(el.Classes, strings.Fields(val)...) - } } } @@ -301,5 +333,10 @@ func parseHTMLElement(elStr string) (el htmlElement) { walk(n) + // did we replaced the start tag? + if tagBuffer != "" { + el.Tag = tagBuffer + } + return } diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go index 5b2d85d47..1ada27c18 100644 --- a/publisher/htmlElementsCollector_test.go +++ b/publisher/htmlElementsCollector_test.go @@ -14,17 +14,17 @@ package publisher import ( + "bytes" "fmt" "strings" "testing" - "github.com/gohugoio/hugo/minifiers" - "github.com/gohugoio/hugo/media" + "github.com/gohugoio/hugo/minifiers" "github.com/gohugoio/hugo/output" - "github.com/spf13/viper" qt "github.com/frankban/quicktest" + "github.com/spf13/viper" ) func TestClassCollector(t *testing.T) { @@ -50,7 +50,6 @@ func TestClassCollector(t *testing.T) { skipMinifyTest := map[string]bool{ "Script tags content should be skipped": true, // https://github.com/tdewolff/minify/issues/396 - } for _, test := range []struct { @@ -62,56 +61,57 @@ func TestClassCollector(t *testing.T) { {"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")}, {"single quote", `<body class='b a'></body>`, f("body", "a b", "")}, {"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")}, - {"thead", ` - https://github.com/gohugoio/hugo/issues/7318 -<table class="cl1"> + // https://github.com/gohugoio/hugo/issues/7318 + {"thead", `<table class="cl1"> <thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead> <tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody> </table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")}, // https://github.com/gohugoio/hugo/issues/7161 {"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")}, - {"AlpineJS bind 1", `<body> - <div x-bind:class="{ + <div x-bind:class="{ 'class1': data.open, 'class2 class3': data.foo == 'bar' }"> - </div> - </body>`, f("body div", "class1 class2 class3", "")}, - - { - "Alpine bind 2", `<div x-bind:class="{ 'bg-black': filter.checked }" - class="inline-block mr-1 mb-2 rounded bg-gray-300 px-2 py-2">FOO</div>`, + </div> +</body>`, f("body div", "class1 class2 class3", "")}, + {"AlpineJS bind 2", `<div x-bind:class="{ 'bg-black': filter.checked }" class="inline-block mr-1 mb-2 rounded bg-gray-300 px-2 py-2">FOO</div>`, f("div", "bg-black bg-gray-300 inline-block mb-2 mr-1 px-2 py-2 rounded", ""), }, - - {"Alpine bind 3", `<div x-bind:class="{ 'text-gray-800': !checked, 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")}, - {"Alpine bind 4", `<div x-bind:class="{ 'text-gray-800': !checked, + {"AlpineJS bind 3", `<div x-bind:class="{ 'text-gray-800': !checked, 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")}, + {"AlpineJS bind 4", `<div x-bind:class="{ 'text-gray-800': !checked, 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")}, - - {"Alpine bind 5", `<a x-bind:class="{ + {"AlpineJS bind 5", `<a x-bind:class="{ 'text-a': a && b, 'text-b': !a && b || c, 'pl-3': a === 1, pl-2: b == 3, 'text-gray-600': (a > 1) - }" class="block w-36 cursor-pointer pr-3 no-underline capitalize"></a>`, f("a", "block capitalize cursor-pointer no-underline pl-2 pl-3 pr-3 text-a text-b text-gray-600 w-36", "")}, - - {"Alpine transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")}, + {"AlpineJS transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")}, {"Vue bind", `<div v-bind:class="{ active: isActive }"></div>`, f("div", "active", "")}, // Issue #7746 {"Apostrophe inside attribute value", `<a class="missingclass" title="Plus d'information">my text</a><div></div>`, f("a div", "missingclass", "")}, // Issue #7567 {"Script tags content should be skipped", `<script><span>foo</span><span>bar</span></script><div class="foo"></div>`, f("div script", "foo", "")}, + {"Style tags content should be skipped", `<style>p{color: red;font-size: 20px;}</style><div class="foo"></div>`, f("div style", "foo", "")}, {"Pre tags content should be skipped", `<pre class="preclass"><span>foo</span><span>bar</span></pre><div class="foo"></div>`, f("div pre", "foo preclass", "")}, - {"Textare tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")}, + {"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")}, + {"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")}, + {"Comments should be skipped", `<!-- example comment -->`, f("", "", "")}, + // Issue #8417 + {"Tabs inline", `<hr id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")}, + {"Tabs on multiple rows", `<form + id="a" + action="www.example.com" + method="post" +></form> +<div id="b" class="foo">d</div>`, f("div form", "foo", "a b")}, } { for _, minify := range []bool{false, true} { c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) { w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) - if minify { if skipMinifyTest[test.name] { c.Skip("skip minify test") @@ -152,6 +152,106 @@ func BenchmarkClassCollectorWriter(b *testing.B) { for i := 0; i < b.N; i++ { w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) fmt.Fprint(w, benchHTML) + } +} +const benchHTML = ` +<!DOCTYPE html> +<html> +<head> +<title>title</title> +<style> + a {color: red;} + .c {color: blue;} +</style> +</head> +<body id="i1" class="a b c d"> +<a class="c d e"></a> +<hr> +<a class="c d e"></a> +<a class="c d e"></a> +<hr> +<a id="i2" class="c d e f"></a> +<a id="i3" class="c d e"></a> +<a class="c d e"></a> +<p>To force<br> line breaks<br> in a text,<br> use the br<br> element.</p> +<hr> +<a class="c d e"></a> +<a class="c d e"></a> +<a class="c d e"></a> +<a class="c d e"></a> +<table> + <thead class="ch"> + <tr> + <th>Month</th> + <th>Savings</th> + </tr> + </thead> + <tbody class="cb"> + <tr> + <td>January</td> + <td>$100</td> + </tr> + <tr> + <td>February</td> + <td>$200</td> + </tr> + </tbody> + <tfoot class="cf"> + <tr> + <td></td> + <td>$300</td> + </tr> + </tfoot> +</table> +</body> +</html> +` + +func BenchmarkElementsCollectorWriter(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + fmt.Fprint(w, benchHTML) + } +} + +func BenchmarkElementsCollectorWriterMinified(b *testing.B) { + b.ReportAllocs() + v := viper.New() + m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v) + var buf bytes.Buffer + m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML)) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + fmt.Fprint(w, buf.String()) + } +} + +func BenchmarkElementsCollectorWriterWithMinifyStream(b *testing.B) { + b.ReportAllocs() + v := viper.New() + m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + m.Minify(media.HTMLType, w, strings.NewReader(benchHTML)) + } +} + +func BenchmarkElementsCollectorWriterWithMinifyString(b *testing.B) { + b.ReportAllocs() + v := viper.New() + m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + var buf bytes.Buffer + m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML)) + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + fmt.Fprint(w, buf.String()) } } |