aboutsummaryrefslogtreecommitdiffhomepage
path: root/publisher
diff options
context:
space:
mode:
authorDirk Olbrich <[email protected]>2021-04-12 23:42:51 +0200
committerBjørn Erik Pedersen <[email protected]>2021-04-20 17:24:17 +0200
commitbc80022e033a5462d1a9ce541f40a050994011cc (patch)
tree2767906a3b8dec84e332e35f4c0a2236dff4add7 /publisher
parent2bb9496ce29dfe90e8b3664ed8cf7f895011b2d4 (diff)
downloadhugo-bc80022e033a5462d1a9ce541f40a050994011cc.tar.gz
hugo-bc80022e033a5462d1a9ce541f40a050994011cc.zip
publisher: Exclude comment and doctype elements from writeStats
- Reorder code blocks - Rename cssClassCollectorWriter to htmlElementCollectorWriter, as it just collect html element information - Expand benchmark to test for minified and unminified content Fixes #8396 Fixes #8417
Diffstat (limited to 'publisher')
-rw-r--r--publisher/htmlElementsCollector.go295
-rw-r--r--publisher/htmlElementsCollector_test.go150
2 files changed, 291 insertions, 154 deletions
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go
index d9479aafa..9f4be1ff5 100644
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -20,21 +20,10 @@ import (
"strings"
"sync"
- "github.com/gohugoio/hugo/helpers"
"golang.org/x/net/html"
-)
-
-func newHTMLElementsCollector() *htmlElementsCollector {
- return &htmlElementsCollector{
- elementSet: make(map[string]bool),
- }
-}
-func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *cssClassCollectorWriter {
- return &cssClassCollectorWriter{
- collector: collector,
- }
-}
+ "github.com/gohugoio/hugo/helpers"
+)
// HTMLElements holds lists of tags and attribute values for classes and id.
type HTMLElements struct {
@@ -59,7 +48,50 @@ func (h *HTMLElements) Sort() {
sort.Strings(h.IDs)
}
-type cssClassCollectorWriter struct {
+type htmlElementsCollector struct {
+ // Contains the raw HTML string. We will get the same element
+ // several times, and want to avoid costly reparsing when this
+ // is used for aggregated data only.
+ elementSet map[string]bool
+
+ elements []htmlElement
+
+ mu sync.RWMutex
+}
+
+func newHTMLElementsCollector() *htmlElementsCollector {
+ return &htmlElementsCollector{
+ elementSet: make(map[string]bool),
+ }
+}
+
+func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
+ var (
+ classes []string
+ ids []string
+ tags []string
+ )
+
+ for _, el := range c.elements {
+ classes = append(classes, el.Classes...)
+ ids = append(ids, el.IDs...)
+ tags = append(tags, el.Tag)
+ }
+
+ classes = helpers.UniqueStringsSorted(classes)
+ ids = helpers.UniqueStringsSorted(ids)
+ tags = helpers.UniqueStringsSorted(tags)
+
+ els := HTMLElements{
+ Classes: classes,
+ IDs: ids,
+ Tags: tags,
+ }
+
+ return els
+}
+
+type htmlElementsCollectorWriter struct {
collector *htmlElementsCollector
buff bytes.Buffer
@@ -70,11 +102,19 @@ type cssClassCollectorWriter struct {
quoteValue byte
}
-func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
+func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
+ return &htmlElementsCollectorWriter{
+ collector: collector,
+ }
+}
+
+// Write splits the incoming stream into single html element and writes these into elementSet
+func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
n = len(p)
i := 0
for i < len(p) {
+ // if is not collecting, cycle through byte stream until start bracket "<" is found
if !w.isCollecting {
for ; i < len(p); i++ {
b := p[i]
@@ -86,109 +126,89 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
}
if w.isCollecting {
+ // if is collecting, cycle through byte stream until end bracket ">" is found
+ // disregard any ">" if within a quote
+ // write bytes until found to buffer
for ; i < len(p); i++ {
b := p[i]
w.toggleIfQuote(b)
+ w.buff.WriteByte(b)
+
if !w.inQuote && b == '>' {
w.endCollecting()
break
}
- w.buff.WriteByte(b)
}
+ }
- if !w.isCollecting {
- if w.inPreTag != "" {
- s := w.buff.String()
- if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName {
- w.inPreTag = ""
- }
- w.buff.Reset()
- continue
- }
-
- // First check if we have processed this element before.
- w.collector.mu.RLock()
-
- // See https://github.com/dominikh/go-tools/issues/723
- //lint:ignore S1030 This construct avoids memory allocation for the string.
- seen := w.collector.elementSet[string(w.buff.Bytes())]
- w.collector.mu.RUnlock()
- if seen {
- w.buff.Reset()
- continue
- }
-
- s := w.buff.String()
-
- w.buff.Reset()
+ // if no end bracket ">" is found while collecting, but the stream ended
+ // this could mean we received chunks of a stream from e.g. the minify functionality
+ // next if loop will be skipped
- if strings.HasPrefix(s, "</") {
- continue
+ // at this point we have collected an element line between angle brackets "<" and ">"
+ if !w.isCollecting {
+ s := w.buff.String()
+ w.buff.Reset()
+
+ // filter out unwanted tags
+ // empty string, just in case
+ // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
+ // comments and doctype tags
+ // end tags
+ switch {
+ case s == "": // empty string
+ continue
+ case w.inPreTag != "": // within preformatted code block
+ if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
+ w.inPreTag = ""
}
+ continue
+ case strings.HasPrefix(s, "<!"): // comment or doctype tag
+ continue
+ case strings.HasPrefix(s, "</"): // end tag
+ continue
+ }
- key := s
-
- s, tagName := w.insertStandinHTMLElement(s)
- el := parseHTMLElement(s)
- el.Tag = tagName
- if w.isPreFormatted(tagName) {
- w.inPreTag = tagName
- }
+ // check if we have processed this element before.
+ w.collector.mu.RLock()
+ seen := w.collector.elementSet[s]
+ w.collector.mu.RUnlock()
+ if seen {
+ continue
+ }
- w.collector.mu.Lock()
- w.collector.elementSet[key] = true
- if el.Tag != "" {
- w.collector.elements = append(w.collector.elements, el)
- }
- w.collector.mu.Unlock()
+ // check if a preformatted code block started
+ if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
+ w.inPreTag = tagName
+ }
+ // parse each collected element
+ el, err := parseHTMLElement(s)
+ if err != nil {
+ return n, err
}
+
+ // write this tag to the element set
+ w.collector.mu.Lock()
+ w.collector.elementSet[s] = true
+ w.collector.elements = append(w.collector.elements, el)
+ w.collector.mu.Unlock()
}
}
return
}
-// No need to look inside these for HTML elements.
-func (c *cssClassCollectorWriter) isPreFormatted(s string) bool {
- return s == "pre" || s == "textarea" || s == "script"
-}
-
-// The net/html parser does not handle single table elements as input, e.g. tbody.
-// We only care about the element/class/ids, so just store away the original tag name
-// and pretend it's a <div>.
-func (c *cssClassCollectorWriter) insertStandinHTMLElement(el string) (string, string) {
- tag := el[1:]
- spacei := strings.Index(tag, " ")
- if spacei != -1 {
- tag = tag[:spacei]
- }
- tag = strings.Trim(tag, "\n ")
- newv := strings.Replace(el, tag, "div", 1)
- return newv, strings.ToLower(tag)
-}
-
-func (c *cssClassCollectorWriter) parseEndTag(s string) (string, bool) {
- if !strings.HasPrefix(s, "</") {
- return "", false
- }
- s = strings.TrimPrefix(s, "</")
- s = strings.TrimSuffix(s, ">")
- return strings.ToLower(strings.TrimSpace(s)), true
+func (c *htmlElementsCollectorWriter) startCollecting() {
+ c.isCollecting = true
}
-func (c *cssClassCollectorWriter) endCollecting() {
+func (c *htmlElementsCollectorWriter) endCollecting() {
c.isCollecting = false
c.inQuote = false
-
-}
-
-func (c *cssClassCollectorWriter) startCollecting() {
- c.isCollecting = true
-
}
-func (c *cssClassCollectorWriter) toggleIfQuote(b byte) {
+func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) {
if isQuote(b) {
if c.inQuote && b == c.quoteValue {
c.inQuote = false
@@ -199,51 +219,46 @@ func (c *cssClassCollectorWriter) toggleIfQuote(b byte) {
}
}
-type htmlElement struct {
- Tag string
- Classes []string
- IDs []string
+func isQuote(b byte) bool {
+ return b == '"' || b == '\''
}
-type htmlElementsCollector struct {
- // Contains the raw HTML string. We will get the same element
- // several times, and want to avoid costly reparsing when this
- // is used for aggregated data only.
- elementSet map[string]bool
+func parseStartTag(s string) (string, bool) {
+ if strings.HasPrefix(s, "</") || strings.HasPrefix(s, "<!") {
+ return "", false
+ }
- elements []htmlElement
+ s = strings.TrimPrefix(s, "<")
+ s = strings.TrimSuffix(s, ">")
- mu sync.RWMutex
-}
+ spaceIndex := strings.Index(s, " ")
+ if spaceIndex != -1 {
+ s = s[:spaceIndex]
+ }
-func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
- var (
- classes []string
- ids []string
- tags []string
- )
+ return strings.ToLower(strings.TrimSpace(s)), true
+}
- for _, el := range c.elements {
- classes = append(classes, el.Classes...)
- ids = append(ids, el.IDs...)
- tags = append(tags, el.Tag)
+func parseEndTag(s string) (string, bool) {
+ if !strings.HasPrefix(s, "</") {
+ return "", false
}
- classes = helpers.UniqueStringsSorted(classes)
- ids = helpers.UniqueStringsSorted(ids)
- tags = helpers.UniqueStringsSorted(tags)
+ s = strings.TrimPrefix(s, "</")
+ s = strings.TrimSuffix(s, ">")
- els := HTMLElements{
- Classes: classes,
- IDs: ids,
- Tags: tags,
- }
+ return strings.ToLower(strings.TrimSpace(s)), true
+}
- return els
+// No need to look inside these for HTML elements.
+func isPreFormatted(s string) bool {
+ return s == "pre" || s == "textarea" || s == "script" || s == "style"
}
-func isQuote(b byte) bool {
- return b == '"' || b == '\''
+type htmlElement struct {
+ Tag string
+ Classes []string
+ IDs []string
}
var (
@@ -252,11 +267,29 @@ var (
classAttrRe = regexp.MustCompile(`(?i)^class$|transition`)
)
-func parseHTMLElement(elStr string) (el htmlElement) {
- elStr = strings.TrimSpace(elStr)
- if !strings.HasSuffix(elStr, ">") {
- elStr += ">"
+func parseHTMLElement(elStr string) (el htmlElement, err error) {
+ var tagBuffer string = ""
+ exceptionList := map[string]bool{
+ "thead": true,
+ "tbody": true,
+ "tfoot": true,
+ "td": true,
+ "tr": true,
}
+
+ tagName, ok := parseStartTag(elStr)
+ if !ok {
+ return
+ }
+
+ // The net/html parser does not handle single table elements as input, e.g. tbody.
+ // We only care about the element/class/ids, so just store away the original tag name
+ // and pretend it's a <div>.
+ if exceptionList[tagName] {
+ tagBuffer = tagName
+ elStr = strings.Replace(elStr, tagName, "div", 1)
+ }
+
n, err := html.Parse(strings.NewReader(elStr))
if err != nil {
return
@@ -287,7 +320,6 @@ func parseHTMLElement(elStr string) (el htmlElement) {
val = strings.Join(lines, "\n")
val = jsonAttrRe.ReplaceAllString(val, "$1")
el.Classes = append(el.Classes, strings.Fields(val)...)
-
}
}
}
@@ -301,5 +333,10 @@ func parseHTMLElement(elStr string) (el htmlElement) {
walk(n)
+ // did we replaced the start tag?
+ if tagBuffer != "" {
+ el.Tag = tagBuffer
+ }
+
return
}
diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go
index 5b2d85d47..1ada27c18 100644
--- a/publisher/htmlElementsCollector_test.go
+++ b/publisher/htmlElementsCollector_test.go
@@ -14,17 +14,17 @@
package publisher
import (
+ "bytes"
"fmt"
"strings"
"testing"
- "github.com/gohugoio/hugo/minifiers"
-
"github.com/gohugoio/hugo/media"
+ "github.com/gohugoio/hugo/minifiers"
"github.com/gohugoio/hugo/output"
- "github.com/spf13/viper"
qt "github.com/frankban/quicktest"
+ "github.com/spf13/viper"
)
func TestClassCollector(t *testing.T) {
@@ -50,7 +50,6 @@ func TestClassCollector(t *testing.T) {
skipMinifyTest := map[string]bool{
"Script tags content should be skipped": true, // https://github.com/tdewolff/minify/issues/396
-
}
for _, test := range []struct {
@@ -62,56 +61,57 @@ func TestClassCollector(t *testing.T) {
{"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")},
{"single quote", `<body class='b a'></body>`, f("body", "a b", "")},
{"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")},
- {"thead", `
- https://github.com/gohugoio/hugo/issues/7318
-<table class="cl1">
+ // https://github.com/gohugoio/hugo/issues/7318
+ {"thead", `<table class="cl1">
<thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead>
<tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody>
</table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")},
// https://github.com/gohugoio/hugo/issues/7161
{"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")},
-
{"AlpineJS bind 1", `<body>
- <div x-bind:class="{
+ <div x-bind:class="{
'class1': data.open,
'class2 class3': data.foo == 'bar'
}">
- </div>
- </body>`, f("body div", "class1 class2 class3", "")},
-
- {
- "Alpine bind 2", `<div x-bind:class="{ 'bg-black': filter.checked }"
- class="inline-block mr-1 mb-2 rounded bg-gray-300 px-2 py-2">FOO</div>`,
+ </div>
+</body>`, f("body div", "class1 class2 class3", "")},
+ {"AlpineJS bind 2", `<div x-bind:class="{ 'bg-black': filter.checked }" class="inline-block mr-1 mb-2 rounded bg-gray-300 px-2 py-2">FOO</div>`,
f("div", "bg-black bg-gray-300 inline-block mb-2 mr-1 px-2 py-2 rounded", ""),
},
-
- {"Alpine bind 3", `<div x-bind:class="{ 'text-gray-800': !checked, 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
- {"Alpine bind 4", `<div x-bind:class="{ 'text-gray-800': !checked,
+ {"AlpineJS bind 3", `<div x-bind:class="{ 'text-gray-800': !checked, 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
+ {"AlpineJS bind 4", `<div x-bind:class="{ 'text-gray-800': !checked,
'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
-
- {"Alpine bind 5", `<a x-bind:class="{
+ {"AlpineJS bind 5", `<a x-bind:class="{
'text-a': a && b,
'text-b': !a && b || c,
'pl-3': a === 1,
pl-2: b == 3,
'text-gray-600': (a > 1)
-
}" class="block w-36 cursor-pointer pr-3 no-underline capitalize"></a>`, f("a", "block capitalize cursor-pointer no-underline pl-2 pl-3 pr-3 text-a text-b text-gray-600 w-36", "")},
-
- {"Alpine transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")},
+ {"AlpineJS transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")},
{"Vue bind", `<div v-bind:class="{ active: isActive }"></div>`, f("div", "active", "")},
// Issue #7746
{"Apostrophe inside attribute value", `<a class="missingclass" title="Plus d'information">my text</a><div></div>`, f("a div", "missingclass", "")},
// Issue #7567
{"Script tags content should be skipped", `<script><span>foo</span><span>bar</span></script><div class="foo"></div>`, f("div script", "foo", "")},
+ {"Style tags content should be skipped", `<style>p{color: red;font-size: 20px;}</style><div class="foo"></div>`, f("div style", "foo", "")},
{"Pre tags content should be skipped", `<pre class="preclass"><span>foo</span><span>bar</span></pre><div class="foo"></div>`, f("div pre", "foo preclass", "")},
- {"Textare tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
+ {"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
+ {"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")},
+ {"Comments should be skipped", `<!-- example comment -->`, f("", "", "")},
+ // Issue #8417
+ {"Tabs inline", `<hr id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")},
+ {"Tabs on multiple rows", `<form
+ id="a"
+ action="www.example.com"
+ method="post"
+></form>
+<div id="b" class="foo">d</div>`, f("div form", "foo", "a b")},
} {
for _, minify := range []bool{false, true} {
c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
-
if minify {
if skipMinifyTest[test.name] {
c.Skip("skip minify test")
@@ -152,6 +152,106 @@ func BenchmarkClassCollectorWriter(b *testing.B) {
for i := 0; i < b.N; i++ {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
fmt.Fprint(w, benchHTML)
+ }
+}
+const benchHTML = `
+<!DOCTYPE html>
+<html>
+<head>
+<title>title</title>
+<style>
+ a {color: red;}
+ .c {color: blue;}
+</style>
+</head>
+<body id="i1" class="a b c d">
+<a class="c d e"></a>
+<hr>
+<a class="c d e"></a>
+<a class="c d e"></a>
+<hr>
+<a id="i2" class="c d e f"></a>
+<a id="i3" class="c d e"></a>
+<a class="c d e"></a>
+<p>To force<br> line breaks<br> in a text,<br> use the br<br> element.</p>
+<hr>
+<a class="c d e"></a>
+<a class="c d e"></a>
+<a class="c d e"></a>
+<a class="c d e"></a>
+<table>
+ <thead class="ch">
+ <tr>
+ <th>Month</th>
+ <th>Savings</th>
+ </tr>
+ </thead>
+ <tbody class="cb">
+ <tr>
+ <td>January</td>
+ <td>$100</td>
+ </tr>
+ <tr>
+ <td>February</td>
+ <td>$200</td>
+ </tr>
+ </tbody>
+ <tfoot class="cf">
+ <tr>
+ <td></td>
+ <td>$300</td>
+ </tr>
+ </tfoot>
+</table>
+</body>
+</html>
+`
+
+func BenchmarkElementsCollectorWriter(b *testing.B) {
+ b.ReportAllocs()
+ for i := 0; i < b.N; i++ {
+ w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
+ fmt.Fprint(w, benchHTML)
+ }
+}
+
+func BenchmarkElementsCollectorWriterMinified(b *testing.B) {
+ b.ReportAllocs()
+ v := viper.New()
+ m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
+ var buf bytes.Buffer
+ m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
+ b.ResetTimer()
+
+ for i := 0; i < b.N; i++ {
+ w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
+ fmt.Fprint(w, buf.String())
+ }
+}
+
+func BenchmarkElementsCollectorWriterWithMinifyStream(b *testing.B) {
+ b.ReportAllocs()
+ v := viper.New()
+ m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
+ b.ResetTimer()
+
+ for i := 0; i < b.N; i++ {
+ w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
+ m.Minify(media.HTMLType, w, strings.NewReader(benchHTML))
+ }
+}
+
+func BenchmarkElementsCollectorWriterWithMinifyString(b *testing.B) {
+ b.ReportAllocs()
+ v := viper.New()
+ m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
+ b.ResetTimer()
+
+ for i := 0; i < b.N; i++ {
+ var buf bytes.Buffer
+ m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
+ w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
+ fmt.Fprint(w, buf.String())
}
}