diff options
author | Bjørn Erik Pedersen <[email protected]> | 2021-05-19 03:45:36 +0200 |
---|---|---|
committer | Bjørn Erik Pedersen <[email protected]> | 2021-05-19 03:45:36 +0200 |
commit | dc6b7a75ff5b7fcb8a0b0e3f7ed406422d847624 (patch) | |
tree | f567d55d5ba900a488d777cb5fb979f414d2e061 | |
parent | 3f515f0e3395b24776ae24045b846ff2b33b8906 (diff) | |
download | hugo-dc6b7a75ff5b7fcb8a0b0e3f7ed406422d847624.tar.gz hugo-dc6b7a75ff5b7fcb8a0b0e3f7ed406422d847624.zip |
Revert "publisher: Make the HTML element collector more robust"
This reverts commit ef0f1a726901d6c614040cfc2d7e8f9a2ca97816.
-rw-r--r-- | common/text/transform.go | 22 | ||||
-rw-r--r-- | publisher/htmlElementsCollector.go | 379 | ||||
-rw-r--r-- | publisher/htmlElementsCollector_test.go | 38 |
3 files changed, 153 insertions, 286 deletions
diff --git a/common/text/transform.go b/common/text/transform.go index 2d51f6c33..f59577803 100644 --- a/common/text/transform.go +++ b/common/text/transform.go @@ -45,25 +45,3 @@ func RemoveAccentsString(s string) string { accentTransformerPool.Put(t) return s } - -// Chunk splits s into strings of size. -func Chunk(s string, size int) []string { - if size >= len(s) { - return []string{s} - } - var chunks []string - chunk := make([]rune, size) - l := 0 - for _, r := range s { - chunk[l] = r - l++ - if l == size { - chunks = append(chunks, string(chunk)) - l = 0 - } - } - if l > 0 { - chunks = append(chunks, string(chunk[:l])) - } - return chunks -} diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go index 1bc1a09bc..9dc28c4c2 100644 --- a/publisher/htmlElementsCollector.go +++ b/publisher/htmlElementsCollector.go @@ -19,51 +19,12 @@ import ( "sort" "strings" "sync" - "unicode" - "unicode/utf8" "golang.org/x/net/html" "github.com/gohugoio/hugo/helpers" ) -const eof = -1 - -var ( - htmlJsonFixer = strings.NewReplacer(", ", "\n") - jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`) - classAttrRe = regexp.MustCompile(`(?i)^class$|transition`) - - skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`) - skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`) - endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`) - - exceptionList = map[string]bool{ - "thead": true, - "tbody": true, - "tfoot": true, - "td": true, - "tr": true, - } -) - -func newHTMLElementsCollector() *htmlElementsCollector { - return &htmlElementsCollector{ - elementSet: make(map[string]bool), - } -} - -func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { - w := &htmlElementsCollectorWriter{ - collector: collector, - state: htmlLexStart, - } - - w.defaultLexElementInside = w.lexElementInside(htmlLexStart) - - return w -} - // HTMLElements holds lists of tags and attribute values for classes and id. type HTMLElements struct { Tags []string `json:"tags"` @@ -87,12 +48,6 @@ func (h *HTMLElements) Sort() { sort.Strings(h.IDs) } -type htmlElement struct { - Tag string - Classes []string - IDs []string -} - type htmlElementsCollector struct { // Contains the raw HTML string. We will get the same element // several times, and want to avoid costly reparsing when this @@ -104,6 +59,12 @@ type htmlElementsCollector struct { mu sync.RWMutex } +func newHTMLElementsCollector() *htmlElementsCollector { + return &htmlElementsCollector{ + elementSet: make(map[string]bool), + } +} + func (c *htmlElementsCollector) getHTMLElements() HTMLElements { var ( classes []string @@ -132,118 +93,114 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements { type htmlElementsCollectorWriter struct { collector *htmlElementsCollector + buff bytes.Buffer - r rune // Current rune - width int // The width in bytes of r - input []byte // The current slice written to Write - pos int // The current position in input - - err error - - inQuote rune - - buff bytes.Buffer + isCollecting bool + inPreTag string - // Current state - state htmlCollectorStateFunc + inQuote bool + quoteValue byte +} - // Precompiled state funcs - defaultLexElementInside htmlCollectorStateFunc +func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { + return &htmlElementsCollectorWriter{ + collector: collector, + } } -// Write collects HTML elements from p. +// Write splits the incoming stream into single html element. func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) { n = len(p) - w.input = p - w.pos = 0 - - for { - w.r = w.next() - if w.r == eof { - return + i := 0 + + for i < len(p) { + // If we are not collecting, cycle through byte stream until start bracket "<" is found. + if !w.isCollecting { + for ; i < len(p); i++ { + b := p[i] + if b == '<' { + w.startCollecting() + break + } + } } - w.state = w.state(w) - } -} - -func (l *htmlElementsCollectorWriter) backup() { - l.pos -= l.width - l.r, _ = utf8.DecodeRune(l.input[l.pos:]) -} -func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { - var s htmlCollectorStateFunc - s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { - w.buff.WriteRune(w.r) - if condition() { - w.buff.Reset() - return resolve + if w.isCollecting { + // If we are collecting, cycle through byte stream until end bracket ">" is found, + // disregard any ">" if within a quote, + // write bytes until found to buffer. + for ; i < len(p); i++ { + b := p[i] + w.toggleIfQuote(b) + w.buff.WriteByte(b) + + if !w.inQuote && b == '>' { + w.endCollecting() + break + } + } } - return s - } - return s -} -func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { - var s htmlCollectorStateFunc - s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { - if condition(w.r) { - return resolve - } - return s - } - return s -} + // If no end bracket ">" is found while collecting, but the stream ended + // this could mean we received chunks of a stream from e.g. the minify functionality + // next if loop will be skipped. -// Starts with e.g. "<body " or "<div" -func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc { - var s htmlCollectorStateFunc - s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { - w.buff.WriteRune(w.r) - - // Skip any text inside a quote. - if w.r == '\'' || w.r == '"' { - if w.inQuote == w.r { - w.inQuote = 0 - } else if w.inQuote == 0 { - w.inQuote = w.r + // At this point we have collected an element line between angle brackets "<" and ">". + if !w.isCollecting { + if w.buff.Len() == 0 { + continue } - } - if w.inQuote != 0 { - return s - } + if w.inPreTag != "" { // within preformatted code block + s := w.buff.String() + w.buff.Reset() + if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName { + w.inPreTag = "" + } + continue + } - if w.r == '>' { + // First check if we have processed this element before. + w.collector.mu.RLock() // Work with the bytes slice as long as it's practical, // to save memory allocations. b := w.buff.Bytes() - defer func() { - w.buff.Reset() - }() - - // First check if we have processed this element before. - w.collector.mu.RLock() - + // See https://github.com/dominikh/go-tools/issues/723 + //lint:ignore S1030 This construct avoids memory allocation for the string. seen := w.collector.elementSet[string(b)] w.collector.mu.RUnlock() if seen { - return resolve + w.buff.Reset() + continue + } + + // Filter out unwanted tags + // if within preformatted code blocks <pre>, <textarea>, <script>, <style> + // comments and doctype tags + // end tags. + switch { + case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag + w.buff.Reset() + continue + case bytes.HasPrefix(b, []byte("</")): // end tag + w.buff.Reset() + continue } s := w.buff.String() + w.buff.Reset() - if s == "" { - return resolve + // Check if a preformatted code block started. + if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) { + w.inPreTag = tagName } // Parse each collected element. el, err := parseHTMLElement(s) if err != nil { - w.err = err - return resolve + return n, err } // Write this tag to the element set. @@ -251,137 +208,109 @@ func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStat w.collector.elementSet[s] = true w.collector.elements = append(w.collector.elements, el) w.collector.mu.Unlock() - - return resolve - } - - return s } - return s + return } -func (l *htmlElementsCollectorWriter) next() rune { - if l.pos >= len(l.input) { - l.width = 0 - return eof - } - - runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:]) - l.width = runeWidth - l.pos += l.width - return runeValue +func (c *htmlElementsCollectorWriter) startCollecting() { + c.isCollecting = true } -// returns the next state in HTML element scanner. -type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc +func (c *htmlElementsCollectorWriter) endCollecting() { + c.isCollecting = false + c.inQuote = false +} -// At "<", buffer empty. -// Potentially starting a HTML element. -func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { - if w.r == '>' || unicode.IsSpace(w.r) { - if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) { - w.buff.Reset() - return htmlLexStart +func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) { + if isQuote(b) { + if c.inQuote && b == c.quoteValue { + c.inQuote = false + } else if !c.inQuote { + c.inQuote = true + c.quoteValue = b } + } +} - tagName := w.buff.Bytes()[1:] - - switch { - case skipInnerElementRe.Match(tagName): - // pre, script etc. We collect classes etc. on the surrounding - // element, but skip the inner content. - w.backup() +func isQuote(b byte) bool { + return b == '"' || b == '\'' +} - // tagName will be overwritten, so make a copy. - tagNameCopy := make([]byte, len(tagName)) - copy(tagNameCopy, tagName) +func parseStartTag(s string) (string, bool) { + s = strings.TrimPrefix(s, "<") + s = strings.TrimSuffix(s, ">") - return w.lexElementInside( - w.consumeBuffUntil( - func() bool { - if w.r != '>' { - return false - } - m := endTagRe.FindSubmatch(w.buff.Bytes()) - if m == nil { - return false - } - return bytes.EqualFold(m[1], tagNameCopy) - }, - htmlLexStart, - )) - case skipAllElementRe.Match(tagName): - // E.g. "<!DOCTYPE ..." - w.buff.Reset() - return w.consumeRuneUntil(func(r rune) bool { - return r == '>' - }, htmlLexStart) - default: - w.backup() - return w.defaultLexElementInside - } + spaceIndex := strings.Index(s, " ") + if spaceIndex != -1 { + s = s[:spaceIndex] } - w.buff.WriteRune(w.r) + return strings.ToLower(strings.TrimSpace(s)), true +} - // If it's a comment, skip to its end. - if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) { - w.buff.Reset() - return htmlLexToEndOfComment +func parseEndTag(s string) (string, bool) { + if !strings.HasPrefix(s, "</") { + return "", false } - return htmlLexElementStart + s = strings.TrimPrefix(s, "</") + s = strings.TrimSuffix(s, ">") + + return strings.ToLower(strings.TrimSpace(s)), true } -// Entry state func. -// Looks for a opening bracket, '<'. -func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { - if w.r == '<' { - w.backup() - w.buff.Reset() - return htmlLexElementStart - } +// No need to look inside these for HTML elements. +func isPreFormatted(s string) bool { + return s == "pre" || s == "textarea" || s == "script" || s == "style" +} - return htmlLexStart +type htmlElement struct { + Tag string + Classes []string + IDs []string } -// After "<!--", buff empty. -func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { - w.buff.WriteRune(w.r) +var ( + htmlJsonFixer = strings.NewReplacer(", ", "\n") + jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`) + classAttrRe = regexp.MustCompile(`(?i)^class$|transition`) - if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) { - // Done, start looking for HTML elements again. - return htmlLexStart + exceptionList = map[string]bool{ + "thead": true, + "tbody": true, + "tfoot": true, + "td": true, + "tr": true, } - - return htmlLexToEndOfComment -} +) func parseHTMLElement(elStr string) (el htmlElement, err error) { + var tagBuffer string = "" - tagName := parseStartTag(elStr) - - el.Tag = strings.ToLower(tagName) - tagNameToParse := el.Tag + tagName, ok := parseStartTag(elStr) + if !ok { + return + } // The net/html parser does not handle single table elements as input, e.g. tbody. // We only care about the element/class/ids, so just store away the original tag name // and pretend it's a <div>. - if exceptionList[el.Tag] { + if exceptionList[tagName] { + tagBuffer = tagName elStr = strings.Replace(elStr, tagName, "div", 1) - tagNameToParse = "div" } n, err := html.Parse(strings.NewReader(elStr)) if err != nil { return } - var walk func(*html.Node) walk = func(n *html.Node) { - if n.Type == html.ElementNode && n.Data == tagNameToParse { + if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) { + el.Tag = n.Data + for _, a := range n.Attr { switch { case strings.EqualFold(a.Key, "id"): @@ -416,20 +345,10 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) { walk(n) - return -} - -// Variants of s -// <body class="b a"> -// <div> -func parseStartTag(s string) string { - spaceIndex := strings.IndexFunc(s, func(r rune) bool { - return unicode.IsSpace(r) - }) - - if spaceIndex == -1 { - return s[1 : len(s)-1] + // did we replaced the start tag? + if tagBuffer != "" { + el.Tag = tagBuffer } - return s[1:spaceIndex] + return } diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go index 2eac31f73..0c8b2b65b 100644 --- a/publisher/htmlElementsCollector_test.go +++ b/publisher/htmlElementsCollector_test.go @@ -15,12 +15,8 @@ package publisher import ( "fmt" - "math/rand" "strings" "testing" - "time" - - "github.com/gohugoio/hugo/common/text" "github.com/gohugoio/hugo/media" "github.com/gohugoio/hugo/minifiers" @@ -32,7 +28,6 @@ import ( func TestClassCollector(t *testing.T) { c := qt.New((t)) - rnd := rand.New(rand.NewSource(time.Now().Unix())) f := func(tags, classes, ids string) HTMLElements { var tagss, classess, idss []string @@ -62,20 +57,14 @@ func TestClassCollector(t *testing.T) { expect HTMLElements }{ {"basic", `<body class="b a"></body>`, f("body", "a b", "")}, - {"duplicates", `<div class="b a b"></div><div class="b a b"></div>x'`, f("div", "a b", "")}, + {"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")}, {"single quote", `<body class='b a'></body>`, f("body", "a b", "")}, {"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")}, - {"short", `<i>`, f("i", "", "")}, - {"invalid", `< body class="b a"></body><div></div>`, f("div", "", "")}, // https://github.com/gohugoio/hugo/issues/7318 {"thead", `<table class="cl1"> <thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead> <tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody> </table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")}, - {"thead uppercase", `<TABLE class="CL1"> - <THEAD class="CL2"><TR class="CL3"><TD class="CL4"></TD></TR></THEAD> - <TBODY class="CL5"><TR class="CL6"><TD class="CL7"></TD></TR></TBODY> -</TABLE>`, f("table tbody td thead tr", "CL1 CL2 CL3 CL4 CL5 CL6 CL7", "")}, // https://github.com/gohugoio/hugo/issues/7161 {"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")}, {"AlpineJS bind 1", `<body> @@ -109,11 +98,6 @@ func TestClassCollector(t *testing.T) { {"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")}, {"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")}, {"Comments should be skipped", `<!-- example comment -->`, f("", "", "")}, - {"Comments with elements before and after", `<div></div><!-- example comment --><span><span>`, f("div span", "", "")}, - // Issue #8530 - {"Comment with single quote", `<!-- Hero Area Image d'accueil --><i class="foo">`, f("i", "foo", "")}, - {"Uppercase tags", `<DIV></DIV>`, f("div", "", "")}, - {"Predefined tags with distinct casing", `<script>if (a < b) { nothing(); }</SCRIPT><div></div>`, f("div script", "", "")}, // Issue #8417 {"Tabs inline", `<hr id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")}, {"Tabs on multiple rows", `<form @@ -124,29 +108,16 @@ func TestClassCollector(t *testing.T) { <div id="b" class="foo">d</div>`, f("div form", "foo", "a b")}, } { - for _, variant := range []struct { - minify bool - stream bool - }{ - {minify: false, stream: false}, - {minify: true, stream: false}, - {minify: false, stream: true}, - } { - - c.Run(fmt.Sprintf("%s--minify-%t--stream-%t", test.name, variant.minify, variant.stream), func(c *qt.C) { + for _, minify := range []bool{false, true} { + c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) { w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) - if variant.minify { + if minify { if skipMinifyTest[test.name] { c.Skip("skip minify test") } v := viper.New() m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v) m.Minify(media.HTMLType, w, strings.NewReader(test.html)) - } else if variant.stream { - chunks := text.Chunk(test.html, rnd.Intn(41)+1) - for _, chunk := range chunks { - fmt.Fprint(w, chunk) - } } else { fmt.Fprint(w, test.html) } @@ -155,7 +126,6 @@ func TestClassCollector(t *testing.T) { }) } } - } func BenchmarkElementsCollectorWriter(b *testing.B) { |