diff options
author | Bjørn Erik Pedersen <[email protected]> | 2021-05-13 13:10:32 +0200 |
---|---|---|
committer | Bjørn Erik Pedersen <[email protected]> | 2021-05-19 20:43:49 +0200 |
commit | f518b4f71e1a61b09d660b5c284121ebf3b3b86b (patch) | |
tree | 910197dbf1526769a2f365cbc571791c8f417418 | |
parent | dc6b7a75ff5b7fcb8a0b0e3f7ed406422d847624 (diff) | |
download | hugo-f518b4f71e1a61b09d660b5c284121ebf3b3b86b.tar.gz hugo-f518b4f71e1a61b09d660b5c284121ebf3b3b86b.zip |
publisher: Make the HTML element collector more robust
Fixes #8530
-rw-r--r-- | publisher/htmlElementsCollector.go | 389 | ||||
-rw-r--r-- | publisher/htmlElementsCollector_test.go | 37 |
2 files changed, 271 insertions, 155 deletions
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go index 9dc28c4c2..ca6e2d940 100644 --- a/publisher/htmlElementsCollector.go +++ b/publisher/htmlElementsCollector.go @@ -19,12 +19,51 @@ import ( "sort" "strings" "sync" + "unicode" + "unicode/utf8" "golang.org/x/net/html" "github.com/gohugoio/hugo/helpers" ) +const eof = -1 + +var ( + htmlJsonFixer = strings.NewReplacer(", ", "\n") + jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`) + classAttrRe = regexp.MustCompile(`(?i)^class$|transition`) + + skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`) + skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`) + endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`) + + exceptionList = map[string]bool{ + "thead": true, + "tbody": true, + "tfoot": true, + "td": true, + "tr": true, + } +) + +func newHTMLElementsCollector() *htmlElementsCollector { + return &htmlElementsCollector{ + elementSet: make(map[string]bool), + } +} + +func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { + w := &htmlElementsCollectorWriter{ + collector: collector, + state: htmlLexStart, + } + + w.defaultLexElementInside = w.lexElementInside(htmlLexStart) + + return w +} + // HTMLElements holds lists of tags and attribute values for classes and id. type HTMLElements struct { Tags []string `json:"tags"` @@ -48,6 +87,12 @@ func (h *HTMLElements) Sort() { sort.Strings(h.IDs) } +type htmlElement struct { + Tag string + Classes []string + IDs []string +} + type htmlElementsCollector struct { // Contains the raw HTML string. We will get the same element // several times, and want to avoid costly reparsing when this @@ -59,12 +104,6 @@ type htmlElementsCollector struct { mu sync.RWMutex } -func newHTMLElementsCollector() *htmlElementsCollector { - return &htmlElementsCollector{ - elementSet: make(map[string]bool), - } -} - func (c *htmlElementsCollector) getHTMLElements() HTMLElements { var ( classes []string @@ -93,114 +132,125 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements { type htmlElementsCollectorWriter struct { collector *htmlElementsCollector - buff bytes.Buffer - isCollecting bool - inPreTag string + r rune // Current rune + width int // The width in bytes of r + input []byte // The current slice written to Write + pos int // The current position in input + + err error - inQuote bool - quoteValue byte + inQuote rune + + buff bytes.Buffer + + // Current state + state htmlCollectorStateFunc + + // Precompiled state funcs + defaultLexElementInside htmlCollectorStateFunc } -func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter { - return &htmlElementsCollectorWriter{ - collector: collector, +// Write collects HTML elements from p, which must contain complete runes. +func (w *htmlElementsCollectorWriter) Write(p []byte) (int, error) { + if p == nil { + return 0, nil } -} -// Write splits the incoming stream into single html element. -func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) { - n = len(p) - i := 0 - - for i < len(p) { - // If we are not collecting, cycle through byte stream until start bracket "<" is found. - if !w.isCollecting { - for ; i < len(p); i++ { - b := p[i] - if b == '<' { - w.startCollecting() - break - } - } + w.input = p + + for { + w.r = w.next() + if w.r == eof || w.r == utf8.RuneError { + break } + w.state = w.state(w) + } - if w.isCollecting { - // If we are collecting, cycle through byte stream until end bracket ">" is found, - // disregard any ">" if within a quote, - // write bytes until found to buffer. - for ; i < len(p); i++ { - b := p[i] - w.toggleIfQuote(b) - w.buff.WriteByte(b) - - if !w.inQuote && b == '>' { - w.endCollecting() - break - } - } + w.pos = 0 + w.input = nil + + return len(p), nil +} + +func (l *htmlElementsCollectorWriter) backup() { + l.pos -= l.width + l.r, _ = utf8.DecodeRune(l.input[l.pos:]) +} + +func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { + var s htmlCollectorStateFunc + s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { + w.buff.WriteRune(w.r) + if condition() { + w.buff.Reset() + return resolve } + return s + } + return s +} - // If no end bracket ">" is found while collecting, but the stream ended - // this could mean we received chunks of a stream from e.g. the minify functionality - // next if loop will be skipped. +func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc { + var s htmlCollectorStateFunc + s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc { + if condition(w.r) { + return resolve + } + return s + } + return s +} - // At this point we have collected an element line between angle brackets "<" and ">". - if !w.isCollecting { - if w.buff.Len() == 0 { - continue +// Starts with e.g. "<body " or "<div" +func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc { + var s htmlCollectorStateFunc + s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { + w.buff.WriteRune(w.r) + + // Skip any text inside a quote. + if w.r == '\'' || w.r == '"' { + if w.inQuote == w.r { + w.inQuote = 0 + } else if w.inQuote == 0 { + w.inQuote = w.r } + } - if w.inPreTag != "" { // within preformatted code block - s := w.buff.String() - w.buff.Reset() - if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName { - w.inPreTag = "" - } - continue - } + if w.inQuote != 0 { + return s + } - // First check if we have processed this element before. - w.collector.mu.RLock() + if w.r == '>' { // Work with the bytes slice as long as it's practical, // to save memory allocations. b := w.buff.Bytes() - // See https://github.com/dominikh/go-tools/issues/723 - //lint:ignore S1030 This construct avoids memory allocation for the string. + defer func() { + w.buff.Reset() + }() + + // First check if we have processed this element before. + w.collector.mu.RLock() + seen := w.collector.elementSet[string(b)] w.collector.mu.RUnlock() if seen { - w.buff.Reset() - continue - } - - // Filter out unwanted tags - // if within preformatted code blocks <pre>, <textarea>, <script>, <style> - // comments and doctype tags - // end tags. - switch { - case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag - w.buff.Reset() - continue - case bytes.HasPrefix(b, []byte("</")): // end tag - w.buff.Reset() - continue + return resolve } s := w.buff.String() - w.buff.Reset() - // Check if a preformatted code block started. - if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) { - w.inPreTag = tagName + if s == "" { + return resolve } // Parse each collected element. el, err := parseHTMLElement(s) if err != nil { - return n, err + w.err = err + return resolve } // Write this tag to the element set. @@ -208,109 +258,138 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) { w.collector.elementSet[s] = true w.collector.elements = append(w.collector.elements, el) w.collector.mu.Unlock() + + return resolve + } + + return s } - return + return s } -func (c *htmlElementsCollectorWriter) startCollecting() { - c.isCollecting = true -} +func (l *htmlElementsCollectorWriter) next() rune { + if l.pos >= len(l.input) { + l.width = 0 + return eof + } -func (c *htmlElementsCollectorWriter) endCollecting() { - c.isCollecting = false - c.inQuote = false + runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:]) + + l.width = runeWidth + l.pos += l.width + return runeValue } -func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) { - if isQuote(b) { - if c.inQuote && b == c.quoteValue { - c.inQuote = false - } else if !c.inQuote { - c.inQuote = true - c.quoteValue = b +// returns the next state in HTML element scanner. +type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc + +// At "<", buffer empty. +// Potentially starting a HTML element. +func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { + if w.r == '>' || unicode.IsSpace(w.r) { + if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) { + w.buff.Reset() + return htmlLexStart } - } -} -func isQuote(b byte) bool { - return b == '"' || b == '\'' -} + tagName := w.buff.Bytes()[1:] -func parseStartTag(s string) (string, bool) { - s = strings.TrimPrefix(s, "<") - s = strings.TrimSuffix(s, ">") + switch { + case skipInnerElementRe.Match(tagName): + // pre, script etc. We collect classes etc. on the surrounding + // element, but skip the inner content. + w.backup() - spaceIndex := strings.Index(s, " ") - if spaceIndex != -1 { - s = s[:spaceIndex] + // tagName will be overwritten, so make a copy. + tagNameCopy := make([]byte, len(tagName)) + copy(tagNameCopy, tagName) + + return w.lexElementInside( + w.consumeBuffUntil( + func() bool { + if w.r != '>' { + return false + } + m := endTagRe.FindSubmatch(w.buff.Bytes()) + if m == nil { + return false + } + return bytes.EqualFold(m[1], tagNameCopy) + }, + htmlLexStart, + )) + case skipAllElementRe.Match(tagName): + // E.g. "<!DOCTYPE ..." + w.buff.Reset() + return w.consumeRuneUntil(func(r rune) bool { + return r == '>' + }, htmlLexStart) + default: + w.backup() + return w.defaultLexElementInside + } } - return strings.ToLower(strings.TrimSpace(s)), true -} + w.buff.WriteRune(w.r) -func parseEndTag(s string) (string, bool) { - if !strings.HasPrefix(s, "</") { - return "", false + // If it's a comment, skip to its end. + if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) { + w.buff.Reset() + return htmlLexToEndOfComment } - s = strings.TrimPrefix(s, "</") - s = strings.TrimSuffix(s, ">") - - return strings.ToLower(strings.TrimSpace(s)), true + return htmlLexElementStart } -// No need to look inside these for HTML elements. -func isPreFormatted(s string) bool { - return s == "pre" || s == "textarea" || s == "script" || s == "style" -} +// Entry state func. +// Looks for a opening bracket, '<'. +func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { + if w.r == '<' { + w.backup() + w.buff.Reset() + return htmlLexElementStart + } -type htmlElement struct { - Tag string - Classes []string - IDs []string + return htmlLexStart } -var ( - htmlJsonFixer = strings.NewReplacer(", ", "\n") - jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`) - classAttrRe = regexp.MustCompile(`(?i)^class$|transition`) +// After "<!--", buff empty. +func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc { + w.buff.WriteRune(w.r) - exceptionList = map[string]bool{ - "thead": true, - "tbody": true, - "tfoot": true, - "td": true, - "tr": true, + if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) { + // Done, start looking for HTML elements again. + return htmlLexStart } -) + + return htmlLexToEndOfComment +} func parseHTMLElement(elStr string) (el htmlElement, err error) { - var tagBuffer string = "" - tagName, ok := parseStartTag(elStr) - if !ok { - return - } + tagName := parseStartTag(elStr) + + el.Tag = strings.ToLower(tagName) + tagNameToParse := el.Tag // The net/html parser does not handle single table elements as input, e.g. tbody. // We only care about the element/class/ids, so just store away the original tag name // and pretend it's a <div>. - if exceptionList[tagName] { - tagBuffer = tagName + if exceptionList[el.Tag] { elStr = strings.Replace(elStr, tagName, "div", 1) + tagNameToParse = "div" } n, err := html.Parse(strings.NewReader(elStr)) if err != nil { return } + var walk func(*html.Node) walk = func(n *html.Node) { - if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) { - el.Tag = n.Data - + if n.Type == html.ElementNode && n.Data == tagNameToParse { for _, a := range n.Attr { switch { case strings.EqualFold(a.Key, "id"): @@ -345,10 +424,20 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) { walk(n) - // did we replaced the start tag? - if tagBuffer != "" { - el.Tag = tagBuffer + return +} + +// Variants of s +// <body class="b a"> +// <div> +func parseStartTag(s string) string { + spaceIndex := strings.IndexFunc(s, func(r rune) bool { + return unicode.IsSpace(r) + }) + + if spaceIndex == -1 { + return s[1 : len(s)-1] } - return + return s[1:spaceIndex] } diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go index 0c8b2b65b..2400b1612 100644 --- a/publisher/htmlElementsCollector_test.go +++ b/publisher/htmlElementsCollector_test.go @@ -14,9 +14,13 @@ package publisher import ( + "bytes" "fmt" + "io" + "math/rand" "strings" "testing" + "time" "github.com/gohugoio/hugo/media" "github.com/gohugoio/hugo/minifiers" @@ -28,6 +32,7 @@ import ( func TestClassCollector(t *testing.T) { c := qt.New((t)) + rnd := rand.New(rand.NewSource(time.Now().Unix())) f := func(tags, classes, ids string) HTMLElements { var tagss, classess, idss []string @@ -57,14 +62,20 @@ func TestClassCollector(t *testing.T) { expect HTMLElements }{ {"basic", `<body class="b a"></body>`, f("body", "a b", "")}, - {"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")}, + {"duplicates", `<div class="b a b"></div><div class="b a b"></div>x'`, f("div", "a b", "")}, {"single quote", `<body class='b a'></body>`, f("body", "a b", "")}, {"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")}, + {"short", `<i>`, f("i", "", "")}, + {"invalid", `< body class="b a"></body><div></div>`, f("div", "", "")}, // https://github.com/gohugoio/hugo/issues/7318 {"thead", `<table class="cl1"> <thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead> <tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody> </table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")}, + {"thead uppercase", `<TABLE class="CL1"> + <THEAD class="CL2"><TR class="CL3"><TD class="CL4"></TD></TR></THEAD> + <TBODY class="CL5"><TR class="CL6"><TD class="CL7"></TD></TR></TBODY> +</TABLE>`, f("table tbody td thead tr", "CL1 CL2 CL3 CL4 CL5 CL6 CL7", "")}, // https://github.com/gohugoio/hugo/issues/7161 {"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")}, {"AlpineJS bind 1", `<body> @@ -98,6 +109,11 @@ func TestClassCollector(t *testing.T) { {"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")}, {"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")}, {"Comments should be skipped", `<!-- example comment -->`, f("", "", "")}, + {"Comments with elements before and after", `<div></div><!-- example comment --><span><span>`, f("div span", "", "")}, + // Issue #8530 + {"Comment with single quote", `<!-- Hero Area Image d'accueil --><i class="foo">`, f("i", "foo", "")}, + {"Uppercase tags", `<DIV></DIV>`, f("div", "", "")}, + {"Predefined tags with distinct casing", `<script>if (a < b) { nothing(); }</SCRIPT><div></div>`, f("div script", "", "")}, // Issue #8417 {"Tabs inline", `<hr id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")}, {"Tabs on multiple rows", `<form @@ -106,26 +122,37 @@ func TestClassCollector(t *testing.T) { method="post" ></form> <div id="b" class="foo">d</div>`, f("div form", "foo", "a b")}, + {"Big input, multibyte runes", strings.Repeat(`神真美好 `, rnd.Intn(500)+1) + "<div id=\"神真美好\" class=\"foo\">" + strings.Repeat(`神真美好 `, rnd.Intn(100)+1) + " <span>神真美好</span>", f("div span", "foo", "神真美好")}, } { - for _, minify := range []bool{false, true} { - c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) { + for _, variant := range []struct { + minify bool + }{ + {minify: false}, + {minify: true}, + } { + + c.Run(fmt.Sprintf("%s--minify-%t", test.name, variant.minify), func(c *qt.C) { w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) - if minify { + if variant.minify { if skipMinifyTest[test.name] { c.Skip("skip minify test") } v := viper.New() m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v) m.Minify(media.HTMLType, w, strings.NewReader(test.html)) + } else { - fmt.Fprint(w, test.html) + var buff bytes.Buffer + buff.WriteString(test.html) + io.Copy(w, &buff) } got := w.collector.getHTMLElements() c.Assert(got, qt.DeepEquals, test.expect) }) } } + } func BenchmarkElementsCollectorWriter(b *testing.B) { |