aboutsummaryrefslogtreecommitdiffhomepage
path: root/publisher/htmlElementsCollector.go
diff options
context:
space:
mode:
authorBjørn Erik Pedersen <[email protected]>2021-05-19 03:45:36 +0200
committerBjørn Erik Pedersen <[email protected]>2021-05-19 03:45:36 +0200
commitdc6b7a75ff5b7fcb8a0b0e3f7ed406422d847624 (patch)
treef567d55d5ba900a488d777cb5fb979f414d2e061 /publisher/htmlElementsCollector.go
parent3f515f0e3395b24776ae24045b846ff2b33b8906 (diff)
downloadhugo-dc6b7a75ff5b7fcb8a0b0e3f7ed406422d847624.tar.gz
hugo-dc6b7a75ff5b7fcb8a0b0e3f7ed406422d847624.zip
Revert "publisher: Make the HTML element collector more robust"
This reverts commit ef0f1a726901d6c614040cfc2d7e8f9a2ca97816.
Diffstat (limited to 'publisher/htmlElementsCollector.go')
-rw-r--r--publisher/htmlElementsCollector.go379
1 files changed, 149 insertions, 230 deletions
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go
index 1bc1a09bc..9dc28c4c2 100644
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -19,51 +19,12 @@ import (
"sort"
"strings"
"sync"
- "unicode"
- "unicode/utf8"
"golang.org/x/net/html"
"github.com/gohugoio/hugo/helpers"
)
-const eof = -1
-
-var (
- htmlJsonFixer = strings.NewReplacer(", ", "\n")
- jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`)
- classAttrRe = regexp.MustCompile(`(?i)^class$|transition`)
-
- skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
- skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`)
- endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
-
- exceptionList = map[string]bool{
- "thead": true,
- "tbody": true,
- "tfoot": true,
- "td": true,
- "tr": true,
- }
-)
-
-func newHTMLElementsCollector() *htmlElementsCollector {
- return &htmlElementsCollector{
- elementSet: make(map[string]bool),
- }
-}
-
-func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
- w := &htmlElementsCollectorWriter{
- collector: collector,
- state: htmlLexStart,
- }
-
- w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
-
- return w
-}
-
// HTMLElements holds lists of tags and attribute values for classes and id.
type HTMLElements struct {
Tags []string `json:"tags"`
@@ -87,12 +48,6 @@ func (h *HTMLElements) Sort() {
sort.Strings(h.IDs)
}
-type htmlElement struct {
- Tag string
- Classes []string
- IDs []string
-}
-
type htmlElementsCollector struct {
// Contains the raw HTML string. We will get the same element
// several times, and want to avoid costly reparsing when this
@@ -104,6 +59,12 @@ type htmlElementsCollector struct {
mu sync.RWMutex
}
+func newHTMLElementsCollector() *htmlElementsCollector {
+ return &htmlElementsCollector{
+ elementSet: make(map[string]bool),
+ }
+}
+
func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
var (
classes []string
@@ -132,118 +93,114 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
type htmlElementsCollectorWriter struct {
collector *htmlElementsCollector
+ buff bytes.Buffer
- r rune // Current rune
- width int // The width in bytes of r
- input []byte // The current slice written to Write
- pos int // The current position in input
-
- err error
-
- inQuote rune
-
- buff bytes.Buffer
+ isCollecting bool
+ inPreTag string
- // Current state
- state htmlCollectorStateFunc
+ inQuote bool
+ quoteValue byte
+}
- // Precompiled state funcs
- defaultLexElementInside htmlCollectorStateFunc
+func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
+ return &htmlElementsCollectorWriter{
+ collector: collector,
+ }
}
-// Write collects HTML elements from p.
+// Write splits the incoming stream into single html element.
func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
n = len(p)
- w.input = p
- w.pos = 0
-
- for {
- w.r = w.next()
- if w.r == eof {
- return
+ i := 0
+
+ for i < len(p) {
+ // If we are not collecting, cycle through byte stream until start bracket "<" is found.
+ if !w.isCollecting {
+ for ; i < len(p); i++ {
+ b := p[i]
+ if b == '<' {
+ w.startCollecting()
+ break
+ }
+ }
}
- w.state = w.state(w)
- }
-}
-
-func (l *htmlElementsCollectorWriter) backup() {
- l.pos -= l.width
- l.r, _ = utf8.DecodeRune(l.input[l.pos:])
-}
-func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
- var s htmlCollectorStateFunc
- s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
- w.buff.WriteRune(w.r)
- if condition() {
- w.buff.Reset()
- return resolve
+ if w.isCollecting {
+ // If we are collecting, cycle through byte stream until end bracket ">" is found,
+ // disregard any ">" if within a quote,
+ // write bytes until found to buffer.
+ for ; i < len(p); i++ {
+ b := p[i]
+ w.toggleIfQuote(b)
+ w.buff.WriteByte(b)
+
+ if !w.inQuote && b == '>' {
+ w.endCollecting()
+ break
+ }
+ }
}
- return s
- }
- return s
-}
-func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
- var s htmlCollectorStateFunc
- s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
- if condition(w.r) {
- return resolve
- }
- return s
- }
- return s
-}
+ // If no end bracket ">" is found while collecting, but the stream ended
+ // this could mean we received chunks of a stream from e.g. the minify functionality
+ // next if loop will be skipped.
-// Starts with e.g. "<body " or "<div"
-func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
- var s htmlCollectorStateFunc
- s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
- w.buff.WriteRune(w.r)
-
- // Skip any text inside a quote.
- if w.r == '\'' || w.r == '"' {
- if w.inQuote == w.r {
- w.inQuote = 0
- } else if w.inQuote == 0 {
- w.inQuote = w.r
+ // At this point we have collected an element line between angle brackets "<" and ">".
+ if !w.isCollecting {
+ if w.buff.Len() == 0 {
+ continue
}
- }
- if w.inQuote != 0 {
- return s
- }
+ if w.inPreTag != "" { // within preformatted code block
+ s := w.buff.String()
+ w.buff.Reset()
+ if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
+ w.inPreTag = ""
+ }
+ continue
+ }
- if w.r == '>' {
+ // First check if we have processed this element before.
+ w.collector.mu.RLock()
// Work with the bytes slice as long as it's practical,
// to save memory allocations.
b := w.buff.Bytes()
- defer func() {
- w.buff.Reset()
- }()
-
- // First check if we have processed this element before.
- w.collector.mu.RLock()
-
+ // See https://github.com/dominikh/go-tools/issues/723
+ //lint:ignore S1030 This construct avoids memory allocation for the string.
seen := w.collector.elementSet[string(b)]
w.collector.mu.RUnlock()
if seen {
- return resolve
+ w.buff.Reset()
+ continue
+ }
+
+ // Filter out unwanted tags
+ // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
+ // comments and doctype tags
+ // end tags.
+ switch {
+ case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
+ w.buff.Reset()
+ continue
+ case bytes.HasPrefix(b, []byte("</")): // end tag
+ w.buff.Reset()
+ continue
}
s := w.buff.String()
+ w.buff.Reset()
- if s == "" {
- return resolve
+ // Check if a preformatted code block started.
+ if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
+ w.inPreTag = tagName
}
// Parse each collected element.
el, err := parseHTMLElement(s)
if err != nil {
- w.err = err
- return resolve
+ return n, err
}
// Write this tag to the element set.
@@ -251,137 +208,109 @@ func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStat
w.collector.elementSet[s] = true
w.collector.elements = append(w.collector.elements, el)
w.collector.mu.Unlock()
-
- return resolve
-
}
-
- return s
}
- return s
+ return
}
-func (l *htmlElementsCollectorWriter) next() rune {
- if l.pos >= len(l.input) {
- l.width = 0
- return eof
- }
-
- runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
- l.width = runeWidth
- l.pos += l.width
- return runeValue
+func (c *htmlElementsCollectorWriter) startCollecting() {
+ c.isCollecting = true
}
-// returns the next state in HTML element scanner.
-type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
+func (c *htmlElementsCollectorWriter) endCollecting() {
+ c.isCollecting = false
+ c.inQuote = false
+}
-// At "<", buffer empty.
-// Potentially starting a HTML element.
-func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
- if w.r == '>' || unicode.IsSpace(w.r) {
- if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
- w.buff.Reset()
- return htmlLexStart
+func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) {
+ if isQuote(b) {
+ if c.inQuote && b == c.quoteValue {
+ c.inQuote = false
+ } else if !c.inQuote {
+ c.inQuote = true
+ c.quoteValue = b
}
+ }
+}
- tagName := w.buff.Bytes()[1:]
-
- switch {
- case skipInnerElementRe.Match(tagName):
- // pre, script etc. We collect classes etc. on the surrounding
- // element, but skip the inner content.
- w.backup()
+func isQuote(b byte) bool {
+ return b == '"' || b == '\''
+}
- // tagName will be overwritten, so make a copy.
- tagNameCopy := make([]byte, len(tagName))
- copy(tagNameCopy, tagName)
+func parseStartTag(s string) (string, bool) {
+ s = strings.TrimPrefix(s, "<")
+ s = strings.TrimSuffix(s, ">")
- return w.lexElementInside(
- w.consumeBuffUntil(
- func() bool {
- if w.r != '>' {
- return false
- }
- m := endTagRe.FindSubmatch(w.buff.Bytes())
- if m == nil {
- return false
- }
- return bytes.EqualFold(m[1], tagNameCopy)
- },
- htmlLexStart,
- ))
- case skipAllElementRe.Match(tagName):
- // E.g. "<!DOCTYPE ..."
- w.buff.Reset()
- return w.consumeRuneUntil(func(r rune) bool {
- return r == '>'
- }, htmlLexStart)
- default:
- w.backup()
- return w.defaultLexElementInside
- }
+ spaceIndex := strings.Index(s, " ")
+ if spaceIndex != -1 {
+ s = s[:spaceIndex]
}
- w.buff.WriteRune(w.r)
+ return strings.ToLower(strings.TrimSpace(s)), true
+}
- // If it's a comment, skip to its end.
- if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
- w.buff.Reset()
- return htmlLexToEndOfComment
+func parseEndTag(s string) (string, bool) {
+ if !strings.HasPrefix(s, "</") {
+ return "", false
}
- return htmlLexElementStart
+ s = strings.TrimPrefix(s, "</")
+ s = strings.TrimSuffix(s, ">")
+
+ return strings.ToLower(strings.TrimSpace(s)), true
}
-// Entry state func.
-// Looks for a opening bracket, '<'.
-func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
- if w.r == '<' {
- w.backup()
- w.buff.Reset()
- return htmlLexElementStart
- }
+// No need to look inside these for HTML elements.
+func isPreFormatted(s string) bool {
+ return s == "pre" || s == "textarea" || s == "script" || s == "style"
+}
- return htmlLexStart
+type htmlElement struct {
+ Tag string
+ Classes []string
+ IDs []string
}
-// After "<!--", buff empty.
-func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
- w.buff.WriteRune(w.r)
+var (
+ htmlJsonFixer = strings.NewReplacer(", ", "\n")
+ jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`)
+ classAttrRe = regexp.MustCompile(`(?i)^class$|transition`)
- if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
- // Done, start looking for HTML elements again.
- return htmlLexStart
+ exceptionList = map[string]bool{
+ "thead": true,
+ "tbody": true,
+ "tfoot": true,
+ "td": true,
+ "tr": true,
}
-
- return htmlLexToEndOfComment
-}
+)
func parseHTMLElement(elStr string) (el htmlElement, err error) {
+ var tagBuffer string = ""
- tagName := parseStartTag(elStr)
-
- el.Tag = strings.ToLower(tagName)
- tagNameToParse := el.Tag
+ tagName, ok := parseStartTag(elStr)
+ if !ok {
+ return
+ }
// The net/html parser does not handle single table elements as input, e.g. tbody.
// We only care about the element/class/ids, so just store away the original tag name
// and pretend it's a <div>.
- if exceptionList[el.Tag] {
+ if exceptionList[tagName] {
+ tagBuffer = tagName
elStr = strings.Replace(elStr, tagName, "div", 1)
- tagNameToParse = "div"
}
n, err := html.Parse(strings.NewReader(elStr))
if err != nil {
return
}
-
var walk func(*html.Node)
walk = func(n *html.Node) {
- if n.Type == html.ElementNode && n.Data == tagNameToParse {
+ if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) {
+ el.Tag = n.Data
+
for _, a := range n.Attr {
switch {
case strings.EqualFold(a.Key, "id"):
@@ -416,20 +345,10 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
walk(n)
- return
-}
-
-// Variants of s
-// <body class="b a">
-// <div>
-func parseStartTag(s string) string {
- spaceIndex := strings.IndexFunc(s, func(r rune) bool {
- return unicode.IsSpace(r)
- })
-
- if spaceIndex == -1 {
- return s[1 : len(s)-1]
+ // did we replaced the start tag?
+ if tagBuffer != "" {
+ el.Tag = tagBuffer
}
- return s[1:spaceIndex]
+ return
}