diff options
author | Bjørn Erik Pedersen <[email protected]> | 2023-02-05 15:14:30 +0100 |
---|---|---|
committer | Bjørn Erik Pedersen <[email protected]> | 2023-02-05 20:01:39 +0100 |
commit | f9fc0e045bc1f72ba61fdf4a79b10a75a240394e (patch) | |
tree | d2c622aa89ff0aa20acfaf53e684833255671864 /publisher/htmlElementsCollector.go | |
parent | 4f4a1c00bfdc385c5afda9dcc1f259b1f9956991 (diff) | |
download | hugo-f9fc0e045bc1f72ba61fdf4a79b10a75a240394e.tar.gz hugo-f9fc0e045bc1f72ba61fdf4a79b10a75a240394e.zip |
Fix slow HTML elements collector for the pre case
```
name old time/op new time/op delta
ElementsCollectorWriterPre-10 25.2µs ± 1% 3.4µs ± 0% -86.54% (p=0.029 n=4+4)
name old alloc/op new alloc/op delta
ElementsCollectorWriterPre-10 624B ± 0% 142B ± 0% -77.18% (p=0.029 n=4+4)
name old allocs/op new allocs/op delta
ElementsCollectorWriterPre-10 16.0 ± 0% 6.0 ± 0% -62.50% (p=0.029 n=4+4)
```
Fixes #10698
Diffstat (limited to 'publisher/htmlElementsCollector.go')
-rw-r--r-- | publisher/htmlElementsCollector.go | 73 |
1 files changed, 65 insertions, 8 deletions
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go index ca6e2d940..91e1237a9 100644 --- a/publisher/htmlElementsCollector.go +++ b/publisher/htmlElementsCollector.go @@ -36,7 +36,6 @@ var ( skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`) skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`) - endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`) exceptionList = map[string]bool{ "thead": true, @@ -312,11 +311,7 @@ func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc if w.r != '>' { return false } - m := endTagRe.FindSubmatch(w.buff.Bytes()) - if m == nil { - return false - } - return bytes.EqualFold(m[1], tagNameCopy) + return isClosedByTag(w.buff.Bytes(), tagNameCopy) }, htmlLexStart, )) @@ -428,8 +423,9 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) { } // Variants of s -// <body class="b a"> -// <div> +// +// <body class="b a"> +// <div> func parseStartTag(s string) string { spaceIndex := strings.IndexFunc(s, func(r rune) bool { return unicode.IsSpace(r) @@ -441,3 +437,64 @@ func parseStartTag(s string) string { return s[1:spaceIndex] } + +// isClosedByTag reports whether b ends with a closing tag for tagName. +func isClosedByTag(b, tagName []byte) bool { + if len(b) == 0 { + return false + } + + if b[len(b)-1] != '>' { + return false + } + + var ( + lo int + hi int + + state int + inWord bool + ) + +LOOP: + for i := len(b) - 2; i >= 0; i-- { + switch { + case b[i] == '<': + if state != 1 { + return false + } + state = 2 + break LOOP + case b[i] == '/': + if state != 0 { + return false + } + state++ + if inWord { + lo = i + 1 + inWord = false + } + case isSpace(b[i]): + if inWord { + lo = i + 1 + inWord = false + } + default: + if !inWord { + hi = i + 1 + inWord = true + } + } + } + + if state != 2 { + return false + } + + return bytes.EqualFold(tagName, b[lo:hi]) + +} + +func isSpace(b byte) bool { + return b == ' ' || b == '\t' || b == '\n' +} |