diff options
Diffstat (limited to 'publisher')
-rw-r--r-- | publisher/htmlElementsCollector.go | 73 | ||||
-rw-r--r-- | publisher/htmlElementsCollector_test.go | 28 |
2 files changed, 93 insertions, 8 deletions
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go index ca6e2d940..91e1237a9 100644 --- a/publisher/htmlElementsCollector.go +++ b/publisher/htmlElementsCollector.go @@ -36,7 +36,6 @@ var ( skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`) skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`) - endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`) exceptionList = map[string]bool{ "thead": true, @@ -312,11 +311,7 @@ func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc if w.r != '>' { return false } - m := endTagRe.FindSubmatch(w.buff.Bytes()) - if m == nil { - return false - } - return bytes.EqualFold(m[1], tagNameCopy) + return isClosedByTag(w.buff.Bytes(), tagNameCopy) }, htmlLexStart, )) @@ -428,8 +423,9 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) { } // Variants of s -// <body class="b a"> -// <div> +// +// <body class="b a"> +// <div> func parseStartTag(s string) string { spaceIndex := strings.IndexFunc(s, func(r rune) bool { return unicode.IsSpace(r) @@ -441,3 +437,64 @@ func parseStartTag(s string) string { return s[1:spaceIndex] } + +// isClosedByTag reports whether b ends with a closing tag for tagName. +func isClosedByTag(b, tagName []byte) bool { + if len(b) == 0 { + return false + } + + if b[len(b)-1] != '>' { + return false + } + + var ( + lo int + hi int + + state int + inWord bool + ) + +LOOP: + for i := len(b) - 2; i >= 0; i-- { + switch { + case b[i] == '<': + if state != 1 { + return false + } + state = 2 + break LOOP + case b[i] == '/': + if state != 0 { + return false + } + state++ + if inWord { + lo = i + 1 + inWord = false + } + case isSpace(b[i]): + if inWord { + lo = i + 1 + inWord = false + } + default: + if !inWord { + hi = i + 1 + inWord = true + } + } + } + + if state != 2 { + return false + } + + return bytes.EqualFold(tagName, b[lo:hi]) + +} + +func isSpace(b byte) bool { + return b == ' ' || b == '\t' || b == '\n' +} diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go index 8be8c46ac..11590e0a3 100644 --- a/publisher/htmlElementsCollector_test.go +++ b/publisher/htmlElementsCollector_test.go @@ -155,6 +155,34 @@ func TestClassCollector(t *testing.T) { } +func TestEndsWithTag(t *testing.T) { + c := qt.New((t)) + + for _, test := range []struct { + name string + s string + tagName string + expect bool + }{ + {"empty", "", "div", false}, + {"no match", "foo", "div", false}, + {"no close", "foo<div>", "div", false}, + {"no close 2", "foo/div>", "div", false}, + {"no close 2", "foo//div>", "div", false}, + {"no tag", "foo</>", "div", false}, + {"match", "foo</div>", "div", true}, + {"match space", "foo< / div>", "div", true}, + {"match space 2", "foo< / div \n>", "div", true}, + {"match case", "foo</DIV>", "div", true}, + } { + c.Run(test.name, func(c *qt.C) { + got := isClosedByTag([]byte(test.s), []byte(test.tagName)) + c.Assert(got, qt.Equals, test.expect) + }) + } + +} + func BenchmarkElementsCollectorWriter(b *testing.B) { const benchHTML = ` <!DOCTYPE html> |