diff options
Diffstat (limited to 'libs/html5lib/html5parser.py')
-rw-r--r-- | libs/html5lib/html5parser.py | 734 |
1 files changed, 369 insertions, 365 deletions
diff --git a/libs/html5lib/html5parser.py b/libs/html5lib/html5parser.py index 9d39b9d41..74d829d98 100644 --- a/libs/html5lib/html5parser.py +++ b/libs/html5lib/html5parser.py @@ -2,7 +2,6 @@ from __future__ import absolute_import, division, unicode_literals from six import with_metaclass, viewkeys import types -from collections import OrderedDict from . import _inputstream from . import _tokenizer @@ -119,8 +118,8 @@ class HTMLParser(object): self.tree = tree(namespaceHTMLElements) self.errors = [] - self.phases = dict([(name, cls(self, self.tree)) for name, cls in - getPhases(debug).items()]) + self.phases = {name: cls(self, self.tree) for name, cls in + getPhases(debug).items()} def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): @@ -202,7 +201,7 @@ class HTMLParser(object): DoctypeToken = tokenTypes["Doctype"] ParseErrorToken = tokenTypes["ParseError"] - for token in self.normalizedTokens(): + for token in self.tokenizer: prev_token = None new_token = token while new_token is not None: @@ -260,10 +259,6 @@ class HTMLParser(object): if reprocess: assert self.phase not in phases - def normalizedTokens(self): - for token in self.tokenizer: - yield self.normalizeToken(token) - def parse(self, stream, *args, **kwargs): """Parse a HTML document into a well-formed tree @@ -325,17 +320,6 @@ class HTMLParser(object): if self.strict: raise ParseError(E[errorcode] % datavars) - def normalizeToken(self, token): - # HTML5 specific normalizations to the token stream - if token["type"] == tokenTypes["StartTag"]: - raw = token["data"] - token["data"] = OrderedDict(raw) - if len(raw) > len(token["data"]): - # we had some duplicated attribute, fix so first wins - token["data"].update(raw[::-1]) - - return token - def adjustMathMLAttributes(self, token): adjust_attributes(token, adjustMathMLAttributes) @@ -413,16 +397,12 @@ class HTMLParser(object): def getPhases(debug): def log(function): """Logger that records which phase processes each token""" - type_names = dict((value, key) for key, value in - tokenTypes.items()) + type_names = {value: key for key, value in tokenTypes.items()} def wrapped(self, *args, **kwargs): if function.__name__.startswith("process") and len(args) > 0: token = args[0] - try: - info = {"type": type_names[token['type']]} - except: - raise + info = {"type": type_names[token['type']]} if token['type'] in tagTokenTypes: info["name"] = token['name'] @@ -446,10 +426,13 @@ def getPhases(debug): class Phase(with_metaclass(getMetaclass(debug, log))): """Base class for helper object that implements each phase of processing """ + __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") def __init__(self, parser, tree): self.parser = parser self.tree = tree + self.__startTagCache = {} + self.__endTagCache = {} def processEOF(self): raise NotImplementedError @@ -469,7 +452,21 @@ def getPhases(debug): self.tree.insertText(token["data"]) def processStartTag(self, token): - return self.startTagHandler[token["name"]](token) + # Note the caching is done here rather than BoundMethodDispatcher as doing it there + # requires a circular reference to the Phase, and this ends up with a significant + # (CPython 2.7, 3.8) GC cost when parsing many short inputs + name = token["name"] + # In Py2, using `in` is quicker in general than try/except KeyError + # In Py3, `in` is quicker when there are few cache hits (typically short inputs) + if name in self.__startTagCache: + func = self.__startTagCache[name] + else: + func = self.__startTagCache[name] = self.startTagHandler[name] + # bound the cache size in case we get loads of unknown tags + while len(self.__startTagCache) > len(self.startTagHandler) * 1.1: + # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 + self.__startTagCache.pop(next(iter(self.__startTagCache))) + return func(token) def startTagHtml(self, token): if not self.parser.firstStartTag and token["name"] == "html": @@ -482,9 +479,25 @@ def getPhases(debug): self.parser.firstStartTag = False def processEndTag(self, token): - return self.endTagHandler[token["name"]](token) + # Note the caching is done here rather than BoundMethodDispatcher as doing it there + # requires a circular reference to the Phase, and this ends up with a significant + # (CPython 2.7, 3.8) GC cost when parsing many short inputs + name = token["name"] + # In Py2, using `in` is quicker in general than try/except KeyError + # In Py3, `in` is quicker when there are few cache hits (typically short inputs) + if name in self.__endTagCache: + func = self.__endTagCache[name] + else: + func = self.__endTagCache[name] = self.endTagHandler[name] + # bound the cache size in case we get loads of unknown tags + while len(self.__endTagCache) > len(self.endTagHandler) * 1.1: + # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 + self.__endTagCache.pop(next(iter(self.__endTagCache))) + return func(token) class InitialPhase(Phase): + __slots__ = tuple() + def processSpaceCharacters(self, token): pass @@ -613,6 +626,8 @@ def getPhases(debug): return True class BeforeHtmlPhase(Phase): + __slots__ = tuple() + # helper methods def insertHtmlElement(self): self.tree.insertRoot(impliedTagToken("html", "StartTag")) @@ -648,19 +663,7 @@ def getPhases(debug): return token class BeforeHeadPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("head", self.startTagHead) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - (("head", "body", "html", "br"), self.endTagImplyHead) - ]) - self.endTagHandler.default = self.endTagOther + __slots__ = tuple() def processEOF(self): self.startTagHead(impliedTagToken("head", "StartTag")) @@ -693,28 +696,19 @@ def getPhases(debug): self.parser.parseError("end-tag-after-implied-root", {"name": token["name"]}) + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("head", startTagHead) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + (("head", "body", "html", "br"), endTagImplyHead) + ]) + endTagHandler.default = endTagOther + class InHeadPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("title", self.startTagTitle), - (("noframes", "style"), self.startTagNoFramesStyle), - ("noscript", self.startTagNoscript), - ("script", self.startTagScript), - (("base", "basefont", "bgsound", "command", "link"), - self.startTagBaseLinkCommand), - ("meta", self.startTagMeta), - ("head", self.startTagHead) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("head", self.endTagHead), - (("br", "html", "body"), self.endTagHtmlBodyBr) - ]) - self.endTagHandler.default = self.endTagOther + __slots__ = tuple() # the real thing def processEOF(self): @@ -796,22 +790,27 @@ def getPhases(debug): def anythingElse(self): self.endTagHead(impliedTagToken("head")) - class InHeadNoscriptPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("title", startTagTitle), + (("noframes", "style"), startTagNoFramesStyle), + ("noscript", startTagNoscript), + ("script", startTagScript), + (("base", "basefont", "bgsound", "command", "link"), + startTagBaseLinkCommand), + ("meta", startTagMeta), + ("head", startTagHead) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("head", endTagHead), + (("br", "html", "body"), endTagHtmlBodyBr) + ]) + endTagHandler.default = endTagOther - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand), - (("head", "noscript"), self.startTagHeadNoscript), - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("noscript", self.endTagNoscript), - ("br", self.endTagBr), - ]) - self.endTagHandler.default = self.endTagOther + class InHeadNoscriptPhase(Phase): + __slots__ = tuple() def processEOF(self): self.parser.parseError("eof-in-head-noscript") @@ -860,23 +859,21 @@ def getPhases(debug): # Caller must raise parse error first! self.endTagNoscript(impliedTagToken("noscript")) + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand), + (("head", "noscript"), startTagHeadNoscript), + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("noscript", endTagNoscript), + ("br", endTagBr), + ]) + endTagHandler.default = endTagOther + class AfterHeadPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("body", self.startTagBody), - ("frameset", self.startTagFrameset), - (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", - "style", "title"), - self.startTagFromHead), - ("head", self.startTagHead) - ]) - self.startTagHandler.default = self.startTagOther - self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), - self.endTagHtmlBodyBr)]) - self.endTagHandler.default = self.endTagOther + __slots__ = tuple() def processEOF(self): self.anythingElse() @@ -927,80 +924,30 @@ def getPhases(debug): self.parser.phase = self.parser.phases["inBody"] self.parser.framesetOK = True + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("body", startTagBody), + ("frameset", startTagFrameset), + (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", + "style", "title"), + startTagFromHead), + ("head", startTagHead) + ]) + startTagHandler.default = startTagOther + endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), + endTagHtmlBodyBr)]) + endTagHandler.default = endTagOther + class InBodyPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody # the really-really-really-very crazy mode - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) + __slots__ = ("processSpaceCharacters",) + def __init__(self, *args, **kwargs): + super(InBodyPhase, self).__init__(*args, **kwargs) # Set this to the default handler self.processSpaceCharacters = self.processSpaceCharactersNonPre - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - (("base", "basefont", "bgsound", "command", "link", "meta", - "script", "style", "title"), - self.startTagProcessInHead), - ("body", self.startTagBody), - ("frameset", self.startTagFrameset), - (("address", "article", "aside", "blockquote", "center", "details", - "dir", "div", "dl", "fieldset", "figcaption", "figure", - "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", - "section", "summary", "ul"), - self.startTagCloseP), - (headingElements, self.startTagHeading), - (("pre", "listing"), self.startTagPreListing), - ("form", self.startTagForm), - (("li", "dd", "dt"), self.startTagListItem), - ("plaintext", self.startTagPlaintext), - ("a", self.startTagA), - (("b", "big", "code", "em", "font", "i", "s", "small", "strike", - "strong", "tt", "u"), self.startTagFormatting), - ("nobr", self.startTagNobr), - ("button", self.startTagButton), - (("applet", "marquee", "object"), self.startTagAppletMarqueeObject), - ("xmp", self.startTagXmp), - ("table", self.startTagTable), - (("area", "br", "embed", "img", "keygen", "wbr"), - self.startTagVoidFormatting), - (("param", "source", "track"), self.startTagParamSource), - ("input", self.startTagInput), - ("hr", self.startTagHr), - ("image", self.startTagImage), - ("isindex", self.startTagIsIndex), - ("textarea", self.startTagTextarea), - ("iframe", self.startTagIFrame), - ("noscript", self.startTagNoscript), - (("noembed", "noframes"), self.startTagRawtext), - ("select", self.startTagSelect), - (("rp", "rt"), self.startTagRpRt), - (("option", "optgroup"), self.startTagOpt), - (("math"), self.startTagMath), - (("svg"), self.startTagSvg), - (("caption", "col", "colgroup", "frame", "head", - "tbody", "td", "tfoot", "th", "thead", - "tr"), self.startTagMisplaced) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("body", self.endTagBody), - ("html", self.endTagHtml), - (("address", "article", "aside", "blockquote", "button", "center", - "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", - "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", - "section", "summary", "ul"), self.endTagBlock), - ("form", self.endTagForm), - ("p", self.endTagP), - (("dd", "dt", "li"), self.endTagListItem), - (headingElements, self.endTagHeading), - (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", - "strike", "strong", "tt", "u"), self.endTagFormatting), - (("applet", "marquee", "object"), self.endTagAppletMarqueeObject), - ("br", self.endTagBr), - ]) - self.endTagHandler.default = self.endTagOther - def isMatchingFormattingElement(self, node1, node2): return (node1.name == node2.name and node1.namespace == node2.namespace and @@ -1650,14 +1597,73 @@ def getPhases(debug): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) break + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + (("base", "basefont", "bgsound", "command", "link", "meta", + "script", "style", "title"), + startTagProcessInHead), + ("body", startTagBody), + ("frameset", startTagFrameset), + (("address", "article", "aside", "blockquote", "center", "details", + "dir", "div", "dl", "fieldset", "figcaption", "figure", + "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", + "section", "summary", "ul"), + startTagCloseP), + (headingElements, startTagHeading), + (("pre", "listing"), startTagPreListing), + ("form", startTagForm), + (("li", "dd", "dt"), startTagListItem), + ("plaintext", startTagPlaintext), + ("a", startTagA), + (("b", "big", "code", "em", "font", "i", "s", "small", "strike", + "strong", "tt", "u"), startTagFormatting), + ("nobr", startTagNobr), + ("button", startTagButton), + (("applet", "marquee", "object"), startTagAppletMarqueeObject), + ("xmp", startTagXmp), + ("table", startTagTable), + (("area", "br", "embed", "img", "keygen", "wbr"), + startTagVoidFormatting), + (("param", "source", "track"), startTagParamSource), + ("input", startTagInput), + ("hr", startTagHr), + ("image", startTagImage), + ("isindex", startTagIsIndex), + ("textarea", startTagTextarea), + ("iframe", startTagIFrame), + ("noscript", startTagNoscript), + (("noembed", "noframes"), startTagRawtext), + ("select", startTagSelect), + (("rp", "rt"), startTagRpRt), + (("option", "optgroup"), startTagOpt), + (("math"), startTagMath), + (("svg"), startTagSvg), + (("caption", "col", "colgroup", "frame", "head", + "tbody", "td", "tfoot", "th", "thead", + "tr"), startTagMisplaced) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("body", endTagBody), + ("html", endTagHtml), + (("address", "article", "aside", "blockquote", "button", "center", + "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", + "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", + "section", "summary", "ul"), endTagBlock), + ("form", endTagForm), + ("p", endTagP), + (("dd", "dt", "li"), endTagListItem), + (headingElements, endTagHeading), + (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", + "strike", "strong", "tt", "u"), endTagFormatting), + (("applet", "marquee", "object"), endTagAppletMarqueeObject), + ("br", endTagBr), + ]) + endTagHandler.default = endTagOther + class TextPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - self.startTagHandler = _utils.MethodDispatcher([]) - self.startTagHandler.default = self.startTagOther - self.endTagHandler = _utils.MethodDispatcher([ - ("script", self.endTagScript)]) - self.endTagHandler.default = self.endTagOther + __slots__ = tuple() def processCharacters(self, token): self.tree.insertText(token["data"]) @@ -1683,30 +1689,15 @@ def getPhases(debug): self.tree.openElements.pop() self.parser.phase = self.parser.originalPhase + startTagHandler = _utils.MethodDispatcher([]) + startTagHandler.default = startTagOther + endTagHandler = _utils.MethodDispatcher([ + ("script", endTagScript)]) + endTagHandler.default = endTagOther + class InTablePhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-table - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("caption", self.startTagCaption), - ("colgroup", self.startTagColgroup), - ("col", self.startTagCol), - (("tbody", "tfoot", "thead"), self.startTagRowGroup), - (("td", "th", "tr"), self.startTagImplyTbody), - ("table", self.startTagTable), - (("style", "script"), self.startTagStyleScript), - ("input", self.startTagInput), - ("form", self.startTagForm) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("table", self.endTagTable), - (("body", "caption", "col", "colgroup", "html", "tbody", "td", - "tfoot", "th", "thead", "tr"), self.endTagIgnore) - ]) - self.endTagHandler.default = self.endTagOther + __slots__ = tuple() # helper methods def clearStackToTableContext(self): @@ -1828,9 +1819,32 @@ def getPhases(debug): self.parser.phases["inBody"].processEndTag(token) self.tree.insertFromTable = False + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + ("caption", startTagCaption), + ("colgroup", startTagColgroup), + ("col", startTagCol), + (("tbody", "tfoot", "thead"), startTagRowGroup), + (("td", "th", "tr"), startTagImplyTbody), + ("table", startTagTable), + (("style", "script"), startTagStyleScript), + ("input", startTagInput), + ("form", startTagForm) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("table", endTagTable), + (("body", "caption", "col", "colgroup", "html", "tbody", "td", + "tfoot", "th", "thead", "tr"), endTagIgnore) + ]) + endTagHandler.default = endTagOther + class InTableTextPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) + __slots__ = ("originalPhase", "characterTokens") + + def __init__(self, *args, **kwargs): + super(InTableTextPhase, self).__init__(*args, **kwargs) self.originalPhase = None self.characterTokens = [] @@ -1875,23 +1889,7 @@ def getPhases(debug): class InCaptionPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-caption - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", - "thead", "tr"), self.startTagTableElement) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("caption", self.endTagCaption), - ("table", self.endTagTable), - (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", - "thead", "tr"), self.endTagIgnore) - ]) - self.endTagHandler.default = self.endTagOther + __slots__ = tuple() def ignoreEndTagCaption(self): return not self.tree.elementInScope("caption", variant="table") @@ -1944,23 +1942,24 @@ def getPhases(debug): def endTagOther(self, token): return self.parser.phases["inBody"].processEndTag(token) + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", + "thead", "tr"), startTagTableElement) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("caption", endTagCaption), + ("table", endTagTable), + (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", + "thead", "tr"), endTagIgnore) + ]) + endTagHandler.default = endTagOther + class InColumnGroupPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-column - - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("col", self.startTagCol) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("colgroup", self.endTagColgroup), - ("col", self.endTagCol) - ]) - self.endTagHandler.default = self.endTagOther + __slots__ = tuple() def ignoreEndTagColgroup(self): return self.tree.openElements[-1].name == "html" @@ -2010,26 +2009,21 @@ def getPhases(debug): if not ignoreEndTag: return token + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + ("col", startTagCol) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("colgroup", endTagColgroup), + ("col", endTagCol) + ]) + endTagHandler.default = endTagOther + class InTableBodyPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("tr", self.startTagTr), - (("td", "th"), self.startTagTableCell), - (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), - self.startTagTableOther) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), - ("table", self.endTagTable), - (("body", "caption", "col", "colgroup", "html", "td", "th", - "tr"), self.endTagIgnore) - ]) - self.endTagHandler.default = self.endTagOther + __slots__ = tuple() # helper methods def clearStackToTableBodyContext(self): @@ -2108,26 +2102,26 @@ def getPhases(debug): def endTagOther(self, token): return self.parser.phases["inTable"].processEndTag(token) + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + ("tr", startTagTr), + (("td", "th"), startTagTableCell), + (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), + startTagTableOther) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + (("tbody", "tfoot", "thead"), endTagTableRowGroup), + ("table", endTagTable), + (("body", "caption", "col", "colgroup", "html", "td", "th", + "tr"), endTagIgnore) + ]) + endTagHandler.default = endTagOther + class InRowPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-row - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - (("td", "th"), self.startTagTableCell), - (("caption", "col", "colgroup", "tbody", "tfoot", "thead", - "tr"), self.startTagTableOther) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("tr", self.endTagTr), - ("table", self.endTagTable), - (("tbody", "tfoot", "thead"), self.endTagTableRowGroup), - (("body", "caption", "col", "colgroup", "html", "td", "th"), - self.endTagIgnore) - ]) - self.endTagHandler.default = self.endTagOther + __slots__ = tuple() # helper methods (XXX unify this with other table helper methods) def clearStackToTableRowContext(self): @@ -2197,23 +2191,26 @@ def getPhases(debug): def endTagOther(self, token): return self.parser.phases["inTable"].processEndTag(token) + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + (("td", "th"), startTagTableCell), + (("caption", "col", "colgroup", "tbody", "tfoot", "thead", + "tr"), startTagTableOther) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("tr", endTagTr), + ("table", endTagTable), + (("tbody", "tfoot", "thead"), endTagTableRowGroup), + (("body", "caption", "col", "colgroup", "html", "td", "th"), + endTagIgnore) + ]) + endTagHandler.default = endTagOther + class InCellPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-cell - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", - "thead", "tr"), self.startTagTableOther) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - (("td", "th"), self.endTagTableCell), - (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore), - (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply) - ]) - self.endTagHandler.default = self.endTagOther + __slots__ = tuple() # helper def closeCell(self): @@ -2273,26 +2270,22 @@ def getPhases(debug): def endTagOther(self, token): return self.parser.phases["inBody"].processEndTag(token) + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", + "thead", "tr"), startTagTableOther) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + (("td", "th"), endTagTableCell), + (("body", "caption", "col", "colgroup", "html"), endTagIgnore), + (("table", "tbody", "tfoot", "thead", "tr"), endTagImply) + ]) + endTagHandler.default = endTagOther + class InSelectPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("option", self.startTagOption), - ("optgroup", self.startTagOptgroup), - ("select", self.startTagSelect), - (("input", "keygen", "textarea"), self.startTagInput), - ("script", self.startTagScript) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([ - ("option", self.endTagOption), - ("optgroup", self.endTagOptgroup), - ("select", self.endTagSelect) - ]) - self.endTagHandler.default = self.endTagOther + __slots__ = tuple() # http://www.whatwg.org/specs/web-apps/current-work/#in-select def processEOF(self): @@ -2373,21 +2366,25 @@ def getPhases(debug): self.parser.parseError("unexpected-end-tag-in-select", {"name": token["name"]}) - class InSelectInTablePhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), - self.startTagTable) - ]) - self.startTagHandler.default = self.startTagOther + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + ("option", startTagOption), + ("optgroup", startTagOptgroup), + ("select", startTagSelect), + (("input", "keygen", "textarea"), startTagInput), + ("script", startTagScript) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("option", endTagOption), + ("optgroup", endTagOptgroup), + ("select", endTagSelect) + ]) + endTagHandler.default = endTagOther - self.endTagHandler = _utils.MethodDispatcher([ - (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), - self.endTagTable) - ]) - self.endTagHandler.default = self.endTagOther + class InSelectInTablePhase(Phase): + __slots__ = tuple() def processEOF(self): self.parser.phases["inSelect"].processEOF() @@ -2412,7 +2409,21 @@ def getPhases(debug): def endTagOther(self, token): return self.parser.phases["inSelect"].processEndTag(token) + startTagHandler = _utils.MethodDispatcher([ + (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), + startTagTable) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), + endTagTable) + ]) + endTagHandler.default = endTagOther + class InForeignContentPhase(Phase): + __slots__ = tuple() + breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", "center", "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3", @@ -2422,9 +2433,6 @@ def getPhases(debug): "span", "strong", "strike", "sub", "sup", "table", "tt", "u", "ul", "var"]) - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - def adjustSVGTagNames(self, token): replacements = {"altglyph": "altGlyph", "altglyphdef": "altGlyphDef", @@ -2478,7 +2486,7 @@ def getPhases(debug): currentNode = self.tree.openElements[-1] if (token["name"] in self.breakoutElements or (token["name"] == "font" and - set(token["data"].keys()) & set(["color", "face", "size"]))): + set(token["data"].keys()) & {"color", "face", "size"})): self.parser.parseError("unexpected-html-element-in-foreign-content", {"name": token["name"]}) while (self.tree.openElements[-1].namespace != @@ -2528,16 +2536,7 @@ def getPhases(debug): return new_token class AfterBodyPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) - - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml) - ]) - self.startTagHandler.default = self.startTagOther - - self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)]) - self.endTagHandler.default = self.endTagOther + __slots__ = tuple() def processEOF(self): # Stop parsing @@ -2574,23 +2573,17 @@ def getPhases(debug): self.parser.phase = self.parser.phases["inBody"] return token - class InFramesetPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml) + ]) + startTagHandler.default = startTagOther - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("frameset", self.startTagFrameset), - ("frame", self.startTagFrame), - ("noframes", self.startTagNoframes) - ]) - self.startTagHandler.default = self.startTagOther + endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)]) + endTagHandler.default = endTagOther - self.endTagHandler = _utils.MethodDispatcher([ - ("frameset", self.endTagFrameset) - ]) - self.endTagHandler.default = self.endTagOther + class InFramesetPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset + __slots__ = tuple() def processEOF(self): if self.tree.openElements[-1].name != "html": @@ -2631,21 +2624,22 @@ def getPhases(debug): self.parser.parseError("unexpected-end-tag-in-frameset", {"name": token["name"]}) - class AfterFramesetPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#after3 - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + ("frameset", startTagFrameset), + ("frame", startTagFrame), + ("noframes", startTagNoframes) + ]) + startTagHandler.default = startTagOther - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("noframes", self.startTagNoframes) - ]) - self.startTagHandler.default = self.startTagOther + endTagHandler = _utils.MethodDispatcher([ + ("frameset", endTagFrameset) + ]) + endTagHandler.default = endTagOther - self.endTagHandler = _utils.MethodDispatcher([ - ("html", self.endTagHtml) - ]) - self.endTagHandler.default = self.endTagOther + class AfterFramesetPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#after3 + __slots__ = tuple() def processEOF(self): # Stop parsing @@ -2668,14 +2662,19 @@ def getPhases(debug): self.parser.parseError("unexpected-end-tag-after-frameset", {"name": token["name"]}) - class AfterAfterBodyPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + ("noframes", startTagNoframes) + ]) + startTagHandler.default = startTagOther - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml) - ]) - self.startTagHandler.default = self.startTagOther + endTagHandler = _utils.MethodDispatcher([ + ("html", endTagHtml) + ]) + endTagHandler.default = endTagOther + + class AfterAfterBodyPhase(Phase): + __slots__ = tuple() def processEOF(self): pass @@ -2706,15 +2705,13 @@ def getPhases(debug): self.parser.phase = self.parser.phases["inBody"] return token - class AfterAfterFramesetPhase(Phase): - def __init__(self, parser, tree): - Phase.__init__(self, parser, tree) + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml) + ]) + startTagHandler.default = startTagOther - self.startTagHandler = _utils.MethodDispatcher([ - ("html", self.startTagHtml), - ("noframes", self.startTagNoFrames) - ]) - self.startTagHandler.default = self.startTagOther + class AfterAfterFramesetPhase(Phase): + __slots__ = tuple() def processEOF(self): pass @@ -2741,6 +2738,13 @@ def getPhases(debug): def processEndTag(self, token): self.parser.parseError("expected-eof-but-got-end-tag", {"name": token["name"]}) + + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("noframes", startTagNoFrames) + ]) + startTagHandler.default = startTagOther + # pylint:enable=unused-argument return { @@ -2774,8 +2778,8 @@ def getPhases(debug): def adjust_attributes(token, replacements): needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) if needs_adjustment: - token['data'] = OrderedDict((replacements.get(k, k), v) - for k, v in token['data'].items()) + token['data'] = type(token['data'])((replacements.get(k, k), v) + for k, v in token['data'].items()) def impliedTagToken(name, type="EndTag", attributes=None, |