diff options
author | Louis Vézina <[email protected]> | 2019-09-13 15:12:26 -0400 |
---|---|---|
committer | Louis Vézina <[email protected]> | 2019-09-13 15:12:26 -0400 |
commit | 645952c61aba3cccb5ca919be966a7ba02d853fa (patch) | |
tree | 005362d11c260b4293b427ed8ba9d94e8b478ec3 /libs/bs4/builder | |
parent | 4e7e3a39d26cb3771876b43c82a747f6c6526dda (diff) | |
download | bazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.tar.gz bazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.zip |
WIP
Diffstat (limited to 'libs/bs4/builder')
-rw-r--r-- | libs/bs4/builder/__init__.py | 74 | ||||
-rw-r--r-- | libs/bs4/builder/_html5lib.py | 40 | ||||
-rw-r--r-- | libs/bs4/builder/_htmlparser.py | 72 | ||||
-rw-r--r-- | libs/bs4/builder/_lxml.py | 90 |
4 files changed, 192 insertions, 84 deletions
diff --git a/libs/bs4/builder/__init__.py b/libs/bs4/builder/__init__.py index fdb3362fc..cc497cf0b 100644 --- a/libs/bs4/builder/__init__.py +++ b/libs/bs4/builder/__init__.py @@ -1,5 +1,5 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" from collections import defaultdict import itertools @@ -7,8 +7,7 @@ import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, - HTMLAwareEntitySubstitution, - whitespace_re + nonwhitespace_re ) __all__ = [ @@ -90,18 +89,46 @@ class TreeBuilder(object): is_xml = False picklable = False - preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. - + # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. - cdata_list_attributes = {} - - - def __init__(self): + DEFAULT_CDATA_LIST_ATTRIBUTES = {} + + DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + + USE_DEFAULT = object() + + def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT): + """Constructor. + + :param multi_valued_attributes: If this is set to None, the + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this do a dictionary will + customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES + for an example. + + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is `multi_valued_attributes`. + + :param preserve_whitespace_tags: + """ self.soup = None - + if multi_valued_attributes is self.USE_DEFAULT: + multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES + self.cdata_list_attributes = multi_valued_attributes + if preserve_whitespace_tags is self.USE_DEFAULT: + preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS + self.preserve_whitespace_tags = preserve_whitespace_tags + + def initialize_soup(self, soup): + """The BeautifulSoup object has been initialized and is now + being associated with the TreeBuilder. + """ + self.soup = soup + def reset(self): pass @@ -125,7 +152,7 @@ class TreeBuilder(object): if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags - + def feed(self, markup): raise NotImplementedError() @@ -160,14 +187,14 @@ class TreeBuilder(object): universal = self.cdata_list_attributes.get('*', []) tag_specific = self.cdata_list_attributes.get( tag_name.lower(), None) - for attr in attrs.keys(): + for attr in list(attrs.keys()): if attr in universal or (tag_specific and attr in tag_specific): # We have a "class"-type attribute whose string # value is a whitespace-separated list of # values. Split it into a list. value = attrs[attr] - if isinstance(value, basestring): - values = whitespace_re.split(value) + if isinstance(value, str): + values = nonwhitespace_re.findall(value) else: # html5lib sometimes calls setAttributes twice # for the same tag when rearranging the parse @@ -231,15 +258,20 @@ class HTMLTreeBuilder(TreeBuilder): Such as which tags are empty-element tags. """ - preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags empty_element_tags = set([ # These are from HTML5. 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', - - # These are from HTML4, removed in HTML5. - 'spacer', 'frame' + + # These are from earlier versions of HTML and are removed in HTML5. + 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' ]) + # The HTML standard defines these as block-level elements. Beautiful + # Soup does not treat these elements differently from other elements, + # but it may do so eventually, and this information is available if + # you need to use it. + block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) + # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, # class="foo bar" means that the 'class' attribute has two values, @@ -247,7 +279,7 @@ class HTMLTreeBuilder(TreeBuilder): # encounter one of these attributes, we will parse its value into # a list of values if possible. Upon output, the list will be # converted back into a string. - cdata_list_attributes = { + DEFAULT_CDATA_LIST_ATTRIBUTES = { "*" : ['class', 'accesskey', 'dropzone'], "a" : ['rel', 'rev'], "link" : ['rel', 'rev'], @@ -264,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder): "output" : ["for"], } + DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + def set_up_substitutions(self, tag): # We are only interested in <meta> tags if tag.name != 'meta': diff --git a/libs/bs4/builder/_html5lib.py b/libs/bs4/builder/_html5lib.py index 5f5489358..090bb61a8 100644 --- a/libs/bs4/builder/_html5lib.py +++ b/libs/bs4/builder/_html5lib.py @@ -1,5 +1,5 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" __all__ = [ 'HTML5TreeBuilder', @@ -15,7 +15,7 @@ from bs4.builder import ( ) from bs4.element import ( NamespacedAttribute, - whitespace_re, + nonwhitespace_re, ) import html5lib from html5lib.constants import ( @@ -33,7 +33,7 @@ try: # Pre-0.99999999 from html5lib.treebuilders import _base as treebuilder_base new_html5lib = False -except ImportError, e: +except ImportError as e: # 0.99999999 and up from html5lib.treebuilders import base as treebuilder_base new_html5lib = True @@ -64,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): parser = html5lib.HTMLParser(tree=self.create_treebuilder) extra_kwargs = dict() - if not isinstance(markup, unicode): + if not isinstance(markup, str): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: @@ -72,13 +72,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder): doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. - if isinstance(markup, unicode): + if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] - if not isinstance(original_encoding, basestring): + if not isinstance(original_encoding, str): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. @@ -92,7 +92,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<html><head></head><body>%s</body></html>' % fragment + return '<html><head></head><body>%s</body></html>' % fragment class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): @@ -174,7 +174,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): rv.append("|%s<%s>" % (' ' * indent, name)) if element.attrs: attributes = [] - for name, value in element.attrs.items(): + for name, value in list(element.attrs.items()): if isinstance(name, NamespacedAttribute): name = "%s %s" % (prefixes[name.namespace], name.name) if isinstance(value, list): @@ -199,14 +199,14 @@ class AttrList(object): def __setitem__(self, name, value): # If this attribute is a multi-valued attribute for this element, # turn its value into a list. - list_attr = HTML5TreeBuilder.cdata_list_attributes + list_attr = self.element.cdata_list_attributes if (name in list_attr['*'] or (self.element.name in list_attr and name in list_attr[self.element.name])): # A node that is being cloned may have already undergone # this procedure. if not isinstance(value, list): - value = whitespace_re.split(value) + value = nonwhitespace_re.findall(value) self.element[name] = value def items(self): return list(self.attrs.items()) @@ -229,7 +229,7 @@ class Element(treebuilder_base.Node): def appendChild(self, node): string_child = child = None - if isinstance(node, basestring): + if isinstance(node, str): # Some other piece of code decided to pass in a string # instead of creating a TextElement object to contain the # string. @@ -246,10 +246,10 @@ class Element(treebuilder_base.Node): child = node.element node.parent = self - if not isinstance(child, basestring) and child.parent is not None: + if not isinstance(child, str) and child.parent is not None: node.element.extract() - if (string_child and self.element.contents + if (string_child is not None and self.element.contents and self.element.contents[-1].__class__ == NavigableString): # We are appending a string onto another string. # TODO This has O(n^2) performance, for input like @@ -259,7 +259,7 @@ class Element(treebuilder_base.Node): old_element.replace_with(new_element) self.soup._most_recent_element = new_element else: - if isinstance(node, basestring): + if isinstance(node, str): # Create a brand new NavigableString from this string. child = self.soup.new_string(node) @@ -299,7 +299,7 @@ class Element(treebuilder_base.Node): self.soup.builder._replace_cdata_list_attribute_values( self.name, attributes) - for name, value in attributes.items(): + for name, value in list(attributes.items()): self.element[name] = value # The attributes may contain variables that need substitution. @@ -360,16 +360,16 @@ class Element(treebuilder_base.Node): # Set the first child's previous_element and previous_sibling # to elements within the new parent first_child = to_append[0] - if new_parents_last_descendant: + if new_parents_last_descendant is not None: first_child.previous_element = new_parents_last_descendant else: first_child.previous_element = new_parent_element first_child.previous_sibling = new_parents_last_child - if new_parents_last_descendant: + if new_parents_last_descendant is not None: new_parents_last_descendant.next_element = first_child else: new_parent_element.next_element = first_child - if new_parents_last_child: + if new_parents_last_child is not None: new_parents_last_child.next_sibling = first_child # Find the very last element being moved. It is now the @@ -379,7 +379,7 @@ class Element(treebuilder_base.Node): last_childs_last_descendant = to_append[-1]._last_descendant(False, True) last_childs_last_descendant.next_element = new_parents_last_descendant_next_element - if new_parents_last_descendant_next_element: + if new_parents_last_descendant_next_element is not None: # TODO: This code has no test coverage and I'm not sure # how to get html5lib to go through this path, but it's # just the other side of the previous line. diff --git a/libs/bs4/builder/_htmlparser.py b/libs/bs4/builder/_htmlparser.py index 67890b3a3..ea549c356 100644 --- a/libs/bs4/builder/_htmlparser.py +++ b/libs/bs4/builder/_htmlparser.py @@ -1,17 +1,18 @@ +# encoding: utf-8 """Use the HTMLParser library to parse HTML files that aren't too bad.""" -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import HTMLParser +from html.parser import HTMLParser try: - from HTMLParser import HTMLParseError -except ImportError, e: + from html.parser import HTMLParseError +except ImportError as e: # HTMLParseError is removed in Python 3.5. Since it can never be # thrown in 3.5, we can just define our own class as a placeholder. class HTMLParseError(Exception): @@ -64,7 +65,18 @@ class BeautifulSoupHTMLParser(HTMLParser): # order. It's a list of closing tags we've already handled and # will ignore, assuming they ever show up. self.already_closed_empty_element = [] - + + def error(self, msg): + """In Python 3, HTMLParser subclasses must implement error(), although this + requirement doesn't appear to be documented. + + In Python 2, HTMLParser implements error() as raising an exception. + + In any event, this method is called only on very strange markup and our best strategy + is to pretend it didn't happen and keep going. + """ + warnings.warn(msg) + def handle_startendtag(self, name, attrs): # This is only called when the markup looks like # <tag/>. @@ -129,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser): else: real_name = int(name) - try: - data = unichr(real_name) - except (ValueError, OverflowError), e: - data = u"\N{REPLACEMENT CHARACTER}" - + data = None + if real_name < 256: + # HTML numeric entities are supposed to reference Unicode + # code points, but sometimes they reference code points in + # some other encoding (ahem, Windows-1252). E.g. “ + # instead of É for LEFT DOUBLE QUOTATION MARK. This + # code tries to detect this situation and compensate. + for encoding in (self.soup.original_encoding, 'windows-1252'): + if not encoding: + continue + try: + data = bytearray([real_name]).decode(encoding) + except UnicodeDecodeError as e: + pass + if not data: + try: + data = chr(real_name) + except (ValueError, OverflowError) as e: + pass + data = data or "\N{REPLACEMENT CHARACTER}" self.handle_data(data) def handle_entityref(self, name): @@ -141,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser): if character is not None: data = character else: - data = "&%s;" % name + # If this were XML, it would be ambiguous whether "&foo" + # was an character entity reference with a missing + # semicolon or the literal string "&foo". Since this is + # HTML, we have a complete list of all character entity references, + # and this one wasn't found, so assume it's the literal string "&foo". + data = "&%s" % name self.handle_data(data) def handle_comment(self, data): @@ -182,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): NAME = HTMLPARSER features = [NAME, HTML, STRICT] - def __init__(self, *args, **kwargs): + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): + super(HTMLParserTreeBuilder, self).__init__(**kwargs) + parser_args = parser_args or [] + parser_kwargs = parser_kwargs or {} if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: - kwargs['strict'] = False + parser_kwargs['strict'] = False if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: - kwargs['convert_charrefs'] = False - self.parser_args = (args, kwargs) + parser_kwargs['convert_charrefs'] = False + self.parser_args = (parser_args, parser_kwargs) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): @@ -196,7 +231,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ - if isinstance(markup, unicode): + if isinstance(markup, str): yield (markup, None, None, False) return @@ -213,7 +248,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser.soup = self.soup try: parser.feed(markup) - except HTMLParseError, e: + parser.close() + except HTMLParseError as e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e diff --git a/libs/bs4/builder/_lxml.py b/libs/bs4/builder/_lxml.py index d2ca2872d..a490e2301 100644 --- a/libs/bs4/builder/_lxml.py +++ b/libs/bs4/builder/_lxml.py @@ -1,13 +1,18 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + __all__ = [ 'LXMLTreeBuilderForXML', 'LXMLTreeBuilder', ] +try: + from collections.abc import Callable # Python 3.6 +except ImportError as e: + from collections import Callable + from io import BytesIO -from StringIO import StringIO -import collections +from io import StringIO from lxml import etree from bs4.element import ( Comment, @@ -28,6 +33,10 @@ from bs4.dammit import EncodingDetector LXML = 'lxml' +def _invert(d): + "Invert a dictionary." + return dict((v,k) for k, v in list(d.items())) + class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser @@ -44,7 +53,29 @@ class LXMLTreeBuilderForXML(TreeBuilder): # This namespace mapping is specified in the XML Namespace # standard. - DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} + DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') + + DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) + + def initialize_soup(self, soup): + """Let the BeautifulSoup object know about the standard namespace + mapping. + """ + super(LXMLTreeBuilderForXML, self).initialize_soup(soup) + self._register_namespaces(self.DEFAULT_NSMAPS) + + def _register_namespaces(self, mapping): + """Let the BeautifulSoup object know about namespaces encountered + while parsing the document. + + This might be useful later on when creating CSS selectors. + """ + for key, value in list(mapping.items()): + if key and key not in self.soup._namespaces: + # Let the BeautifulSoup object know about a new namespace. + # If there are multiple namespaces defined with the same + # prefix, the first one in the document takes precedence. + self.soup._namespaces[key] = value def default_parser(self, encoding): # This can either return a parser object or a class, which @@ -58,12 +89,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): # Use the default parser. parser = self.default_parser(encoding) - if isinstance(parser, collections.Callable): + if isinstance(parser, Callable): # Instantiate the parser with default arguments parser = parser(target=self, strip_cdata=False, encoding=encoding) return parser - def __init__(self, parser=None, empty_element_tags=None): + def __init__(self, parser=None, empty_element_tags=None, **kwargs): # TODO: Issue a warning if parser is present but not a # callable, since that means there's no way to create new # parsers for different encodings. @@ -71,8 +102,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): if empty_element_tags is not None: self.empty_element_tags = set(empty_element_tags) self.soup = None - self.nsmaps = [self.DEFAULT_NSMAPS] - + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + super(LXMLTreeBuilderForXML, self).__init__(**kwargs) + def _getNsTag(self, tag): # Split the namespace URL out of a fully-qualified lxml tag # name. Copied from lxml's src/lxml/sax.py. @@ -101,12 +133,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): else: self.processing_instruction_class = XMLProcessingInstruction - if isinstance(markup, unicode): + if isinstance(markup, str): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False - if isinstance(markup, unicode): + if isinstance(markup, str): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", @@ -121,7 +153,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) - elif isinstance(markup, unicode): + elif isinstance(markup, str): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, @@ -136,30 +168,36 @@ class LXMLTreeBuilderForXML(TreeBuilder): if len(data) != 0: self.parser.feed(data) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) def close(self): - self.nsmaps = [self.DEFAULT_NSMAPS] + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] def start(self, name, attrs, nsmap={}): # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. attrs = dict(attrs) nsprefix = None # Invert each namespace map as it comes in. - if len(self.nsmaps) > 1: - # There are no new namespaces for this tag, but - # non-default namespaces are in play, so we need a - # separate tag stack to know when they end. - self.nsmaps.append(None) + if len(nsmap) == 0 and len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. - inverted_nsmap = dict((value, key) for key, value in nsmap.items()) - self.nsmaps.append(inverted_nsmap) + + # First, Let the BeautifulSoup object know about it. + self._register_namespaces(nsmap) + + # Then, add it to our running list of inverted namespace + # mappings. + self.nsmaps.append(_invert(nsmap)) + # Also treat the namespace mapping as a set of attributes on the # tag, so we can recreate it later. attrs = attrs.copy() - for prefix, namespace in nsmap.items(): + for prefix, namespace in list(nsmap.items()): attribute = NamespacedAttribute( "xmlns", prefix, "http://www.w3.org/2000/xmlns/") attrs[attribute] = namespace @@ -168,7 +206,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): # from lxml with namespaces attached to their names, and # turn then into NamespacedAttribute objects. new_attrs = {} - for attr, value in attrs.items(): + for attr, value in list(attrs.items()): namespace, attr = self._getNsTag(attr) if namespace is None: new_attrs[attr] = value @@ -228,7 +266,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment + return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): @@ -249,10 +287,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): self.parser = self.parser_for(encoding) self.parser.feed(markup) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'<html><body>%s</body></html>' % fragment + return '<html><body>%s</body></html>' % fragment |