WIP

author: Louis Vézina <[email protected]> 2019-09-13 15:12:26 -0400
committer: Louis Vézina <[email protected]> 2019-09-13 15:12:26 -0400
commit: 645952c61aba3cccb5ca919be966a7ba02d853fa (patch)
tree: 005362d11c260b4293b427ed8ba9d94e8b478ec3 /libs/bs4/builder
parent: 4e7e3a39d26cb3771876b43c82a747f6c6526dda (diff)
download: bazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.tar.gz
bazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.zip
4 files changed, 192 insertions, 84 deletions
diff --git a/libs/bs4/builder/__init__.py b/libs/bs4/builder/__init__.py
index fdb3362fc..cc497cf0b 100644
--- a/libs/bs4/builder/__init__.py
+++ b/libs/bs4/builder/__init__.py
@@ -1,5 +1,5 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
 
 from collections import defaultdict
 import itertools
@@ -7,8 +7,7 @@ import sys
 from bs4.element import (
     CharsetMetaAttributeValue,
     ContentMetaAttributeValue,
-    HTMLAwareEntitySubstitution,
-    whitespace_re
+    nonwhitespace_re
     )
 
 __all__ = [
@@ -90,18 +89,46 @@ class TreeBuilder(object):
 
     is_xml = False
     picklable = False
-    preserve_whitespace_tags = set()
     empty_element_tags = None # A tag will be considered an empty-element
                               # tag when and only when it has no contents.
-
+    
     # A value for these tag/attribute combinations is a space- or
     # comma-separated list of CDATA, rather than a single CDATA.
-    cdata_list_attributes = {}
-
-
-    def __init__(self):
+    DEFAULT_CDATA_LIST_ATTRIBUTES = {}
+
+    DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
+    
+    USE_DEFAULT = object()
+    
+    def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT):
+        """Constructor.
+
+        :param multi_valued_attributes: If this is set to None, the
+        TreeBuilder will not turn any values for attributes like
+        'class' into lists. Setting this do a dictionary will
+        customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
+        for an example.
+
+        Internally, these are called "CDATA list attributes", but that
+        probably doesn't make sense to an end-user, so the argument name
+        is `multi_valued_attributes`.
+
+        :param preserve_whitespace_tags:
+        """
         self.soup = None
-
+        if multi_valued_attributes is self.USE_DEFAULT:
+            multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
+        self.cdata_list_attributes = multi_valued_attributes
+        if preserve_whitespace_tags is self.USE_DEFAULT:
+            preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
+        self.preserve_whitespace_tags = preserve_whitespace_tags
+            
+    def initialize_soup(self, soup):
+        """The BeautifulSoup object has been initialized and is now
+        being associated with the TreeBuilder.
+        """
+        self.soup = soup
+        
     def reset(self):
         pass
 
@@ -125,7 +152,7 @@ class TreeBuilder(object):
         if self.empty_element_tags is None:
             return True
         return tag_name in self.empty_element_tags
-
+    
     def feed(self, markup):
         raise NotImplementedError()
 
@@ -160,14 +187,14 @@ class TreeBuilder(object):
             universal = self.cdata_list_attributes.get('*', [])
             tag_specific = self.cdata_list_attributes.get(
                 tag_name.lower(), None)
-            for attr in attrs.keys():
+            for attr in list(attrs.keys()):
                 if attr in universal or (tag_specific and attr in tag_specific):
                     # We have a "class"-type attribute whose string
                     # value is a whitespace-separated list of
                     # values. Split it into a list.
                     value = attrs[attr]
-                    if isinstance(value, basestring):
-                        values = whitespace_re.split(value)
+                    if isinstance(value, str):
+                        values = nonwhitespace_re.findall(value)
                     else:
                         # html5lib sometimes calls setAttributes twice
                         # for the same tag when rearranging the parse
@@ -231,15 +258,20 @@ class HTMLTreeBuilder(TreeBuilder):
     Such as which tags are empty-element tags.
     """
 
-    preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
     empty_element_tags = set([
         # These are from HTML5.
         'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
-
-        # These are from HTML4, removed in HTML5.
-        'spacer', 'frame'
+        
+        # These are from earlier versions of HTML and are removed in HTML5.
+        'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
     ])
 
+    # The HTML standard defines these as block-level elements. Beautiful
+    # Soup does not treat these elements differently from other elements,
+    # but it may do so eventually, and this information is available if
+    # you need to use it.
+    block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
+    
     # The HTML standard defines these attributes as containing a
     # space-separated list of values, not a single value. That is,
     # class="foo bar" means that the 'class' attribute has two values,
@@ -247,7 +279,7 @@ class HTMLTreeBuilder(TreeBuilder):
     # encounter one of these attributes, we will parse its value into
     # a list of values if possible. Upon output, the list will be
     # converted back into a string.
-    cdata_list_attributes = {
+    DEFAULT_CDATA_LIST_ATTRIBUTES = {
         "*" : ['class', 'accesskey', 'dropzone'],
         "a" : ['rel', 'rev'],
         "link" :  ['rel', 'rev'],
@@ -264,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder):
         "output" : ["for"],
         }
 
+    DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+    
     def set_up_substitutions(self, tag):
         # We are only interested in <meta> tags
         if tag.name != 'meta':
diff --git a/libs/bs4/builder/_html5lib.py b/libs/bs4/builder/_html5lib.py
index 5f5489358..090bb61a8 100644
--- a/libs/bs4/builder/_html5lib.py
+++ b/libs/bs4/builder/_html5lib.py
@@ -1,5 +1,5 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
 
 __all__ = [
     'HTML5TreeBuilder',
@@ -15,7 +15,7 @@ from bs4.builder import (
     )
 from bs4.element import (
     NamespacedAttribute,
-    whitespace_re,
+    nonwhitespace_re,
 )
 import html5lib
 from html5lib.constants import (
@@ -33,7 +33,7 @@ try:
     # Pre-0.99999999
     from html5lib.treebuilders import _base as treebuilder_base
     new_html5lib = False
-except ImportError, e:
+except ImportError as e:
     # 0.99999999 and up
     from html5lib.treebuilders import base as treebuilder_base
     new_html5lib = True
@@ -64,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
         parser = html5lib.HTMLParser(tree=self.create_treebuilder)
 
         extra_kwargs = dict()
-        if not isinstance(markup, unicode):
+        if not isinstance(markup, str):
             if new_html5lib:
                 extra_kwargs['override_encoding'] = self.user_specified_encoding
             else:
@@ -72,13 +72,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
         doc = parser.parse(markup, **extra_kwargs)
 
         # Set the character encoding detected by the tokenizer.
-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
             # We need to special-case this because html5lib sets
             # charEncoding to UTF-8 if it gets Unicode input.
             doc.original_encoding = None
         else:
             original_encoding = parser.tokenizer.stream.charEncoding[0]
-            if not isinstance(original_encoding, basestring):
+            if not isinstance(original_encoding, str):
                 # In 0.99999999 and up, the encoding is an html5lib
                 # Encoding object. We want to use a string for compatibility
                 # with other tree builders.
@@ -92,7 +92,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
 
     def test_fragment_to_document(self, fragment):
         """See `TreeBuilder`."""
-        return u'<html><head></head><body>%s</body></html>' % fragment
+        return '<html><head></head><body>%s</body></html>' % fragment
 
 
 class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
@@ -174,7 +174,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
                 rv.append("|%s<%s>" % (' ' * indent, name))
                 if element.attrs:
                     attributes = []
-                    for name, value in element.attrs.items():
+                    for name, value in list(element.attrs.items()):
                         if isinstance(name, NamespacedAttribute):
                             name = "%s %s" % (prefixes[name.namespace], name.name)
                         if isinstance(value, list):
@@ -199,14 +199,14 @@ class AttrList(object):
     def __setitem__(self, name, value):
         # If this attribute is a multi-valued attribute for this element,
         # turn its value into a list.
-        list_attr = HTML5TreeBuilder.cdata_list_attributes
+        list_attr = self.element.cdata_list_attributes
         if (name in list_attr['*']
             or (self.element.name in list_attr
                 and name in list_attr[self.element.name])):
             # A node that is being cloned may have already undergone
             # this procedure.
             if not isinstance(value, list):
-                value = whitespace_re.split(value)
+                value = nonwhitespace_re.findall(value)
         self.element[name] = value
     def items(self):
         return list(self.attrs.items())
@@ -229,7 +229,7 @@ class Element(treebuilder_base.Node):
 
     def appendChild(self, node):
         string_child = child = None
-        if isinstance(node, basestring):
+        if isinstance(node, str):
             # Some other piece of code decided to pass in a string
             # instead of creating a TextElement object to contain the
             # string.
@@ -246,10 +246,10 @@ class Element(treebuilder_base.Node):
             child = node.element
             node.parent = self
 
-        if not isinstance(child, basestring) and child.parent is not None:
+        if not isinstance(child, str) and child.parent is not None:
             node.element.extract()
 
-        if (string_child and self.element.contents
+        if (string_child is not None and self.element.contents
             and self.element.contents[-1].__class__ == NavigableString):
             # We are appending a string onto another string.
             # TODO This has O(n^2) performance, for input like
@@ -259,7 +259,7 @@ class Element(treebuilder_base.Node):
             old_element.replace_with(new_element)
             self.soup._most_recent_element = new_element
         else:
-            if isinstance(node, basestring):
+            if isinstance(node, str):
                 # Create a brand new NavigableString from this string.
                 child = self.soup.new_string(node)
 
@@ -299,7 +299,7 @@ class Element(treebuilder_base.Node):
 
             self.soup.builder._replace_cdata_list_attribute_values(
                 self.name, attributes)
-            for name, value in attributes.items():
+            for name, value in list(attributes.items()):
                 self.element[name] = value
 
             # The attributes may contain variables that need substitution.
@@ -360,16 +360,16 @@ class Element(treebuilder_base.Node):
             # Set the first child's previous_element and previous_sibling
             # to elements within the new parent
             first_child = to_append[0]
-            if new_parents_last_descendant:
+            if new_parents_last_descendant is not None:
                 first_child.previous_element = new_parents_last_descendant
             else:
                 first_child.previous_element = new_parent_element
             first_child.previous_sibling = new_parents_last_child
-            if new_parents_last_descendant:
+            if new_parents_last_descendant is not None:
                 new_parents_last_descendant.next_element = first_child
             else:
                 new_parent_element.next_element = first_child
-            if new_parents_last_child:
+            if new_parents_last_child is not None:
                 new_parents_last_child.next_sibling = first_child
 
             # Find the very last element being moved. It is now the
@@ -379,7 +379,7 @@ class Element(treebuilder_base.Node):
             last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
 
             last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
-            if new_parents_last_descendant_next_element:
+            if new_parents_last_descendant_next_element is not None:
                 # TODO: This code has no test coverage and I'm not sure
                 # how to get html5lib to go through this path, but it's
                 # just the other side of the previous line.
diff --git a/libs/bs4/builder/_htmlparser.py b/libs/bs4/builder/_htmlparser.py
index 67890b3a3..ea549c356 100644
--- a/libs/bs4/builder/_htmlparser.py
+++ b/libs/bs4/builder/_htmlparser.py
@@ -1,17 +1,18 @@
+# encoding: utf-8
 """Use the HTMLParser library to parse HTML files that aren't too bad."""
 
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
 
 __all__ = [
     'HTMLParserTreeBuilder',
     ]
 
-from HTMLParser import HTMLParser
+from html.parser import HTMLParser
 
 try:
-    from HTMLParser import HTMLParseError
-except ImportError, e:
+    from html.parser import HTMLParseError
+except ImportError as e:
     # HTMLParseError is removed in Python 3.5. Since it can never be
     # thrown in 3.5, we can just define our own class as a placeholder.
     class HTMLParseError(Exception):
@@ -64,7 +65,18 @@ class BeautifulSoupHTMLParser(HTMLParser):
         # order. It's a list of closing tags we've already handled and
         # will ignore, assuming they ever show up.
         self.already_closed_empty_element = []
-    
+
+    def error(self, msg):
+        """In Python 3, HTMLParser subclasses must implement error(), although this
+        requirement doesn't appear to be documented.
+
+        In Python 2, HTMLParser implements error() as raising an exception.
+
+        In any event, this method is called only on very strange markup and our best strategy
+        is to pretend it didn't happen and keep going.
+        """
+        warnings.warn(msg)
+        
     def handle_startendtag(self, name, attrs):
         # This is only called when the markup looks like
         # <tag/>.
@@ -129,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
         else:
             real_name = int(name)
 
-        try:
-            data = unichr(real_name)
-        except (ValueError, OverflowError), e:
-            data = u"\N{REPLACEMENT CHARACTER}"
-
+        data = None
+        if real_name < 256:
+            # HTML numeric entities are supposed to reference Unicode
+            # code points, but sometimes they reference code points in
+            # some other encoding (ahem, Windows-1252). E.g. &#147;
+            # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
+            # code tries to detect this situation and compensate.
+            for encoding in (self.soup.original_encoding, 'windows-1252'):
+                if not encoding:
+                    continue
+                try:
+                    data = bytearray([real_name]).decode(encoding)
+                except UnicodeDecodeError as e:
+                    pass
+        if not data:
+            try:
+                data = chr(real_name)
+            except (ValueError, OverflowError) as e:
+                pass
+        data = data or "\N{REPLACEMENT CHARACTER}"
         self.handle_data(data)
 
     def handle_entityref(self, name):
@@ -141,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
         if character is not None:
             data = character
         else:
-            data = "&%s;" % name
+            # If this were XML, it would be ambiguous whether "&foo"
+            # was an character entity reference with a missing
+            # semicolon or the literal string "&foo". Since this is
+            # HTML, we have a complete list of all character entity references,
+            # and this one wasn't found, so assume it's the literal string "&foo".
+            data = "&%s" % name
         self.handle_data(data)
 
     def handle_comment(self, data):
@@ -182,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
     NAME = HTMLPARSER
     features = [NAME, HTML, STRICT]
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
+        super(HTMLParserTreeBuilder, self).__init__(**kwargs)
+        parser_args = parser_args or []
+        parser_kwargs = parser_kwargs or {}
         if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
-            kwargs['strict'] = False
+            parser_kwargs['strict'] = False
         if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
-            kwargs['convert_charrefs'] = False
-        self.parser_args = (args, kwargs)
+            parser_kwargs['convert_charrefs'] = False
+        self.parser_args = (parser_args, parser_kwargs)
 
     def prepare_markup(self, markup, user_specified_encoding=None,
                        document_declared_encoding=None, exclude_encodings=None):
@@ -196,7 +231,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
         declared within markup, whether any characters had to be
         replaced with REPLACEMENT CHARACTER).
         """
-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
             yield (markup, None, None, False)
             return
 
@@ -213,7 +248,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
         parser.soup = self.soup
         try:
             parser.feed(markup)
-        except HTMLParseError, e:
+            parser.close()
+        except HTMLParseError as e:
             warnings.warn(RuntimeWarning(
                 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
             raise e
diff --git a/libs/bs4/builder/_lxml.py b/libs/bs4/builder/_lxml.py
index d2ca2872d..a490e2301 100644
--- a/libs/bs4/builder/_lxml.py
+++ b/libs/bs4/builder/_lxml.py
@@ -1,13 +1,18 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
 __all__ = [
     'LXMLTreeBuilderForXML',
     'LXMLTreeBuilder',
     ]
 
+try:
+    from collections.abc import Callable # Python 3.6
+except ImportError as e:
+    from collections import Callable
+
 from io import BytesIO
-from StringIO import StringIO
-import collections
+from io import StringIO
 from lxml import etree
 from bs4.element import (
     Comment,
@@ -28,6 +33,10 @@ from bs4.dammit import EncodingDetector
 
 LXML = 'lxml'
 
+def _invert(d):
+    "Invert a dictionary."
+    return dict((v,k) for k, v in list(d.items()))
+
 class LXMLTreeBuilderForXML(TreeBuilder):
     DEFAULT_PARSER_CLASS = etree.XMLParser
 
@@ -44,7 +53,29 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 
     # This namespace mapping is specified in the XML Namespace
     # standard.
-    DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+    DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
+
+    DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
+
+    def initialize_soup(self, soup):
+        """Let the BeautifulSoup object know about the standard namespace
+        mapping.
+        """
+        super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
+        self._register_namespaces(self.DEFAULT_NSMAPS)
+
+    def _register_namespaces(self, mapping):
+        """Let the BeautifulSoup object know about namespaces encountered
+        while parsing the document.
+
+        This might be useful later on when creating CSS selectors.
+        """
+        for key, value in list(mapping.items()):
+            if key and key not in self.soup._namespaces:
+                # Let the BeautifulSoup object know about a new namespace.
+                # If there are multiple namespaces defined with the same
+                # prefix, the first one in the document takes precedence.
+                self.soup._namespaces[key] = value
 
     def default_parser(self, encoding):
         # This can either return a parser object or a class, which
@@ -58,12 +89,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         # Use the default parser.
         parser = self.default_parser(encoding)
 
-        if isinstance(parser, collections.Callable):
+        if isinstance(parser, Callable):
             # Instantiate the parser with default arguments
             parser = parser(target=self, strip_cdata=False, encoding=encoding)
         return parser
 
-    def __init__(self, parser=None, empty_element_tags=None):
+    def __init__(self, parser=None, empty_element_tags=None, **kwargs):
         # TODO: Issue a warning if parser is present but not a
         # callable, since that means there's no way to create new
         # parsers for different encodings.
@@ -71,8 +102,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         if empty_element_tags is not None:
             self.empty_element_tags = set(empty_element_tags)
         self.soup = None
-        self.nsmaps = [self.DEFAULT_NSMAPS]
-
+        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+        super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
+        
     def _getNsTag(self, tag):
         # Split the namespace URL out of a fully-qualified lxml tag
         # name. Copied from lxml's src/lxml/sax.py.
@@ -101,12 +133,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         else:
             self.processing_instruction_class = XMLProcessingInstruction
 
-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
             # We were given Unicode. Maybe lxml can parse Unicode on
             # this system?
             yield markup, None, document_declared_encoding, False
 
-        if isinstance(markup, unicode):
+        if isinstance(markup, str):
             # No, apparently not. Convert the Unicode to UTF-8 and
             # tell lxml to parse it as UTF-8.
             yield (markup.encode("utf8"), "utf8",
@@ -121,7 +153,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
     def feed(self, markup):
         if isinstance(markup, bytes):
             markup = BytesIO(markup)
-        elif isinstance(markup, unicode):
+        elif isinstance(markup, str):
             markup = StringIO(markup)
 
         # Call feed() at least once, even if the markup is empty,
@@ -136,30 +168,36 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                 if len(data) != 0:
                     self.parser.feed(data)
             self.parser.close()
-        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
             raise ParserRejectedMarkup(str(e))
 
     def close(self):
-        self.nsmaps = [self.DEFAULT_NSMAPS]
+        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
 
     def start(self, name, attrs, nsmap={}):
         # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
         attrs = dict(attrs)
         nsprefix = None
         # Invert each namespace map as it comes in.
-        if len(self.nsmaps) > 1:
-            # There are no new namespaces for this tag, but
-            # non-default namespaces are in play, so we need a
-            # separate tag stack to know when they end.
-            self.nsmaps.append(None)
+        if len(nsmap) == 0 and len(self.nsmaps) > 1:
+                # There are no new namespaces for this tag, but
+                # non-default namespaces are in play, so we need a
+                # separate tag stack to know when they end.
+                self.nsmaps.append(None)
         elif len(nsmap) > 0:
             # A new namespace mapping has come into play.
-            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
-            self.nsmaps.append(inverted_nsmap)
+
+            # First, Let the BeautifulSoup object know about it.
+            self._register_namespaces(nsmap)
+
+            # Then, add it to our running list of inverted namespace
+            # mappings.
+            self.nsmaps.append(_invert(nsmap))
+
             # Also treat the namespace mapping as a set of attributes on the
             # tag, so we can recreate it later.
             attrs = attrs.copy()
-            for prefix, namespace in nsmap.items():
+            for prefix, namespace in list(nsmap.items()):
                 attribute = NamespacedAttribute(
                     "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
                 attrs[attribute] = namespace
@@ -168,7 +206,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         # from lxml with namespaces attached to their names, and
         # turn then into NamespacedAttribute objects.
         new_attrs = {}
-        for attr, value in attrs.items():
+        for attr, value in list(attrs.items()):
             namespace, attr = self._getNsTag(attr)
             if namespace is None:
                 new_attrs[attr] = value
@@ -228,7 +266,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 
     def test_fragment_to_document(self, fragment):
         """See `TreeBuilder`."""
-        return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+        return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
 
 
 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
@@ -249,10 +287,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
             self.parser = self.parser_for(encoding)
             self.parser.feed(markup)
             self.parser.close()
-        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
             raise ParserRejectedMarkup(str(e))
 
 
     def test_fragment_to_document(self, fragment):
         """See `TreeBuilder`."""
-        return u'<html><body>%s</body></html>' % fragment
+        return '<html><body>%s</body></html>' % fragment
author	Louis Vézina <[email protected]>	2019-09-13 15:12:26 -0400
committer	Louis Vézina <[email protected]>	2019-09-13 15:12:26 -0400
commit	645952c61aba3cccb5ca919be966a7ba02d853fa (patch)
tree	005362d11c260b4293b427ed8ba9d94e8b478ec3 /libs/bs4/builder
parent	4e7e3a39d26cb3771876b43c82a747f6c6526dda (diff)
download	bazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.tar.gz bazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.zip