diff options
author | Louis Vézina <[email protected]> | 2019-09-24 06:23:11 -0400 |
---|---|---|
committer | Louis Vézina <[email protected]> | 2019-09-24 06:23:11 -0400 |
commit | 8227df459a8a9286a4b5e8829b95abad337fefe8 (patch) | |
tree | d3a5cfe429e76149886e22f05b7cf363d2e98643 /libs/bs4/builder | |
parent | 2b2fd4e8d7d69db132be57a81ab3703147f9c5a5 (diff) | |
download | bazarr-8227df459a8a9286a4b5e8829b95abad337fefe8.tar.gz bazarr-8227df459a8a9286a4b5e8829b95abad337fefe8.zip |
WIP
Diffstat (limited to 'libs/bs4/builder')
-rw-r--r-- | libs/bs4/builder/__init__.py | 4 | ||||
-rw-r--r-- | libs/bs4/builder/_html5lib.py | 16 | ||||
-rw-r--r-- | libs/bs4/builder/_htmlparser.py | 15 | ||||
-rw-r--r-- | libs/bs4/builder/_lxml.py | 12 |
4 files changed, 28 insertions, 19 deletions
diff --git a/libs/bs4/builder/__init__.py b/libs/bs4/builder/__init__.py index fdb3362fc..4b44ef05f 100644 --- a/libs/bs4/builder/__init__.py +++ b/libs/bs4/builder/__init__.py @@ -1,6 +1,7 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +from __future__ import absolute_import from collections import defaultdict import itertools import sys @@ -10,6 +11,7 @@ from bs4.element import ( HTMLAwareEntitySubstitution, whitespace_re ) +import six __all__ = [ 'HTMLTreeBuilder', @@ -166,7 +168,7 @@ class TreeBuilder(object): # value is a whitespace-separated list of # values. Split it into a list. value = attrs[attr] - if isinstance(value, basestring): + if isinstance(value, six.string_types): values = whitespace_re.split(value) else: # html5lib sometimes calls setAttributes twice diff --git a/libs/bs4/builder/_html5lib.py b/libs/bs4/builder/_html5lib.py index 5f5489358..cf6063b83 100644 --- a/libs/bs4/builder/_html5lib.py +++ b/libs/bs4/builder/_html5lib.py @@ -1,6 +1,8 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +from __future__ import absolute_import +import six __all__ = [ 'HTML5TreeBuilder', ] @@ -33,7 +35,7 @@ try: # Pre-0.99999999 from html5lib.treebuilders import _base as treebuilder_base new_html5lib = False -except ImportError, e: +except ImportError as e: # 0.99999999 and up from html5lib.treebuilders import base as treebuilder_base new_html5lib = True @@ -64,7 +66,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): parser = html5lib.HTMLParser(tree=self.create_treebuilder) extra_kwargs = dict() - if not isinstance(markup, unicode): + if not isinstance(markup, six.text_type): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: @@ -72,13 +74,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder): doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. - if isinstance(markup, unicode): + if isinstance(markup, six.text_type): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] - if not isinstance(original_encoding, basestring): + if not isinstance(original_encoding, six.string_types): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. @@ -229,7 +231,7 @@ class Element(treebuilder_base.Node): def appendChild(self, node): string_child = child = None - if isinstance(node, basestring): + if isinstance(node, six.string_types): # Some other piece of code decided to pass in a string # instead of creating a TextElement object to contain the # string. @@ -246,7 +248,7 @@ class Element(treebuilder_base.Node): child = node.element node.parent = self - if not isinstance(child, basestring) and child.parent is not None: + if not isinstance(child, six.string_types) and child.parent is not None: node.element.extract() if (string_child and self.element.contents @@ -259,7 +261,7 @@ class Element(treebuilder_base.Node): old_element.replace_with(new_element) self.soup._most_recent_element = new_element else: - if isinstance(node, basestring): + if isinstance(node, six.string_types): # Create a brand new NavigableString from this string. child = self.soup.new_string(node) diff --git a/libs/bs4/builder/_htmlparser.py b/libs/bs4/builder/_htmlparser.py index 67890b3a3..624028e3c 100644 --- a/libs/bs4/builder/_htmlparser.py +++ b/libs/bs4/builder/_htmlparser.py @@ -3,15 +3,18 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +from __future__ import absolute_import +from six import unichr +import six __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import HTMLParser +from six.moves.html_parser import HTMLParser try: - from HTMLParser import HTMLParseError -except ImportError, e: + from six.moves.html_parser import HTMLParseError +except ImportError as e: # HTMLParseError is removed in Python 3.5. Since it can never be # thrown in 3.5, we can just define our own class as a placeholder. class HTMLParseError(Exception): @@ -131,7 +134,7 @@ class BeautifulSoupHTMLParser(HTMLParser): try: data = unichr(real_name) - except (ValueError, OverflowError), e: + except (ValueError, OverflowError) as e: data = u"\N{REPLACEMENT CHARACTER}" self.handle_data(data) @@ -196,7 +199,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ - if isinstance(markup, unicode): + if isinstance(markup, six.text_type): yield (markup, None, None, False) return @@ -213,7 +216,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser.soup = self.soup try: parser.feed(markup) - except HTMLParseError, e: + except HTMLParseError as e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e diff --git a/libs/bs4/builder/_lxml.py b/libs/bs4/builder/_lxml.py index d2ca2872d..73f6e2b34 100644 --- a/libs/bs4/builder/_lxml.py +++ b/libs/bs4/builder/_lxml.py @@ -1,5 +1,7 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +from __future__ import absolute_import +import six __all__ = [ 'LXMLTreeBuilderForXML', 'LXMLTreeBuilder', @@ -101,12 +103,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): else: self.processing_instruction_class = XMLProcessingInstruction - if isinstance(markup, unicode): + if isinstance(markup, six.text_type): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False - if isinstance(markup, unicode): + if isinstance(markup, six.text_type): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", @@ -121,7 +123,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) - elif isinstance(markup, unicode): + elif isinstance(markup, six.text_type): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, @@ -136,7 +138,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): if len(data) != 0: self.parser.feed(data) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) def close(self): @@ -249,7 +251,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): self.parser = self.parser_for(encoding) self.parser.feed(markup) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) |