diff options
author | Louis Vézina <[email protected]> | 2018-10-20 22:53:12 -0400 |
---|---|---|
committer | Louis Vézina <[email protected]> | 2018-10-20 22:53:12 -0400 |
commit | d61bdfcd4fe2d4423d784cd1370acf630719e90f (patch) | |
tree | 0c97cbd0e9d66dd3e300a6427171827c28eb3f85 /libs/bs4 | |
parent | b9a987b57d361c87cc69bd18fdc39b42ded97199 (diff) | |
download | bazarr-d61bdfcd4fe2d4423d784cd1370acf630719e90f.tar.gz bazarr-d61bdfcd4fe2d4423d784cd1370acf630719e90f.zip |
Initial commit.
Diffstat (limited to 'libs/bs4')
-rw-r--r-- | libs/bs4/__init__.py | 87 | ||||
-rw-r--r-- | libs/bs4/builder/__init__.py | 16 | ||||
-rw-r--r-- | libs/bs4/builder/_htmlparser.py | 47 | ||||
-rw-r--r-- | libs/bs4/builder/_lxml.py | 18 | ||||
-rw-r--r-- | libs/bs4/dammit.py | 6 | ||||
-rw-r--r-- | libs/bs4/diagnose.py | 20 | ||||
-rw-r--r-- | libs/bs4/element.py | 155 | ||||
-rw-r--r-- | libs/bs4/testing.py | 42 | ||||
-rw-r--r-- | libs/bs4/tests/test_htmlparser.py | 15 | ||||
-rw-r--r-- | libs/bs4/tests/test_lxml.py | 6 | ||||
-rw-r--r-- | libs/bs4/tests/test_tree.py | 62 |
11 files changed, 96 insertions, 378 deletions
diff --git a/libs/bs4/__init__.py b/libs/bs4/__init__.py index 470177fdf..7a80452f7 100644 --- a/libs/bs4/__init__.py +++ b/libs/bs4/__init__.py @@ -21,15 +21,14 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ # found in the LICENSE file. __author__ = "Leonard Richardson ([email protected])" -__version__ = "4.6.3" -__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson" +__version__ = "4.6.0" +__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] import os import re -import sys import traceback import warnings @@ -83,46 +82,14 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' - NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n" def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, **kwargs): - """Constructor. - - :param markup: A string or a file-like object representing - markup to be parsed. - - :param features: Desirable features of the parser to be used. This - may be the name of a specific parser ("lxml", "lxml-xml", - "html.parser", or "html5lib") or it may be the type of markup - to be used ("html", "html5", "xml"). It's recommended that you - name a specific parser, so that Beautiful Soup gives you the - same results across platforms and virtual environments. - - :param builder: A specific TreeBuilder to use instead of looking one - up based on `features`. You shouldn't need to use this. - - :param parse_only: A SoupStrainer. Only parts of the document - matching the SoupStrainer will be considered. This is useful - when parsing part of a document that would otherwise be too - large to fit into memory. - - :param from_encoding: A string indicating the encoding of the - document to be parsed. Pass this in if Beautiful Soup is - guessing wrongly about the document's encoding. - - :param exclude_encodings: A list of strings indicating - encodings known to be wrong. Pass this in if you don't know - the document's encoding but you know Beautiful Soup's guess is - wrong. - - :param kwargs: For backwards compatibility purposes, the - constructor accepts certain keyword arguments used in - Beautiful Soup 3. None of these arguments do anything in - Beautiful Soup 4 and there's no need to actually pass keyword - arguments into the constructor. - """ + """The Soup object is initialized as the 'root tag', and the + provided markup (which can be a string or a file-like object) + is fed into the underlying parser.""" if 'convertEntities' in kwargs: warnings.warn( @@ -204,35 +171,14 @@ class BeautifulSoup(Tag): else: markup_type = "HTML" - # This code adapted from warnings.py so that we get the same line - # of code as our warnings.warn() call gets, even if the answer is wrong - # (as it may be in a multithreading situation). - caller = None - try: - caller = sys._getframe(1) - except ValueError: - pass - if caller: - globals = caller.f_globals - line_number = caller.f_lineno - else: - globals = sys.__dict__ - line_number= 1 - filename = globals.get('__file__') - if filename: - fnl = filename.lower() - if fnl.endswith((".pyc", ".pyo")): - filename = filename[:-1] - if filename: - # If there is no filename at all, the user is most likely in a REPL, - # and the warning is not necessary. - values = dict( - filename=filename, - line_number=line_number, - parser=builder.NAME, - markup_type=markup_type - ) - warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) + caller = traceback.extract_stack()[0] + filename = caller[0] + line_number = caller[1] + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( + filename=filename, + line_number=line_number, + parser=builder.NAME, + markup_type=markup_type)) self.builder = builder self.is_xml = builder.is_xml @@ -356,10 +302,9 @@ class BeautifulSoup(Tag): self.preserve_whitespace_tag_stack = [] self.pushTag(self) - def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs): + def new_tag(self, name, namespace=None, nsprefix=None, **attrs): """Create a new tag associated with this soup.""" - kwattrs.update(attrs) - return Tag(None, self.builder, name, namespace, nsprefix, kwattrs) + return Tag(None, self.builder, name, namespace, nsprefix, attrs) def new_string(self, s, subclass=NavigableString): """Create a new NavigableString associated with this soup.""" diff --git a/libs/bs4/builder/__init__.py b/libs/bs4/builder/__init__.py index c9e3f3d31..fdb3362fc 100644 --- a/libs/bs4/builder/__init__.py +++ b/libs/bs4/builder/__init__.py @@ -93,7 +93,7 @@ class TreeBuilder(object): preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. - + # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. cdata_list_attributes = {} @@ -125,7 +125,7 @@ class TreeBuilder(object): if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags - + def feed(self, markup): raise NotImplementedError() @@ -235,17 +235,11 @@ class HTMLTreeBuilder(TreeBuilder): empty_element_tags = set([ # These are from HTML5. 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', - - # These are from earlier versions of HTML and are removed in HTML5. - 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' + + # These are from HTML4, removed in HTML5. + 'spacer', 'frame' ]) - # The HTML standard defines these as block-level elements. Beautiful - # Soup does not treat these elements differently from other elements, - # but it may do so eventually, and this information is available if - # you need to use it. - block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) - # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, # class="foo bar" means that the 'class' attribute has two values, diff --git a/libs/bs4/builder/_htmlparser.py b/libs/bs4/builder/_htmlparser.py index ee6c685df..67890b3a3 100644 --- a/libs/bs4/builder/_htmlparser.py +++ b/libs/bs4/builder/_htmlparser.py @@ -1,4 +1,3 @@ -# encoding: utf-8 """Use the HTMLParser library to parse HTML files that aren't too bad.""" # Use of this source code is governed by a BSD-style license that can be @@ -65,18 +64,7 @@ class BeautifulSoupHTMLParser(HTMLParser): # order. It's a list of closing tags we've already handled and # will ignore, assuming they ever show up. self.already_closed_empty_element = [] - - def error(self, msg): - """In Python 3, HTMLParser subclasses must implement error(), although this - requirement doesn't appear to be documented. - - In Python 2, HTMLParser implements error() as raising an exception. - - In any event, this method is called only on very strange markup and our best strategy - is to pretend it didn't happen and keep going. - """ - warnings.warn(msg) - + def handle_startendtag(self, name, attrs): # This is only called when the markup looks like # <tag/>. @@ -141,26 +129,11 @@ class BeautifulSoupHTMLParser(HTMLParser): else: real_name = int(name) - data = None - if real_name < 256: - # HTML numeric entities are supposed to reference Unicode - # code points, but sometimes they reference code points in - # some other encoding (ahem, Windows-1252). E.g. “ - # instead of É for LEFT DOUBLE QUOTATION MARK. This - # code tries to detect this situation and compensate. - for encoding in (self.soup.original_encoding, 'windows-1252'): - if not encoding: - continue - try: - data = bytearray([real_name]).decode(encoding) - except UnicodeDecodeError, e: - pass - if not data: - try: - data = unichr(real_name) - except (ValueError, OverflowError), e: - pass - data = data or u"\N{REPLACEMENT CHARACTER}" + try: + data = unichr(real_name) + except (ValueError, OverflowError), e: + data = u"\N{REPLACEMENT CHARACTER}" + self.handle_data(data) def handle_entityref(self, name): @@ -168,12 +141,7 @@ class BeautifulSoupHTMLParser(HTMLParser): if character is not None: data = character else: - # If this were XML, it would be ambiguous whether "&foo" - # was an character entity reference with a missing - # semicolon or the literal string "&foo". Since this is - # HTML, we have a complete list of all character entity references, - # and this one wasn't found, so assume it's the literal string "&foo". - data = "&%s" % name + data = "&%s;" % name self.handle_data(data) def handle_comment(self, data): @@ -245,7 +213,6 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser.soup = self.soup try: parser.feed(markup) - parser.close() except HTMLParseError, e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) diff --git a/libs/bs4/builder/_lxml.py b/libs/bs4/builder/_lxml.py index 4a0f7de4b..d2ca2872d 100644 --- a/libs/bs4/builder/_lxml.py +++ b/libs/bs4/builder/_lxml.py @@ -5,13 +5,9 @@ __all__ = [ 'LXMLTreeBuilder', ] -try: - from collections.abc import Callable # Python 3.6 -except ImportError , e: - from collections import Callable - from io import BytesIO from StringIO import StringIO +import collections from lxml import etree from bs4.element import ( Comment, @@ -62,7 +58,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): # Use the default parser. parser = self.default_parser(encoding) - if isinstance(parser, Callable): + if isinstance(parser, collections.Callable): # Instantiate the parser with default arguments parser = parser(target=self, strip_cdata=False, encoding=encoding) return parser @@ -151,11 +147,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): attrs = dict(attrs) nsprefix = None # Invert each namespace map as it comes in. - if len(nsmap) == 0 and len(self.nsmaps) > 1: - # There are no new namespaces for this tag, but - # non-default namespaces are in play, so we need a - # separate tag stack to know when they end. - self.nsmaps.append(None) + if len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. inverted_nsmap = dict((value, key) for key, value in nsmap.items()) diff --git a/libs/bs4/dammit.py b/libs/bs4/dammit.py index be46b3942..7965565f5 100644 --- a/libs/bs4/dammit.py +++ b/libs/bs4/dammit.py @@ -46,9 +46,9 @@ except ImportError: pass xml_encoding_re = re.compile( - '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I) + '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) html_meta_re = re.compile( - '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) + '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) class EntitySubstitution(object): @@ -82,7 +82,7 @@ class EntitySubstitution(object): } BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" ")") AMPERSAND_OR_BRACKET = re.compile("([<>&])") diff --git a/libs/bs4/diagnose.py b/libs/bs4/diagnose.py index 7a28c09a7..8768332f5 100644 --- a/libs/bs4/diagnose.py +++ b/libs/bs4/diagnose.py @@ -37,7 +37,7 @@ def diagnose(data): name) if 'lxml' in basic_parsers: - basic_parsers.append("lxml-xml") + basic_parsers.append(["lxml", "xml"]) try: from lxml import etree print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) @@ -56,27 +56,21 @@ def diagnose(data): if hasattr(data, 'read'): data = data.read() + elif os.path.exists(data): + print '"%s" looks like a filename. Reading data from the file.' % data + with open(data) as fp: + data = fp.read() elif data.startswith("http:") or data.startswith("https:"): print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." return - else: - try: - if os.path.exists(data): - print '"%s" looks like a filename. Reading data from the file.' % data - with open(data) as fp: - data = fp.read() - except ValueError: - # This can happen on some platforms when the 'filename' is - # too long. Assume it's data and not a filename. - pass - print + print for parser in basic_parsers: print "Trying to parse your markup with %s" % parser success = False try: - soup = BeautifulSoup(data, features=parser) + soup = BeautifulSoup(data, parser) success = True except Exception, e: print "%s could not parse the markup." % parser diff --git a/libs/bs4/element.py b/libs/bs4/element.py index 886eb91f2..9ef75f814 100644 --- a/libs/bs4/element.py +++ b/libs/bs4/element.py @@ -2,10 +2,7 @@ # found in the LICENSE file. __license__ = "MIT" -try: - from collections.abc import Callable # Python 3.6 -except ImportError , e: - from collections import Callable +import collections import re import shlex import sys @@ -15,7 +12,7 @@ from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) -whitespace_re = re.compile(r"\s+") +whitespace_re = re.compile("\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @@ -72,7 +69,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): The value of the 'content' attribute will be one of these objects. """ - CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) def __new__(cls, original_value): match = cls.CHARSET_RE.search(original_value) @@ -126,41 +123,6 @@ class HTMLAwareEntitySubstitution(EntitySubstitution): return cls._substitute_if_appropriate( ns, EntitySubstitution.substitute_xml) -class Formatter(object): - """Contains information about how to format a parse tree.""" - - # By default, represent void elements as <tag/> rather than <tag> - void_element_close_prefix = '/' - - def substitute_entities(self, *args, **kwargs): - """Transform certain characters into named entities.""" - raise NotImplementedError() - -class HTMLFormatter(Formatter): - """The default HTML formatter.""" - def substitute(self, *args, **kwargs): - return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) - -class MinimalHTMLFormatter(Formatter): - """A minimal HTML formatter.""" - def substitute(self, *args, **kwargs): - return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs) - -class HTML5Formatter(HTMLFormatter): - """An HTML formatter that omits the slash in a void tag.""" - void_element_close_prefix = None - -class XMLFormatter(Formatter): - """Substitute only the essential XML entities.""" - def substitute(self, *args, **kwargs): - return EntitySubstitution.substitute_xml(*args, **kwargs) - -class HTMLXMLFormatter(Formatter): - """Format XML using HTML rules.""" - def substitute(self, *args, **kwargs): - return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) - - class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -169,49 +131,40 @@ class PageElement(object): # to methods like encode() and prettify(): # # "html" - All Unicode characters with corresponding HTML entities - # are converted to those entities on output. - # "html5" - The same as "html", but empty void tags are represented as - # <tag> rather than <tag/> - # "minimal" - Bare ampersands and angle brackets are converted to + # are converted to those entities on output. + # "minimal" - Bare ampersands and angle brackets are converted to # XML entities: & < > # None - The null formatter. Unicode characters are never # converted to entities. This is not recommended, but it's # faster than "minimal". - # A callable function - it will be called on every string that needs to undergo entity substitution. - # A Formatter instance - Formatter.substitute(string) will be called on every string that + # A function - This function will be called on every string that # needs to undergo entity substitution. # - # In an HTML document, the default "html", "html5", and "minimal" - # functions will leave the contents of <script> and <style> tags - # alone. For an XML document, all tags will be given the same - # treatment. + # In an HTML document, the default "html" and "minimal" functions + # will leave the contents of <script> and <style> tags alone. For + # an XML document, all tags will be given the same treatment. HTML_FORMATTERS = { - "html" : HTMLFormatter(), - "html5" : HTML5Formatter(), - "minimal" : MinimalHTMLFormatter(), + "html" : HTMLAwareEntitySubstitution.substitute_html, + "minimal" : HTMLAwareEntitySubstitution.substitute_xml, None : None } XML_FORMATTERS = { - "html" : HTMLXMLFormatter(), - "minimal" : XMLFormatter(), + "html" : EntitySubstitution.substitute_html, + "minimal" : EntitySubstitution.substitute_xml, None : None } def format_string(self, s, formatter='minimal'): """Format the given string using the given formatter.""" - if isinstance(formatter, basestring): + if not callable(formatter): formatter = self._formatter_for_name(formatter) if formatter is None: output = s else: - if callable(formatter): - # Backwards compatibility -- you used to pass in a formatting method. - output = formatter(s) - else: - output = formatter.substitute(s) + output = formatter(s) return output @property @@ -241,9 +194,11 @@ class PageElement(object): def _formatter_for_name(self, name): "Look up a formatter function based on its name and the tree." if self._is_xml: - return self.XML_FORMATTERS.get(name, XMLFormatter()) + return self.XML_FORMATTERS.get( + name, EntitySubstitution.substitute_xml) else: - return self.HTML_FORMATTERS.get(name, HTMLFormatter()) + return self.HTML_FORMATTERS.get( + name, HTMLAwareEntitySubstitution.substitute_xml) def setup(self, parent=None, previous_element=None, next_element=None, previous_sibling=None, next_sibling=None): @@ -361,14 +316,6 @@ class PageElement(object): and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) - from bs4 import BeautifulSoup - if isinstance(new_child, BeautifulSoup): - # We don't want to end up with a situation where one BeautifulSoup - # object contains another. Insert the children one at a time. - for subchild in list(new_child.contents): - self.insert(position, subchild) - position += 1 - return position = min(position, len(self.contents)) if hasattr(new_child, 'parent') and new_child.parent is not None: # We're 'inserting' an element that's already one @@ -589,21 +536,14 @@ class PageElement(object): elif isinstance(name, basestring): # Optimization to find all tags with a given name. if name.count(':') == 1: - # This is a name with a prefix. If this is a namespace-aware document, - # we need to match the local name against tag.name. If not, - # we need to match the fully-qualified name against tag.name. - prefix, local_name = name.split(':', 1) + # This is a name with a prefix. + prefix, name = name.split(':', 1) else: prefix = None - local_name = name result = (element for element in generator if isinstance(element, Tag) - and ( - element.name == name - ) or ( - element.name == local_name - and (prefix is None or element.prefix == prefix) - ) + and element.name == name + and (prefix is None or element.prefix == prefix) ) return ResultSet(strainer, result) results = ResultSet(strainer) @@ -922,7 +862,7 @@ class Tag(PageElement): self.can_be_empty_element = builder.can_be_empty_element(name) else: self.can_be_empty_element = False - + parserClass = _alias("parser_class") # BS3 def __copy__(self): @@ -1106,10 +1046,8 @@ class Tag(PageElement): # BS3: soup.aTag -> "soup.find("a") tag_name = tag[:-3] warnings.warn( - '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( - name=tag_name - ) - ) + '.%sTag is deprecated, use .find("%s") instead.' % ( + tag_name, tag_name)) return self.find(tag_name) # We special case contents to avoid recursion. elif not tag.startswith("__") and not tag == "contents": @@ -1191,10 +1129,11 @@ class Tag(PageElement): encoding. """ - # First off, turn a string formatter into a Formatter object. This + # First off, turn a string formatter into a function. This # will stop the lookup from happening over and over again. - if not isinstance(formatter, Formatter) and not callable(formatter): + if not callable(formatter): formatter = self._formatter_for_name(formatter) + attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): @@ -1223,9 +1162,7 @@ class Tag(PageElement): prefix = self.prefix + ":" if self.is_empty_element: - close = '' - if isinstance(formatter, Formatter): - close = formatter.void_element_close_prefix or close + close = '/' else: closeTag = '</%s%s>' % (prefix, self.name) @@ -1296,9 +1233,9 @@ class Tag(PageElement): :param formatter: The output formatter responsible for converting entities to Unicode characters. """ - # First off, turn a string formatter into a Formatter object. This + # First off, turn a string formatter into a function. This # will stop the lookup from happening over and over again. - if not isinstance(formatter, Formatter) and not callable(formatter): + if not callable(formatter): formatter = self._formatter_for_name(formatter) pretty_print = (indent_level is not None) @@ -1411,29 +1348,15 @@ class Tag(PageElement): # Handle grouping selectors if ',' exists, ie: p,a if ',' in selector: context = [] - selectors = [x.strip() for x in selector.split(",")] - - # If a selector is mentioned multiple times we don't want - # to use it more than once. - used_selectors = set() - - # We also don't want to select the same element more than once, - # if it's matched by multiple selectors. - selected_object_ids = set() - for partial_selector in selectors: + for partial_selector in selector.split(','): + partial_selector = partial_selector.strip() if partial_selector == '': raise ValueError('Invalid group selection syntax: %s' % selector) - if partial_selector in used_selectors: - continue - used_selectors.add(partial_selector) candidates = self.select(partial_selector, limit=limit) for candidate in candidates: - # This lets us distinguish between distinct tags that - # represent the same markup. - object_id = id(candidate) - if object_id not in selected_object_ids: + if candidate not in context: context.append(candidate) - selected_object_ids.add(object_id) + if limit and len(context) >= limit: break return context @@ -1495,7 +1418,7 @@ class Tag(PageElement): if tag_name == '': raise ValueError( "A pseudo-class must be prefixed with a tag name.") - pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) + pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) found = [] if pseudo_attributes is None: pseudo_type = pseudo @@ -1729,7 +1652,7 @@ class SoupStrainer(object): markup = markup_name markup_attrs = markup call_function_with_tag_data = ( - isinstance(self.name, Callable) + isinstance(self.name, collections.Callable) and not isinstance(markup_name, Tag)) if ((not self.name) @@ -1809,7 +1732,7 @@ class SoupStrainer(object): # True matches any non-None value. return markup is not None - if isinstance(match_against, Callable): + if isinstance(match_against, collections.Callable): return match_against(markup) # Custom callables take the tag as an argument, but all diff --git a/libs/bs4/testing.py b/libs/bs4/testing.py index 745a9c4af..6ba2506c4 100644 --- a/libs/bs4/testing.py +++ b/libs/bs4/testing.py @@ -1,4 +1,3 @@ -# encoding: utf-8 """Helper classes for tests.""" # Use of this source code is governed by a BSD-style license that can be @@ -151,14 +150,6 @@ class HTMLTreeBuilderSmokeTest(object): soup.encode("utf-8").replace(b"\n", b""), markup.replace(b"\n", b"")) - def test_namespaced_html(self): - """When a namespaced XML document is parsed as HTML it should - be treated as HTML with weird tag names. - """ - markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>""" - soup = self.soup(markup) - self.assertEqual(2, len(soup.find_all("ns1:foo"))) - def test_processing_instruction(self): # We test both Unicode and bytestring to verify that # process_markup correctly sets processing_instruction_class @@ -320,26 +311,6 @@ Hello, world! def test_angle_brackets_in_attribute_values_are_escaped(self): self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') - def test_strings_resembling_character_entity_references(self): - # "&T" and "&p" look like incomplete character entities, but they are - # not. - self.assertSoupEquals( - u"<p>• AT&T is in the s&p 500</p>", - u"<p>\u2022 AT&T is in the s&p 500</p>" - ) - - def test_entities_in_foreign_document_encoding(self): - # “ and ” are invalid numeric entities referencing - # Windows-1252 characters. - references a character common - # to Windows-1252 and Unicode, and ☃ references a - # character only found in Unicode. - # - # All of these entities should be converted to Unicode - # characters. - markup = "<p>“Hello” -☃</p>" - soup = self.soup(markup) - self.assertEquals(u"“Hello” -☃", soup.p.string) - def test_entities_in_attributes_converted_to_unicode(self): expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' self.assertSoupEquals('<p id="piñata"></p>', expect) @@ -363,7 +334,7 @@ Hello, world! self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) - + def test_multipart_strings(self): "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") @@ -653,17 +624,6 @@ class XMLTreeBuilderSmokeTest(object): self.assertEqual( soup.encode("utf-8"), markup) - def test_nested_namespaces(self): - doc = b"""<?xml version="1.0" encoding="utf-8"?> -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> -<parent xmlns="http://ns1/"> -<child xmlns="http://ns2/" xmlns:ns3="http://ns3/"> -<grandchild ns3:attr="value" xmlns="http://ns4/"/> -</child> -</parent>""" - soup = self.soup(doc) - self.assertEqual(doc, soup.encode()) - def test_formatter_processes_script_tag_for_xml_documents(self): doc = """ <script type="text/javascript"> diff --git a/libs/bs4/tests/test_htmlparser.py b/libs/bs4/tests/test_htmlparser.py index 0381c7d3a..d5cf0253f 100644 --- a/libs/bs4/tests/test_htmlparser.py +++ b/libs/bs4/tests/test_htmlparser.py @@ -5,7 +5,6 @@ from pdb import set_trace import pickle from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.builder import HTMLParserTreeBuilder -from bs4.builder._htmlparser import BeautifulSoupHTMLParser class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): @@ -33,17 +32,3 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): def test_redundant_empty_element_closing_tags(self): self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>") self.assertSoupEquals('</br></br></br>', "") - - def test_empty_element(self): - # This verifies that any buffered data present when the parser - # finishes working is handled. - self.assertSoupEquals("foo &# bar", "foo &# bar") - - -class TestHTMLParserSubclass(SoupTest): - def test_error(self): - """Verify that our HTMLParser subclass implements error() in a way - that doesn't cause a crash. - """ - parser = BeautifulSoupHTMLParser() - parser.error("don't crash") diff --git a/libs/bs4/tests/test_lxml.py b/libs/bs4/tests/test_lxml.py index 8a8f69047..a05870b91 100644 --- a/libs/bs4/tests/test_lxml.py +++ b/libs/bs4/tests/test_lxml.py @@ -46,12 +46,6 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertSoupEquals( "<p>foo�bar</p>", "<p>foobar</p>") - def test_entities_in_foreign_document_encoding(self): - # We can't implement this case correctly because by the time we - # hear about markup like "“", it's been (incorrectly) converted into - # a string like u'\x93' - pass - # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this # test if an old version of lxml is installed. diff --git a/libs/bs4/tests/test_tree.py b/libs/bs4/tests/test_tree.py index 883cd8abb..c0e7c4080 100644 --- a/libs/bs4/tests/test_tree.py +++ b/libs/bs4/tests/test_tree.py @@ -605,7 +605,7 @@ class SiblingTest(TreeTest): </html>''' # All that whitespace looks good but makes the tests more # difficult. Get rid of it. - markup = re.compile(r"\n\s*").sub("", markup) + markup = re.compile("\n\s*").sub("", markup) self.tree = self.soup(markup) @@ -703,12 +703,12 @@ class TestTagCreation(SoupTest): """Test the ability to create new tags.""" def test_new_tag(self): soup = self.soup("") - new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"}) + new_tag = soup.new_tag("foo", bar="baz") self.assertTrue(isinstance(new_tag, Tag)) self.assertEqual("foo", new_tag.name) - self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs) + self.assertEqual(dict(bar="baz"), new_tag.attrs) self.assertEqual(None, new_tag.parent) - + def test_tag_inherits_self_closing_rules_from_builder(self): if XML_BUILDER_PRESENT: xml_soup = BeautifulSoup("", "lxml-xml") @@ -821,26 +821,6 @@ class TestTreeModification(SoupTest): soup = self.soup(text) self.assertRaises(ValueError, soup.a.insert, 0, soup.a) - def test_insert_beautifulsoup_object_inserts_children(self): - """Inserting one BeautifulSoup object into another actually inserts all - of its children -- you'll never combine BeautifulSoup objects. - """ - soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>") - - text = "<p>p2</p><p>p3</p>" - to_insert = self.soup(text) - soup.insert(1, to_insert) - - for i in soup.descendants: - assert not isinstance(i, BeautifulSoup) - - p1, p2, p3, p4 = list(soup.children) - self.assertEquals("And now, a word:", p1.string) - self.assertEquals("p2", p2.string) - self.assertEquals("p3", p3.string) - self.assertEquals("And we're back.", p4.string) - - def test_replace_with_maintains_next_element_throughout(self): soup = self.soup('<p><a>one</a><b>three</b></p>') a = soup.a @@ -1206,7 +1186,7 @@ class TestElementObjects(SoupTest): tag = soup.bTag self.assertEqual(soup.b, tag) self.assertEqual( - '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")', + '.bTag is deprecated, use .find("b") instead.', str(w[0].message)) def test_has_attr(self): @@ -1439,21 +1419,13 @@ class TestSubstitutions(SoupTest): u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) def test_formatter_html(self): - markup = u"<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" + markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) decoded = soup.decode(formatter="html") self.assertEqual( decoded, - self.document_for("<br/><b><<Sacré bleu!>></b>")) + self.document_for("<b><<Sacré bleu!>></b>")) - def test_formatter_html5(self): - markup = u"<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" - soup = self.soup(markup) - decoded = soup.decode(formatter="html5") - self.assertEqual( - decoded, - self.document_for("<br><b><<Sacré bleu!>></b>")) - def test_formatter_minimal(self): markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" soup = self.soup(markup) @@ -1474,14 +1446,14 @@ class TestSubstitutions(SoupTest): self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) def test_formatter_custom(self): - markup = u"<b><foo></b><b>bar</b><br/>" + markup = u"<b><foo></b><b>bar</b>" soup = self.soup(markup) decoded = soup.decode(formatter = lambda x: x.upper()) # Instead of normal entity conversion code, the custom # callable is called on every string. self.assertEqual( decoded, - self.document_for(u"<b><FOO></b><b>BAR</b><br>")) + self.document_for(u"<b><FOO></b><b>BAR</b>")) def test_formatter_is_run_on_attribute_values(self): markup = u'<a href="http://a.com?a=b&c=é">e</a>' @@ -1526,7 +1498,7 @@ class TestSubstitutions(SoupTest): u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', soup.div.prettify()) - def test_prettify_accepts_formatter_function(self): + def test_prettify_accepts_formatter(self): soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser') pretty = soup.prettify(formatter = lambda x: x.upper()) self.assertTrue("FOO" in pretty) @@ -2074,17 +2046,5 @@ class TestSoupSelector(TreeTest): def test_multiple_select_nested(self): self.assertSelects('body > div > x, y > z', ['xid', 'zidb']) - def test_select_duplicate_elements(self): - # When markup contains duplicate elements, a multiple select - # will find all of them. - markup = '<div class="c1"/><div class="c2"/><div class="c1"/>' - soup = BeautifulSoup(markup, 'html.parser') - selected = soup.select(".c1, .c2") - self.assertEquals(3, len(selected)) - - # Verify that find_all finds the same elements, though because - # of an implementation detail it finds them in a different - # order. - for element in soup.find_all(class_=['c1', 'c2']): - assert element in selected + |