summaryrefslogtreecommitdiffhomepage
path: root/libs/bs4
diff options
context:
space:
mode:
authorLouis Vézina <[email protected]>2018-10-20 22:53:12 -0400
committerLouis Vézina <[email protected]>2018-10-20 22:53:12 -0400
commitd61bdfcd4fe2d4423d784cd1370acf630719e90f (patch)
tree0c97cbd0e9d66dd3e300a6427171827c28eb3f85 /libs/bs4
parentb9a987b57d361c87cc69bd18fdc39b42ded97199 (diff)
downloadbazarr-d61bdfcd4fe2d4423d784cd1370acf630719e90f.tar.gz
bazarr-d61bdfcd4fe2d4423d784cd1370acf630719e90f.zip
Initial commit.
Diffstat (limited to 'libs/bs4')
-rw-r--r--libs/bs4/__init__.py87
-rw-r--r--libs/bs4/builder/__init__.py16
-rw-r--r--libs/bs4/builder/_htmlparser.py47
-rw-r--r--libs/bs4/builder/_lxml.py18
-rw-r--r--libs/bs4/dammit.py6
-rw-r--r--libs/bs4/diagnose.py20
-rw-r--r--libs/bs4/element.py155
-rw-r--r--libs/bs4/testing.py42
-rw-r--r--libs/bs4/tests/test_htmlparser.py15
-rw-r--r--libs/bs4/tests/test_lxml.py6
-rw-r--r--libs/bs4/tests/test_tree.py62
11 files changed, 96 insertions, 378 deletions
diff --git a/libs/bs4/__init__.py b/libs/bs4/__init__.py
index 470177fdf..7a80452f7 100644
--- a/libs/bs4/__init__.py
+++ b/libs/bs4/__init__.py
@@ -21,15 +21,14 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
# found in the LICENSE file.
__author__ = "Leonard Richardson ([email protected])"
-__version__ = "4.6.3"
-__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson"
+__version__ = "4.6.0"
+__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
import os
import re
-import sys
import traceback
import warnings
@@ -83,46 +82,14 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
- NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
+ NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs):
- """Constructor.
-
- :param markup: A string or a file-like object representing
- markup to be parsed.
-
- :param features: Desirable features of the parser to be used. This
- may be the name of a specific parser ("lxml", "lxml-xml",
- "html.parser", or "html5lib") or it may be the type of markup
- to be used ("html", "html5", "xml"). It's recommended that you
- name a specific parser, so that Beautiful Soup gives you the
- same results across platforms and virtual environments.
-
- :param builder: A specific TreeBuilder to use instead of looking one
- up based on `features`. You shouldn't need to use this.
-
- :param parse_only: A SoupStrainer. Only parts of the document
- matching the SoupStrainer will be considered. This is useful
- when parsing part of a document that would otherwise be too
- large to fit into memory.
-
- :param from_encoding: A string indicating the encoding of the
- document to be parsed. Pass this in if Beautiful Soup is
- guessing wrongly about the document's encoding.
-
- :param exclude_encodings: A list of strings indicating
- encodings known to be wrong. Pass this in if you don't know
- the document's encoding but you know Beautiful Soup's guess is
- wrong.
-
- :param kwargs: For backwards compatibility purposes, the
- constructor accepts certain keyword arguments used in
- Beautiful Soup 3. None of these arguments do anything in
- Beautiful Soup 4 and there's no need to actually pass keyword
- arguments into the constructor.
- """
+ """The Soup object is initialized as the 'root tag', and the
+ provided markup (which can be a string or a file-like object)
+ is fed into the underlying parser."""
if 'convertEntities' in kwargs:
warnings.warn(
@@ -204,35 +171,14 @@ class BeautifulSoup(Tag):
else:
markup_type = "HTML"
- # This code adapted from warnings.py so that we get the same line
- # of code as our warnings.warn() call gets, even if the answer is wrong
- # (as it may be in a multithreading situation).
- caller = None
- try:
- caller = sys._getframe(1)
- except ValueError:
- pass
- if caller:
- globals = caller.f_globals
- line_number = caller.f_lineno
- else:
- globals = sys.__dict__
- line_number= 1
- filename = globals.get('__file__')
- if filename:
- fnl = filename.lower()
- if fnl.endswith((".pyc", ".pyo")):
- filename = filename[:-1]
- if filename:
- # If there is no filename at all, the user is most likely in a REPL,
- # and the warning is not necessary.
- values = dict(
- filename=filename,
- line_number=line_number,
- parser=builder.NAME,
- markup_type=markup_type
- )
- warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
+ caller = traceback.extract_stack()[0]
+ filename = caller[0]
+ line_number = caller[1]
+ warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
+ filename=filename,
+ line_number=line_number,
+ parser=builder.NAME,
+ markup_type=markup_type))
self.builder = builder
self.is_xml = builder.is_xml
@@ -356,10 +302,9 @@ class BeautifulSoup(Tag):
self.preserve_whitespace_tag_stack = []
self.pushTag(self)
- def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
+ def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
"""Create a new tag associated with this soup."""
- kwattrs.update(attrs)
- return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
+ return Tag(None, self.builder, name, namespace, nsprefix, attrs)
def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup."""
diff --git a/libs/bs4/builder/__init__.py b/libs/bs4/builder/__init__.py
index c9e3f3d31..fdb3362fc 100644
--- a/libs/bs4/builder/__init__.py
+++ b/libs/bs4/builder/__init__.py
@@ -93,7 +93,7 @@ class TreeBuilder(object):
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
-
+
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {}
@@ -125,7 +125,7 @@ class TreeBuilder(object):
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
-
+
def feed(self, markup):
raise NotImplementedError()
@@ -235,17 +235,11 @@ class HTMLTreeBuilder(TreeBuilder):
empty_element_tags = set([
# These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
-
- # These are from earlier versions of HTML and are removed in HTML5.
- 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
+
+ # These are from HTML4, removed in HTML5.
+ 'spacer', 'frame'
])
- # The HTML standard defines these as block-level elements. Beautiful
- # Soup does not treat these elements differently from other elements,
- # but it may do so eventually, and this information is available if
- # you need to use it.
- block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
-
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,
diff --git a/libs/bs4/builder/_htmlparser.py b/libs/bs4/builder/_htmlparser.py
index ee6c685df..67890b3a3 100644
--- a/libs/bs4/builder/_htmlparser.py
+++ b/libs/bs4/builder/_htmlparser.py
@@ -1,4 +1,3 @@
-# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
# Use of this source code is governed by a BSD-style license that can be
@@ -65,18 +64,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
# order. It's a list of closing tags we've already handled and
# will ignore, assuming they ever show up.
self.already_closed_empty_element = []
-
- def error(self, msg):
- """In Python 3, HTMLParser subclasses must implement error(), although this
- requirement doesn't appear to be documented.
-
- In Python 2, HTMLParser implements error() as raising an exception.
-
- In any event, this method is called only on very strange markup and our best strategy
- is to pretend it didn't happen and keep going.
- """
- warnings.warn(msg)
-
+
def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like
# <tag/>.
@@ -141,26 +129,11 @@ class BeautifulSoupHTMLParser(HTMLParser):
else:
real_name = int(name)
- data = None
- if real_name < 256:
- # HTML numeric entities are supposed to reference Unicode
- # code points, but sometimes they reference code points in
- # some other encoding (ahem, Windows-1252). E.g. &#147;
- # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
- # code tries to detect this situation and compensate.
- for encoding in (self.soup.original_encoding, 'windows-1252'):
- if not encoding:
- continue
- try:
- data = bytearray([real_name]).decode(encoding)
- except UnicodeDecodeError, e:
- pass
- if not data:
- try:
- data = unichr(real_name)
- except (ValueError, OverflowError), e:
- pass
- data = data or u"\N{REPLACEMENT CHARACTER}"
+ try:
+ data = unichr(real_name)
+ except (ValueError, OverflowError), e:
+ data = u"\N{REPLACEMENT CHARACTER}"
+
self.handle_data(data)
def handle_entityref(self, name):
@@ -168,12 +141,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
if character is not None:
data = character
else:
- # If this were XML, it would be ambiguous whether "&foo"
- # was an character entity reference with a missing
- # semicolon or the literal string "&foo". Since this is
- # HTML, we have a complete list of all character entity references,
- # and this one wasn't found, so assume it's the literal string "&foo".
- data = "&%s" % name
+ data = "&%s;" % name
self.handle_data(data)
def handle_comment(self, data):
@@ -245,7 +213,6 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup
try:
parser.feed(markup)
- parser.close()
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
diff --git a/libs/bs4/builder/_lxml.py b/libs/bs4/builder/_lxml.py
index 4a0f7de4b..d2ca2872d 100644
--- a/libs/bs4/builder/_lxml.py
+++ b/libs/bs4/builder/_lxml.py
@@ -5,13 +5,9 @@ __all__ = [
'LXMLTreeBuilder',
]
-try:
- from collections.abc import Callable # Python 3.6
-except ImportError , e:
- from collections import Callable
-
from io import BytesIO
from StringIO import StringIO
+import collections
from lxml import etree
from bs4.element import (
Comment,
@@ -62,7 +58,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Use the default parser.
parser = self.default_parser(encoding)
- if isinstance(parser, Callable):
+ if isinstance(parser, collections.Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser
@@ -151,11 +147,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
- if len(nsmap) == 0 and len(self.nsmaps) > 1:
- # There are no new namespaces for this tag, but
- # non-default namespaces are in play, so we need a
- # separate tag stack to know when they end.
- self.nsmaps.append(None)
+ if len(self.nsmaps) > 1:
+ # There are no new namespaces for this tag, but
+ # non-default namespaces are in play, so we need a
+ # separate tag stack to know when they end.
+ self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
diff --git a/libs/bs4/dammit.py b/libs/bs4/dammit.py
index be46b3942..7965565f5 100644
--- a/libs/bs4/dammit.py
+++ b/libs/bs4/dammit.py
@@ -46,9 +46,9 @@ except ImportError:
pass
xml_encoding_re = re.compile(
- '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
+ '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
html_meta_re = re.compile(
- '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+ '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object):
@@ -82,7 +82,7 @@ class EntitySubstitution(object):
}
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
- "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
")")
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
diff --git a/libs/bs4/diagnose.py b/libs/bs4/diagnose.py
index 7a28c09a7..8768332f5 100644
--- a/libs/bs4/diagnose.py
+++ b/libs/bs4/diagnose.py
@@ -37,7 +37,7 @@ def diagnose(data):
name)
if 'lxml' in basic_parsers:
- basic_parsers.append("lxml-xml")
+ basic_parsers.append(["lxml", "xml"])
try:
from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
@@ -56,27 +56,21 @@ def diagnose(data):
if hasattr(data, 'read'):
data = data.read()
+ elif os.path.exists(data):
+ print '"%s" looks like a filename. Reading data from the file.' % data
+ with open(data) as fp:
+ data = fp.read()
elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
return
- else:
- try:
- if os.path.exists(data):
- print '"%s" looks like a filename. Reading data from the file.' % data
- with open(data) as fp:
- data = fp.read()
- except ValueError:
- # This can happen on some platforms when the 'filename' is
- # too long. Assume it's data and not a filename.
- pass
- print
+ print
for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser
success = False
try:
- soup = BeautifulSoup(data, features=parser)
+ soup = BeautifulSoup(data, parser)
success = True
except Exception, e:
print "%s could not parse the markup." % parser
diff --git a/libs/bs4/element.py b/libs/bs4/element.py
index 886eb91f2..9ef75f814 100644
--- a/libs/bs4/element.py
+++ b/libs/bs4/element.py
@@ -2,10 +2,7 @@
# found in the LICENSE file.
__license__ = "MIT"
-try:
- from collections.abc import Callable # Python 3.6
-except ImportError , e:
- from collections import Callable
+import collections
import re
import shlex
import sys
@@ -15,7 +12,7 @@ from bs4.dammit import EntitySubstitution
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
-whitespace_re = re.compile(r"\s+")
+whitespace_re = re.compile("\s+")
def _alias(attr):
"""Alias one attribute name to another for backward compatibility"""
@@ -72,7 +69,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
The value of the 'content' attribute will be one of these objects.
"""
- CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
+ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
def __new__(cls, original_value):
match = cls.CHARSET_RE.search(original_value)
@@ -126,41 +123,6 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_xml)
-class Formatter(object):
- """Contains information about how to format a parse tree."""
-
- # By default, represent void elements as <tag/> rather than <tag>
- void_element_close_prefix = '/'
-
- def substitute_entities(self, *args, **kwargs):
- """Transform certain characters into named entities."""
- raise NotImplementedError()
-
-class HTMLFormatter(Formatter):
- """The default HTML formatter."""
- def substitute(self, *args, **kwargs):
- return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
-
-class MinimalHTMLFormatter(Formatter):
- """A minimal HTML formatter."""
- def substitute(self, *args, **kwargs):
- return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
-
-class HTML5Formatter(HTMLFormatter):
- """An HTML formatter that omits the slash in a void tag."""
- void_element_close_prefix = None
-
-class XMLFormatter(Formatter):
- """Substitute only the essential XML entities."""
- def substitute(self, *args, **kwargs):
- return EntitySubstitution.substitute_xml(*args, **kwargs)
-
-class HTMLXMLFormatter(Formatter):
- """Format XML using HTML rules."""
- def substitute(self, *args, **kwargs):
- return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
-
-
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -169,49 +131,40 @@ class PageElement(object):
# to methods like encode() and prettify():
#
# "html" - All Unicode characters with corresponding HTML entities
- # are converted to those entities on output.
- # "html5" - The same as "html", but empty void tags are represented as
- # <tag> rather than <tag/>
- # "minimal" - Bare ampersands and angle brackets are converted to
+ # are converted to those entities on output.
+ # "minimal" - Bare ampersands and angle brackets are converted to
# XML entities: &amp; &lt; &gt;
# None - The null formatter. Unicode characters are never
# converted to entities. This is not recommended, but it's
# faster than "minimal".
- # A callable function - it will be called on every string that needs to undergo entity substitution.
- # A Formatter instance - Formatter.substitute(string) will be called on every string that
+ # A function - This function will be called on every string that
# needs to undergo entity substitution.
#
- # In an HTML document, the default "html", "html5", and "minimal"
- # functions will leave the contents of <script> and <style> tags
- # alone. For an XML document, all tags will be given the same
- # treatment.
+ # In an HTML document, the default "html" and "minimal" functions
+ # will leave the contents of <script> and <style> tags alone. For
+ # an XML document, all tags will be given the same treatment.
HTML_FORMATTERS = {
- "html" : HTMLFormatter(),
- "html5" : HTML5Formatter(),
- "minimal" : MinimalHTMLFormatter(),
+ "html" : HTMLAwareEntitySubstitution.substitute_html,
+ "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
None : None
}
XML_FORMATTERS = {
- "html" : HTMLXMLFormatter(),
- "minimal" : XMLFormatter(),
+ "html" : EntitySubstitution.substitute_html,
+ "minimal" : EntitySubstitution.substitute_xml,
None : None
}
def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter."""
- if isinstance(formatter, basestring):
+ if not callable(formatter):
formatter = self._formatter_for_name(formatter)
if formatter is None:
output = s
else:
- if callable(formatter):
- # Backwards compatibility -- you used to pass in a formatting method.
- output = formatter(s)
- else:
- output = formatter.substitute(s)
+ output = formatter(s)
return output
@property
@@ -241,9 +194,11 @@ class PageElement(object):
def _formatter_for_name(self, name):
"Look up a formatter function based on its name and the tree."
if self._is_xml:
- return self.XML_FORMATTERS.get(name, XMLFormatter())
+ return self.XML_FORMATTERS.get(
+ name, EntitySubstitution.substitute_xml)
else:
- return self.HTML_FORMATTERS.get(name, HTMLFormatter())
+ return self.HTML_FORMATTERS.get(
+ name, HTMLAwareEntitySubstitution.substitute_xml)
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
@@ -361,14 +316,6 @@ class PageElement(object):
and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child)
- from bs4 import BeautifulSoup
- if isinstance(new_child, BeautifulSoup):
- # We don't want to end up with a situation where one BeautifulSoup
- # object contains another. Insert the children one at a time.
- for subchild in list(new_child.contents):
- self.insert(position, subchild)
- position += 1
- return
position = min(position, len(self.contents))
if hasattr(new_child, 'parent') and new_child.parent is not None:
# We're 'inserting' an element that's already one
@@ -589,21 +536,14 @@ class PageElement(object):
elif isinstance(name, basestring):
# Optimization to find all tags with a given name.
if name.count(':') == 1:
- # This is a name with a prefix. If this is a namespace-aware document,
- # we need to match the local name against tag.name. If not,
- # we need to match the fully-qualified name against tag.name.
- prefix, local_name = name.split(':', 1)
+ # This is a name with a prefix.
+ prefix, name = name.split(':', 1)
else:
prefix = None
- local_name = name
result = (element for element in generator
if isinstance(element, Tag)
- and (
- element.name == name
- ) or (
- element.name == local_name
- and (prefix is None or element.prefix == prefix)
- )
+ and element.name == name
+ and (prefix is None or element.prefix == prefix)
)
return ResultSet(strainer, result)
results = ResultSet(strainer)
@@ -922,7 +862,7 @@ class Tag(PageElement):
self.can_be_empty_element = builder.can_be_empty_element(name)
else:
self.can_be_empty_element = False
-
+
parserClass = _alias("parser_class") # BS3
def __copy__(self):
@@ -1106,10 +1046,8 @@ class Tag(PageElement):
# BS3: soup.aTag -> "soup.find("a")
tag_name = tag[:-3]
warnings.warn(
- '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
- name=tag_name
- )
- )
+ '.%sTag is deprecated, use .find("%s") instead.' % (
+ tag_name, tag_name))
return self.find(tag_name)
# We special case contents to avoid recursion.
elif not tag.startswith("__") and not tag == "contents":
@@ -1191,10 +1129,11 @@ class Tag(PageElement):
encoding.
"""
- # First off, turn a string formatter into a Formatter object. This
+ # First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
- if not isinstance(formatter, Formatter) and not callable(formatter):
+ if not callable(formatter):
formatter = self._formatter_for_name(formatter)
+
attrs = []
if self.attrs:
for key, val in sorted(self.attrs.items()):
@@ -1223,9 +1162,7 @@ class Tag(PageElement):
prefix = self.prefix + ":"
if self.is_empty_element:
- close = ''
- if isinstance(formatter, Formatter):
- close = formatter.void_element_close_prefix or close
+ close = '/'
else:
closeTag = '</%s%s>' % (prefix, self.name)
@@ -1296,9 +1233,9 @@ class Tag(PageElement):
:param formatter: The output formatter responsible for converting
entities to Unicode characters.
"""
- # First off, turn a string formatter into a Formatter object. This
+ # First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
- if not isinstance(formatter, Formatter) and not callable(formatter):
+ if not callable(formatter):
formatter = self._formatter_for_name(formatter)
pretty_print = (indent_level is not None)
@@ -1411,29 +1348,15 @@ class Tag(PageElement):
# Handle grouping selectors if ',' exists, ie: p,a
if ',' in selector:
context = []
- selectors = [x.strip() for x in selector.split(",")]
-
- # If a selector is mentioned multiple times we don't want
- # to use it more than once.
- used_selectors = set()
-
- # We also don't want to select the same element more than once,
- # if it's matched by multiple selectors.
- selected_object_ids = set()
- for partial_selector in selectors:
+ for partial_selector in selector.split(','):
+ partial_selector = partial_selector.strip()
if partial_selector == '':
raise ValueError('Invalid group selection syntax: %s' % selector)
- if partial_selector in used_selectors:
- continue
- used_selectors.add(partial_selector)
candidates = self.select(partial_selector, limit=limit)
for candidate in candidates:
- # This lets us distinguish between distinct tags that
- # represent the same markup.
- object_id = id(candidate)
- if object_id not in selected_object_ids:
+ if candidate not in context:
context.append(candidate)
- selected_object_ids.add(object_id)
+
if limit and len(context) >= limit:
break
return context
@@ -1495,7 +1418,7 @@ class Tag(PageElement):
if tag_name == '':
raise ValueError(
"A pseudo-class must be prefixed with a tag name.")
- pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+ pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
found = []
if pseudo_attributes is None:
pseudo_type = pseudo
@@ -1729,7 +1652,7 @@ class SoupStrainer(object):
markup = markup_name
markup_attrs = markup
call_function_with_tag_data = (
- isinstance(self.name, Callable)
+ isinstance(self.name, collections.Callable)
and not isinstance(markup_name, Tag))
if ((not self.name)
@@ -1809,7 +1732,7 @@ class SoupStrainer(object):
# True matches any non-None value.
return markup is not None
- if isinstance(match_against, Callable):
+ if isinstance(match_against, collections.Callable):
return match_against(markup)
# Custom callables take the tag as an argument, but all
diff --git a/libs/bs4/testing.py b/libs/bs4/testing.py
index 745a9c4af..6ba2506c4 100644
--- a/libs/bs4/testing.py
+++ b/libs/bs4/testing.py
@@ -1,4 +1,3 @@
-# encoding: utf-8
"""Helper classes for tests."""
# Use of this source code is governed by a BSD-style license that can be
@@ -151,14 +150,6 @@ class HTMLTreeBuilderSmokeTest(object):
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
- def test_namespaced_html(self):
- """When a namespaced XML document is parsed as HTML it should
- be treated as HTML with weird tag names.
- """
- markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
- soup = self.soup(markup)
- self.assertEqual(2, len(soup.find_all("ns1:foo")))
-
def test_processing_instruction(self):
# We test both Unicode and bytestring to verify that
# process_markup correctly sets processing_instruction_class
@@ -320,26 +311,6 @@ Hello, world!
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
- def test_strings_resembling_character_entity_references(self):
- # "&T" and "&p" look like incomplete character entities, but they are
- # not.
- self.assertSoupEquals(
- u"<p>&bull; AT&T is in the s&p 500</p>",
- u"<p>\u2022 AT&amp;T is in the s&amp;p 500</p>"
- )
-
- def test_entities_in_foreign_document_encoding(self):
- # &#147; and &#148; are invalid numeric entities referencing
- # Windows-1252 characters. &#45; references a character common
- # to Windows-1252 and Unicode, and &#9731; references a
- # character only found in Unicode.
- #
- # All of these entities should be converted to Unicode
- # characters.
- markup = "<p>&#147;Hello&#148; &#45;&#9731;</p>"
- soup = self.soup(markup)
- self.assertEquals(u"“Hello” -☃", soup.p.string)
-
def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
@@ -363,7 +334,7 @@ Hello, world!
self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect)
-
+
def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
@@ -653,17 +624,6 @@ class XMLTreeBuilderSmokeTest(object):
self.assertEqual(
soup.encode("utf-8"), markup)
- def test_nested_namespaces(self):
- doc = b"""<?xml version="1.0" encoding="utf-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
-<parent xmlns="http://ns1/">
-<child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
-<grandchild ns3:attr="value" xmlns="http://ns4/"/>
-</child>
-</parent>"""
- soup = self.soup(doc)
- self.assertEqual(doc, soup.encode())
-
def test_formatter_processes_script_tag_for_xml_documents(self):
doc = """
<script type="text/javascript">
diff --git a/libs/bs4/tests/test_htmlparser.py b/libs/bs4/tests/test_htmlparser.py
index 0381c7d3a..d5cf0253f 100644
--- a/libs/bs4/tests/test_htmlparser.py
+++ b/libs/bs4/tests/test_htmlparser.py
@@ -5,7 +5,6 @@ from pdb import set_trace
import pickle
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
-from bs4.builder._htmlparser import BeautifulSoupHTMLParser
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
@@ -33,17 +32,3 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
def test_redundant_empty_element_closing_tags(self):
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
self.assertSoupEquals('</br></br></br>', "")
-
- def test_empty_element(self):
- # This verifies that any buffered data present when the parser
- # finishes working is handled.
- self.assertSoupEquals("foo &# bar", "foo &amp;# bar")
-
-
-class TestHTMLParserSubclass(SoupTest):
- def test_error(self):
- """Verify that our HTMLParser subclass implements error() in a way
- that doesn't cause a crash.
- """
- parser = BeautifulSoupHTMLParser()
- parser.error("don't crash")
diff --git a/libs/bs4/tests/test_lxml.py b/libs/bs4/tests/test_lxml.py
index 8a8f69047..a05870b91 100644
--- a/libs/bs4/tests/test_lxml.py
+++ b/libs/bs4/tests/test_lxml.py
@@ -46,12 +46,6 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
- def test_entities_in_foreign_document_encoding(self):
- # We can't implement this case correctly because by the time we
- # hear about markup like "&#147;", it's been (incorrectly) converted into
- # a string like u'\x93'
- pass
-
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
diff --git a/libs/bs4/tests/test_tree.py b/libs/bs4/tests/test_tree.py
index 883cd8abb..c0e7c4080 100644
--- a/libs/bs4/tests/test_tree.py
+++ b/libs/bs4/tests/test_tree.py
@@ -605,7 +605,7 @@ class SiblingTest(TreeTest):
</html>'''
# All that whitespace looks good but makes the tests more
# difficult. Get rid of it.
- markup = re.compile(r"\n\s*").sub("", markup)
+ markup = re.compile("\n\s*").sub("", markup)
self.tree = self.soup(markup)
@@ -703,12 +703,12 @@ class TestTagCreation(SoupTest):
"""Test the ability to create new tags."""
def test_new_tag(self):
soup = self.soup("")
- new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
+ new_tag = soup.new_tag("foo", bar="baz")
self.assertTrue(isinstance(new_tag, Tag))
self.assertEqual("foo", new_tag.name)
- self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs)
+ self.assertEqual(dict(bar="baz"), new_tag.attrs)
self.assertEqual(None, new_tag.parent)
-
+
def test_tag_inherits_self_closing_rules_from_builder(self):
if XML_BUILDER_PRESENT:
xml_soup = BeautifulSoup("", "lxml-xml")
@@ -821,26 +821,6 @@ class TestTreeModification(SoupTest):
soup = self.soup(text)
self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
- def test_insert_beautifulsoup_object_inserts_children(self):
- """Inserting one BeautifulSoup object into another actually inserts all
- of its children -- you'll never combine BeautifulSoup objects.
- """
- soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>")
-
- text = "<p>p2</p><p>p3</p>"
- to_insert = self.soup(text)
- soup.insert(1, to_insert)
-
- for i in soup.descendants:
- assert not isinstance(i, BeautifulSoup)
-
- p1, p2, p3, p4 = list(soup.children)
- self.assertEquals("And now, a word:", p1.string)
- self.assertEquals("p2", p2.string)
- self.assertEquals("p3", p3.string)
- self.assertEquals("And we're back.", p4.string)
-
-
def test_replace_with_maintains_next_element_throughout(self):
soup = self.soup('<p><a>one</a><b>three</b></p>')
a = soup.a
@@ -1206,7 +1186,7 @@ class TestElementObjects(SoupTest):
tag = soup.bTag
self.assertEqual(soup.b, tag)
self.assertEqual(
- '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")',
+ '.bTag is deprecated, use .find("b") instead.',
str(w[0].message))
def test_has_attr(self):
@@ -1439,21 +1419,13 @@ class TestSubstitutions(SoupTest):
u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
def test_formatter_html(self):
- markup = u"<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
+ markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter="html")
self.assertEqual(
decoded,
- self.document_for("<br/><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
+ self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
- def test_formatter_html5(self):
- markup = u"<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
- soup = self.soup(markup)
- decoded = soup.decode(formatter="html5")
- self.assertEqual(
- decoded,
- self.document_for("<br><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
-
def test_formatter_minimal(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup)
@@ -1474,14 +1446,14 @@ class TestSubstitutions(SoupTest):
self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
def test_formatter_custom(self):
- markup = u"<b>&lt;foo&gt;</b><b>bar</b><br/>"
+ markup = u"<b>&lt;foo&gt;</b><b>bar</b>"
soup = self.soup(markup)
decoded = soup.decode(formatter = lambda x: x.upper())
# Instead of normal entity conversion code, the custom
# callable is called on every string.
self.assertEqual(
decoded,
- self.document_for(u"<b><FOO></b><b>BAR</b><br>"))
+ self.document_for(u"<b><FOO></b><b>BAR</b>"))
def test_formatter_is_run_on_attribute_values(self):
markup = u'<a href="http://a.com?a=b&c=é">e</a>'
@@ -1526,7 +1498,7 @@ class TestSubstitutions(SoupTest):
u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
soup.div.prettify())
- def test_prettify_accepts_formatter_function(self):
+ def test_prettify_accepts_formatter(self):
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
pretty = soup.prettify(formatter = lambda x: x.upper())
self.assertTrue("FOO" in pretty)
@@ -2074,17 +2046,5 @@ class TestSoupSelector(TreeTest):
def test_multiple_select_nested(self):
self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
- def test_select_duplicate_elements(self):
- # When markup contains duplicate elements, a multiple select
- # will find all of them.
- markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
- soup = BeautifulSoup(markup, 'html.parser')
- selected = soup.select(".c1, .c2")
- self.assertEquals(3, len(selected))
-
- # Verify that find_all finds the same elements, though because
- # of an implementation detail it finds them in a different
- # order.
- for element in soup.find_all(class_=['c1', 'c2']):
- assert element in selected
+