summaryrefslogtreecommitdiffhomepage
path: root/libs/bs4/element.py
diff options
context:
space:
mode:
authorLouis Vézina <[email protected]>2018-10-20 22:53:12 -0400
committerLouis Vézina <[email protected]>2018-10-20 22:53:12 -0400
commitd61bdfcd4fe2d4423d784cd1370acf630719e90f (patch)
tree0c97cbd0e9d66dd3e300a6427171827c28eb3f85 /libs/bs4/element.py
parentb9a987b57d361c87cc69bd18fdc39b42ded97199 (diff)
downloadbazarr-d61bdfcd4fe2d4423d784cd1370acf630719e90f.tar.gz
bazarr-d61bdfcd4fe2d4423d784cd1370acf630719e90f.zip
Initial commit.
Diffstat (limited to 'libs/bs4/element.py')
-rw-r--r--libs/bs4/element.py155
1 files changed, 39 insertions, 116 deletions
diff --git a/libs/bs4/element.py b/libs/bs4/element.py
index 886eb91f2..9ef75f814 100644
--- a/libs/bs4/element.py
+++ b/libs/bs4/element.py
@@ -2,10 +2,7 @@
# found in the LICENSE file.
__license__ = "MIT"
-try:
- from collections.abc import Callable # Python 3.6
-except ImportError , e:
- from collections import Callable
+import collections
import re
import shlex
import sys
@@ -15,7 +12,7 @@ from bs4.dammit import EntitySubstitution
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
-whitespace_re = re.compile(r"\s+")
+whitespace_re = re.compile("\s+")
def _alias(attr):
"""Alias one attribute name to another for backward compatibility"""
@@ -72,7 +69,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
The value of the 'content' attribute will be one of these objects.
"""
- CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
+ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
def __new__(cls, original_value):
match = cls.CHARSET_RE.search(original_value)
@@ -126,41 +123,6 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_xml)
-class Formatter(object):
- """Contains information about how to format a parse tree."""
-
- # By default, represent void elements as <tag/> rather than <tag>
- void_element_close_prefix = '/'
-
- def substitute_entities(self, *args, **kwargs):
- """Transform certain characters into named entities."""
- raise NotImplementedError()
-
-class HTMLFormatter(Formatter):
- """The default HTML formatter."""
- def substitute(self, *args, **kwargs):
- return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
-
-class MinimalHTMLFormatter(Formatter):
- """A minimal HTML formatter."""
- def substitute(self, *args, **kwargs):
- return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
-
-class HTML5Formatter(HTMLFormatter):
- """An HTML formatter that omits the slash in a void tag."""
- void_element_close_prefix = None
-
-class XMLFormatter(Formatter):
- """Substitute only the essential XML entities."""
- def substitute(self, *args, **kwargs):
- return EntitySubstitution.substitute_xml(*args, **kwargs)
-
-class HTMLXMLFormatter(Formatter):
- """Format XML using HTML rules."""
- def substitute(self, *args, **kwargs):
- return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
-
-
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -169,49 +131,40 @@ class PageElement(object):
# to methods like encode() and prettify():
#
# "html" - All Unicode characters with corresponding HTML entities
- # are converted to those entities on output.
- # "html5" - The same as "html", but empty void tags are represented as
- # <tag> rather than <tag/>
- # "minimal" - Bare ampersands and angle brackets are converted to
+ # are converted to those entities on output.
+ # "minimal" - Bare ampersands and angle brackets are converted to
# XML entities: &amp; &lt; &gt;
# None - The null formatter. Unicode characters are never
# converted to entities. This is not recommended, but it's
# faster than "minimal".
- # A callable function - it will be called on every string that needs to undergo entity substitution.
- # A Formatter instance - Formatter.substitute(string) will be called on every string that
+ # A function - This function will be called on every string that
# needs to undergo entity substitution.
#
- # In an HTML document, the default "html", "html5", and "minimal"
- # functions will leave the contents of <script> and <style> tags
- # alone. For an XML document, all tags will be given the same
- # treatment.
+ # In an HTML document, the default "html" and "minimal" functions
+ # will leave the contents of <script> and <style> tags alone. For
+ # an XML document, all tags will be given the same treatment.
HTML_FORMATTERS = {
- "html" : HTMLFormatter(),
- "html5" : HTML5Formatter(),
- "minimal" : MinimalHTMLFormatter(),
+ "html" : HTMLAwareEntitySubstitution.substitute_html,
+ "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
None : None
}
XML_FORMATTERS = {
- "html" : HTMLXMLFormatter(),
- "minimal" : XMLFormatter(),
+ "html" : EntitySubstitution.substitute_html,
+ "minimal" : EntitySubstitution.substitute_xml,
None : None
}
def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter."""
- if isinstance(formatter, basestring):
+ if not callable(formatter):
formatter = self._formatter_for_name(formatter)
if formatter is None:
output = s
else:
- if callable(formatter):
- # Backwards compatibility -- you used to pass in a formatting method.
- output = formatter(s)
- else:
- output = formatter.substitute(s)
+ output = formatter(s)
return output
@property
@@ -241,9 +194,11 @@ class PageElement(object):
def _formatter_for_name(self, name):
"Look up a formatter function based on its name and the tree."
if self._is_xml:
- return self.XML_FORMATTERS.get(name, XMLFormatter())
+ return self.XML_FORMATTERS.get(
+ name, EntitySubstitution.substitute_xml)
else:
- return self.HTML_FORMATTERS.get(name, HTMLFormatter())
+ return self.HTML_FORMATTERS.get(
+ name, HTMLAwareEntitySubstitution.substitute_xml)
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
@@ -361,14 +316,6 @@ class PageElement(object):
and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child)
- from bs4 import BeautifulSoup
- if isinstance(new_child, BeautifulSoup):
- # We don't want to end up with a situation where one BeautifulSoup
- # object contains another. Insert the children one at a time.
- for subchild in list(new_child.contents):
- self.insert(position, subchild)
- position += 1
- return
position = min(position, len(self.contents))
if hasattr(new_child, 'parent') and new_child.parent is not None:
# We're 'inserting' an element that's already one
@@ -589,21 +536,14 @@ class PageElement(object):
elif isinstance(name, basestring):
# Optimization to find all tags with a given name.
if name.count(':') == 1:
- # This is a name with a prefix. If this is a namespace-aware document,
- # we need to match the local name against tag.name. If not,
- # we need to match the fully-qualified name against tag.name.
- prefix, local_name = name.split(':', 1)
+ # This is a name with a prefix.
+ prefix, name = name.split(':', 1)
else:
prefix = None
- local_name = name
result = (element for element in generator
if isinstance(element, Tag)
- and (
- element.name == name
- ) or (
- element.name == local_name
- and (prefix is None or element.prefix == prefix)
- )
+ and element.name == name
+ and (prefix is None or element.prefix == prefix)
)
return ResultSet(strainer, result)
results = ResultSet(strainer)
@@ -922,7 +862,7 @@ class Tag(PageElement):
self.can_be_empty_element = builder.can_be_empty_element(name)
else:
self.can_be_empty_element = False
-
+
parserClass = _alias("parser_class") # BS3
def __copy__(self):
@@ -1106,10 +1046,8 @@ class Tag(PageElement):
# BS3: soup.aTag -> "soup.find("a")
tag_name = tag[:-3]
warnings.warn(
- '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
- name=tag_name
- )
- )
+ '.%sTag is deprecated, use .find("%s") instead.' % (
+ tag_name, tag_name))
return self.find(tag_name)
# We special case contents to avoid recursion.
elif not tag.startswith("__") and not tag == "contents":
@@ -1191,10 +1129,11 @@ class Tag(PageElement):
encoding.
"""
- # First off, turn a string formatter into a Formatter object. This
+ # First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
- if not isinstance(formatter, Formatter) and not callable(formatter):
+ if not callable(formatter):
formatter = self._formatter_for_name(formatter)
+
attrs = []
if self.attrs:
for key, val in sorted(self.attrs.items()):
@@ -1223,9 +1162,7 @@ class Tag(PageElement):
prefix = self.prefix + ":"
if self.is_empty_element:
- close = ''
- if isinstance(formatter, Formatter):
- close = formatter.void_element_close_prefix or close
+ close = '/'
else:
closeTag = '</%s%s>' % (prefix, self.name)
@@ -1296,9 +1233,9 @@ class Tag(PageElement):
:param formatter: The output formatter responsible for converting
entities to Unicode characters.
"""
- # First off, turn a string formatter into a Formatter object. This
+ # First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
- if not isinstance(formatter, Formatter) and not callable(formatter):
+ if not callable(formatter):
formatter = self._formatter_for_name(formatter)
pretty_print = (indent_level is not None)
@@ -1411,29 +1348,15 @@ class Tag(PageElement):
# Handle grouping selectors if ',' exists, ie: p,a
if ',' in selector:
context = []
- selectors = [x.strip() for x in selector.split(",")]
-
- # If a selector is mentioned multiple times we don't want
- # to use it more than once.
- used_selectors = set()
-
- # We also don't want to select the same element more than once,
- # if it's matched by multiple selectors.
- selected_object_ids = set()
- for partial_selector in selectors:
+ for partial_selector in selector.split(','):
+ partial_selector = partial_selector.strip()
if partial_selector == '':
raise ValueError('Invalid group selection syntax: %s' % selector)
- if partial_selector in used_selectors:
- continue
- used_selectors.add(partial_selector)
candidates = self.select(partial_selector, limit=limit)
for candidate in candidates:
- # This lets us distinguish between distinct tags that
- # represent the same markup.
- object_id = id(candidate)
- if object_id not in selected_object_ids:
+ if candidate not in context:
context.append(candidate)
- selected_object_ids.add(object_id)
+
if limit and len(context) >= limit:
break
return context
@@ -1495,7 +1418,7 @@ class Tag(PageElement):
if tag_name == '':
raise ValueError(
"A pseudo-class must be prefixed with a tag name.")
- pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+ pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
found = []
if pseudo_attributes is None:
pseudo_type = pseudo
@@ -1729,7 +1652,7 @@ class SoupStrainer(object):
markup = markup_name
markup_attrs = markup
call_function_with_tag_data = (
- isinstance(self.name, Callable)
+ isinstance(self.name, collections.Callable)
and not isinstance(markup_name, Tag))
if ((not self.name)
@@ -1809,7 +1732,7 @@ class SoupStrainer(object):
# True matches any non-None value.
return markup is not None
- if isinstance(match_against, Callable):
+ if isinstance(match_against, collections.Callable):
return match_against(markup)
# Custom callables take the tag as an argument, but all