diff options
author | Louis Vézina <[email protected]> | 2018-10-20 22:53:12 -0400 |
---|---|---|
committer | Louis Vézina <[email protected]> | 2018-10-20 22:53:12 -0400 |
commit | d61bdfcd4fe2d4423d784cd1370acf630719e90f (patch) | |
tree | 0c97cbd0e9d66dd3e300a6427171827c28eb3f85 /libs/bs4/element.py | |
parent | b9a987b57d361c87cc69bd18fdc39b42ded97199 (diff) | |
download | bazarr-d61bdfcd4fe2d4423d784cd1370acf630719e90f.tar.gz bazarr-d61bdfcd4fe2d4423d784cd1370acf630719e90f.zip |
Initial commit.
Diffstat (limited to 'libs/bs4/element.py')
-rw-r--r-- | libs/bs4/element.py | 155 |
1 files changed, 39 insertions, 116 deletions
diff --git a/libs/bs4/element.py b/libs/bs4/element.py index 886eb91f2..9ef75f814 100644 --- a/libs/bs4/element.py +++ b/libs/bs4/element.py @@ -2,10 +2,7 @@ # found in the LICENSE file. __license__ = "MIT" -try: - from collections.abc import Callable # Python 3.6 -except ImportError , e: - from collections import Callable +import collections import re import shlex import sys @@ -15,7 +12,7 @@ from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) -whitespace_re = re.compile(r"\s+") +whitespace_re = re.compile("\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @@ -72,7 +69,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): The value of the 'content' attribute will be one of these objects. """ - CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) def __new__(cls, original_value): match = cls.CHARSET_RE.search(original_value) @@ -126,41 +123,6 @@ class HTMLAwareEntitySubstitution(EntitySubstitution): return cls._substitute_if_appropriate( ns, EntitySubstitution.substitute_xml) -class Formatter(object): - """Contains information about how to format a parse tree.""" - - # By default, represent void elements as <tag/> rather than <tag> - void_element_close_prefix = '/' - - def substitute_entities(self, *args, **kwargs): - """Transform certain characters into named entities.""" - raise NotImplementedError() - -class HTMLFormatter(Formatter): - """The default HTML formatter.""" - def substitute(self, *args, **kwargs): - return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) - -class MinimalHTMLFormatter(Formatter): - """A minimal HTML formatter.""" - def substitute(self, *args, **kwargs): - return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs) - -class HTML5Formatter(HTMLFormatter): - """An HTML formatter that omits the slash in a void tag.""" - void_element_close_prefix = None - -class XMLFormatter(Formatter): - """Substitute only the essential XML entities.""" - def substitute(self, *args, **kwargs): - return EntitySubstitution.substitute_xml(*args, **kwargs) - -class HTMLXMLFormatter(Formatter): - """Format XML using HTML rules.""" - def substitute(self, *args, **kwargs): - return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) - - class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -169,49 +131,40 @@ class PageElement(object): # to methods like encode() and prettify(): # # "html" - All Unicode characters with corresponding HTML entities - # are converted to those entities on output. - # "html5" - The same as "html", but empty void tags are represented as - # <tag> rather than <tag/> - # "minimal" - Bare ampersands and angle brackets are converted to + # are converted to those entities on output. + # "minimal" - Bare ampersands and angle brackets are converted to # XML entities: & < > # None - The null formatter. Unicode characters are never # converted to entities. This is not recommended, but it's # faster than "minimal". - # A callable function - it will be called on every string that needs to undergo entity substitution. - # A Formatter instance - Formatter.substitute(string) will be called on every string that + # A function - This function will be called on every string that # needs to undergo entity substitution. # - # In an HTML document, the default "html", "html5", and "minimal" - # functions will leave the contents of <script> and <style> tags - # alone. For an XML document, all tags will be given the same - # treatment. + # In an HTML document, the default "html" and "minimal" functions + # will leave the contents of <script> and <style> tags alone. For + # an XML document, all tags will be given the same treatment. HTML_FORMATTERS = { - "html" : HTMLFormatter(), - "html5" : HTML5Formatter(), - "minimal" : MinimalHTMLFormatter(), + "html" : HTMLAwareEntitySubstitution.substitute_html, + "minimal" : HTMLAwareEntitySubstitution.substitute_xml, None : None } XML_FORMATTERS = { - "html" : HTMLXMLFormatter(), - "minimal" : XMLFormatter(), + "html" : EntitySubstitution.substitute_html, + "minimal" : EntitySubstitution.substitute_xml, None : None } def format_string(self, s, formatter='minimal'): """Format the given string using the given formatter.""" - if isinstance(formatter, basestring): + if not callable(formatter): formatter = self._formatter_for_name(formatter) if formatter is None: output = s else: - if callable(formatter): - # Backwards compatibility -- you used to pass in a formatting method. - output = formatter(s) - else: - output = formatter.substitute(s) + output = formatter(s) return output @property @@ -241,9 +194,11 @@ class PageElement(object): def _formatter_for_name(self, name): "Look up a formatter function based on its name and the tree." if self._is_xml: - return self.XML_FORMATTERS.get(name, XMLFormatter()) + return self.XML_FORMATTERS.get( + name, EntitySubstitution.substitute_xml) else: - return self.HTML_FORMATTERS.get(name, HTMLFormatter()) + return self.HTML_FORMATTERS.get( + name, HTMLAwareEntitySubstitution.substitute_xml) def setup(self, parent=None, previous_element=None, next_element=None, previous_sibling=None, next_sibling=None): @@ -361,14 +316,6 @@ class PageElement(object): and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) - from bs4 import BeautifulSoup - if isinstance(new_child, BeautifulSoup): - # We don't want to end up with a situation where one BeautifulSoup - # object contains another. Insert the children one at a time. - for subchild in list(new_child.contents): - self.insert(position, subchild) - position += 1 - return position = min(position, len(self.contents)) if hasattr(new_child, 'parent') and new_child.parent is not None: # We're 'inserting' an element that's already one @@ -589,21 +536,14 @@ class PageElement(object): elif isinstance(name, basestring): # Optimization to find all tags with a given name. if name.count(':') == 1: - # This is a name with a prefix. If this is a namespace-aware document, - # we need to match the local name against tag.name. If not, - # we need to match the fully-qualified name against tag.name. - prefix, local_name = name.split(':', 1) + # This is a name with a prefix. + prefix, name = name.split(':', 1) else: prefix = None - local_name = name result = (element for element in generator if isinstance(element, Tag) - and ( - element.name == name - ) or ( - element.name == local_name - and (prefix is None or element.prefix == prefix) - ) + and element.name == name + and (prefix is None or element.prefix == prefix) ) return ResultSet(strainer, result) results = ResultSet(strainer) @@ -922,7 +862,7 @@ class Tag(PageElement): self.can_be_empty_element = builder.can_be_empty_element(name) else: self.can_be_empty_element = False - + parserClass = _alias("parser_class") # BS3 def __copy__(self): @@ -1106,10 +1046,8 @@ class Tag(PageElement): # BS3: soup.aTag -> "soup.find("a") tag_name = tag[:-3] warnings.warn( - '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( - name=tag_name - ) - ) + '.%sTag is deprecated, use .find("%s") instead.' % ( + tag_name, tag_name)) return self.find(tag_name) # We special case contents to avoid recursion. elif not tag.startswith("__") and not tag == "contents": @@ -1191,10 +1129,11 @@ class Tag(PageElement): encoding. """ - # First off, turn a string formatter into a Formatter object. This + # First off, turn a string formatter into a function. This # will stop the lookup from happening over and over again. - if not isinstance(formatter, Formatter) and not callable(formatter): + if not callable(formatter): formatter = self._formatter_for_name(formatter) + attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): @@ -1223,9 +1162,7 @@ class Tag(PageElement): prefix = self.prefix + ":" if self.is_empty_element: - close = '' - if isinstance(formatter, Formatter): - close = formatter.void_element_close_prefix or close + close = '/' else: closeTag = '</%s%s>' % (prefix, self.name) @@ -1296,9 +1233,9 @@ class Tag(PageElement): :param formatter: The output formatter responsible for converting entities to Unicode characters. """ - # First off, turn a string formatter into a Formatter object. This + # First off, turn a string formatter into a function. This # will stop the lookup from happening over and over again. - if not isinstance(formatter, Formatter) and not callable(formatter): + if not callable(formatter): formatter = self._formatter_for_name(formatter) pretty_print = (indent_level is not None) @@ -1411,29 +1348,15 @@ class Tag(PageElement): # Handle grouping selectors if ',' exists, ie: p,a if ',' in selector: context = [] - selectors = [x.strip() for x in selector.split(",")] - - # If a selector is mentioned multiple times we don't want - # to use it more than once. - used_selectors = set() - - # We also don't want to select the same element more than once, - # if it's matched by multiple selectors. - selected_object_ids = set() - for partial_selector in selectors: + for partial_selector in selector.split(','): + partial_selector = partial_selector.strip() if partial_selector == '': raise ValueError('Invalid group selection syntax: %s' % selector) - if partial_selector in used_selectors: - continue - used_selectors.add(partial_selector) candidates = self.select(partial_selector, limit=limit) for candidate in candidates: - # This lets us distinguish between distinct tags that - # represent the same markup. - object_id = id(candidate) - if object_id not in selected_object_ids: + if candidate not in context: context.append(candidate) - selected_object_ids.add(object_id) + if limit and len(context) >= limit: break return context @@ -1495,7 +1418,7 @@ class Tag(PageElement): if tag_name == '': raise ValueError( "A pseudo-class must be prefixed with a tag name.") - pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) + pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) found = [] if pseudo_attributes is None: pseudo_type = pseudo @@ -1729,7 +1652,7 @@ class SoupStrainer(object): markup = markup_name markup_attrs = markup call_function_with_tag_data = ( - isinstance(self.name, Callable) + isinstance(self.name, collections.Callable) and not isinstance(markup_name, Tag)) if ((not self.name) @@ -1809,7 +1732,7 @@ class SoupStrainer(object): # True matches any non-None value. return markup is not None - if isinstance(match_against, Callable): + if isinstance(match_against, collections.Callable): return match_against(markup) # Custom callables take the tag as an argument, but all |