diff options
author | morpheus65535 <[email protected]> | 2022-01-23 23:07:52 -0500 |
---|---|---|
committer | morpheus65535 <[email protected]> | 2022-01-23 23:07:52 -0500 |
commit | 0c3c5a02a75bc61b6bf6e303de20e11741d2afac (patch) | |
tree | 30ae1d524ffe5d54172b7a4a8445d90c3461e659 /libs/bs4/element.py | |
parent | 36bf0d219d0432c20e6314e0ce752b36f4d88e3c (diff) | |
download | bazarr-0c3c5a02a75bc61b6bf6e303de20e11741d2afac.tar.gz bazarr-0c3c5a02a75bc61b6bf6e303de20e11741d2afac.zip |
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.v1.0.3-beta.16
Diffstat (limited to 'libs/bs4/element.py')
-rw-r--r-- | libs/bs4/element.py | 978 |
1 files changed, 827 insertions, 151 deletions
diff --git a/libs/bs4/element.py b/libs/bs4/element.py index f16b1663e..82a986e42 100644 --- a/libs/bs4/element.py +++ b/libs/bs4/element.py @@ -43,12 +43,49 @@ def _alias(attr): return alias +# These encodings are recognized by Python (so PageElement.encode +# could theoretically support them) but XML and HTML don't recognize +# them (so they should not show up in an XML or HTML document as that +# document's encoding). +# +# If an XML document is encoded in one of these encodings, no encoding +# will be mentioned in the XML declaration. If an HTML document is +# encoded in one of these encodings, and the HTML document has a +# <meta> tag that mentions an encoding, the encoding will be given as +# the empty string. +# +# Source: +# https://docs.python.org/3/library/codecs.html#python-specific-encodings +PYTHON_SPECIFIC_ENCODINGS = set([ + "idna", + "mbcs", + "oem", + "palmos", + "punycode", + "raw_unicode_escape", + "undefined", + "unicode_escape", + "raw-unicode-escape", + "unicode-escape", + "string-escape", + "string_escape", +]) + + class NamespacedAttribute(str): + """A namespaced string (e.g. 'xml:lang') that remembers the namespace + ('xml') and the name ('lang') that were used to create it. + """ + + def __new__(cls, prefix, name=None, namespace=None): + if not name: + # This is the default namespace. Its name "has no value" + # per https://www.w3.org/TR/xml-names/#defaulting + name = None - def __new__(cls, prefix, name, namespace=None): - if name is None: + if not name: obj = str.__new__(cls, prefix) - elif prefix is None: + elif not prefix: # Not really namespaced. obj = str.__new__(cls, name) else: @@ -74,6 +111,11 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): return obj def encode(self, encoding): + """When an HTML document is being encoded to a given encoding, the + value of a meta tag's 'charset' is the name of the encoding. + """ + if encoding in PYTHON_SPECIFIC_ENCODINGS: + return '' return encoding @@ -99,19 +141,39 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): return obj def encode(self, encoding): + if encoding in PYTHON_SPECIFIC_ENCODINGS: + return '' def rewrite(match): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) class PageElement(object): - """Contains the navigational information for some part of the page - (either a tag or a piece of text)""" + """Contains the navigational information for some part of the page: + that is, its current location in the parse tree. + + NavigableString, Tag, etc. are all subclasses of PageElement. + """ def setup(self, parent=None, previous_element=None, next_element=None, previous_sibling=None, next_sibling=None): """Sets up the initial relations between this element and - other elements.""" + other elements. + + :param parent: The parent of this element. + + :param previous_element: The element parsed immediately before + this one. + + :param next_element: The element parsed immediately before + this one. + + :param previous_sibling: The most recently encountered element + on the same level of the parse tree as this one. + + :param previous_sibling: The next element to be encountered + on the same level of the parse tree as this one. + """ self.parent = parent self.previous_element = previous_element @@ -135,7 +197,11 @@ class PageElement(object): self.previous_sibling.next_sibling = self def format_string(self, s, formatter): - """Format the given string using the given formatter.""" + """Format the given string using the given formatter. + + :param s: A string. + :param formatter: A Formatter object, or a string naming one of the standard formatters. + """ if formatter is None: return s if not isinstance(formatter, Formatter): @@ -148,9 +214,10 @@ class PageElement(object): if necessary. :param formatter: Can be a Formatter object (used as-is), a - function (used as the entity substitution hook for an - XMLFormatter or HTMLFormatter), or a string (used to look up - an XMLFormatter or HTMLFormatter in the appropriate registry. + function (used as the entity substitution hook for an + XMLFormatter or HTMLFormatter), or a string (used to look + up an XMLFormatter or HTMLFormatter in the appropriate + registry. """ if isinstance(formatter, Formatter): return formatter @@ -158,7 +225,7 @@ class PageElement(object): c = XMLFormatter else: c = HTMLFormatter - if callable(formatter): + if isinstance(formatter, Callable): return c(entity_substitution=formatter) return c.REGISTRY[formatter] @@ -188,30 +255,82 @@ class PageElement(object): nextSibling = _alias("next_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3 - def replace_with(self, replace_with): + default = object() + def _all_strings(self, strip=False, types=default): + """Yield all strings of certain classes, possibly stripping them. + + This is implemented differently in Tag and NavigableString. + """ + raise NotImplementedError() + + @property + def stripped_strings(self): + """Yield all strings in this PageElement, stripping them first. + + :yield: A sequence of stripped strings. + """ + for string in self._all_strings(True): + yield string + + def get_text(self, separator="", strip=False, + types=default): + """Get all child strings of this PageElement, concatenated using the + given separator. + + :param separator: Strings will be concatenated using this separator. + + :param strip: If True, strings will be stripped before being + concatenated. + + :param types: A tuple of NavigableString subclasses. Any + strings of a subclass not found in this list will be + ignored. Although there are exceptions, the default + behavior in most cases is to consider only NavigableString + and CData objects. That means no comments, processing + instructions, etc. + + :return: A string. + """ + return separator.join([s for s in self._all_strings( + strip, types=types)]) + getText = get_text + text = property(get_text) + + def replace_with(self, *args): + """Replace this PageElement with one or more PageElements, keeping the + rest of the tree the same. + + :param args: One or more PageElements. + :return: `self`, no longer part of the tree. + """ if self.parent is None: raise ValueError( "Cannot replace one element with another when the " "element to be replaced is not part of a tree.") - if replace_with is self: + if len(args) == 1 and args[0] is self: return - if replace_with is self.parent: + if any(x is self.parent for x in args): raise ValueError("Cannot replace a Tag with its parent.") old_parent = self.parent my_index = self.parent.index(self) - self.extract() - old_parent.insert(my_index, replace_with) + self.extract(_self_index=my_index) + for idx, replace_with in enumerate(args, start=my_index): + old_parent.insert(idx, replace_with) return self replaceWith = replace_with # BS3 def unwrap(self): + """Replace this PageElement with its contents. + + :return: `self`, no longer part of the tree. + """ my_parent = self.parent if self.parent is None: raise ValueError( "Cannot replace an element with its contents when that" "element is not part of a tree.") my_index = self.parent.index(self) - self.extract() + self.extract(_self_index=my_index) for child in reversed(self.contents[:]): my_parent.insert(my_index, child) return self @@ -219,14 +338,29 @@ class PageElement(object): replaceWithChildren = unwrap # BS3 def wrap(self, wrap_inside): + """Wrap this PageElement inside another one. + + :param wrap_inside: A PageElement. + :return: `wrap_inside`, occupying the position in the tree that used + to be occupied by `self`, and with `self` inside it. + """ me = self.replace_with(wrap_inside) wrap_inside.append(me) return wrap_inside - def extract(self): - """Destructively rips this element out of the tree.""" + def extract(self, _self_index=None): + """Destructively rips this element out of the tree. + + :param _self_index: The location of this element in its parent's + .contents, if known. Passing this in allows for a performance + optimization. + + :return: `self`, no longer part of the tree. + """ if self.parent is not None: - del self.parent.contents[self.parent.index(self)] + if _self_index is None: + _self_index = self.parent.index(self) + del self.parent.contents[_self_index] #Find the two elements that would be next to each other if #this element (and any children) hadn't been parsed. Connect @@ -253,7 +387,12 @@ class PageElement(object): return self def _last_descendant(self, is_initialized=True, accept_self=True): - "Finds the last element beneath this object to be parsed." + """Finds the last element beneath this object to be parsed. + + :param is_initialized: Has `setup` been called on this PageElement + yet? + :param accept_self: Is `self` an acceptable answer to the question? + """ if is_initialized and self.next_sibling is not None: last_child = self.next_sibling.previous_element else: @@ -267,6 +406,14 @@ class PageElement(object): _lastRecursiveChild = _last_descendant def insert(self, position, new_child): + """Insert a new PageElement in the list of this PageElement's children. + + This works the same way as `list.insert`. + + :param position: The numeric position that should be occupied + in `self.children` by the new PageElement. + :param new_child: A PageElement. + """ if new_child is None: raise ValueError("Cannot insert None into a tag.") if new_child is self: @@ -341,19 +488,32 @@ class PageElement(object): self.contents.insert(position, new_child) def append(self, tag): - """Appends the given tag to the contents of this tag.""" + """Appends the given PageElement to the contents of this one. + + :param tag: A PageElement. + """ self.insert(len(self.contents), tag) def extend(self, tags): - """Appends the given tags to the contents of this tag.""" + """Appends the given PageElements to this one's contents. + + :param tags: A list of PageElements. + """ + if isinstance(tags, Tag): + # Calling self.append() on another tag's contents will change + # the list we're iterating over. Make a list that won't + # change. + tags = list(tags.contents) for tag in tags: self.append(tag) def insert_before(self, *args): """Makes the given element(s) the immediate predecessor of this one. - The elements will have the same parent, and the given elements + All the elements will have the same parent, and the given elements will be immediately before this one. + + :param args: One or more PageElements. """ parent = self.parent if parent is None: @@ -374,6 +534,8 @@ class PageElement(object): The elements will have the same parent, and the given elements will be immediately after this one. + + :param args: One or more PageElements. """ # Do all error checking before modifying the tree. parent = self.parent @@ -394,70 +556,172 @@ class PageElement(object): offset += 1 def find_next(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears after this Tag in the document.""" + """Find the first PageElement that matches the given criteria and + appears later in the document than this PageElement. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ return self._find_one(self.find_all_next, name, attrs, text, **kwargs) findNext = find_next # BS3 def find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs): - """Returns all items that match the given criteria and appear - after this Tag in the document.""" + """Find all PageElements that match the given criteria and appear + later in the document than this PageElement. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet containing PageElements. + """ return self._find_all(name, attrs, text, limit, self.next_elements, **kwargs) findAllNext = find_all_next # BS3 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears after this Tag in the document.""" + """Find the closest sibling to this PageElement that matches the + given criteria and appears later in the document. + + All find_* methods take a common set of arguments. See the + online documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ return self._find_one(self.find_next_siblings, name, attrs, text, **kwargs) findNextSibling = find_next_sibling # BS3 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear after this Tag in the document.""" + """Find all siblings of this PageElement that match the given criteria + and appear later in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ return self._find_all(name, attrs, text, limit, self.next_siblings, **kwargs) findNextSiblings = find_next_siblings # BS3 fetchNextSiblings = find_next_siblings # BS2 def find_previous(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears before this Tag in the document.""" + """Look backwards in the document from this PageElement and find the + first PageElement that matches the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ return self._find_one( self.find_all_previous, name, attrs, text, **kwargs) findPrevious = find_previous # BS3 def find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs): - """Returns all items that match the given criteria and appear - before this Tag in the document.""" + """Look backwards in the document from this PageElement and find all + PageElements that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ return self._find_all(name, attrs, text, limit, self.previous_elements, **kwargs) findAllPrevious = find_all_previous # BS3 fetchPrevious = find_all_previous # BS2 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears before this Tag in the document.""" + """Returns the closest sibling to this PageElement that matches the + given criteria and appears earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ return self._find_one(self.find_previous_siblings, name, attrs, text, **kwargs) findPreviousSibling = find_previous_sibling # BS3 def find_previous_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear before this Tag in the document.""" + """Returns all siblings to this PageElement that match the + given criteria and appear earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ return self._find_all(name, attrs, text, limit, self.previous_siblings, **kwargs) findPreviousSiblings = find_previous_siblings # BS3 fetchPreviousSiblings = find_previous_siblings # BS2 def find_parent(self, name=None, attrs={}, **kwargs): - """Returns the closest parent of this Tag that matches the given - criteria.""" + """Find the closest parent of this PageElement that matches the given + criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :kwargs: A dictionary of filters on attribute values. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ # NOTE: We can't use _find_one because findParents takes a different # set of arguments. r = None @@ -468,9 +732,19 @@ class PageElement(object): findParent = find_parent # BS3 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): - """Returns the parents of this Tag that match the given - criteria.""" + """Find all parents of this PageElement that match the given criteria. + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ return self._find_all(name, attrs, None, limit, self.parents, **kwargs) findParents = find_parents # BS3 @@ -478,10 +752,20 @@ class PageElement(object): @property def next(self): + """The PageElement, if any, that was parsed just after this one. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ return self.next_element @property def previous(self): + """The PageElement, if any, that was parsed just before this one. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ return self.previous_element #These methods do the real heavy lifting. @@ -549,6 +833,10 @@ class PageElement(object): #NavigableStrings and Tags. @property def next_elements(self): + """All PageElements that were parsed after this one. + + :yield: A sequence of PageElements. + """ i = self.next_element while i is not None: yield i @@ -556,6 +844,11 @@ class PageElement(object): @property def next_siblings(self): + """All PageElements that are siblings of this one but were parsed + later. + + :yield: A sequence of PageElements. + """ i = self.next_sibling while i is not None: yield i @@ -563,6 +856,10 @@ class PageElement(object): @property def previous_elements(self): + """All PageElements that were parsed before this one. + + :yield: A sequence of PageElements. + """ i = self.previous_element while i is not None: yield i @@ -570,6 +867,11 @@ class PageElement(object): @property def previous_siblings(self): + """All PageElements that are siblings of this one but were parsed + earlier. + + :yield: A sequence of PageElements. + """ i = self.previous_sibling while i is not None: yield i @@ -577,11 +879,23 @@ class PageElement(object): @property def parents(self): + """All PageElements that are parents of this PageElement. + + :yield: A sequence of PageElements. + """ i = self.parent while i is not None: yield i i = i.parent + @property + def decomposed(self): + """Check whether a PageElement has been decomposed. + + :rtype: bool + """ + return getattr(self, '_decomposed', False) or False + # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): @@ -601,6 +915,11 @@ class PageElement(object): class NavigableString(str, PageElement): + """A Python Unicode string that is part of a parse tree. + + When Beautiful Soup parses the markup <b>penguin</b>, it will + create a NavigableString for the string "penguin". + """ PREFIX = '' SUFFIX = '' @@ -646,35 +965,100 @@ class NavigableString(str, PageElement): self.__class__.__name__, attr)) def output_ready(self, formatter="minimal"): - """Run the string through the provided formatter.""" + """Run the string through the provided formatter. + + :param formatter: A Formatter object, or a string naming one of the standard formatters. + """ output = self.format_string(self, formatter) return self.PREFIX + output + self.SUFFIX @property def name(self): + """Since a NavigableString is not a Tag, it has no .name. + + This property is implemented so that code like this doesn't crash + when run on a mixture of Tag and NavigableString objects: + [x.name for x in tag.children] + """ return None @name.setter def name(self, name): + """Prevent NavigableString.name from ever being set.""" raise AttributeError("A NavigableString cannot be given a name.") + def _all_strings(self, strip=False, types=PageElement.default): + """Yield all strings of certain classes, possibly stripping them. + + This makes it easy for NavigableString to implement methods + like get_text() as conveniences, creating a consistent + text-extraction API across all PageElements. + + :param strip: If True, all strings will be stripped before being + yielded. + + :param types: A tuple of NavigableString subclasses. If this + NavigableString isn't one of those subclasses, the + sequence will be empty. By default, the subclasses + considered are NavigableString and CData objects. That + means no comments, processing instructions, etc. + + :yield: A sequence that either contains this string, or is empty. + + """ + if types is self.default: + # This is kept in Tag because it's full of subclasses of + # this class, which aren't defined until later in the file. + types = Tag.DEFAULT_INTERESTING_STRING_TYPES + + # Do nothing if the caller is looking for specific types of + # string, and we're of a different type. + my_type = type(self) + if types is not None: + if isinstance(types, type): + # Looking for a single type. + if my_type is not types: + return + elif my_type not in types: + # Looking for one of a list of types. + return + + value = self + if strip: + value = value.strip() + if len(value) > 0: + yield value + strings = property(_all_strings) + class PreformattedString(NavigableString): """A NavigableString not subject to the normal formatting rules. - The string will be passed into the formatter (to trigger side effects), - but the return value will be ignored. + This is an abstract class used for special kinds of strings such + as comments (the Comment class) and CDATA blocks (the CData + class). """ - + + PREFIX = '' + SUFFIX = '' + def output_ready(self, formatter=None): - """CData strings are passed into the formatter, purely - for any side effects. The return value is ignored. + """Make this string ready for output by adding any subclass-specific + prefix or suffix. + + :param formatter: A Formatter object, or a string naming one + of the standard formatters. The string will be passed into the + Formatter, but only to trigger any side effects: the return + value is ignored. + + :return: The string, with any subclass-specific prefix and + suffix added on. """ if formatter is not None: ignore = self.format_string(self, formatter) return self.PREFIX + self + self.SUFFIX class CData(PreformattedString): - + """A CDATA block.""" PREFIX = '<![CDATA[' SUFFIX = ']]>' @@ -690,20 +1074,32 @@ class XMLProcessingInstruction(ProcessingInstruction): SUFFIX = '?>' class Comment(PreformattedString): - + """An HTML or XML comment.""" PREFIX = '<!--' SUFFIX = '-->' class Declaration(PreformattedString): + """An XML declaration.""" PREFIX = '<?' SUFFIX = '?>' class Doctype(PreformattedString): - + """A document type declaration.""" @classmethod def for_name_and_ids(cls, name, pub_id, system_id): + """Generate an appropriate document type declaration for a given + public ID and system ID. + + :param name: The name of the document's root element, e.g. 'html'. + :param pub_id: The Formal Public Identifier for this document type, + e.g. '-//W3C//DTD XHTML 1.1//EN' + :param system_id: The system identifier for this document type, + e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' + + :return: A Doctype. + """ value = name or '' if pub_id is not None: value += ' PUBLIC "%s"' % pub_id @@ -718,15 +1114,80 @@ class Doctype(PreformattedString): SUFFIX = '>\n' +class Stylesheet(NavigableString): + """A NavigableString representing an stylesheet (probably + CSS). + + Used to distinguish embedded stylesheets from textual content. + """ + pass + + +class Script(NavigableString): + """A NavigableString representing an executable script (probably + Javascript). + + Used to distinguish executable code from textual content. + """ + pass + + +class TemplateString(NavigableString): + """A NavigableString representing a string found inside an HTML + template embedded in a larger document. + + Used to distinguish such strings from the main body of the document. + """ + pass + + class Tag(PageElement): + """Represents an HTML or XML tag that is part of a parse tree, along + with its attributes and contents. - """Represents a found HTML tag with its attributes and contents.""" + When Beautiful Soup parses the markup <b>penguin</b>, it will + create a Tag object representing the <b> tag. + """ def __init__(self, parser=None, builder=None, name=None, namespace=None, prefix=None, attrs=None, parent=None, previous=None, - is_xml=None): - "Basic constructor." - + is_xml=None, sourceline=None, sourcepos=None, + can_be_empty_element=None, cdata_list_attributes=None, + preserve_whitespace_tags=None, + interesting_string_types=None, + ): + """Basic constructor. + + :param parser: A BeautifulSoup object. + :param builder: A TreeBuilder. + :param name: The name of the tag. + :param namespace: The URI of this Tag's XML namespace, if any. + :param prefix: The prefix for this Tag's XML namespace, if any. + :param attrs: A dictionary of this Tag's attribute values. + :param parent: The PageElement to use as this Tag's parent. + :param previous: The PageElement that was parsed immediately before + this tag. + :param is_xml: If True, this is an XML tag. Otherwise, this is an + HTML tag. + :param sourceline: The line number where this tag was found in its + source document. + :param sourcepos: The character position within `sourceline` where this + tag was found. + :param can_be_empty_element: If True, this tag should be + represented as <tag/>. If False, this tag should be represented + as <tag></tag>. + :param cdata_list_attributes: A list of attributes whose values should + be treated as CDATA if they ever show up on this tag. + :param preserve_whitespace_tags: A list of tag names whose contents + should have their whitespace preserved. + :param interesting_string_types: This is a NavigableString + subclass or a tuple of them. When iterating over this + Tag's strings in methods like Tag.strings or Tag.get_text, + these are the types of strings that are interesting enough + to be considered. The default is to consider + NavigableString and CData the only interesting string + subtypes. + """ if parser is None: self.parser_class = None else: @@ -738,6 +1199,10 @@ class Tag(PageElement): self.name = name self.namespace = namespace self.prefix = prefix + if ((not builder or builder.store_line_numbers) + and (sourceline is not None or sourcepos is not None)): + self.sourceline = sourceline + self.sourcepos = sourcepos if attrs is None: attrs = {} elif attrs: @@ -761,10 +1226,13 @@ class Tag(PageElement): self.hidden = False if builder is None: - # In the absence of a TreeBuilder, assume this tag is nothing - # special. - self.can_be_empty_element = False - self.cdata_list_attributes = None + # In the absence of a TreeBuilder, use whatever values were + # passed in here. They're probably None, unless this is a copy of some + # other tag. + self.can_be_empty_element = can_be_empty_element + self.cdata_list_attributes = cdata_list_attributes + self.preserve_whitespace_tags = preserve_whitespace_tags + self.interesting_string_types = interesting_string_types else: # Set up any substitutions for this tag, such as the charset in a META tag. builder.set_up_substitutions(self) @@ -785,6 +1253,13 @@ class Tag(PageElement): # Keep track of the names that might cause this tag to be treated as a # whitespace-preserved tag. self.preserve_whitespace_tags = builder.preserve_whitespace_tags + + if self.name in builder.string_containers: + # This sort of tag uses a special string container + # subclass for most of its strings. When we ask the + self.interesting_string_types = builder.string_containers[self.name] + else: + self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES parserClass = _alias("parser_class") # BS3 @@ -792,8 +1267,14 @@ class Tag(PageElement): """A copy of a Tag is a new Tag, unconnected to the parse tree. Its contents are a copy of the old Tag's contents. """ - clone = type(self)(None, self.builder, self.name, self.namespace, - self.prefix, self.attrs, is_xml=self._is_xml) + clone = type(self)( + None, self.builder, self.name, self.namespace, + self.prefix, self.attrs, is_xml=self._is_xml, + sourceline=self.sourceline, sourcepos=self.sourcepos, + can_be_empty_element=self.can_be_empty_element, + cdata_list_attributes=self.cdata_list_attributes, + preserve_whitespace_tags=self.preserve_whitespace_tags + ) for attr in ('can_be_empty_element', 'hidden'): setattr(clone, attr, getattr(self, attr)) for child in self.contents: @@ -820,13 +1301,17 @@ class Tag(PageElement): @property def string(self): - """Convenience property to get the single string within this tag. + """Convenience property to get the single string within this + PageElement. - :Return: If this tag has a single string child, return value - is that string. If this tag has no children, or more than one - child, return value is None. If this tag has one child tag, + TODO It might make sense to have NavigableString.string return + itself. + + :return: If this element has a single string child, return + value is that string. If this element has one child tag, return value is the 'string' attribute of the child tag, - recursively. + recursively. If this element is itself a string, has no + children, or has more than one child, return value is None. """ if len(self.contents) != 1: return None @@ -837,57 +1322,75 @@ class Tag(PageElement): @string.setter def string(self, string): + """Replace this PageElement's contents with `string`.""" self.clear() self.append(string.__class__(string)) - def _all_strings(self, strip=False, types=(NavigableString, CData)): + DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData) + def _all_strings(self, strip=False, types=PageElement.default): """Yield all strings of certain classes, possibly stripping them. - By default, yields only NavigableString and CData objects. So - no comments, processing instructions, etc. + :param strip: If True, all strings will be stripped before being + yielded. + + :param types: A tuple of NavigableString subclasses. Any strings of + a subclass not found in this list will be ignored. By + default, the subclasses considered are the ones found in + self.interesting_string_types. If that's not specified, + only NavigableString and CData objects will be + considered. That means no comments, processing + instructions, etc. + + :yield: A sequence of strings. + """ + if types is self.default: + types = self.interesting_string_types + for descendant in self.descendants: - if ( - (types is None and not isinstance(descendant, NavigableString)) - or - (types is not None and type(descendant) not in types)): + if (types is None and not isinstance(descendant, NavigableString)): + continue + descendant_type = type(descendant) + if isinstance(types, type): + if descendant_type is not types: + # We're not interested in strings of this type. + continue + elif types is not None and descendant_type not in types: + # We're not interested in strings of this type. continue if strip: descendant = descendant.strip() if len(descendant) == 0: continue yield descendant - strings = property(_all_strings) - @property - def stripped_strings(self): - for string in self._all_strings(True): - yield string + def decompose(self): + """Recursively destroys this PageElement and its children. - def get_text(self, separator="", strip=False, - types=(NavigableString, CData)): - """ - Get all child strings, concatenated using the given separator. - """ - return separator.join([s for s in self._all_strings( - strip, types=types)]) - getText = get_text - text = property(get_text) + This element will be removed from the tree and wiped out; so + will everything beneath it. - def decompose(self): - """Recursively destroys the contents of this tree.""" + The behavior of a decomposed PageElement is undefined and you + should never use one for anything, but if you need to _check_ + whether an element has been decomposed, you can use the + `decomposed` property. + """ self.extract() i = self while i is not None: - next = i.next_element + n = i.next_element i.__dict__.clear() i.contents = [] - i = next - + i._decomposed = True + i = n + def clear(self, decompose=False): - """ - Extract all children. If decompose is True, decompose instead. + """Wipe out all children of this PageElement by calling extract() + on them. + + :param decompose: If this is True, decompose() (a more + destructive method) will be called instead of extract(). """ if decompose: for element in self.contents[:]: @@ -900,7 +1403,8 @@ class Tag(PageElement): element.extract() def smooth(self): - """Smooth out this element's children by consolidating consecutive strings. + """Smooth out this element's children by consolidating consecutive + strings. This makes pretty-printed output look more natural following a lot of operations that modified the tree. @@ -937,9 +1441,12 @@ class Tag(PageElement): a.replace_with(n) def index(self, element): - """ - Find the index of a child by identity, not value. Avoids issues with - tag.contents.index(element) getting the index of equal elements. + """Find the index of a child by identity, not value. + + Avoids issues with tag.contents.index(element) getting the + index of equal elements. + + :param element: Look for this PageElement in `self.contents`. """ for i, child in enumerate(self.contents): if child is element: @@ -953,29 +1460,37 @@ class Tag(PageElement): return self.attrs.get(key, default) def get_attribute_list(self, key, default=None): - """The same as get(), but always returns a list.""" + """The same as get(), but always returns a list. + + :param key: The attribute to look for. + :param default: Use this value if the attribute is not present + on this PageElement. + :return: A list of values, probably containing only a single + value. + """ value = self.get(key, default) if not isinstance(value, list): value = [value] return value def has_attr(self, key): + """Does this PageElement have an attribute with the given name?""" return key in self.attrs def __hash__(self): return str(self).__hash__() def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the tag, + """tag[key] returns the value of the 'key' attribute for the Tag, and throws an exception if it's not there.""" return self.attrs[key] def __iter__(self): - "Iterating over a tag iterates over its contents." + "Iterating over a Tag iterates over its contents." return iter(self.contents) def __len__(self): - "The length of a tag is the length of its list of contents." + "The length of a Tag is the length of its list of contents." return len(self.contents) def __contains__(self, x): @@ -995,13 +1510,14 @@ class Tag(PageElement): self.attrs.pop(key, None) def __call__(self, *args, **kwargs): - """Calling a tag like a function is the same as calling its + """Calling a Tag like a function is the same as calling its find_all() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" return self.find_all(*args, **kwargs) def __getattr__(self, tag): - #print "Getattr %s.%s" % (self.__class__, tag) + """Calling tag.subtag is the same as calling tag.find(name="subtag")""" + #print("Getattr %s.%s" % (self.__class__, tag)) if len(tag) > 3 and tag.endswith('Tag'): # BS3: soup.aTag -> "soup.find("a") tag_name = tag[:-3] @@ -1018,8 +1534,8 @@ class Tag(PageElement): "'%s' object has no attribute '%s'" % (self.__class__, tag)) def __eq__(self, other): - """Returns true iff this tag has the same name, the same attributes, - and the same contents (recursively) as the given tag.""" + """Returns true iff this Tag has the same name, the same attributes, + and the same contents (recursively) as `other`.""" if self is other: return True if (not hasattr(other, 'name') or @@ -1035,12 +1551,17 @@ class Tag(PageElement): return True def __ne__(self, other): - """Returns true iff this tag is not identical to the other tag, + """Returns true iff this Tag is not identical to `other`, as defined in __eq__.""" return not self == other def __repr__(self, encoding="unicode-escape"): - """Renders this tag as a string.""" + """Renders this PageElement as a string. + + :param encoding: The encoding to use (Python 2 only). + :return: Under Python 2, a bytestring; under Python 3, + a Unicode string. + """ if PY3K: # "The return value must be a string object", i.e. Unicode return self.decode() @@ -1051,9 +1572,15 @@ class Tag(PageElement): return self.encode(encoding) def __unicode__(self): + """Renders this PageElement as a Unicode string.""" return self.decode() def __str__(self): + """Renders this PageElement as a generic string. + + :return: Under Python 2, a UTF-8 bytestring; under Python 3, + a Unicode string. + """ if PY3K: return self.decode() else: @@ -1065,6 +1592,22 @@ class Tag(PageElement): def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, indent_level=None, formatter="minimal", errors="xmlcharrefreplace"): + """Render a bytestring representation of this PageElement and its + contents. + + :param encoding: The destination encoding. + :param indent_level: Each line of the rendering will be + indented this many spaces. Used internally in + recursive calls while pretty-printing. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. + :param errors: An error handling strategy such as + 'xmlcharrefreplace'. This value is passed along into + encode() and its value should be one of the constants + defined by Python. + :return: A bytestring. + + """ # Turn the data structure into Unicode, then encode the # Unicode. u = self.decode(indent_level, encoding, formatter) @@ -1073,14 +1616,20 @@ class Tag(PageElement): def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): - """Returns a Unicode representation of this tag and its contents. + """Render a Unicode representation of this PageElement and its + contents. + :param indent_level: Each line of the rendering will be + indented this many spaces. Used internally in + recursive calls while pretty-printing. :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a <META> tag that mentions the document's - encoding. + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a <META> tag that mentions the document's + encoding. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. """ # First off, turn a non-Formatter `formatter` into a Formatter @@ -1166,13 +1715,29 @@ class Tag(PageElement): return s def _should_pretty_print(self, indent_level): - """Should this tag be pretty-printed?""" + """Should this tag be pretty-printed? + + Most of them should, but some (such as <pre> in HTML + documents) should not. + """ return ( indent_level is not None - and self.name not in self.preserve_whitespace_tags + and ( + not self.preserve_whitespace_tags + or self.name not in self.preserve_whitespace_tags + ) ) def prettify(self, encoding=None, formatter="minimal"): + """Pretty-print this PageElement as a string. + + :param encoding: The eventual encoding of the string. If this is None, + a Unicode string will be returned. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. + :return: A Unicode string (if encoding==None) or a bytestring + (otherwise). + """ if encoding is None: return self.decode(True, formatter=formatter) else: @@ -1184,7 +1749,8 @@ class Tag(PageElement): """Renders the contents of this tag as a Unicode string. :param indent_level: Each line of the rendering will be - indented this many spaces. + indented this many spaces. Used internally in + recursive calls while pretty-printing. :param eventual_encoding: The tag is destined to be encoded into this encoding. decode_contents() is _not_ @@ -1226,23 +1792,26 @@ class Tag(PageElement): def encode_contents( self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): - """Renders the contents of this tag as a bytestring. + """Renders the contents of this PageElement as a bytestring. :param indent_level: Each line of the rendering will be - indented this many spaces. + indented this many spaces. Used internally in + recursive calls while pretty-printing. :param eventual_encoding: The bytestring will be in this encoding. - :param formatter: The output formatter responsible for converting - entities to Unicode characters. - """ + :param formatter: A Formatter object, or a string naming one of + the standard Formatters. + :return: A bytestring. + """ contents = self.decode_contents(indent_level, encoding, formatter) return contents.encode(encoding) # Old method for BS3 compatibility def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): + """Deprecated method for BS3 compatibility.""" if not prettyPrint: indentLevel = None return self.encode_contents( @@ -1252,27 +1821,47 @@ class Tag(PageElement): def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs): - """Return only the first child of this Tag matching the given - criteria.""" + """Look in the children of this PageElement and find the first + PageElement that matches the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param recursive: If this is True, find() will perform a + recursive search of this PageElement's children. Otherwise, + only the direct children will be considered. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ r = None l = self.find_all(name, attrs, recursive, text, 1, **kwargs) if l: r = l[0] return r - findChild = find + findChild = find #BS2 def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs): - """Extracts a list of Tag objects that match the given - criteria. You can specify the name of the Tag and any - attributes you want the Tag to have. - - The value of a key-value pair in the 'attrs' map can be a - string, a list of strings, a regular expression object, or a - callable that takes a string and returns whether or not the - string matches for some custom definition of 'matches'. The - same is true of the tag name.""" - + """Look in the children of this PageElement and find all + PageElements that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param recursive: If this is True, find_all() will perform a + recursive search of this PageElement's children. Otherwise, + only the direct children will be considered. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ generator = self.descendants if not recursive: generator = self.children @@ -1283,11 +1872,20 @@ class Tag(PageElement): #Generator methods @property def children(self): + """Iterate over all direct children of this PageElement. + + :yield: A sequence of PageElements. + """ # return iter() to make the purpose of the method clear return iter(self.contents) # XXX This seems to be untested. @property def descendants(self): + """Iterate over all children of this PageElement in a + breadth-first sequence. + + :yield: A sequence of PageElements. + """ if not len(self.contents): return stopNode = self._last_descendant().next_element @@ -1298,7 +1896,21 @@ class Tag(PageElement): # CSS selector code def select_one(self, selector, namespaces=None, **kwargs): - """Perform a CSS selection operation on the current element.""" + """Perform a CSS selection operation on the current element. + + :param selector: A CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select() method. + + :return: A Tag. + :rtype: bs4.element.Tag + """ value = self.select(selector, namespaces, 1, **kwargs) if value: return value[0] @@ -1312,14 +1924,17 @@ class Tag(PageElement): :param selector: A string containing a CSS selector. :param namespaces: A dictionary mapping namespace prefixes - used in the CSS selector to namespace URIs. By default, - Beautiful Soup will use the prefixes it encountered while - parsing the document. + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. :param limit: After finding this number of results, stop looking. - :param kwargs: Any extra arguments you'd like to pass in to - soupsieve.select(). + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select() method. + + :return: A ResultSet of Tags. + :rtype: bs4.element.ResultSet """ if namespaces is None: namespaces = self._namespaces @@ -1331,19 +1946,27 @@ class Tag(PageElement): "Cannot execute CSS selectors because the soupsieve package is not installed." ) - return soupsieve.select(selector, self, namespaces, limit, **kwargs) + results = soupsieve.select(selector, self, namespaces, limit, **kwargs) + + # We do this because it's more consistent and because + # ResultSet.__getattr__ has a helpful error message. + return ResultSet(None, results) # Old names for backwards compatibility def childGenerator(self): + """Deprecated generator.""" return self.children def recursiveChildGenerator(self): + """Deprecated generator.""" return self.descendants def has_key(self, key): - """This was kind of misleading because has_key() (attributes) - was different from __in__ (contents). has_key() is gone in - Python 3, anyway.""" + """Deprecated method. This was kind of misleading because has_key() + (attributes) was different from __in__ (contents). + + has_key() is gone in Python 3, anyway. + """ warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( key)) return self.has_attr(key) @@ -1351,9 +1974,26 @@ class Tag(PageElement): # Next, a couple classes to represent queries and their results. class SoupStrainer(object): """Encapsulates a number of ways of matching a markup element (tag or - text).""" + string). + + This is primarily used to underpin the find_* methods, but you can + create one yourself and pass it in as `parse_only` to the + `BeautifulSoup` constructor, to parse a subset of a large + document. + """ def __init__(self, name=None, attrs={}, text=None, **kwargs): + """Constructor. + + The SoupStrainer constructor takes the same arguments passed + into the find_* methods. See the online documentation for + detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param text: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + """ self.name = self._normalize_search_value(name) if not isinstance(attrs, dict): # Treat a non-dict value for attrs as a search for the 'class' @@ -1411,17 +2051,38 @@ class SoupStrainer(object): return str(str(value)) def __str__(self): + """A human-readable representation of this SoupStrainer.""" if self.text: return self.text else: return "%s|%s" % (self.name, self.attrs) def search_tag(self, markup_name=None, markup_attrs={}): + """Check whether a Tag with the given name and attributes would + match this SoupStrainer. + + Used prospectively to decide whether to even bother creating a Tag + object. + + :param markup_name: A tag name as found in some markup. + :param markup_attrs: A dictionary of attributes as found in some markup. + + :return: True if the prospective tag would match this SoupStrainer; + False otherwise. + """ found = None markup = None if isinstance(markup_name, Tag): markup = markup_name markup_attrs = markup + + if isinstance(self.name, str): + # Optimization for a very common case where the user is + # searching for a tag with one specific name, and we're + # looking at a tag with a different name. + if markup and not markup.prefix and self.name != markup.name: + return False + call_function_with_tag_data = ( isinstance(self.name, Callable) and not isinstance(markup_name, Tag)) @@ -1455,10 +2116,19 @@ class SoupStrainer(object): if found and self.text and not self._matches(found.string, self.text): found = None return found + + # For BS3 compatibility. searchTag = search_tag def search(self, markup): - # print 'looking for %s in %s' % (self, markup) + """Find all items in `markup` that match this SoupStrainer. + + Used by the core _find_all() method, which is ultimately + called by all find_* methods. + + :param markup: A PageElement or a list of them. + """ + # print('looking for %s in %s' % (self, markup)) found = None # If given a list of items, scan it for a text element that # matches. @@ -1484,7 +2154,7 @@ class SoupStrainer(object): return found def _matches(self, markup, match_against, already_tried=None): - # print u"Matching %s against %s" % (markup, match_against) + # print(u"Matching %s against %s" % (markup, match_against)) result = False if isinstance(markup, list) or isinstance(markup, tuple): # This should only happen when searching a multi-valued attribute @@ -1570,10 +2240,16 @@ class ResultSet(list): """A ResultSet is just a list that keeps track of the SoupStrainer that created it.""" def __init__(self, source, result=()): + """Constructor. + + :param source: A SoupStrainer. + :param result: A list of PageElements. + """ super(ResultSet, self).__init__(result) self.source = source def __getattr__(self, key): + """Raise a helpful exception to explain a common code fix.""" raise AttributeError( - "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key + "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key ) |