aboutsummaryrefslogtreecommitdiffhomepage
path: root/libs/soupsieve/css_match.py
diff options
context:
space:
mode:
Diffstat (limited to 'libs/soupsieve/css_match.py')
-rw-r--r--libs/soupsieve/css_match.py1472
1 files changed, 1472 insertions, 0 deletions
diff --git a/libs/soupsieve/css_match.py b/libs/soupsieve/css_match.py
new file mode 100644
index 000000000..aa3be671e
--- /dev/null
+++ b/libs/soupsieve/css_match.py
@@ -0,0 +1,1472 @@
+"""CSS matcher."""
+from __future__ import unicode_literals
+from datetime import datetime
+from . import util
+import re
+from .import css_types as ct
+import unicodedata
+
+# Empty tag pattern (whitespace okay)
+RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
+
+RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
+
+# Relationships
+REL_PARENT = ' '
+REL_CLOSE_PARENT = '>'
+REL_SIBLING = '~'
+REL_CLOSE_SIBLING = '+'
+
+# Relationships for :has() (forward looking)
+REL_HAS_PARENT = ': '
+REL_HAS_CLOSE_PARENT = ':>'
+REL_HAS_SIBLING = ':~'
+REL_HAS_CLOSE_SIBLING = ':+'
+
+NS_XHTML = 'http://www.w3.org/1999/xhtml'
+NS_XML = 'http://www.w3.org/XML/1998/namespace'
+
+DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
+RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
+
+DIR_MAP = {
+ 'ltr': ct.SEL_DIR_LTR,
+ 'rtl': ct.SEL_DIR_RTL,
+ 'auto': 0
+}
+
+RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
+RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
+RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
+RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
+RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
+RE_DATETIME = re.compile(
+ r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
+)
+
+MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
+FEB = 2
+SHORT_MONTH = 30
+LONG_MONTH = 31
+FEB_MONTH = 28
+FEB_LEAP_MONTH = 29
+DAYS_IN_WEEK = 7
+
+
+class _FakeParent(object):
+ """
+ Fake parent class.
+
+ When we have a fragment with no `BeautifulSoup` document object,
+ we can't evaluate `nth` selectors properly. Create a temporary
+ fake parent so we can traverse the root element as a child.
+ """
+
+ def __init__(self, element):
+ """Initialize."""
+
+ self.contents = [element]
+
+ def __len__(self):
+ """Length."""
+
+ return len(self.contents)
+
+
+class _DocumentNav(object):
+ """Navigate a Beautiful Soup document."""
+
+ @classmethod
+ def assert_valid_input(cls, tag):
+ """Check if valid input tag or document."""
+
+ # Fail on unexpected types.
+ if not cls.is_tag(tag):
+ raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag)))
+
+ @staticmethod
+ def is_doc(obj):
+ """Is `BeautifulSoup` object."""
+
+ import bs4
+ return isinstance(obj, bs4.BeautifulSoup)
+
+ @staticmethod
+ def is_tag(obj):
+ """Is tag."""
+
+ import bs4
+ return isinstance(obj, bs4.Tag)
+
+ @staticmethod
+ def is_comment(obj):
+ """Is comment."""
+
+ import bs4
+ return isinstance(obj, bs4.Comment)
+
+ @staticmethod
+ def is_declaration(obj): # pragma: no cover
+ """Is declaration."""
+
+ import bs4
+ return isinstance(obj, bs4.Declaration)
+
+ @staticmethod
+ def is_cdata(obj):
+ """Is CDATA."""
+
+ import bs4
+ return isinstance(obj, bs4.CData)
+
+ @staticmethod
+ def is_processing_instruction(obj): # pragma: no cover
+ """Is processing instruction."""
+
+ import bs4
+ return isinstance(obj, bs4.ProcessingInstruction)
+
+ @staticmethod
+ def is_navigable_string(obj):
+ """Is navigable string."""
+
+ import bs4
+ return isinstance(obj, bs4.NavigableString)
+
+ @staticmethod
+ def is_special_string(obj):
+ """Is special string."""
+
+ import bs4
+ return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
+
+ @classmethod
+ def is_content_string(cls, obj):
+ """Check if node is content string."""
+
+ return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
+
+ @staticmethod
+ def create_fake_parent(el):
+ """Create fake parent for a given element."""
+
+ return _FakeParent(el)
+
+ @staticmethod
+ def is_xml_tree(el):
+ """Check if element (or document) is from a XML tree."""
+
+ return el._is_xml
+
+ def is_iframe(self, el):
+ """Check if element is an `iframe`."""
+
+ return ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el)
+
+ def is_root(self, el):
+ """
+ Return whether element is a root element.
+
+ We check that the element is the root of the tree (which we have already pre-calculated),
+ and we check if it is the root element under an `iframe`.
+ """
+
+ root = self.root and self.root is el
+ if not root:
+ parent = self.get_parent(el)
+ root = parent is not None and self.is_html and self.is_iframe(parent)
+ return root
+
+ def get_contents(self, el, no_iframe=False):
+ """Get contents or contents in reverse."""
+ if not no_iframe or not self.is_iframe(el):
+ for content in el.contents:
+ yield content
+
+ def get_children(self, el, start=None, reverse=False, tags=True, no_iframe=False):
+ """Get children."""
+
+ if not no_iframe or not self.is_iframe(el):
+ last = len(el.contents) - 1
+ if start is None:
+ index = last if reverse else 0
+ else:
+ index = start
+ end = -1 if reverse else last + 1
+ incr = -1 if reverse else 1
+
+ if 0 <= index <= last:
+ while index != end:
+ node = el.contents[index]
+ index += incr
+ if not tags or self.is_tag(node):
+ yield node
+
+ def get_descendants(self, el, tags=True, no_iframe=False):
+ """Get descendants."""
+
+ if not no_iframe or not self.is_iframe(el):
+ next_good = None
+ for child in el.descendants:
+
+ if next_good is not None:
+ if child is not next_good:
+ continue
+ next_good = None
+
+ is_tag = self.is_tag(child)
+
+ if no_iframe and is_tag and self.is_iframe(child):
+ if child.next_sibling is not None:
+ next_good = child.next_sibling
+ else:
+ last_child = child
+ while self.is_tag(last_child) and last_child.contents:
+ last_child = last_child.contents[-1]
+ next_good = last_child.next_element
+ yield child
+ if next_good is None:
+ break
+ # Coverage isn't seeing this even though it's executed
+ continue # pragma: no cover
+
+ if not tags or is_tag:
+ yield child
+
+ def get_parent(self, el, no_iframe=False):
+ """Get parent."""
+
+ parent = el.parent
+ if no_iframe and parent is not None and self.is_iframe(parent):
+ parent = None
+ return parent
+
+ @staticmethod
+ def get_tag_name(el):
+ """Get tag."""
+
+ return el.name
+
+ @staticmethod
+ def get_prefix_name(el):
+ """Get prefix."""
+
+ return el.prefix
+
+ @staticmethod
+ def get_uri(el):
+ """Get namespace `URI`."""
+
+ return el.namespace
+
+ @classmethod
+ def get_next(cls, el, tags=True):
+ """Get next sibling tag."""
+
+ sibling = el.next_sibling
+ while tags and not cls.is_tag(sibling) and sibling is not None:
+ sibling = sibling.next_sibling
+ return sibling
+
+ @classmethod
+ def get_previous(cls, el, tags=True):
+ """Get previous sibling tag."""
+
+ sibling = el.previous_sibling
+ while tags and not cls.is_tag(sibling) and sibling is not None:
+ sibling = sibling.previous_sibling
+ return sibling
+
+ @staticmethod
+ def has_html_ns(el):
+ """
+ Check if element has an HTML namespace.
+
+ This is a bit different than whether a element is treated as having an HTML namespace,
+ like we do in the case of `is_html_tag`.
+ """
+
+ ns = getattr(el, 'namespace') if el else None
+ return ns and ns == NS_XHTML
+
+ @staticmethod
+ def split_namespace(el, attr_name):
+ """Return namespace and attribute name without the prefix."""
+
+ return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
+
+ @staticmethod
+ def get_attribute_by_name(el, name, default=None):
+ """Get attribute by name."""
+
+ value = default
+ if el._is_xml:
+ try:
+ value = el.attrs[name]
+ except KeyError:
+ pass
+ else:
+ for k, v in el.attrs.items():
+ if util.lower(k) == name:
+ value = v
+ break
+ return value
+
+ @staticmethod
+ def iter_attributes(el):
+ """Iterate attributes."""
+
+ for k, v in el.attrs.items():
+ yield k, v
+
+ @classmethod
+ def get_classes(cls, el):
+ """Get classes."""
+
+ classes = cls.get_attribute_by_name(el, 'class', [])
+ if isinstance(classes, util.ustr):
+ classes = RE_NOT_WS.findall(classes)
+ return classes
+
+ def get_text(self, el, no_iframe=False):
+ """Get text."""
+
+ return ''.join(
+ [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
+ )
+
+
+class Inputs(object):
+ """Class for parsing and validating input items."""
+
+ @staticmethod
+ def validate_day(year, month, day):
+ """Validate day."""
+
+ max_days = LONG_MONTH
+ if month == FEB:
+ max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
+ elif month in MONTHS_30:
+ max_days = SHORT_MONTH
+ return 1 <= day <= max_days
+
+ @staticmethod
+ def validate_week(year, week):
+ """Validate week."""
+
+ max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1]
+ if max_week == 1:
+ max_week = 53
+ return 1 <= week <= max_week
+
+ @staticmethod
+ def validate_month(month):
+ """Validate month."""
+
+ return 1 <= month <= 12
+
+ @staticmethod
+ def validate_year(year):
+ """Validate year."""
+
+ return 1 <= year
+
+ @staticmethod
+ def validate_hour(hour):
+ """Validate hour."""
+
+ return 0 <= hour <= 23
+
+ @staticmethod
+ def validate_minutes(minutes):
+ """Validate minutes."""
+
+ return 0 <= minutes <= 59
+
+ @classmethod
+ def parse_value(cls, itype, value):
+ """Parse the input value."""
+
+ parsed = None
+ if itype == "date":
+ m = RE_DATE.match(value)
+ if m:
+ year = int(m.group('year'), 10)
+ month = int(m.group('month'), 10)
+ day = int(m.group('day'), 10)
+ if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
+ parsed = (year, month, day)
+ elif itype == "month":
+ m = RE_MONTH.match(value)
+ if m:
+ year = int(m.group('year'), 10)
+ month = int(m.group('month'), 10)
+ if cls.validate_year(year) and cls.validate_month(month):
+ parsed = (year, month)
+ elif itype == "week":
+ m = RE_WEEK.match(value)
+ if m:
+ year = int(m.group('year'), 10)
+ week = int(m.group('week'), 10)
+ if cls.validate_year(year) and cls.validate_week(year, week):
+ parsed = (year, week)
+ elif itype == "time":
+ m = RE_TIME.match(value)
+ if m:
+ hour = int(m.group('hour'), 10)
+ minutes = int(m.group('minutes'), 10)
+ if cls.validate_hour(hour) and cls.validate_minutes(minutes):
+ parsed = (hour, minutes)
+ elif itype == "datetime-local":
+ m = RE_DATETIME.match(value)
+ if m:
+ year = int(m.group('year'), 10)
+ month = int(m.group('month'), 10)
+ day = int(m.group('day'), 10)
+ hour = int(m.group('hour'), 10)
+ minutes = int(m.group('minutes'), 10)
+ if (
+ cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
+ cls.validate_hour(hour) and cls.validate_minutes(minutes)
+ ):
+ parsed = (year, month, day, hour, minutes)
+ elif itype in ("number", "range"):
+ m = RE_NUM.match(value)
+ if m:
+ parsed = float(m.group('value'))
+ return parsed
+
+
+class _Match(object):
+ """Perform CSS matching."""
+
+ def __init__(self, selectors, scope, namespaces, flags):
+ """Initialize."""
+
+ self.assert_valid_input(scope)
+ self.tag = scope
+ self.cached_meta_lang = []
+ self.cached_default_forms = []
+ self.cached_indeterminate_forms = []
+ self.selectors = selectors
+ self.namespaces = {} if namespaces is None else namespaces
+ self.flags = flags
+ self.iframe_restrict = False
+
+ # Find the root element for the whole tree
+ doc = scope
+ parent = self.get_parent(doc)
+ while parent:
+ doc = parent
+ parent = self.get_parent(doc)
+ root = None
+ if not self.is_doc(doc):
+ root = doc
+ else:
+ for child in self.get_children(doc):
+ root = child
+ break
+
+ self.root = root
+ self.scope = scope if scope is not doc else root
+ self.has_html_namespace = self.has_html_ns(root)
+
+ # A document can be both XML and HTML (XHTML)
+ self.is_xml = self.is_xml_tree(doc)
+ self.is_html = not self.is_xml or self.has_html_namespace
+
+ def supports_namespaces(self):
+ """Check if namespaces are supported in the HTML type."""
+
+ return self.is_xml or self.has_html_namespace
+
+ def get_tag_ns(self, el):
+ """Get tag namespace."""
+
+ if self.supports_namespaces():
+ namespace = ''
+ ns = self.get_uri(el)
+ if ns:
+ namespace = ns
+ else:
+ namespace = NS_XHTML
+ return namespace
+
+ def is_html_tag(self, el):
+ """Check if tag is in HTML namespace."""
+
+ return self.get_tag_ns(el) == NS_XHTML
+
+ def get_tag(self, el):
+ """Get tag."""
+
+ name = self.get_tag_name(el)
+ return util.lower(name) if name is not None and not self.is_xml else name
+
+ def get_prefix(self, el):
+ """Get prefix."""
+
+ prefix = self.get_prefix_name(el)
+ return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
+
+ def find_bidi(self, el):
+ """Get directionality from element text."""
+
+ for node in self.get_children(el, tags=False):
+
+ # Analyze child text nodes
+ if self.is_tag(node):
+
+ # Avoid analyzing certain elements specified in the specification.
+ direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)
+ if (
+ self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or
+ not self.is_html_tag(node) or
+ direction is not None
+ ):
+ continue # pragma: no cover
+
+ # Check directionality of this node's text
+ value = self.find_bidi(node)
+ if value is not None:
+ return value
+
+ # Direction could not be determined
+ continue # pragma: no cover
+
+ # Skip `doctype` comments, etc.
+ if self.is_special_string(node):
+ continue
+
+ # Analyze text nodes for directionality.
+ for c in node:
+ bidi = unicodedata.bidirectional(c)
+ if bidi in ('AL', 'R', 'L'):
+ return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
+ return None
+
+ def match_attribute_name(self, el, attr, prefix):
+ """Match attribute name and return value if it exists."""
+
+ value = None
+ if self.supports_namespaces():
+ value = None
+ # If we have not defined namespaces, we can't very well find them, so don't bother trying.
+ if prefix:
+ ns = self.namespaces.get(prefix)
+ if ns is None and prefix != '*':
+ return None
+ else:
+ ns = None
+
+ for k, v in self.iter_attributes(el):
+
+ # Get attribute parts
+ namespace, name = self.split_namespace(el, k)
+
+ # Can't match a prefix attribute as we haven't specified one to match
+ # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
+ if ns is None:
+ if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
+ value = v
+ break
+ # Coverage is not finding this even though it is executed.
+ # Adding a print statement before this (and erasing coverage) causes coverage to find the line.
+ # Ignore the false positive message.
+ continue # pragma: no cover
+
+ # We can't match our desired prefix attribute as the attribute doesn't have a prefix
+ if namespace is None or ns != namespace and prefix != '*':
+ continue
+
+ # The attribute doesn't match.
+ if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
+ continue
+
+ value = v
+ break
+ else:
+ for k, v in self.iter_attributes(el):
+ if util.lower(attr) != util.lower(k):
+ continue
+ value = v
+ break
+ return value
+
+ def match_namespace(self, el, tag):
+ """Match the namespace of the element."""
+
+ match = True
+ namespace = self.get_tag_ns(el)
+ default_namespace = self.namespaces.get('')
+ tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix, None)
+ # We must match the default namespace if one is not provided
+ if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
+ match = False
+ # If we specified `|tag`, we must not have a namespace.
+ elif (tag.prefix is not None and tag.prefix == '' and namespace):
+ match = False
+ # Verify prefix matches
+ elif (
+ tag.prefix and
+ tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
+ ):
+ match = False
+ return match
+
+ def match_attributes(self, el, attributes):
+ """Match attributes."""
+
+ match = True
+ if attributes:
+ for a in attributes:
+ value = self.match_attribute_name(el, a.attribute, a.prefix)
+ pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
+ if isinstance(value, list):
+ value = ' '.join(value)
+ if value is None:
+ match = False
+ break
+ elif pattern is None:
+ continue
+ elif pattern.match(value) is None:
+ match = False
+ break
+ return match
+
+ def match_tagname(self, el, tag):
+ """Match tag name."""
+
+ name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
+ return not (
+ name is not None and
+ name not in (self.get_tag(el), '*')
+ )
+
+ def match_tag(self, el, tag):
+ """Match the tag."""
+
+ match = True
+ if tag is not None:
+ # Verify namespace
+ if not self.match_namespace(el, tag):
+ match = False
+ if not self.match_tagname(el, tag):
+ match = False
+ return match
+
+ def match_past_relations(self, el, relation):
+ """Match past relationship."""
+
+ found = False
+ if relation[0].rel_type == REL_PARENT:
+ parent = self.get_parent(el, no_iframe=self.iframe_restrict)
+ while not found and parent:
+ found = self.match_selectors(parent, relation)
+ parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
+ elif relation[0].rel_type == REL_CLOSE_PARENT:
+ parent = self.get_parent(el, no_iframe=self.iframe_restrict)
+ if parent:
+ found = self.match_selectors(parent, relation)
+ elif relation[0].rel_type == REL_SIBLING:
+ sibling = self.get_previous(el)
+ while not found and sibling:
+ found = self.match_selectors(sibling, relation)
+ sibling = self.get_previous(sibling)
+ elif relation[0].rel_type == REL_CLOSE_SIBLING:
+ sibling = self.get_previous(el)
+ if sibling and self.is_tag(sibling):
+ found = self.match_selectors(sibling, relation)
+ return found
+
+ def match_future_child(self, parent, relation, recursive=False):
+ """Match future child."""
+
+ match = False
+ children = self.get_descendants if recursive else self.get_children
+ for child in children(parent, no_iframe=self.iframe_restrict):
+ match = self.match_selectors(child, relation)
+ if match:
+ break
+ return match
+
+ def match_future_relations(self, el, relation):
+ """Match future relationship."""
+
+ found = False
+ if relation[0].rel_type == REL_HAS_PARENT:
+ found = self.match_future_child(el, relation, True)
+ elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
+ found = self.match_future_child(el, relation)
+ elif relation[0].rel_type == REL_HAS_SIBLING:
+ sibling = self.get_next(el)
+ while not found and sibling:
+ found = self.match_selectors(sibling, relation)
+ sibling = self.get_next(sibling)
+ elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
+ sibling = self.get_next(el)
+ if sibling and self.is_tag(sibling):
+ found = self.match_selectors(sibling, relation)
+ return found
+
+ def match_relations(self, el, relation):
+ """Match relationship to other elements."""
+
+ found = False
+
+ if relation[0].rel_type.startswith(':'):
+ found = self.match_future_relations(el, relation)
+ else:
+ found = self.match_past_relations(el, relation)
+
+ return found
+
+ def match_id(self, el, ids):
+ """Match element's ID."""
+
+ found = True
+ for i in ids:
+ if i != self.get_attribute_by_name(el, 'id', ''):
+ found = False
+ break
+ return found
+
+ def match_classes(self, el, classes):
+ """Match element's classes."""
+
+ current_classes = self.get_classes(el)
+ found = True
+ for c in classes:
+ if c not in current_classes:
+ found = False
+ break
+ return found
+
+ def match_root(self, el):
+ """Match element as root."""
+
+ is_root = self.is_root(el)
+ if is_root:
+ sibling = self.get_previous(el, tags=False)
+ while is_root and sibling is not None:
+ if (
+ self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
+ self.is_cdata(sibling)
+ ):
+ is_root = False
+ else:
+ sibling = self.get_previous(sibling, tags=False)
+ if is_root:
+ sibling = self.get_next(el, tags=False)
+ while is_root and sibling is not None:
+ if (
+ self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
+ self.is_cdata(sibling)
+ ):
+ is_root = False
+ else:
+ sibling = self.get_next(sibling, tags=False)
+ return is_root
+
+ def match_scope(self, el):
+ """Match element as scope."""
+
+ return self.scope is el
+
+ def match_nth_tag_type(self, el, child):
+ """Match tag type for `nth` matches."""
+
+ return(
+ (self.get_tag(child) == self.get_tag(el)) and
+ (self.get_tag_ns(child) == self.get_tag_ns(el))
+ )
+
+ def match_nth(self, el, nth):
+ """Match `nth` elements."""
+
+ matched = True
+
+ for n in nth:
+ matched = False
+ if n.selectors and not self.match_selectors(el, n.selectors):
+ break
+ parent = self.get_parent(el)
+ if parent is None:
+ parent = self.create_fake_parent(el)
+ last = n.last
+ last_index = len(parent) - 1
+ index = last_index if last else 0
+ relative_index = 0
+ a = n.a
+ b = n.b
+ var = n.n
+ count = 0
+ count_incr = 1
+ factor = -1 if last else 1
+ idx = last_idx = a * count + b if var else a
+
+ # We can only adjust bounds within a variable index
+ if var:
+ # Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
+ # Otherwise, increment to try to get in bounds.
+ adjust = None
+ while idx < 1 or idx > last_index:
+ if idx < 0:
+ diff_low = 0 - idx
+ if adjust is not None and adjust == 1:
+ break
+ adjust = -1
+ count += count_incr
+ idx = last_idx = a * count + b if var else a
+ diff = 0 - idx
+ if diff >= diff_low:
+ break
+ else:
+ diff_high = idx - last_index
+ if adjust is not None and adjust == -1:
+ break
+ adjust = 1
+ count += count_incr
+ idx = last_idx = a * count + b if var else a
+ diff = idx - last_index
+ if diff >= diff_high:
+ break
+ diff_high = diff
+
+ # If a < 0, our count is working backwards, so floor the index by increasing the count.
+ # Find the count that yields the lowest, in bound value and use that.
+ # Lastly reverse count increment so that we'll increase our index.
+ lowest = count
+ if a < 0:
+ while idx >= 1:
+ lowest = count
+ count += count_incr
+ idx = last_idx = a * count + b if var else a
+ count_incr = -1
+ count = lowest
+ idx = last_idx = a * count + b if var else a
+
+ # Evaluate elements while our calculated nth index is still in range
+ while 1 <= idx <= last_index + 1:
+ child = None
+ # Evaluate while our child index is still in range.
+ for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False):
+ index += factor
+ if not self.is_tag(child):
+ continue
+ # Handle `of S` in `nth-child`
+ if n.selectors and not self.match_selectors(child, n.selectors):
+ continue
+ # Handle `of-type`
+ if n.of_type and not self.match_nth_tag_type(el, child):
+ continue
+ relative_index += 1
+ if relative_index == idx:
+ if child is el:
+ matched = True
+ else:
+ break
+ if child is el:
+ break
+ if child is el:
+ break
+ last_idx = idx
+ count += count_incr
+ if count < 0:
+ # Count is counting down and has now ventured into invalid territory.
+ break
+ idx = a * count + b if var else a
+ if last_idx == idx:
+ break
+ if not matched:
+ break
+ return matched
+
+ def match_empty(self, el):
+ """Check if element is empty (if requested)."""
+
+ is_empty = True
+ for child in self.get_children(el, tags=False):
+ if self.is_tag(child):
+ is_empty = False
+ break
+ elif self.is_content_string(child) and RE_NOT_EMPTY.search(child):
+ is_empty = False
+ break
+ return is_empty
+
+ def match_subselectors(self, el, selectors):
+ """Match selectors."""
+
+ match = True
+ for sel in selectors:
+ if not self.match_selectors(el, sel):
+ match = False
+ return match
+
+ def match_contains(self, el, contains):
+ """Match element if it contains text."""
+
+ match = True
+ content = None
+ for contain_list in contains:
+ if content is None:
+ content = self.get_text(el, no_iframe=self.is_html)
+ found = False
+ for text in contain_list.text:
+ if text in content:
+ found = True
+ break
+ if not found:
+ match = False
+ return match
+
+ def match_default(self, el):
+ """Match default."""
+
+ match = False
+
+ # Find this input's form
+ form = None
+ parent = self.get_parent(el, no_iframe=True)
+ while parent and form is None:
+ if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
+ form = parent
+ else:
+ parent = self.get_parent(parent, no_iframe=True)
+
+ # Look in form cache to see if we've already located its default button
+ found_form = False
+ for f, t in self.cached_default_forms:
+ if f is form:
+ found_form = True
+ if t is el:
+ match = True
+ break
+
+ # We didn't have the form cached, so look for its default button
+ if not found_form:
+ for child in self.get_descendants(form, no_iframe=True):
+ name = self.get_tag(child)
+ # Can't do nested forms (haven't figured out why we never hit this)
+ if name == 'form': # pragma: no cover
+ break
+ if name in ('input', 'button'):
+ v = self.get_attribute_by_name(child, 'type', '')
+ if v and util.lower(v) == 'submit':
+ self.cached_default_forms.append([form, child])
+ if el is child:
+ match = True
+ break
+ return match
+
+ def match_indeterminate(self, el):
+ """Match default."""
+
+ match = False
+ name = self.get_attribute_by_name(el, 'name')
+
+ def get_parent_form(el):
+ """Find this input's form."""
+ form = None
+ parent = self.get_parent(el, no_iframe=True)
+ while form is None:
+ if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
+ form = parent
+ break
+ last_parent = parent
+ parent = self.get_parent(parent, no_iframe=True)
+ if parent is None:
+ form = last_parent
+ break
+ return form
+
+ form = get_parent_form(el)
+
+ # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
+ found_form = False
+ for f, n, i in self.cached_indeterminate_forms:
+ if f is form and n == name:
+ found_form = True
+ if i is True:
+ match = True
+ break
+
+ # We didn't have the form cached, so validate that the radio button is indeterminate
+ if not found_form:
+ checked = False
+ for child in self.get_descendants(form, no_iframe=True):
+ if child is el:
+ continue
+ tag_name = self.get_tag(child)
+ if tag_name == 'input':
+ is_radio = False
+ check = False
+ has_name = False
+ for k, v in self.iter_attributes(child):
+ if util.lower(k) == 'type' and util.lower(v) == 'radio':
+ is_radio = True
+ elif util.lower(k) == 'name' and v == name:
+ has_name = True
+ elif util.lower(k) == 'checked':
+ check = True
+ if is_radio and check and has_name and get_parent_form(child) is form:
+ checked = True
+ break
+ if checked:
+ break
+ if not checked:
+ match = True
+ self.cached_indeterminate_forms.append([form, name, match])
+
+ return match
+
+ def match_lang(self, el, langs):
+ """Match languages."""
+
+ match = False
+ has_ns = self.supports_namespaces()
+ root = self.root
+ has_html_namespace = self.has_html_namespace
+
+ # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
+ parent = el
+ found_lang = None
+ last = None
+ while not found_lang:
+ has_html_ns = self.has_html_ns(parent)
+ for k, v in self.iter_attributes(parent):
+ attr_ns, attr = self.split_namespace(parent, k)
+ if (
+ ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
+ (
+ has_ns and not has_html_ns and attr_ns == NS_XML and
+ (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
+ )
+ ):
+ found_lang = v
+ break
+ last = parent
+ parent = self.get_parent(parent, no_iframe=self.is_html)
+
+ if parent is None:
+ root = last
+ has_html_namespace = self.has_html_ns(root)
+ parent = last
+ break
+
+ # Use cached meta language.
+ if not found_lang and self.cached_meta_lang:
+ for cache in self.cached_meta_lang:
+ if root is cache[0]:
+ found_lang = cache[1]
+
+ # If we couldn't find a language, and the document is HTML, look to meta to determine language.
+ if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')):
+ # Find head
+ found = False
+ for tag in ('html', 'head'):
+ found = False
+ for child in self.get_children(parent, no_iframe=self.is_html):
+ if self.get_tag(child) == tag and self.is_html_tag(child):
+ found = True
+ parent = child
+ break
+ if not found: # pragma: no cover
+ break
+
+ # Search meta tags
+ if found:
+ for child in parent:
+ if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent):
+ c_lang = False
+ content = None
+ for k, v in self.iter_attributes(child):
+ if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
+ c_lang = True
+ if util.lower(k) == 'content':
+ content = v
+ if c_lang and content:
+ found_lang = content
+ self.cached_meta_lang.append((root, found_lang))
+ break
+ if found_lang:
+ break
+ if not found_lang:
+ self.cached_meta_lang.append((root, False))
+
+ # If we determined a language, compare.
+ if found_lang:
+ for patterns in langs:
+ match = False
+ for pattern in patterns:
+ if pattern.match(found_lang):
+ match = True
+ if not match:
+ break
+
+ return match
+
+ def match_dir(self, el, directionality):
+ """Check directionality."""
+
+ # If we have to match both left and right, we can't match either.
+ if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
+ return False
+
+ if el is None or not self.is_html_tag(el):
+ return False
+
+ # Element has defined direction of left to right or right to left
+ direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
+ if direction not in (None, 0):
+ return direction == directionality
+
+ # Element is the document element (the root) and no direction assigned, assume left to right.
+ is_root = self.is_root(el)
+ if is_root and direction is None:
+ return ct.SEL_DIR_LTR == directionality
+
+ # If `input[type=telephone]` and no direction is assigned, assume left to right.
+ name = self.get_tag(el)
+ is_input = name == 'input'
+ is_textarea = name == 'textarea'
+ is_bdi = name == 'bdi'
+ itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
+ if is_input and itype == 'tel' and direction is None:
+ return ct.SEL_DIR_LTR == directionality
+
+ # Auto handling for text inputs
+ if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
+ if is_textarea:
+ value = []
+ for node in self.get_contents(el, no_iframe=True):
+ if self.is_content_string(node):
+ value.append(node)
+ value = ''.join(value)
+ else:
+ value = self.get_attribute_by_name(el, 'value', '')
+ if value:
+ for c in value:
+ bidi = unicodedata.bidirectional(c)
+ if bidi in ('AL', 'R', 'L'):
+ direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
+ return direction == directionality
+ # Assume left to right
+ return ct.SEL_DIR_LTR == directionality
+ elif is_root:
+ return ct.SEL_DIR_LTR == directionality
+ return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
+
+ # Auto handling for `bdi` and other non text inputs.
+ if (is_bdi and direction is None) or direction == 0:
+ direction = self.find_bidi(el)
+ if direction is not None:
+ return direction == directionality
+ elif is_root:
+ return ct.SEL_DIR_LTR == directionality
+ return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
+
+ # Match parents direction
+ return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
+
+ def match_range(self, el, condition):
+ """
+ Match range.
+
+ Behavior is modeled after what we see in browsers. Browsers seem to evaluate
+ if the value is out of range, and if not, it is in range. So a missing value
+ will not evaluate out of range; therefore, value is in range. Personally, I
+ feel like this should evaluate as neither in or out of range.
+ """
+
+ out_of_range = False
+
+ itype = self.get_attribute_by_name(el, 'type').lower()
+ mn = self.get_attribute_by_name(el, 'min', None)
+ if mn is not None:
+ mn = Inputs.parse_value(itype, mn)
+ mx = self.get_attribute_by_name(el, 'max', None)
+ if mx is not None:
+ mx = Inputs.parse_value(itype, mx)
+
+ # There is no valid min or max, so we cannot evaluate a range
+ if mn is None and mx is None:
+ return False
+
+ value = self.get_attribute_by_name(el, 'value', None)
+ if value is not None:
+ value = Inputs.parse_value(itype, value)
+ if value is not None:
+ if itype in ("date", "datetime-local", "month", "week", "number", "range"):
+ if mn is not None and value < mn:
+ out_of_range = True
+ if not out_of_range and mx is not None and value > mx:
+ out_of_range = True
+ elif itype == "time":
+ if mn is not None and mx is not None and mn > mx:
+ # Time is periodic, so this is a reversed/discontinuous range
+ if value < mn and value > mx:
+ out_of_range = True
+ else:
+ if mn is not None and value < mn:
+ out_of_range = True
+ if not out_of_range and mx is not None and value > mx:
+ out_of_range = True
+
+ return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
+
+ def match_defined(self, el):
+ """
+ Match defined.
+
+ `:defined` is related to custom elements in a browser.
+
+ - If the document is XML (not XHTML), all tags will match.
+ - Tags that are not custom (don't have a hyphen) are marked defined.
+ - If the tag has a prefix (without or without a namespace), it will not match.
+
+ This is of course requires the parser to provide us with the proper prefix and namespace info,
+ if it doesn't, there is nothing we can do.
+ """
+
+ name = self.get_tag(el)
+ return (
+ name.find('-') == -1 or
+ name.find(':') != -1 or
+ self.get_prefix(el) is not None
+ )
+
+ def match_selectors(self, el, selectors):
+ """Check if element matches one of the selectors."""
+
+ match = False
+ is_not = selectors.is_not
+ is_html = selectors.is_html
+
+ # Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
+ if is_html:
+ namespaces = self.namespaces
+ iframe_restrict = self.iframe_restrict
+ self.namespaces = {'html': NS_XHTML}
+ self.iframe_restrict = True
+
+ if not is_html or self.is_html:
+ for selector in selectors:
+ match = is_not
+ # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
+ if isinstance(selector, ct.SelectorNull):
+ continue
+ # Verify tag matches
+ if not self.match_tag(el, selector.tag):
+ continue
+ # Verify tag is defined
+ if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
+ continue
+ # Verify element is root
+ if selector.flags & ct.SEL_ROOT and not self.match_root(el):
+ continue
+ # Verify element is scope
+ if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
+ continue
+ # Verify `nth` matches
+ if not self.match_nth(el, selector.nth):
+ continue
+ if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
+ continue
+ # Verify id matches
+ if selector.ids and not self.match_id(el, selector.ids):
+ continue
+ # Verify classes match
+ if selector.classes and not self.match_classes(el, selector.classes):
+ continue
+ # Verify attribute(s) match
+ if not self.match_attributes(el, selector.attributes):
+ continue
+ # Verify ranges
+ if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
+ continue
+ # Verify language patterns
+ if selector.lang and not self.match_lang(el, selector.lang):
+ continue
+ # Verify pseudo selector patterns
+ if selector.selectors and not self.match_subselectors(el, selector.selectors):
+ continue
+ # Verify relationship selectors
+ if selector.relation and not self.match_relations(el, selector.relation):
+ continue
+ # Validate that the current default selector match corresponds to the first submit button in the form
+ if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
+ continue
+ # Validate that the unset radio button is among radio buttons with the same name in a form that are
+ # also not set.
+ if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
+ continue
+ # Validate element directionality
+ if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
+ continue
+ # Validate that the tag contains the specified text.
+ if not self.match_contains(el, selector.contains):
+ continue
+ match = not is_not
+ break
+
+ # Restore actual namespaces being used for external selector lists
+ if is_html:
+ self.namespaces = namespaces
+ self.iframe_restrict = iframe_restrict
+
+ return match
+
+ def select(self, limit=0):
+ """Match all tags under the targeted tag."""
+
+ if limit < 1:
+ limit = None
+
+ for child in self.get_descendants(self.tag):
+ if self.match(child):
+ yield child
+ if limit is not None:
+ limit -= 1
+ if limit < 1:
+ break
+
+ def closest(self):
+ """Match closest ancestor."""
+
+ current = self.tag
+ closest = None
+ while closest is None and current is not None:
+ if self.match(current):
+ closest = current
+ else:
+ current = self.get_parent(current)
+ return closest
+
+ def filter(self): # noqa A001
+ """Filter tag's children."""
+
+ return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
+
+ def match(self, el):
+ """Match."""
+
+ return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
+
+
+class CSSMatch(_DocumentNav, _Match):
+ """The Beautiful Soup CSS match class."""
+
+
+class CommentsMatch(_DocumentNav):
+ """Comments matcher."""
+
+ def __init__(self, el):
+ """Initialize."""
+
+ self.assert_valid_input(el)
+ self.tag = el
+
+ def get_comments(self, limit=0):
+ """Get comments."""
+
+ if limit < 1:
+ limit = None
+
+ for child in self.get_descendants(self.tag, tags=False):
+ if self.is_comment(child):
+ yield child
+ if limit is not None:
+ limit -= 1
+ if limit < 1:
+ break
+
+
+class SoupSieve(ct.Immutable):
+ """Compiled Soup Sieve selector matching object."""
+
+ __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
+
+ def __init__(self, pattern, selectors, namespaces, custom, flags):
+ """Initialize."""
+
+ super(SoupSieve, self).__init__(
+ pattern=pattern,
+ selectors=selectors,
+ namespaces=namespaces,
+ custom=custom,
+ flags=flags
+ )
+
+ def match(self, tag):
+ """Match."""
+
+ return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
+
+ def closest(self, tag):
+ """Match closest ancestor."""
+
+ return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
+
+ def filter(self, iterable): # noqa A001
+ """
+ Filter.
+
+ `CSSMatch` can cache certain searches for tags of the same document,
+ so if we are given a tag, all tags are from the same document,
+ and we can take advantage of the optimization.
+
+ Any other kind of iterable could have tags from different documents or detached tags,
+ so for those, we use a new `CSSMatch` for each item in the iterable.
+ """
+
+ if CSSMatch.is_tag(iterable):
+ return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
+ else:
+ return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
+
+ @util.deprecated("'comments' is not related to CSS selectors and will be removed in the future.")
+ def comments(self, tag, limit=0):
+ """Get comments only."""
+
+ return [comment for comment in CommentsMatch(tag).get_comments(limit)]
+
+ @util.deprecated("'icomments' is not related to CSS selectors and will be removed in the future.")
+ def icomments(self, tag, limit=0):
+ """Iterate comments only."""
+
+ for comment in CommentsMatch(tag).get_comments(limit):
+ yield comment
+
+ def select_one(self, tag):
+ """Select a single tag."""
+
+ tags = self.select(tag, limit=1)
+ return tags[0] if tags else None
+
+ def select(self, tag, limit=0):
+ """Select the specified tags."""
+
+ return list(self.iselect(tag, limit))
+
+ def iselect(self, tag, limit=0):
+ """Iterate the specified tags."""
+
+ for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit):
+ yield el
+
+ def __repr__(self): # pragma: no cover
+ """Representation."""
+
+ return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format(
+ self.pattern,
+ self.namespaces,
+ self.custom,
+ self.flags
+ )
+
+ __str__ = __repr__
+
+
+ct.pickle_register(SoupSieve)