diff options
Diffstat (limited to 'libs/soupsieve/css_match.py')
-rw-r--r-- | libs/soupsieve/css_match.py | 1472 |
1 files changed, 1472 insertions, 0 deletions
diff --git a/libs/soupsieve/css_match.py b/libs/soupsieve/css_match.py new file mode 100644 index 000000000..aa3be671e --- /dev/null +++ b/libs/soupsieve/css_match.py @@ -0,0 +1,1472 @@ +"""CSS matcher.""" +from __future__ import unicode_literals +from datetime import datetime +from . import util +import re +from .import css_types as ct +import unicodedata + +# Empty tag pattern (whitespace okay) +RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') + +RE_NOT_WS = re.compile('[^ \t\r\n\f]+') + +# Relationships +REL_PARENT = ' ' +REL_CLOSE_PARENT = '>' +REL_SIBLING = '~' +REL_CLOSE_SIBLING = '+' + +# Relationships for :has() (forward looking) +REL_HAS_PARENT = ': ' +REL_HAS_CLOSE_PARENT = ':>' +REL_HAS_SIBLING = ':~' +REL_HAS_CLOSE_SIBLING = ':+' + +NS_XHTML = 'http://www.w3.org/1999/xhtml' +NS_XML = 'http://www.w3.org/XML/1998/namespace' + +DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL +RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE + +DIR_MAP = { + 'ltr': ct.SEL_DIR_LTR, + 'rtl': ct.SEL_DIR_RTL, + 'auto': 0 +} + +RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") +RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') +RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') +RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') +RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') +RE_DATETIME = re.compile( + r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' +) + +MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November +FEB = 2 +SHORT_MONTH = 30 +LONG_MONTH = 31 +FEB_MONTH = 28 +FEB_LEAP_MONTH = 29 +DAYS_IN_WEEK = 7 + + +class _FakeParent(object): + """ + Fake parent class. + + When we have a fragment with no `BeautifulSoup` document object, + we can't evaluate `nth` selectors properly. Create a temporary + fake parent so we can traverse the root element as a child. + """ + + def __init__(self, element): + """Initialize.""" + + self.contents = [element] + + def __len__(self): + """Length.""" + + return len(self.contents) + + +class _DocumentNav(object): + """Navigate a Beautiful Soup document.""" + + @classmethod + def assert_valid_input(cls, tag): + """Check if valid input tag or document.""" + + # Fail on unexpected types. + if not cls.is_tag(tag): + raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag))) + + @staticmethod + def is_doc(obj): + """Is `BeautifulSoup` object.""" + + import bs4 + return isinstance(obj, bs4.BeautifulSoup) + + @staticmethod + def is_tag(obj): + """Is tag.""" + + import bs4 + return isinstance(obj, bs4.Tag) + + @staticmethod + def is_comment(obj): + """Is comment.""" + + import bs4 + return isinstance(obj, bs4.Comment) + + @staticmethod + def is_declaration(obj): # pragma: no cover + """Is declaration.""" + + import bs4 + return isinstance(obj, bs4.Declaration) + + @staticmethod + def is_cdata(obj): + """Is CDATA.""" + + import bs4 + return isinstance(obj, bs4.CData) + + @staticmethod + def is_processing_instruction(obj): # pragma: no cover + """Is processing instruction.""" + + import bs4 + return isinstance(obj, bs4.ProcessingInstruction) + + @staticmethod + def is_navigable_string(obj): + """Is navigable string.""" + + import bs4 + return isinstance(obj, bs4.NavigableString) + + @staticmethod + def is_special_string(obj): + """Is special string.""" + + import bs4 + return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) + + @classmethod + def is_content_string(cls, obj): + """Check if node is content string.""" + + return cls.is_navigable_string(obj) and not cls.is_special_string(obj) + + @staticmethod + def create_fake_parent(el): + """Create fake parent for a given element.""" + + return _FakeParent(el) + + @staticmethod + def is_xml_tree(el): + """Check if element (or document) is from a XML tree.""" + + return el._is_xml + + def is_iframe(self, el): + """Check if element is an `iframe`.""" + + return ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el) + + def is_root(self, el): + """ + Return whether element is a root element. + + We check that the element is the root of the tree (which we have already pre-calculated), + and we check if it is the root element under an `iframe`. + """ + + root = self.root and self.root is el + if not root: + parent = self.get_parent(el) + root = parent is not None and self.is_html and self.is_iframe(parent) + return root + + def get_contents(self, el, no_iframe=False): + """Get contents or contents in reverse.""" + if not no_iframe or not self.is_iframe(el): + for content in el.contents: + yield content + + def get_children(self, el, start=None, reverse=False, tags=True, no_iframe=False): + """Get children.""" + + if not no_iframe or not self.is_iframe(el): + last = len(el.contents) - 1 + if start is None: + index = last if reverse else 0 + else: + index = start + end = -1 if reverse else last + 1 + incr = -1 if reverse else 1 + + if 0 <= index <= last: + while index != end: + node = el.contents[index] + index += incr + if not tags or self.is_tag(node): + yield node + + def get_descendants(self, el, tags=True, no_iframe=False): + """Get descendants.""" + + if not no_iframe or not self.is_iframe(el): + next_good = None + for child in el.descendants: + + if next_good is not None: + if child is not next_good: + continue + next_good = None + + is_tag = self.is_tag(child) + + if no_iframe and is_tag and self.is_iframe(child): + if child.next_sibling is not None: + next_good = child.next_sibling + else: + last_child = child + while self.is_tag(last_child) and last_child.contents: + last_child = last_child.contents[-1] + next_good = last_child.next_element + yield child + if next_good is None: + break + # Coverage isn't seeing this even though it's executed + continue # pragma: no cover + + if not tags or is_tag: + yield child + + def get_parent(self, el, no_iframe=False): + """Get parent.""" + + parent = el.parent + if no_iframe and parent is not None and self.is_iframe(parent): + parent = None + return parent + + @staticmethod + def get_tag_name(el): + """Get tag.""" + + return el.name + + @staticmethod + def get_prefix_name(el): + """Get prefix.""" + + return el.prefix + + @staticmethod + def get_uri(el): + """Get namespace `URI`.""" + + return el.namespace + + @classmethod + def get_next(cls, el, tags=True): + """Get next sibling tag.""" + + sibling = el.next_sibling + while tags and not cls.is_tag(sibling) and sibling is not None: + sibling = sibling.next_sibling + return sibling + + @classmethod + def get_previous(cls, el, tags=True): + """Get previous sibling tag.""" + + sibling = el.previous_sibling + while tags and not cls.is_tag(sibling) and sibling is not None: + sibling = sibling.previous_sibling + return sibling + + @staticmethod + def has_html_ns(el): + """ + Check if element has an HTML namespace. + + This is a bit different than whether a element is treated as having an HTML namespace, + like we do in the case of `is_html_tag`. + """ + + ns = getattr(el, 'namespace') if el else None + return ns and ns == NS_XHTML + + @staticmethod + def split_namespace(el, attr_name): + """Return namespace and attribute name without the prefix.""" + + return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) + + @staticmethod + def get_attribute_by_name(el, name, default=None): + """Get attribute by name.""" + + value = default + if el._is_xml: + try: + value = el.attrs[name] + except KeyError: + pass + else: + for k, v in el.attrs.items(): + if util.lower(k) == name: + value = v + break + return value + + @staticmethod + def iter_attributes(el): + """Iterate attributes.""" + + for k, v in el.attrs.items(): + yield k, v + + @classmethod + def get_classes(cls, el): + """Get classes.""" + + classes = cls.get_attribute_by_name(el, 'class', []) + if isinstance(classes, util.ustr): + classes = RE_NOT_WS.findall(classes) + return classes + + def get_text(self, el, no_iframe=False): + """Get text.""" + + return ''.join( + [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] + ) + + +class Inputs(object): + """Class for parsing and validating input items.""" + + @staticmethod + def validate_day(year, month, day): + """Validate day.""" + + max_days = LONG_MONTH + if month == FEB: + max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH + elif month in MONTHS_30: + max_days = SHORT_MONTH + return 1 <= day <= max_days + + @staticmethod + def validate_week(year, week): + """Validate week.""" + + max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1] + if max_week == 1: + max_week = 53 + return 1 <= week <= max_week + + @staticmethod + def validate_month(month): + """Validate month.""" + + return 1 <= month <= 12 + + @staticmethod + def validate_year(year): + """Validate year.""" + + return 1 <= year + + @staticmethod + def validate_hour(hour): + """Validate hour.""" + + return 0 <= hour <= 23 + + @staticmethod + def validate_minutes(minutes): + """Validate minutes.""" + + return 0 <= minutes <= 59 + + @classmethod + def parse_value(cls, itype, value): + """Parse the input value.""" + + parsed = None + if itype == "date": + m = RE_DATE.match(value) + if m: + year = int(m.group('year'), 10) + month = int(m.group('month'), 10) + day = int(m.group('day'), 10) + if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): + parsed = (year, month, day) + elif itype == "month": + m = RE_MONTH.match(value) + if m: + year = int(m.group('year'), 10) + month = int(m.group('month'), 10) + if cls.validate_year(year) and cls.validate_month(month): + parsed = (year, month) + elif itype == "week": + m = RE_WEEK.match(value) + if m: + year = int(m.group('year'), 10) + week = int(m.group('week'), 10) + if cls.validate_year(year) and cls.validate_week(year, week): + parsed = (year, week) + elif itype == "time": + m = RE_TIME.match(value) + if m: + hour = int(m.group('hour'), 10) + minutes = int(m.group('minutes'), 10) + if cls.validate_hour(hour) and cls.validate_minutes(minutes): + parsed = (hour, minutes) + elif itype == "datetime-local": + m = RE_DATETIME.match(value) + if m: + year = int(m.group('year'), 10) + month = int(m.group('month'), 10) + day = int(m.group('day'), 10) + hour = int(m.group('hour'), 10) + minutes = int(m.group('minutes'), 10) + if ( + cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and + cls.validate_hour(hour) and cls.validate_minutes(minutes) + ): + parsed = (year, month, day, hour, minutes) + elif itype in ("number", "range"): + m = RE_NUM.match(value) + if m: + parsed = float(m.group('value')) + return parsed + + +class _Match(object): + """Perform CSS matching.""" + + def __init__(self, selectors, scope, namespaces, flags): + """Initialize.""" + + self.assert_valid_input(scope) + self.tag = scope + self.cached_meta_lang = [] + self.cached_default_forms = [] + self.cached_indeterminate_forms = [] + self.selectors = selectors + self.namespaces = {} if namespaces is None else namespaces + self.flags = flags + self.iframe_restrict = False + + # Find the root element for the whole tree + doc = scope + parent = self.get_parent(doc) + while parent: + doc = parent + parent = self.get_parent(doc) + root = None + if not self.is_doc(doc): + root = doc + else: + for child in self.get_children(doc): + root = child + break + + self.root = root + self.scope = scope if scope is not doc else root + self.has_html_namespace = self.has_html_ns(root) + + # A document can be both XML and HTML (XHTML) + self.is_xml = self.is_xml_tree(doc) + self.is_html = not self.is_xml or self.has_html_namespace + + def supports_namespaces(self): + """Check if namespaces are supported in the HTML type.""" + + return self.is_xml or self.has_html_namespace + + def get_tag_ns(self, el): + """Get tag namespace.""" + + if self.supports_namespaces(): + namespace = '' + ns = self.get_uri(el) + if ns: + namespace = ns + else: + namespace = NS_XHTML + return namespace + + def is_html_tag(self, el): + """Check if tag is in HTML namespace.""" + + return self.get_tag_ns(el) == NS_XHTML + + def get_tag(self, el): + """Get tag.""" + + name = self.get_tag_name(el) + return util.lower(name) if name is not None and not self.is_xml else name + + def get_prefix(self, el): + """Get prefix.""" + + prefix = self.get_prefix_name(el) + return util.lower(prefix) if prefix is not None and not self.is_xml else prefix + + def find_bidi(self, el): + """Get directionality from element text.""" + + for node in self.get_children(el, tags=False): + + # Analyze child text nodes + if self.is_tag(node): + + # Avoid analyzing certain elements specified in the specification. + direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) + if ( + self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or + not self.is_html_tag(node) or + direction is not None + ): + continue # pragma: no cover + + # Check directionality of this node's text + value = self.find_bidi(node) + if value is not None: + return value + + # Direction could not be determined + continue # pragma: no cover + + # Skip `doctype` comments, etc. + if self.is_special_string(node): + continue + + # Analyze text nodes for directionality. + for c in node: + bidi = unicodedata.bidirectional(c) + if bidi in ('AL', 'R', 'L'): + return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL + return None + + def match_attribute_name(self, el, attr, prefix): + """Match attribute name and return value if it exists.""" + + value = None + if self.supports_namespaces(): + value = None + # If we have not defined namespaces, we can't very well find them, so don't bother trying. + if prefix: + ns = self.namespaces.get(prefix) + if ns is None and prefix != '*': + return None + else: + ns = None + + for k, v in self.iter_attributes(el): + + # Get attribute parts + namespace, name = self.split_namespace(el, k) + + # Can't match a prefix attribute as we haven't specified one to match + # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. + if ns is None: + if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): + value = v + break + # Coverage is not finding this even though it is executed. + # Adding a print statement before this (and erasing coverage) causes coverage to find the line. + # Ignore the false positive message. + continue # pragma: no cover + + # We can't match our desired prefix attribute as the attribute doesn't have a prefix + if namespace is None or ns != namespace and prefix != '*': + continue + + # The attribute doesn't match. + if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): + continue + + value = v + break + else: + for k, v in self.iter_attributes(el): + if util.lower(attr) != util.lower(k): + continue + value = v + break + return value + + def match_namespace(self, el, tag): + """Match the namespace of the element.""" + + match = True + namespace = self.get_tag_ns(el) + default_namespace = self.namespaces.get('') + tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix, None) + # We must match the default namespace if one is not provided + if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): + match = False + # If we specified `|tag`, we must not have a namespace. + elif (tag.prefix is not None and tag.prefix == '' and namespace): + match = False + # Verify prefix matches + elif ( + tag.prefix and + tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) + ): + match = False + return match + + def match_attributes(self, el, attributes): + """Match attributes.""" + + match = True + if attributes: + for a in attributes: + value = self.match_attribute_name(el, a.attribute, a.prefix) + pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern + if isinstance(value, list): + value = ' '.join(value) + if value is None: + match = False + break + elif pattern is None: + continue + elif pattern.match(value) is None: + match = False + break + return match + + def match_tagname(self, el, tag): + """Match tag name.""" + + name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) + return not ( + name is not None and + name not in (self.get_tag(el), '*') + ) + + def match_tag(self, el, tag): + """Match the tag.""" + + match = True + if tag is not None: + # Verify namespace + if not self.match_namespace(el, tag): + match = False + if not self.match_tagname(el, tag): + match = False + return match + + def match_past_relations(self, el, relation): + """Match past relationship.""" + + found = False + if relation[0].rel_type == REL_PARENT: + parent = self.get_parent(el, no_iframe=self.iframe_restrict) + while not found and parent: + found = self.match_selectors(parent, relation) + parent = self.get_parent(parent, no_iframe=self.iframe_restrict) + elif relation[0].rel_type == REL_CLOSE_PARENT: + parent = self.get_parent(el, no_iframe=self.iframe_restrict) + if parent: + found = self.match_selectors(parent, relation) + elif relation[0].rel_type == REL_SIBLING: + sibling = self.get_previous(el) + while not found and sibling: + found = self.match_selectors(sibling, relation) + sibling = self.get_previous(sibling) + elif relation[0].rel_type == REL_CLOSE_SIBLING: + sibling = self.get_previous(el) + if sibling and self.is_tag(sibling): + found = self.match_selectors(sibling, relation) + return found + + def match_future_child(self, parent, relation, recursive=False): + """Match future child.""" + + match = False + children = self.get_descendants if recursive else self.get_children + for child in children(parent, no_iframe=self.iframe_restrict): + match = self.match_selectors(child, relation) + if match: + break + return match + + def match_future_relations(self, el, relation): + """Match future relationship.""" + + found = False + if relation[0].rel_type == REL_HAS_PARENT: + found = self.match_future_child(el, relation, True) + elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: + found = self.match_future_child(el, relation) + elif relation[0].rel_type == REL_HAS_SIBLING: + sibling = self.get_next(el) + while not found and sibling: + found = self.match_selectors(sibling, relation) + sibling = self.get_next(sibling) + elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: + sibling = self.get_next(el) + if sibling and self.is_tag(sibling): + found = self.match_selectors(sibling, relation) + return found + + def match_relations(self, el, relation): + """Match relationship to other elements.""" + + found = False + + if relation[0].rel_type.startswith(':'): + found = self.match_future_relations(el, relation) + else: + found = self.match_past_relations(el, relation) + + return found + + def match_id(self, el, ids): + """Match element's ID.""" + + found = True + for i in ids: + if i != self.get_attribute_by_name(el, 'id', ''): + found = False + break + return found + + def match_classes(self, el, classes): + """Match element's classes.""" + + current_classes = self.get_classes(el) + found = True + for c in classes: + if c not in current_classes: + found = False + break + return found + + def match_root(self, el): + """Match element as root.""" + + is_root = self.is_root(el) + if is_root: + sibling = self.get_previous(el, tags=False) + while is_root and sibling is not None: + if ( + self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or + self.is_cdata(sibling) + ): + is_root = False + else: + sibling = self.get_previous(sibling, tags=False) + if is_root: + sibling = self.get_next(el, tags=False) + while is_root and sibling is not None: + if ( + self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or + self.is_cdata(sibling) + ): + is_root = False + else: + sibling = self.get_next(sibling, tags=False) + return is_root + + def match_scope(self, el): + """Match element as scope.""" + + return self.scope is el + + def match_nth_tag_type(self, el, child): + """Match tag type for `nth` matches.""" + + return( + (self.get_tag(child) == self.get_tag(el)) and + (self.get_tag_ns(child) == self.get_tag_ns(el)) + ) + + def match_nth(self, el, nth): + """Match `nth` elements.""" + + matched = True + + for n in nth: + matched = False + if n.selectors and not self.match_selectors(el, n.selectors): + break + parent = self.get_parent(el) + if parent is None: + parent = self.create_fake_parent(el) + last = n.last + last_index = len(parent) - 1 + index = last_index if last else 0 + relative_index = 0 + a = n.a + b = n.b + var = n.n + count = 0 + count_incr = 1 + factor = -1 if last else 1 + idx = last_idx = a * count + b if var else a + + # We can only adjust bounds within a variable index + if var: + # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. + # Otherwise, increment to try to get in bounds. + adjust = None + while idx < 1 or idx > last_index: + if idx < 0: + diff_low = 0 - idx + if adjust is not None and adjust == 1: + break + adjust = -1 + count += count_incr + idx = last_idx = a * count + b if var else a + diff = 0 - idx + if diff >= diff_low: + break + else: + diff_high = idx - last_index + if adjust is not None and adjust == -1: + break + adjust = 1 + count += count_incr + idx = last_idx = a * count + b if var else a + diff = idx - last_index + if diff >= diff_high: + break + diff_high = diff + + # If a < 0, our count is working backwards, so floor the index by increasing the count. + # Find the count that yields the lowest, in bound value and use that. + # Lastly reverse count increment so that we'll increase our index. + lowest = count + if a < 0: + while idx >= 1: + lowest = count + count += count_incr + idx = last_idx = a * count + b if var else a + count_incr = -1 + count = lowest + idx = last_idx = a * count + b if var else a + + # Evaluate elements while our calculated nth index is still in range + while 1 <= idx <= last_index + 1: + child = None + # Evaluate while our child index is still in range. + for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False): + index += factor + if not self.is_tag(child): + continue + # Handle `of S` in `nth-child` + if n.selectors and not self.match_selectors(child, n.selectors): + continue + # Handle `of-type` + if n.of_type and not self.match_nth_tag_type(el, child): + continue + relative_index += 1 + if relative_index == idx: + if child is el: + matched = True + else: + break + if child is el: + break + if child is el: + break + last_idx = idx + count += count_incr + if count < 0: + # Count is counting down and has now ventured into invalid territory. + break + idx = a * count + b if var else a + if last_idx == idx: + break + if not matched: + break + return matched + + def match_empty(self, el): + """Check if element is empty (if requested).""" + + is_empty = True + for child in self.get_children(el, tags=False): + if self.is_tag(child): + is_empty = False + break + elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): + is_empty = False + break + return is_empty + + def match_subselectors(self, el, selectors): + """Match selectors.""" + + match = True + for sel in selectors: + if not self.match_selectors(el, sel): + match = False + return match + + def match_contains(self, el, contains): + """Match element if it contains text.""" + + match = True + content = None + for contain_list in contains: + if content is None: + content = self.get_text(el, no_iframe=self.is_html) + found = False + for text in contain_list.text: + if text in content: + found = True + break + if not found: + match = False + return match + + def match_default(self, el): + """Match default.""" + + match = False + + # Find this input's form + form = None + parent = self.get_parent(el, no_iframe=True) + while parent and form is None: + if self.get_tag(parent) == 'form' and self.is_html_tag(parent): + form = parent + else: + parent = self.get_parent(parent, no_iframe=True) + + # Look in form cache to see if we've already located its default button + found_form = False + for f, t in self.cached_default_forms: + if f is form: + found_form = True + if t is el: + match = True + break + + # We didn't have the form cached, so look for its default button + if not found_form: + for child in self.get_descendants(form, no_iframe=True): + name = self.get_tag(child) + # Can't do nested forms (haven't figured out why we never hit this) + if name == 'form': # pragma: no cover + break + if name in ('input', 'button'): + v = self.get_attribute_by_name(child, 'type', '') + if v and util.lower(v) == 'submit': + self.cached_default_forms.append([form, child]) + if el is child: + match = True + break + return match + + def match_indeterminate(self, el): + """Match default.""" + + match = False + name = self.get_attribute_by_name(el, 'name') + + def get_parent_form(el): + """Find this input's form.""" + form = None + parent = self.get_parent(el, no_iframe=True) + while form is None: + if self.get_tag(parent) == 'form' and self.is_html_tag(parent): + form = parent + break + last_parent = parent + parent = self.get_parent(parent, no_iframe=True) + if parent is None: + form = last_parent + break + return form + + form = get_parent_form(el) + + # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate + found_form = False + for f, n, i in self.cached_indeterminate_forms: + if f is form and n == name: + found_form = True + if i is True: + match = True + break + + # We didn't have the form cached, so validate that the radio button is indeterminate + if not found_form: + checked = False + for child in self.get_descendants(form, no_iframe=True): + if child is el: + continue + tag_name = self.get_tag(child) + if tag_name == 'input': + is_radio = False + check = False + has_name = False + for k, v in self.iter_attributes(child): + if util.lower(k) == 'type' and util.lower(v) == 'radio': + is_radio = True + elif util.lower(k) == 'name' and v == name: + has_name = True + elif util.lower(k) == 'checked': + check = True + if is_radio and check and has_name and get_parent_form(child) is form: + checked = True + break + if checked: + break + if not checked: + match = True + self.cached_indeterminate_forms.append([form, name, match]) + + return match + + def match_lang(self, el, langs): + """Match languages.""" + + match = False + has_ns = self.supports_namespaces() + root = self.root + has_html_namespace = self.has_html_namespace + + # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. + parent = el + found_lang = None + last = None + while not found_lang: + has_html_ns = self.has_html_ns(parent) + for k, v in self.iter_attributes(parent): + attr_ns, attr = self.split_namespace(parent, k) + if ( + ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or + ( + has_ns and not has_html_ns and attr_ns == NS_XML and + (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' + ) + ): + found_lang = v + break + last = parent + parent = self.get_parent(parent, no_iframe=self.is_html) + + if parent is None: + root = last + has_html_namespace = self.has_html_ns(root) + parent = last + break + + # Use cached meta language. + if not found_lang and self.cached_meta_lang: + for cache in self.cached_meta_lang: + if root is cache[0]: + found_lang = cache[1] + + # If we couldn't find a language, and the document is HTML, look to meta to determine language. + if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')): + # Find head + found = False + for tag in ('html', 'head'): + found = False + for child in self.get_children(parent, no_iframe=self.is_html): + if self.get_tag(child) == tag and self.is_html_tag(child): + found = True + parent = child + break + if not found: # pragma: no cover + break + + # Search meta tags + if found: + for child in parent: + if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent): + c_lang = False + content = None + for k, v in self.iter_attributes(child): + if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': + c_lang = True + if util.lower(k) == 'content': + content = v + if c_lang and content: + found_lang = content + self.cached_meta_lang.append((root, found_lang)) + break + if found_lang: + break + if not found_lang: + self.cached_meta_lang.append((root, False)) + + # If we determined a language, compare. + if found_lang: + for patterns in langs: + match = False + for pattern in patterns: + if pattern.match(found_lang): + match = True + if not match: + break + + return match + + def match_dir(self, el, directionality): + """Check directionality.""" + + # If we have to match both left and right, we can't match either. + if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: + return False + + if el is None or not self.is_html_tag(el): + return False + + # Element has defined direction of left to right or right to left + direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) + if direction not in (None, 0): + return direction == directionality + + # Element is the document element (the root) and no direction assigned, assume left to right. + is_root = self.is_root(el) + if is_root and direction is None: + return ct.SEL_DIR_LTR == directionality + + # If `input[type=telephone]` and no direction is assigned, assume left to right. + name = self.get_tag(el) + is_input = name == 'input' + is_textarea = name == 'textarea' + is_bdi = name == 'bdi' + itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' + if is_input and itype == 'tel' and direction is None: + return ct.SEL_DIR_LTR == directionality + + # Auto handling for text inputs + if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: + if is_textarea: + value = [] + for node in self.get_contents(el, no_iframe=True): + if self.is_content_string(node): + value.append(node) + value = ''.join(value) + else: + value = self.get_attribute_by_name(el, 'value', '') + if value: + for c in value: + bidi = unicodedata.bidirectional(c) + if bidi in ('AL', 'R', 'L'): + direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL + return direction == directionality + # Assume left to right + return ct.SEL_DIR_LTR == directionality + elif is_root: + return ct.SEL_DIR_LTR == directionality + return self.match_dir(self.get_parent(el, no_iframe=True), directionality) + + # Auto handling for `bdi` and other non text inputs. + if (is_bdi and direction is None) or direction == 0: + direction = self.find_bidi(el) + if direction is not None: + return direction == directionality + elif is_root: + return ct.SEL_DIR_LTR == directionality + return self.match_dir(self.get_parent(el, no_iframe=True), directionality) + + # Match parents direction + return self.match_dir(self.get_parent(el, no_iframe=True), directionality) + + def match_range(self, el, condition): + """ + Match range. + + Behavior is modeled after what we see in browsers. Browsers seem to evaluate + if the value is out of range, and if not, it is in range. So a missing value + will not evaluate out of range; therefore, value is in range. Personally, I + feel like this should evaluate as neither in or out of range. + """ + + out_of_range = False + + itype = self.get_attribute_by_name(el, 'type').lower() + mn = self.get_attribute_by_name(el, 'min', None) + if mn is not None: + mn = Inputs.parse_value(itype, mn) + mx = self.get_attribute_by_name(el, 'max', None) + if mx is not None: + mx = Inputs.parse_value(itype, mx) + + # There is no valid min or max, so we cannot evaluate a range + if mn is None and mx is None: + return False + + value = self.get_attribute_by_name(el, 'value', None) + if value is not None: + value = Inputs.parse_value(itype, value) + if value is not None: + if itype in ("date", "datetime-local", "month", "week", "number", "range"): + if mn is not None and value < mn: + out_of_range = True + if not out_of_range and mx is not None and value > mx: + out_of_range = True + elif itype == "time": + if mn is not None and mx is not None and mn > mx: + # Time is periodic, so this is a reversed/discontinuous range + if value < mn and value > mx: + out_of_range = True + else: + if mn is not None and value < mn: + out_of_range = True + if not out_of_range and mx is not None and value > mx: + out_of_range = True + + return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range + + def match_defined(self, el): + """ + Match defined. + + `:defined` is related to custom elements in a browser. + + - If the document is XML (not XHTML), all tags will match. + - Tags that are not custom (don't have a hyphen) are marked defined. + - If the tag has a prefix (without or without a namespace), it will not match. + + This is of course requires the parser to provide us with the proper prefix and namespace info, + if it doesn't, there is nothing we can do. + """ + + name = self.get_tag(el) + return ( + name.find('-') == -1 or + name.find(':') != -1 or + self.get_prefix(el) is not None + ) + + def match_selectors(self, el, selectors): + """Check if element matches one of the selectors.""" + + match = False + is_not = selectors.is_not + is_html = selectors.is_html + + # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. + if is_html: + namespaces = self.namespaces + iframe_restrict = self.iframe_restrict + self.namespaces = {'html': NS_XHTML} + self.iframe_restrict = True + + if not is_html or self.is_html: + for selector in selectors: + match = is_not + # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) + if isinstance(selector, ct.SelectorNull): + continue + # Verify tag matches + if not self.match_tag(el, selector.tag): + continue + # Verify tag is defined + if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): + continue + # Verify element is root + if selector.flags & ct.SEL_ROOT and not self.match_root(el): + continue + # Verify element is scope + if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): + continue + # Verify `nth` matches + if not self.match_nth(el, selector.nth): + continue + if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): + continue + # Verify id matches + if selector.ids and not self.match_id(el, selector.ids): + continue + # Verify classes match + if selector.classes and not self.match_classes(el, selector.classes): + continue + # Verify attribute(s) match + if not self.match_attributes(el, selector.attributes): + continue + # Verify ranges + if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): + continue + # Verify language patterns + if selector.lang and not self.match_lang(el, selector.lang): + continue + # Verify pseudo selector patterns + if selector.selectors and not self.match_subselectors(el, selector.selectors): + continue + # Verify relationship selectors + if selector.relation and not self.match_relations(el, selector.relation): + continue + # Validate that the current default selector match corresponds to the first submit button in the form + if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): + continue + # Validate that the unset radio button is among radio buttons with the same name in a form that are + # also not set. + if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): + continue + # Validate element directionality + if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): + continue + # Validate that the tag contains the specified text. + if not self.match_contains(el, selector.contains): + continue + match = not is_not + break + + # Restore actual namespaces being used for external selector lists + if is_html: + self.namespaces = namespaces + self.iframe_restrict = iframe_restrict + + return match + + def select(self, limit=0): + """Match all tags under the targeted tag.""" + + if limit < 1: + limit = None + + for child in self.get_descendants(self.tag): + if self.match(child): + yield child + if limit is not None: + limit -= 1 + if limit < 1: + break + + def closest(self): + """Match closest ancestor.""" + + current = self.tag + closest = None + while closest is None and current is not None: + if self.match(current): + closest = current + else: + current = self.get_parent(current) + return closest + + def filter(self): # noqa A001 + """Filter tag's children.""" + + return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] + + def match(self, el): + """Match.""" + + return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) + + +class CSSMatch(_DocumentNav, _Match): + """The Beautiful Soup CSS match class.""" + + +class CommentsMatch(_DocumentNav): + """Comments matcher.""" + + def __init__(self, el): + """Initialize.""" + + self.assert_valid_input(el) + self.tag = el + + def get_comments(self, limit=0): + """Get comments.""" + + if limit < 1: + limit = None + + for child in self.get_descendants(self.tag, tags=False): + if self.is_comment(child): + yield child + if limit is not None: + limit -= 1 + if limit < 1: + break + + +class SoupSieve(ct.Immutable): + """Compiled Soup Sieve selector matching object.""" + + __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") + + def __init__(self, pattern, selectors, namespaces, custom, flags): + """Initialize.""" + + super(SoupSieve, self).__init__( + pattern=pattern, + selectors=selectors, + namespaces=namespaces, + custom=custom, + flags=flags + ) + + def match(self, tag): + """Match.""" + + return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) + + def closest(self, tag): + """Match closest ancestor.""" + + return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() + + def filter(self, iterable): # noqa A001 + """ + Filter. + + `CSSMatch` can cache certain searches for tags of the same document, + so if we are given a tag, all tags are from the same document, + and we can take advantage of the optimization. + + Any other kind of iterable could have tags from different documents or detached tags, + so for those, we use a new `CSSMatch` for each item in the iterable. + """ + + if CSSMatch.is_tag(iterable): + return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() + else: + return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] + + @util.deprecated("'comments' is not related to CSS selectors and will be removed in the future.") + def comments(self, tag, limit=0): + """Get comments only.""" + + return [comment for comment in CommentsMatch(tag).get_comments(limit)] + + @util.deprecated("'icomments' is not related to CSS selectors and will be removed in the future.") + def icomments(self, tag, limit=0): + """Iterate comments only.""" + + for comment in CommentsMatch(tag).get_comments(limit): + yield comment + + def select_one(self, tag): + """Select a single tag.""" + + tags = self.select(tag, limit=1) + return tags[0] if tags else None + + def select(self, tag, limit=0): + """Select the specified tags.""" + + return list(self.iselect(tag, limit)) + + def iselect(self, tag, limit=0): + """Iterate the specified tags.""" + + for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit): + yield el + + def __repr__(self): # pragma: no cover + """Representation.""" + + return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format( + self.pattern, + self.namespaces, + self.custom, + self.flags + ) + + __str__ = __repr__ + + +ct.pickle_register(SoupSieve) |