aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authordirkf <[email protected]>2024-05-28 16:38:20 +0100
committerdirkf <[email protected]>2024-05-30 15:46:36 +0100
commit34484e49f5cd91a830f5459a5b673b7c05a22e24 (patch)
tree1f59edbfaf69074316b551eda25bead767639510
parent06da64ee51cd405b9392ba484cf7d3d31a88ee30 (diff)
downloadyoutube-dl-34484e49f5cd91a830f5459a5b673b7c05a22e24.tar.gz
youtube-dl-34484e49f5cd91a830f5459a5b673b7c05a22e24.zip
[compat] Improve compat_etree_iterfind for Py2.6
Adapted from https://raw.githubusercontent.com/python/cpython/2.7/Lib/xml/etree/ElementPath.py
-rw-r--r--youtube_dl/compat.py214
1 files changed, 211 insertions, 3 deletions
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
index d5485c7e8..0371896ab 100644
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -2720,9 +2720,217 @@ if sys.version_info < (2, 7):
xpath = xpath.encode('ascii')
return xpath
- def compat_etree_iterfind(element, match):
- for from_ in element.findall(match):
- yield from_
+ # further code below based on CPython 2.7 source
+ import functools
+
+ _xpath_tokenizer_re = re.compile(r'''(?x)
+ ( # (1)
+ '[^']*'|"[^"]*"| # quoted strings, or
+ ::|//?|\.\.|\(\)|[/.*:[\]()@=] # navigation specials
+ )| # or (2)
+ ((?:\{[^}]+\})?[^/[\]()@=\s]+)| # token: optional {ns}, no specials
+ \s+ # or white space
+ ''')
+
+ def _xpath_tokenizer(pattern, namespaces=None):
+ for token in _xpath_tokenizer_re.findall(pattern):
+ tag = token[1]
+ if tag and tag[0] != "{" and ":" in tag:
+ try:
+ if not namespaces:
+ raise KeyError
+ prefix, uri = tag.split(":", 1)
+ yield token[0], "{%s}%s" % (namespaces[prefix], uri)
+ except KeyError:
+ raise SyntaxError("prefix %r not found in prefix map" % prefix)
+ else:
+ yield token
+
+ def _get_parent_map(context):
+ parent_map = context.parent_map
+ if parent_map is None:
+ context.parent_map = parent_map = {}
+ for p in context.root.getiterator():
+ for e in p:
+ parent_map[e] = p
+ return parent_map
+
+ def _select(context, result, filter_fn=lambda *_: True):
+ for elem in result:
+ for e in elem:
+ if filter_fn(e, elem):
+ yield e
+
+ def _prepare_child(next_, token):
+ tag = token[1]
+ return functools.partial(_select, filter_fn=lambda e, _: e.tag == tag)
+
+ def _prepare_star(next_, token):
+ return _select
+
+ def _prepare_self(next_, token):
+ return lambda _, result: (e for e in result)
+
+ def _prepare_descendant(next_, token):
+ token = next(next_)
+ if token[0] == "*":
+ tag = "*"
+ elif not token[0]:
+ tag = token[1]
+ else:
+ raise SyntaxError("invalid descendant")
+
+ def select(context, result):
+ for elem in result:
+ for e in elem.getiterator(tag):
+ if e is not elem:
+ yield e
+ return select
+
+ def _prepare_parent(next_, token):
+ def select(context, result):
+ # FIXME: raise error if .. is applied at toplevel?
+ parent_map = _get_parent_map(context)
+ result_map = {}
+ for elem in result:
+ if elem in parent_map:
+ parent = parent_map[elem]
+ if parent not in result_map:
+ result_map[parent] = None
+ yield parent
+ return select
+
+ def _prepare_predicate(next_, token):
+ signature = []
+ predicate = []
+ for token in next_:
+ if token[0] == "]":
+ break
+ if token[0] and token[0][:1] in "'\"":
+ token = "'", token[0][1:-1]
+ signature.append(token[0] or "-")
+ predicate.append(token[1])
+
+ def select(context, result, filter_fn=lambda _: True):
+ for elem in result:
+ if filter_fn(elem):
+ yield elem
+
+ signature = "".join(signature)
+ # use signature to determine predicate type
+ if signature == "@-":
+ # [@attribute] predicate
+ key = predicate[1]
+ return functools.partial(
+ select, filter_fn=lambda el: el.get(key) is not None)
+ if signature == "@-='":
+ # [@attribute='value']
+ key = predicate[1]
+ value = predicate[-1]
+ return functools.partial(
+ select, filter_fn=lambda el: el.get(key) == value)
+ if signature == "-" and not re.match(r"\d+$", predicate[0]):
+ # [tag]
+ tag = predicate[0]
+ return functools.partial(
+ select, filter_fn=lambda el: el.find(tag) is not None)
+ if signature == "-='" and not re.match(r"\d+$", predicate[0]):
+ # [tag='value']
+ tag = predicate[0]
+ value = predicate[-1]
+
+ def itertext(el):
+ for e in el.getiterator():
+ e = e.text
+ if e:
+ yield e
+
+ def select(context, result):
+ for elem in result:
+ for e in elem.findall(tag):
+ if "".join(itertext(e)) == value:
+ yield elem
+ break
+ return select
+ if signature == "-" or signature == "-()" or signature == "-()-":
+ # [index] or [last()] or [last()-index]
+ if signature == "-":
+ index = int(predicate[0]) - 1
+ else:
+ if predicate[0] != "last":
+ raise SyntaxError("unsupported function")
+ if signature == "-()-":
+ try:
+ index = int(predicate[2]) - 1
+ except ValueError:
+ raise SyntaxError("unsupported expression")
+ else:
+ index = -1
+
+ def select(context, result):
+ parent_map = _get_parent_map(context)
+ for elem in result:
+ try:
+ parent = parent_map[elem]
+ # FIXME: what if the selector is "*" ?
+ elems = list(parent.findall(elem.tag))
+ if elems[index] is elem:
+ yield elem
+ except (IndexError, KeyError):
+ pass
+ return select
+ raise SyntaxError("invalid predicate")
+
+ ops = {
+ "": _prepare_child,
+ "*": _prepare_star,
+ ".": _prepare_self,
+ "..": _prepare_parent,
+ "//": _prepare_descendant,
+ "[": _prepare_predicate,
+ }
+
+ _cache = {}
+
+ class _SelectorContext:
+ parent_map = None
+
+ def __init__(self, root):
+ self.root = root
+
+ ##
+ # Generate all matching objects.
+
+ def compat_etree_iterfind(elem, path, namespaces=None):
+ # compile selector pattern
+ if path[-1:] == "/":
+ path = path + "*" # implicit all (FIXME: keep this?)
+ try:
+ selector = _cache[path]
+ except KeyError:
+ if len(_cache) > 100:
+ _cache.clear()
+ if path[:1] == "/":
+ raise SyntaxError("cannot use absolute path on element")
+ tokens = _xpath_tokenizer(path, namespaces)
+ selector = []
+ for token in tokens:
+ if token[0] == "/":
+ continue
+ try:
+ selector.append(ops[token[0]](tokens, token))
+ except StopIteration:
+ raise SyntaxError("invalid path")
+ _cache[path] = selector
+ # execute selector pattern
+ result = [elem]
+ context = _SelectorContext(elem)
+ for select in selector:
+ result = select(context, result)
+ return result
+
+ # end of code based on CPython 2.7 source
+
else:
compat_xpath = lambda xpath: xpath