aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authordirkf <[email protected]>2024-12-12 04:13:06 +0000
committerdirkf <[email protected]>2024-12-16 12:38:51 +0000
commit94fd7746084d87a43e34b094c5db1325f91ce053 (patch)
tree4538e3e7186d9b2295c7337b19a0ef81991b9c44
parent5dee6213edddb5ea00775db6b3b73f8355144485 (diff)
downloadyoutube-dl-94fd7746084d87a43e34b094c5db1325f91ce053.tar.gz
youtube-dl-94fd7746084d87a43e34b094c5db1325f91ce053.zip
[jsinterp] Fix and improve split/join
* improve split/join edge cases * correctly implement regex split (not like re.split)
-rw-r--r--test/test_jsinterp.py19
-rw-r--r--youtube_dl/jsinterp.py48
2 files changed, 62 insertions, 5 deletions
diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py
index 52eaf1ed8..b6e87e9f1 100644
--- a/test/test_jsinterp.py
+++ b/test/test_jsinterp.py
@@ -483,6 +483,13 @@ class TestJSInterpreter(unittest.TestCase):
self._test(jsi, 't-e-s-t', args=[test_input, '-'])
self._test(jsi, '', args=[[], '-'])
+ self._test('function f(){return '
+ '[1, 1.0, "abc", {a: 1}, null, undefined, Infinity, NaN].join()}',
+ '1,1,abc,[object Object],,,Infinity,NaN')
+ self._test('function f(){return '
+ '[1, 1.0, "abc", {a: 1}, null, undefined, Infinity, NaN].join("~")}',
+ '1~1~abc~[object Object]~~~Infinity~NaN')
+
def test_split(self):
test_result = list('test')
tests = [
@@ -496,6 +503,18 @@ class TestJSInterpreter(unittest.TestCase):
self._test(jsi, test_result, args=['t-e-s-t', '-'])
self._test(jsi, [''], args=['', '-'])
self._test(jsi, [], args=['', ''])
+ # RegExp split
+ self._test('function f(){return "test".split(/(?:)/)}',
+ ['t', 'e', 's', 't'])
+ self._test('function f(){return "t-e-s-t".split(/[es-]+/)}',
+ ['t', 't'])
+ # from MDN: surrogate pairs aren't handled: case 1 fails
+ # self._test('function f(){return "😄😄".split(/(?:)/)}',
+ # ['\ud83d', '\ude04', '\ud83d', '\ude04'])
+ # case 2 beats Py3.2: it gets the case 1 result
+ if sys.version_info >= (2, 6) and not ((3, 0) <= sys.version_info < (3, 3)):
+ self._test('function f(){return "😄😄".split(/(?:)/u)}',
+ ['😄', '😄'])
def test_slice(self):
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice()}', [0, 1, 2, 3, 4, 5, 6, 7, 8])
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py
index 73934a6af..bec959946 100644
--- a/youtube_dl/jsinterp.py
+++ b/youtube_dl/jsinterp.py
@@ -397,6 +397,9 @@ class JSInterpreter(object):
RE_FLAGS = {
# special knowledge: Python's re flags are bitmask values, current max 128
# invent new bitmask values well above that for literal parsing
+ # JS 'u' flag is effectively always set (surrogate pairs aren't seen),
+ # but \u{...} and \p{...} escapes aren't handled); no additional JS 'v'
+ # features are supported
# TODO: execute matches with these flags (remaining: d, y)
'd': 1024, # Generate indices for substring matches
'g': 2048, # Global search
@@ -404,6 +407,7 @@ class JSInterpreter(object):
'm': re.M, # Multi-line search
's': re.S, # Allows . to match newline characters
'u': re.U, # Treat a pattern as a sequence of unicode code points
+ 'v': re.U, # Like 'u' with extended character class and \p{} syntax
'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string
}
@@ -1047,13 +1051,47 @@ class JSInterpreter(object):
raise self.Exception('Unsupported Math method ' + member, expr=expr)
if member == 'split':
- assertion(argvals, 'takes one or more arguments')
- assertion(len(argvals) == 1, 'with limit argument is not implemented')
- return obj.split(argvals[0]) if argvals[0] else list(obj)
+ assertion(len(argvals) <= 2, 'takes at most two arguments')
+ if len(argvals) > 1:
+ limit = argvals[1]
+ assertion(isinstance(limit, int) and limit >= 0, 'integer limit >= 0')
+ if limit == 0:
+ return []
+ else:
+ limit = 0
+ if len(argvals) == 0:
+ argvals = [JS_Undefined]
+ elif isinstance(argvals[0], self.JS_RegExp):
+ # avoid re.split(), similar but not enough
+
+ def where():
+ for m in argvals[0].finditer(obj):
+ yield m.span(0)
+ yield (None, None)
+
+ def splits(limit=limit):
+ i = 0
+ for j, jj in where():
+ if j == jj == 0:
+ continue
+ if j is None and i >= len(obj):
+ break
+ yield obj[i:j]
+ if jj is None or limit == 1:
+ break
+ limit -= 1
+ i = jj
+
+ return list(splits())
+ return (
+ obj.split(argvals[0], limit - 1) if argvals[0] and argvals[0] != JS_Undefined
+ else list(obj)[:limit or None])
elif member == 'join':
assertion(isinstance(obj, list), 'must be applied on a list')
- assertion(len(argvals) == 1, 'takes exactly one argument')
- return argvals[0].join(obj)
+ assertion(len(argvals) <= 1, 'takes at most one argument')
+ return (',' if len(argvals) == 0 else argvals[0]).join(
+ ('' if x in (None, JS_Undefined) else _js_toString(x))
+ for x in obj)
elif member == 'reverse':
assertion(not argvals, 'does not take any arguments')
obj.reverse()