diff options
author | dirkf <[email protected]> | 2024-01-27 18:17:09 +0000 |
---|---|---|
committer | dirkf <[email protected]> | 2024-02-02 12:36:05 +0000 |
commit | 4416f82c809a81737d68875dcb201e366d58dabd (patch) | |
tree | 9492c135e64ab5d2e97505543cfcb491741a66c2 | |
parent | bdda6b81df61f52eed2502c8ae624d297d918488 (diff) | |
download | youtube-dl-4416f82c809a81737d68875dcb201e366d58dabd.tar.gz youtube-dl-4416f82c809a81737d68875dcb201e366d58dabd.zip |
[Vbox7IE] Sanitise ld+json containing unexpected characters
* based on PR #29680
* added hack to force invoking `transform_source`
* fixes #26218
-rw-r--r-- | youtube_dl/extractor/vbox7.py | 22 |
1 files changed, 22 insertions, 0 deletions
diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index c504c5311..d114ecb07 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -5,6 +5,7 @@ import re import time from .common import InfoExtractor +from ..compat import compat_kwargs from ..utils import ( determine_ext, ExtractorError, @@ -75,6 +76,27 @@ class Vbox7IE(InfoExtractor): if mobj: return mobj.group('url') + # transform_source=None, fatal=True + def _parse_json(self, json_string, video_id, *args, **kwargs): + if '"@context"' in json_string[:30]: + # this is ld+json, or that's the way to bet + transform_source = args[0] if len(args) > 0 else kwargs.get('transform_source') + if not transform_source: + + def fix_chars(src): + # fix malformed ld+json: replace raw CRLFs with escaped LFs + return re.sub( + r'"[^"]+"', lambda m: re.sub(r'\r?\n', r'\\n', m.group(0)), src) + + if len(args) > 0: + args = (fix_chars,) + args[1:] + else: + kwargs['transform_source'] = fix_chars + kwargs = compat_kwargs(kwargs) + + return super(Vbox7IE, self)._parse_json( + json_string, video_id, *args, **kwargs) + def _real_extract(self, url): video_id = self._match_id(url) url = 'https://vbox7.com/play:%s' % (video_id,) |