From c4251b9aaa9a69e7f7b55197b3907e52b17150d4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 13 Jan 2017 10:08:51 +0100 Subject: [common] add possibility to customize akamai manifest host --- youtube_dl/extractor/common.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6fa7c334e..dce8c7d0d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1967,10 +1967,13 @@ class InfoExtractor(object): entries.append(media_info) return entries - def _extract_akamai_formats(self, manifest_url, video_id): + def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats = [] hdcore_sign = 'hdcore=3.7.0' - f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + hds_host = hosts.get('hds') + if hds_host: + f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) if 'hdcore=' not in f4m_url: f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign f4m_formats = self._extract_f4m_formats( @@ -1978,7 +1981,10 @@ class InfoExtractor(object): for entry in f4m_formats: entry.update({'extra_param_to_segment_url': hdcore_sign}) formats.extend(f4m_formats) - m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') + m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') + hls_host = hosts.get('hls') + if hls_host: + m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) -- cgit v1.2.3 From d04621daf451d601dba80dc0f2baa29e404e4ca6 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Sun, 29 Jan 2017 05:36:53 +0700 Subject: [extractor/common] Fix duration per dash segment (closes #11868) --- youtube_dl/extractor/common.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dce8c7d0d..a3048fb59 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1751,14 +1751,16 @@ class InfoExtractor(object): # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI # or any YouTube dashsegments video fragments = [] - s_num = 0 - for segment_url in representation_ms_info['segment_urls']: - s = representation_ms_info['s'][s_num] + segment_index = 0 + timescale = representation_ms_info['timescale'] + for s in representation_ms_info['s']: + duration = float_or_none(s['d'], timescale) for r in range(s.get('r', 0) + 1): fragments.append({ - 'url': segment_url, - 'duration': float_or_none(s['d'], representation_ms_info['timescale']), + 'url': representation_ms_info['segment_urls'][segment_index], + 'duration': duration, }) + segment_index += 1 representation_ms_info['fragments'] = fragments # NB: MPD manifest may contain direct URLs to unfragmented media. # No fragments key is present in this case. -- cgit v1.2.3 From c58c2d63cbde07af66885829b7c3dbcdfbc096dd Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Sun, 29 Jan 2017 05:56:43 +0700 Subject: [extractor/common] Document forgotten fragment base and path interfaces --- youtube_dl/extractor/common.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a3048fb59..fb484b6f2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -121,9 +121,19 @@ class InfoExtractor(object): download, lower-case. "http", "https", "rtsp", "rtmp", "rtmpe", "m3u8", "m3u8_native" or "http_dash_segments". - * fragments A list of fragments of the fragmented media, - with the following entries: - * "url" (mandatory) - fragment's URL + * fragment_base_url + Base URL for fragments. Each fragment's path + value (if present) will be relative to + this URL. + * fragments A list of fragments of a fragmented media. + Each fragment entry must contain either an url + or a path. If an url is present it should be + considered by a client. Otherwise both path and + fragment_base_url must be present. Here is + the list of all potential fields: + * "url" - fragment's URL + * "path" - fragment's path relative to + fragment_base_url * "duration" (optional, int or float) * "filesize" (optional, int) * preference Order number of this format. If this field is -- cgit v1.2.3 From e228616c6e73561f0c6d32d6b681bbba321c06aa Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Sun, 29 Jan 2017 06:57:39 +0700 Subject: [extractor/common] Fix initialization template (closes #11605, closes #11825) --- youtube_dl/extractor/common.py | 48 ++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 14 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index fb484b6f2..5a15a9536 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1637,12 +1637,12 @@ class InfoExtractor(object): segment_template = element.find(_add_ns('SegmentTemplate')) if segment_template is not None: extract_common(segment_template) - media_template = segment_template.get('media') - if media_template: - ms_info['media_template'] = media_template + media = segment_template.get('media') + if media: + ms_info['media'] = media initialization = segment_template.get('initialization') if initialization: - ms_info['initialization_url'] = initialization + ms_info['initialization'] = initialization else: extract_Initialization(segment_template) return ms_info @@ -1686,6 +1686,7 @@ class InfoExtractor(object): lang = representation_attrib.get('lang') url_el = representation.find(_add_ns('BaseURL')) filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) + bandwidth = int_or_none(representation_attrib.get('bandwidth')) f = { 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'url': base_url, @@ -1693,7 +1694,7 @@ class InfoExtractor(object): 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), - 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), + 'tbr': int_or_none(bandwidth, 1000), 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 'fps': int_or_none(representation_attrib.get('frameRate')), 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, @@ -1702,13 +1703,32 @@ class InfoExtractor(object): } f.update(parse_codecs(representation_attrib.get('codecs'))) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) - if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: - media_template = representation_ms_info['media_template'] - media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) - media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template) - media_template.replace('$$', '$') + def prepare_template(template_name, identifiers): + t = representation_ms_info[template_name] + t = t.replace('$RepresentationID$', representation_id) + t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) + t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) + t.replace('$$', '$') + return t + + # @initialization is a regular template like @media one + # so it should be handled just the same way (see + # https://github.com/rg3/youtube-dl/issues/11605) + if 'initialization' in representation_ms_info: + initialization_template = prepare_template( + 'initialization', + # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and + # $Time$ shall not be included for @initialization thus + # only $Bandwidth$ remains + ('Bandwidth', )) + representation_ms_info['initialization_url'] = initialization_template % { + 'Bandwidth': bandwidth, + } + + if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: + + media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ # can't be used at the same time @@ -1720,7 +1740,7 @@ class InfoExtractor(object): representation_ms_info['fragments'] = [{ 'url': media_template % { 'Number': segment_number, - 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')), + 'Bandwidth': bandwidth, }, 'duration': segment_duration, } for segment_number in range( @@ -1738,7 +1758,7 @@ class InfoExtractor(object): def add_segment_url(): segment_url = media_template % { 'Time': segment_time, - 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')), + 'Bandwidth': bandwidth, 'Number': segment_number, } representation_ms_info['fragments'].append({ @@ -1780,7 +1800,7 @@ class InfoExtractor(object): 'protocol': 'http_dash_segments', }) if 'initialization_url' in representation_ms_info: - initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id) + initialization_url = representation_ms_info['initialization_url'] if not f.get('url'): f['url'] = initialization_url f['fragments'].append({'url': initialization_url}) -- cgit v1.2.3 From 45024183aea169dc898902388f782485de02cbac Mon Sep 17 00:00:00 2001 From: Mattias Wadman Date: Fri, 3 Feb 2017 05:10:13 +0100 Subject: [infoq] Add audio only format if available (#11565) * [infoq] Add audio only format if available Refactor cookie code into a function. Renamed formats to http_video, http_audio, rtmp_video Renamed extract functions to video instead of videos as they return one or no video. * [infoq] Rename to _extract_cookies as it more than one * [infoq] Remove redundant determine_ext * [infoq] Add comment about hardcoded URL * [infoq] Use _hidden_inputs instead of messy regex * [infoq] Probe if audio URL is valid Make it possible to pass headers to _is_valid_url * [infoq] Add audio only test --- youtube_dl/extractor/common.py | 4 +-- youtube_dl/extractor/infoq.py | 63 +++++++++++++++++++++++++++++++++++------- 2 files changed, 55 insertions(+), 12 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5a15a9536..2c8ec1417 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1025,13 +1025,13 @@ class InfoExtractor(object): unique_formats.append(f) formats[:] = unique_formats - def _is_valid_url(self, url, video_id, item='video'): + def _is_valid_url(self, url, video_id, item='video', headers={}): url = self._proto_relative_url(url, scheme='http:') # For now assume non HTTP(S) URLs always valid if not (url.startswith('http://') or url.startswith('https://')): return True try: - self._request_webpage(url, video_id, 'Checking %s URL' % item) + self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True except ExtractorError as e: if isinstance(e.cause, compat_urllib_error.URLError): diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index cca0b8a93..9fb71e8ef 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import base64 -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_urllib_parse_unquote, + compat_urlparse, +) from ..utils import determine_ext from .bokecc import BokeCCBaseIE @@ -33,9 +36,21 @@ class InfoQIE(BokeCCBaseIE): 'ext': 'flv', 'description': 'md5:308d981fb28fa42f49f9568322c683ff', }, + }, { + 'url': 'https://www.infoq.com/presentations/Simple-Made-Easy', + 'md5': '0e34642d4d9ef44bf86f66f6399672db', + 'info_dict': { + 'id': 'Simple-Made-Easy', + 'title': 'Simple Made Easy', + 'ext': 'mp3', + 'description': 'md5:3e0e213a8bbd074796ef89ea35ada25b', + }, + 'params': { + 'format': 'bestaudio', + }, }] - def _extract_rtmp_videos(self, webpage): + def _extract_rtmp_video(self, webpage): # The server URL is hardcoded video_url = 'rtmpe://video.infoq.com/cfx/st/' @@ -47,28 +62,53 @@ class InfoQIE(BokeCCBaseIE): playpath = 'mp4:' + real_id return [{ - 'format_id': 'rtmp', + 'format_id': 'rtmp_video', 'url': video_url, 'ext': determine_ext(playpath), 'play_path': playpath, }] - def _extract_http_videos(self, webpage): - http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') - + def _extract_cookies(self, webpage): policy = self._search_regex(r'InfoQConstants.scp\s*=\s*\'([^\']+)\'', webpage, 'policy') signature = self._search_regex(r'InfoQConstants.scs\s*=\s*\'([^\']+)\'', webpage, 'signature') key_pair_id = self._search_regex(r'InfoQConstants.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id') + return 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( + policy, signature, key_pair_id) + def _extract_http_video(self, webpage): + http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') return [{ - 'format_id': 'http', + 'format_id': 'http_video', 'url': http_video_url, 'http_headers': { - 'Cookie': 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( - policy, signature, key_pair_id), + 'Cookie': self._extract_cookies(webpage) }, }] + def _extract_http_audio(self, webpage, video_id): + fields = self._hidden_inputs(webpage) + http_audio_url = fields['filename'] + if http_audio_url is None: + return [] + + cookies_header = {'Cookie': self._extract_cookies(webpage)} + + # base URL is found in the Location header in the response returned by + # GET https://www.infoq.com/mp3download.action?filename=... when logged in. + http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + + # audio file seem to be missing some times even if there is a download link + # so probe URL to make sure + if not self._is_valid_url(http_audio_url, video_id, headers=cookies_header): + return [] + + return [{ + 'format_id': 'http_audio', + 'url': http_audio_url, + 'vcodec': 'none', + 'http_headers': cookies_header, + }] + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -80,7 +120,10 @@ class InfoQIE(BokeCCBaseIE): # for China videos, HTTP video URL exists but always fails with 403 formats = self._extract_bokecc_formats(webpage, video_id) else: - formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage) + formats = ( + self._extract_rtmp_video(webpage) + + self._extract_http_video(webpage) + + self._extract_http_audio(webpage, video_id)) self._sort_formats(formats) -- cgit v1.2.3 From 2aec7256ae3ef877cc57f7d27f0fa171a7f25a98 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Mon, 6 Feb 2017 00:20:30 +0700 Subject: [extractor/common] Speed-up media tags regex (closes #11979) --- youtube_dl/extractor/common.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2c8ec1417..519188622 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1959,7 +1959,12 @@ class InfoExtractor(object): media_tags = [(media_tag, media_type, '') for media_tag, media_type in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)] - media_tags.extend(re.findall(r'(?s)(<(?Pvideo|audio)[^>]*>)(.*?)', webpage)) + media_tags.extend(re.findall( + # We only allow video|audio followed by a whitespace or '>'. + # Allowing more characters may end up in significant slow down (see + # https://github.com/rg3/youtube-dl/issues/11979, example URL: + # http://www.porntrex.com/maps/videositemap.xml). + r'(?s)(<(?Pvideo|audio)(?:\s+[^>]*)?>)(.*?)', webpage)) for media_tag, media_type, media_content in media_tags: media_info = { 'formats': [], -- cgit v1.2.3 From 242a14a1f6ea14b8e7de9aa1eddaf938de825e1e Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Tue, 7 Feb 2017 00:22:16 +0700 Subject: [extractor/common] Fix audio only with audio group in m3u8 (closes #11995) --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 519188622..0b4e2ac20 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1315,8 +1315,8 @@ class InfoExtractor(object): 'abr': abr, }) f.update(parse_codecs(last_info.get('CODECS'))) - if audio_in_video_stream.get(last_info.get('AUDIO')) is False: - # TODO: update acodec for for audio only formats with the same GROUP-ID + if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none': + # TODO: update acodec for audio only formats with the same GROUP-ID f['acodec'] = 'none' formats.append(f) last_info = {} -- cgit v1.2.3 From 08a00eef79c8f8b12211513b241404394ef6120c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 10 Feb 2017 16:51:41 +0100 Subject: [extractor/common] skip m3u8 manifests protected with Adobe Flash Access --- youtube_dl/extractor/common.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0b4e2ac20..9681453ca 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1208,6 +1208,9 @@ class InfoExtractor(object): m3u8_doc, urlh = res m3u8_url = urlh.geturl() + if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access + return [] + formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] format_url = lambda u: ( -- cgit v1.2.3 From a4a554a79354981fcab55de8eaab7b95a40bbb48 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 16 Feb 2017 23:42:36 +0800 Subject: [generic] Try parsing JWPlayer embedded videos (closes #12030) --- ChangeLog | 6 ++ youtube_dl/extractor/archiveorg.py | 4 +- youtube_dl/extractor/common.py | 118 +++++++++++++++++++++++++++ youtube_dl/extractor/generic.py | 20 +++++ youtube_dl/extractor/jwplatform.py | 132 +------------------------------ youtube_dl/extractor/ondemandkorea.py | 4 +- youtube_dl/extractor/pornhub.py | 44 ----------- youtube_dl/extractor/pornoxo.py | 4 +- youtube_dl/extractor/rentv.py | 3 +- youtube_dl/extractor/rudo.py | 4 +- youtube_dl/extractor/screencastomatic.py | 4 +- youtube_dl/extractor/sendtonews.py | 4 +- youtube_dl/extractor/thisav.py | 4 +- youtube_dl/extractor/tvnoe.py | 4 +- youtube_dl/extractor/vidzi.py | 4 +- youtube_dl/extractor/wimp.py | 4 +- 16 files changed, 166 insertions(+), 197 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/ChangeLog b/ChangeLog index 8ef8a8307..4e69b03d0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors ++ [generic] Support complex JWPlayer embedded videos (#12030) + + version 2017.02.16 Core diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 486dff82d..e21045bed 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -1,13 +1,13 @@ from __future__ import unicode_literals -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( unified_strdate, clean_html, ) -class ArchiveOrgIE(JWPlatformBaseIE): +class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P[^/?#]+)(?:[?].*)?$' diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9681453ca..f6ff56eda 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -40,6 +40,7 @@ from ..utils import ( fix_xml_ampersands, float_or_none, int_or_none, + js_to_json, parse_iso8601, RegexNotFoundError, sanitize_filename, @@ -2073,6 +2074,123 @@ class InfoExtractor(object): }) return formats + @staticmethod + def _find_jwplayer_data(webpage): + mobj = re.search( + r'jwplayer\((?P[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P[^)]+)\)', + webpage) + if mobj: + return mobj.group('options') + + def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): + jwplayer_data = self._parse_json( + self._find_jwplayer_data(webpage), video_id, + transform_source=js_to_json) + return self._parse_jwplayer_data( + jwplayer_data, video_id, *args, **kwargs) + + def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, + m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + # JWPlayer backward compatibility: flattened playlists + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 + if 'playlist' not in jwplayer_data: + jwplayer_data = {'playlist': [jwplayer_data]} + + entries = [] + + # JWPlayer backward compatibility: single playlist item + # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 + if not isinstance(jwplayer_data['playlist'], list): + jwplayer_data['playlist'] = [jwplayer_data['playlist']] + + for video_data in jwplayer_data['playlist']: + # JWPlayer backward compatibility: flattened sources + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 + if 'sources' not in video_data: + video_data['sources'] = [video_data] + + this_video_id = video_id or video_data['mediaid'] + + formats = [] + for source in video_data['sources']: + source_url = self._proto_relative_url(source['file']) + if base_url: + source_url = compat_urlparse.urljoin(base_url, source_url) + source_type = source.get('type') or '' + ext = mimetype2ext(source_type) or determine_ext(source_url) + if source_type == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + source_url, this_video_id, mpd_id=mpd_id, fatal=False)) + # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 + elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + 'ext': ext, + }) + else: + height = int_or_none(source.get('height')) + if height is None: + # Often no height is provided but there is a label in + # format like 1080p. + height = int_or_none(self._search_regex( + r'^(\d{3,})[pP]$', source.get('label') or '', + 'height', default=None)) + a_format = { + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': height, + 'ext': ext, + } + if source_url.startswith('rtmp'): + a_format['ext'] = 'flv' + + # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as + # of jwplayer.flash.swf + rtmp_url_parts = re.split( + r'((?:mp4|mp3|flv):)', source_url, 1) + if len(rtmp_url_parts) == 3: + rtmp_url, prefix, play_path = rtmp_url_parts + a_format.update({ + 'url': rtmp_url, + 'play_path': prefix + play_path, + }) + if rtmp_params: + a_format.update(rtmp_params) + formats.append(a_format) + self._sort_formats(formats) + + subtitles = {} + tracks = video_data.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if track.get('kind') != 'captions': + continue + track_url = urljoin(base_url, track.get('file')) + if not track_url: + continue + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track_url) + }) + + entries.append({ + 'id': this_video_id, + 'title': video_data['title'] if require_title else video_data.get('title'), + 'description': video_data.get('description'), + 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), + 'subtitles': subtitles, + 'formats': formats, + }) + if len(entries) == 1: + return entries[0] + else: + return self.playlist_result(entries) + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a2b0298ec..3db31debe 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -20,6 +20,7 @@ from ..utils import ( float_or_none, HEADRequest, is_html, + js_to_json, orderedSet, sanitized_Request, smuggle_url, @@ -961,6 +962,16 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, + # Complex jwplayer + { + 'url': 'http://www.indiedb.com/games/king-machine/videos', + 'info_dict': { + 'id': 'videos', + 'ext': 'mp4', + 'title': 'king machine trailer 1', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, # rtl.nl embed { 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', @@ -2488,6 +2499,15 @@ class GenericIE(InfoExtractor): self._sort_formats(entry['formats']) return self.playlist_result(entries) + jwplayer_data_str = self._find_jwplayer_data(webpage) + if jwplayer_data_str: + try: + jwplayer_data = self._parse_json( + jwplayer_data_str, video_id, transform_source=js_to_json) + return self._parse_jwplayer_data(jwplayer_data, video_id) + except ExtractorError: + pass + def check_video(vurl): if YoutubeIE.suitable(vurl): return True diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index aff7ab49a..33d55f770 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -4,139 +4,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - js_to_json, - mimetype2ext, - urljoin, -) -class JWPlatformBaseIE(InfoExtractor): - @staticmethod - def _find_jwplayer_data(webpage): - # TODO: Merge this with JWPlayer-related codes in generic.py - - mobj = re.search( - r'jwplayer\((?P[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P[^)]+)\)', - webpage) - if mobj: - return mobj.group('options') - - def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): - jwplayer_data = self._parse_json( - self._find_jwplayer_data(webpage), video_id, - transform_source=js_to_json) - return self._parse_jwplayer_data( - jwplayer_data, video_id, *args, **kwargs) - - def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, - m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - # JWPlayer backward compatibility: flattened playlists - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: - jwplayer_data = {'playlist': [jwplayer_data]} - - entries = [] - - # JWPlayer backward compatibility: single playlist item - # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 - if not isinstance(jwplayer_data['playlist'], list): - jwplayer_data['playlist'] = [jwplayer_data['playlist']] - - for video_data in jwplayer_data['playlist']: - # JWPlayer backward compatibility: flattened sources - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 - if 'sources' not in video_data: - video_data['sources'] = [video_data] - - this_video_id = video_id or video_data['mediaid'] - - formats = [] - for source in video_data['sources']: - source_url = self._proto_relative_url(source['file']) - if base_url: - source_url = compat_urlparse.urljoin(base_url, source_url) - source_type = source.get('type') or '' - ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - source_url, this_video_id, mpd_id=mpd_id, fatal=False)) - # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 - elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): - formats.append({ - 'url': source_url, - 'vcodec': 'none', - 'ext': ext, - }) - else: - height = int_or_none(source.get('height')) - if height is None: - # Often no height is provided but there is a label in - # format like 1080p. - height = int_or_none(self._search_regex( - r'^(\d{3,})[pP]$', source.get('label') or '', - 'height', default=None)) - a_format = { - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'height': height, - 'ext': ext, - } - if source_url.startswith('rtmp'): - a_format['ext'] = 'flv' - - # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as - # of jwplayer.flash.swf - rtmp_url_parts = re.split( - r'((?:mp4|mp3|flv):)', source_url, 1) - if len(rtmp_url_parts) == 3: - rtmp_url, prefix, play_path = rtmp_url_parts - a_format.update({ - 'url': rtmp_url, - 'play_path': prefix + play_path, - }) - if rtmp_params: - a_format.update(rtmp_params) - formats.append(a_format) - self._sort_formats(formats) - - subtitles = {} - tracks = video_data.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if track.get('kind') != 'captions': - continue - track_url = urljoin(base_url, track.get('file')) - if not track_url: - continue - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track_url) - }) - - entries.append({ - 'id': this_video_id, - 'title': video_data['title'] if require_title else video_data.get('title'), - 'description': video_data.get('description'), - 'thumbnail': self._proto_relative_url(video_data.get('image')), - 'timestamp': int_or_none(video_data.get('pubdate')), - 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), - 'subtitles': subtitles, - 'formats': formats, - }) - if len(entries) == 1: - return entries[0] - else: - return self.playlist_result(entries) - - -class JWPlatformIE(JWPlatformBaseIE): +class JWPlatformIE(InfoExtractor): _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P[a-zA-Z0-9]{8})' _TEST = { 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', diff --git a/youtube_dl/extractor/ondemandkorea.py b/youtube_dl/extractor/ondemandkorea.py index de1d6b08a..dcd157777 100644 --- a/youtube_dl/extractor/ondemandkorea.py +++ b/youtube_dl/extractor/ondemandkorea.py @@ -1,14 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( ExtractorError, js_to_json, ) -class OnDemandKoreaIE(JWPlatformBaseIE): +class OnDemandKoreaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.ondemandkorea.com/ask-us-anything-e43.html', diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 7a2737032..9b413590a 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -169,50 +169,6 @@ class PornHubIE(InfoExtractor): comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') - """ - video_variables = {} - for video_variablename, quote, video_variable in re.findall( - r'(player_quality_[0-9]{3,4}p\w+)\s*=\s*(["\'])(.+?)\2;', webpage): - video_variables[video_variablename] = video_variable - - video_urls = [] - for encoded_video_url in re.findall( - r'player_quality_[0-9]{3,4}p\s*=(.+?);', webpage): - for varname, varval in video_variables.items(): - encoded_video_url = encoded_video_url.replace(varname, varval) - video_urls.append(re.sub(r'[\s+]', '', encoded_video_url)) - - if webpage.find('"encrypted":true') != -1: - password = compat_urllib_parse_unquote_plus( - self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) - video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) - - formats = [] - for video_url in video_urls: - path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[5].split('_')[:2] - format = '-'.join(format) - - m = re.match(r'^(?P[0-9]+)[pP]-(?P[0-9]+)[kK]$', format) - if m is None: - height = None - tbr = None - else: - height = int(m.group('height')) - tbr = int(m.group('tbr')) - - formats.append({ - 'url': video_url, - 'ext': extension, - 'format': format, - 'format_id': format, - 'tbr': tbr, - 'height': height, - }) - self._sort_formats(formats) - """ - page_params = self._parse_json(self._search_regex( r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P{[^}]+})', webpage, 'page parameters', group='data', default='{}'), diff --git a/youtube_dl/extractor/pornoxo.py b/youtube_dl/extractor/pornoxo.py index 1a0cce7e0..2831368b6 100644 --- a/youtube_dl/extractor/pornoxo.py +++ b/youtube_dl/extractor/pornoxo.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( str_to_int, ) -class PornoXOIE(JWPlatformBaseIE): +class PornoXOIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P\d+)/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html', diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py index 422c02cff..d338b3a93 100644 --- a/youtube_dl/extractor/rentv.py +++ b/youtube_dl/extractor/rentv.py @@ -2,11 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .jwplatform import JWPlatformBaseIE from ..compat import compat_str -class RENTVIE(JWPlatformBaseIE): +class RENTVIE(InfoExtractor): _VALID_URL = r'(?:rentv:|https?://(?:www\.)?ren\.tv/(?:player|video/epizod)/)(?P\d+)' _TESTS = [{ 'url': 'http://ren.tv/video/epizod/118577', diff --git a/youtube_dl/extractor/rudo.py b/youtube_dl/extractor/rudo.py index 3bfe934d8..51644011e 100644 --- a/youtube_dl/extractor/rudo.py +++ b/youtube_dl/extractor/rudo.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( js_to_json, get_element_by_class, @@ -11,7 +11,7 @@ from ..utils import ( ) -class RudoIE(JWPlatformBaseIE): +class RudoIE(InfoExtractor): _VALID_URL = r'https?://rudo\.video/vod/(?P[0-9a-zA-Z]+)' _TEST = { diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index 94a2a37d2..b5e76c9af 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -1,11 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import js_to_json -class ScreencastOMaticIE(JWPlatformBaseIE): +class ScreencastOMaticIE(InfoExtractor): _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P[0-9a-zA-Z]+)' _TEST = { 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py index 9880a5a78..9d9652949 100644 --- a/youtube_dl/extractor/sendtonews.py +++ b/youtube_dl/extractor/sendtonews.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( float_or_none, parse_iso8601, @@ -14,7 +14,7 @@ from ..utils import ( ) -class SendtoNewsIE(JWPlatformBaseIE): +class SendtoNewsIE(InfoExtractor): _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P[0-9A-Za-z-]+)' _TEST = { diff --git a/youtube_dl/extractor/thisav.py b/youtube_dl/extractor/thisav.py index 4473a3c77..b7b3568cb 100644 --- a/youtube_dl/extractor/thisav.py +++ b/youtube_dl/extractor/thisav.py @@ -3,11 +3,11 @@ from __future__ import unicode_literals import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import remove_end -class ThisAVIE(JWPlatformBaseIE): +class ThisAVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P[0-9]+)/.*' _TESTS = [{ 'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html', diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py index 6d5c74826..1a5b76bf2 100644 --- a/youtube_dl/extractor/tvnoe.py +++ b/youtube_dl/extractor/tvnoe.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( clean_html, get_element_by_class, @@ -9,7 +9,7 @@ from ..utils import ( ) -class TVNoeIE(JWPlatformBaseIE): +class TVNoeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/video/(?P[0-9]+)' _TEST = { 'url': 'http://www.tvnoe.cz/video/10362', diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 9950c62ad..1f1828fce 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( decode_packed_codes, js_to_json, @@ -12,7 +12,7 @@ from ..utils import ( ) -class VidziIE(JWPlatformBaseIE): +class VidziIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?:embed-)?(?P[0-9a-zA-Z]+)' _TESTS = [{ 'url': 'http://vidzi.tv/cghql9yq6emu.html', diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index 54eb51427..c022fb33e 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -1,10 +1,10 @@ from __future__ import unicode_literals +from .common import InfoExtractor from .youtube import YoutubeIE -from .jwplatform import JWPlatformBaseIE -class WimpIE(JWPlatformBaseIE): +class WimpIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?wimp\.com/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.wimp.com/maru-is-exhausted/', -- cgit v1.2.3 From 773f291dcbce486fefe24e1abd29735d374d0a9e Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Sat, 4 Feb 2017 18:49:58 +0700 Subject: Add experimental geo restriction bypass mechanism Based on faking X-Forwarded-For HTTP header --- youtube_dl/YoutubeDL.py | 17 +++ youtube_dl/__init__.py | 2 + youtube_dl/extractor/common.py | 48 +++++++- youtube_dl/options.py | 12 ++ youtube_dl/utils.py | 267 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 340 insertions(+), 6 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a7bf5a1b0..ebace6b57 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -56,6 +56,8 @@ from .utils import ( ExtractorError, format_bytes, formatSeconds, + GeoRestrictedError, + ISO3166Utils, locked_file, make_HTTPS_handler, MaxDownloadsReached, @@ -272,6 +274,13 @@ class YoutubeDL(object): If it returns None, the video is downloaded. match_filter_func in utils.py is one example for this. no_color: Do not emit color codes in output. + bypass_geo_restriction: + Bypass geographic restriction via faking X-Forwarded-For + HTTP header (experimental) + bypass_geo_restriction_as_country: + Two-letter ISO 3166-2 country code that will be used for + explicit geographic restriction bypassing via faking + X-Forwarded-For HTTP header (experimental) The following options determine which downloader is picked: external_downloader: Executable of the external downloader to call. @@ -707,6 +716,14 @@ class YoutubeDL(object): return self.process_ie_result(ie_result, download, extra_info) else: return ie_result + except GeoRestrictedError as e: + msg = e.msg + if e.countries: + msg += '\nThis video is available in %s.' % ', '.join( + map(ISO3166Utils.short2full, e.countries)) + msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' + self.report_error(msg) + break except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) break diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 5c5b8094b..94f461a78 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -414,6 +414,8 @@ def _real_main(argv=None): 'cn_verification_proxy': opts.cn_verification_proxy, 'geo_verification_proxy': opts.geo_verification_proxy, 'config_location': opts.config_location, + 'bypass_geo_restriction': opts.bypass_geo_restriction, + 'bypass_geo_restriction_as_country': opts.bypass_geo_restriction_as_country, } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f6ff56eda..96815099d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -6,6 +6,7 @@ import hashlib import json import netrc import os +import random import re import socket import sys @@ -39,6 +40,8 @@ from ..utils import ( ExtractorError, fix_xml_ampersands, float_or_none, + GeoRestrictedError, + GeoUtils, int_or_none, js_to_json, parse_iso8601, @@ -320,17 +323,25 @@ class InfoExtractor(object): _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. + _BYPASS_GEO attribute may be set to False in order to disable + geo restriction bypass mechanisms for a particular extractor. + Though it won't disable explicit geo restriction bypass based on + country code provided with bypass_geo_restriction_as_country. + Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ _ready = False _downloader = None + _x_forwarded_for_ip = None + _BYPASS_GEO = True _WORKING = True def __init__(self, downloader=None): """Constructor. Receives an optional downloader.""" self._ready = False + self._x_forwarded_for_ip = None self.set_downloader(downloader) @classmethod @@ -359,6 +370,10 @@ class InfoExtractor(object): def initialize(self): """Initializes an instance (authentication, etc).""" + if not self._x_forwarded_for_ip: + country_code = self._downloader.params.get('bypass_geo_restriction_as_country', None) + if country_code: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if not self._ready: self._real_initialize() self._ready = True @@ -366,8 +381,22 @@ class InfoExtractor(object): def extract(self, url): """Extracts URL information and returns it in list of dicts.""" try: - self.initialize() - return self._real_extract(url) + for _ in range(2): + try: + self.initialize() + return self._real_extract(url) + except GeoRestrictedError as e: + if (not self._downloader.params.get('bypass_geo_restriction_as_country', None) and + self._BYPASS_GEO and + self._downloader.params.get('bypass_geo_restriction', True) and + not self._x_forwarded_for_ip and + e.countries): + self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(e.countries)) + if self._x_forwarded_for_ip: + self.report_warning( + 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip) + continue + raise except ExtractorError: raise except compat_http_client.IncompleteRead as e: @@ -434,6 +463,15 @@ class InfoExtractor(object): if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] + # Some sites check X-Forwarded-For HTTP header in order to figure out + # the origin of the client behind proxy. This allows bypassing geo + # restriction by faking this header's value to IP that belongs to some + # geo unrestricted country. We will do so once we encounter any + # geo restriction error. + if self._x_forwarded_for_ip: + if 'X-Forwarded-For' not in headers: + headers['X-Forwarded-For'] = self._x_forwarded_for_ip + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) if urlh is False: assert not fatal @@ -609,10 +647,8 @@ class InfoExtractor(object): expected=True) @staticmethod - def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): - raise ExtractorError( - '%s. You might want to use --proxy to workaround.' % msg, - expected=True) + def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None): + raise GeoRestrictedError(msg, countries=countries) # Methods for following #608 @staticmethod diff --git a/youtube_dl/options.py b/youtube_dl/options.py index deff54324..2e194f6dc 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -549,6 +549,18 @@ def parseOpts(overrideArguments=None): 'Upper bound of a range for randomized sleep before each download ' '(maximum possible number of seconds to sleep). Must only be used ' 'along with --min-sleep-interval.')) + workarounds.add_option( + '--bypass-geo', + action='store_true', dest='bypass_geo_restriction', default=True, + help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') + workarounds.add_option( + '--no-bypass-geo', + action='store_false', dest='bypass_geo_restriction', default=True, + help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') + workarounds.add_option( + '--bypass-geo-as-country', metavar='CODE', + dest='bypass_geo_restriction_as_country', default=None, + help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)') verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') verbosity.add_option( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 3f9e592e3..4e76b6b7b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -23,6 +23,7 @@ import operator import os import pipes import platform +import random import re import socket import ssl @@ -747,6 +748,18 @@ class RegexNotFoundError(ExtractorError): pass +class GeoRestrictedError(ExtractorError): + """Geographic restriction Error exception. + + This exception may be thrown when a video is not available from your + geographic location due to geographic restrictions imposed by a website. + """ + def __init__(self, msg, countries=None): + super(GeoRestrictedError, self).__init__(msg, expected=True) + self.msg = msg + self.countries = countries + + class DownloadError(YoutubeDLError): """Download Error exception. @@ -3027,6 +3040,260 @@ class ISO3166Utils(object): return cls._country_map.get(code.upper()) +class GeoUtils(object): + # Major IPv4 address blocks per country + _country_ip_map = { + 'AD': '85.94.160.0/19', + 'AE': '94.200.0.0/13', + 'AF': '149.54.0.0/17', + 'AG': '209.59.64.0/18', + 'AI': '204.14.248.0/21', + 'AL': '46.99.0.0/16', + 'AM': '46.70.0.0/15', + 'AO': '105.168.0.0/13', + 'AP': '159.117.192.0/21', + 'AR': '181.0.0.0/12', + 'AS': '202.70.112.0/20', + 'AT': '84.112.0.0/13', + 'AU': '1.128.0.0/11', + 'AW': '181.41.0.0/18', + 'AZ': '5.191.0.0/16', + 'BA': '31.176.128.0/17', + 'BB': '65.48.128.0/17', + 'BD': '114.130.0.0/16', + 'BE': '57.0.0.0/8', + 'BF': '129.45.128.0/17', + 'BG': '95.42.0.0/15', + 'BH': '37.131.0.0/17', + 'BI': '154.117.192.0/18', + 'BJ': '137.255.0.0/16', + 'BL': '192.131.134.0/24', + 'BM': '196.12.64.0/18', + 'BN': '156.31.0.0/16', + 'BO': '161.56.0.0/16', + 'BQ': '161.0.80.0/20', + 'BR': '152.240.0.0/12', + 'BS': '24.51.64.0/18', + 'BT': '119.2.96.0/19', + 'BW': '168.167.0.0/16', + 'BY': '178.120.0.0/13', + 'BZ': '179.42.192.0/18', + 'CA': '99.224.0.0/11', + 'CD': '41.243.0.0/16', + 'CF': '196.32.200.0/21', + 'CG': '197.214.128.0/17', + 'CH': '85.0.0.0/13', + 'CI': '154.232.0.0/14', + 'CK': '202.65.32.0/19', + 'CL': '152.172.0.0/14', + 'CM': '165.210.0.0/15', + 'CN': '36.128.0.0/10', + 'CO': '181.240.0.0/12', + 'CR': '201.192.0.0/12', + 'CU': '152.206.0.0/15', + 'CV': '165.90.96.0/19', + 'CW': '190.88.128.0/17', + 'CY': '46.198.0.0/15', + 'CZ': '88.100.0.0/14', + 'DE': '53.0.0.0/8', + 'DJ': '197.241.0.0/17', + 'DK': '87.48.0.0/12', + 'DM': '192.243.48.0/20', + 'DO': '152.166.0.0/15', + 'DZ': '41.96.0.0/12', + 'EC': '186.68.0.0/15', + 'EE': '90.190.0.0/15', + 'EG': '156.160.0.0/11', + 'ER': '196.200.96.0/20', + 'ES': '88.0.0.0/11', + 'ET': '196.188.0.0/14', + 'EU': '2.16.0.0/13', + 'FI': '91.152.0.0/13', + 'FJ': '144.120.0.0/16', + 'FM': '119.252.112.0/20', + 'FO': '88.85.32.0/19', + 'FR': '90.0.0.0/9', + 'GA': '41.158.0.0/15', + 'GB': '25.0.0.0/8', + 'GD': '74.122.88.0/21', + 'GE': '31.146.0.0/16', + 'GF': '161.22.64.0/18', + 'GG': '62.68.160.0/19', + 'GH': '45.208.0.0/14', + 'GI': '85.115.128.0/19', + 'GL': '88.83.0.0/19', + 'GM': '160.182.0.0/15', + 'GN': '197.149.192.0/18', + 'GP': '104.250.0.0/19', + 'GQ': '105.235.224.0/20', + 'GR': '94.64.0.0/13', + 'GT': '168.234.0.0/16', + 'GU': '168.123.0.0/16', + 'GW': '197.214.80.0/20', + 'GY': '181.41.64.0/18', + 'HK': '113.252.0.0/14', + 'HN': '181.210.0.0/16', + 'HR': '93.136.0.0/13', + 'HT': '148.102.128.0/17', + 'HU': '84.0.0.0/14', + 'ID': '39.192.0.0/10', + 'IE': '87.32.0.0/12', + 'IL': '79.176.0.0/13', + 'IM': '5.62.80.0/20', + 'IN': '117.192.0.0/10', + 'IO': '203.83.48.0/21', + 'IQ': '37.236.0.0/14', + 'IR': '2.176.0.0/12', + 'IS': '82.221.0.0/16', + 'IT': '79.0.0.0/10', + 'JE': '87.244.64.0/18', + 'JM': '72.27.0.0/17', + 'JO': '176.29.0.0/16', + 'JP': '126.0.0.0/8', + 'KE': '105.48.0.0/12', + 'KG': '158.181.128.0/17', + 'KH': '36.37.128.0/17', + 'KI': '103.25.140.0/22', + 'KM': '197.255.224.0/20', + 'KN': '198.32.32.0/19', + 'KP': '175.45.176.0/22', + 'KR': '175.192.0.0/10', + 'KW': '37.36.0.0/14', + 'KY': '64.96.0.0/15', + 'KZ': '2.72.0.0/13', + 'LA': '115.84.64.0/18', + 'LB': '178.135.0.0/16', + 'LC': '192.147.231.0/24', + 'LI': '82.117.0.0/19', + 'LK': '112.134.0.0/15', + 'LR': '41.86.0.0/19', + 'LS': '129.232.0.0/17', + 'LT': '78.56.0.0/13', + 'LU': '188.42.0.0/16', + 'LV': '46.109.0.0/16', + 'LY': '41.252.0.0/14', + 'MA': '105.128.0.0/11', + 'MC': '88.209.64.0/18', + 'MD': '37.246.0.0/16', + 'ME': '178.175.0.0/17', + 'MF': '74.112.232.0/21', + 'MG': '154.126.0.0/17', + 'MH': '117.103.88.0/21', + 'MK': '77.28.0.0/15', + 'ML': '154.118.128.0/18', + 'MM': '37.111.0.0/17', + 'MN': '49.0.128.0/17', + 'MO': '60.246.0.0/16', + 'MP': '202.88.64.0/20', + 'MQ': '109.203.224.0/19', + 'MR': '41.188.64.0/18', + 'MS': '208.90.112.0/22', + 'MT': '46.11.0.0/16', + 'MU': '105.16.0.0/12', + 'MV': '27.114.128.0/18', + 'MW': '105.234.0.0/16', + 'MX': '187.192.0.0/11', + 'MY': '175.136.0.0/13', + 'MZ': '197.218.0.0/15', + 'NA': '41.182.0.0/16', + 'NC': '101.101.0.0/18', + 'NE': '197.214.0.0/18', + 'NF': '203.17.240.0/22', + 'NG': '105.112.0.0/12', + 'NI': '186.76.0.0/15', + 'NL': '145.96.0.0/11', + 'NO': '84.208.0.0/13', + 'NP': '36.252.0.0/15', + 'NR': '203.98.224.0/19', + 'NU': '49.156.48.0/22', + 'NZ': '49.224.0.0/14', + 'OM': '5.36.0.0/15', + 'PA': '186.72.0.0/15', + 'PE': '186.160.0.0/14', + 'PF': '123.50.64.0/18', + 'PG': '124.240.192.0/19', + 'PH': '49.144.0.0/13', + 'PK': '39.32.0.0/11', + 'PL': '83.0.0.0/11', + 'PM': '70.36.0.0/20', + 'PR': '66.50.0.0/16', + 'PS': '188.161.0.0/16', + 'PT': '85.240.0.0/13', + 'PW': '202.124.224.0/20', + 'PY': '181.120.0.0/14', + 'QA': '37.210.0.0/15', + 'RE': '139.26.0.0/16', + 'RO': '79.112.0.0/13', + 'RS': '178.220.0.0/14', + 'RU': '5.136.0.0/13', + 'RW': '105.178.0.0/15', + 'SA': '188.48.0.0/13', + 'SB': '202.1.160.0/19', + 'SC': '154.192.0.0/11', + 'SD': '154.96.0.0/13', + 'SE': '78.64.0.0/12', + 'SG': '152.56.0.0/14', + 'SI': '188.196.0.0/14', + 'SK': '78.98.0.0/15', + 'SL': '197.215.0.0/17', + 'SM': '89.186.32.0/19', + 'SN': '41.82.0.0/15', + 'SO': '197.220.64.0/19', + 'SR': '186.179.128.0/17', + 'SS': '105.235.208.0/21', + 'ST': '197.159.160.0/19', + 'SV': '168.243.0.0/16', + 'SX': '190.102.0.0/20', + 'SY': '5.0.0.0/16', + 'SZ': '41.84.224.0/19', + 'TC': '65.255.48.0/20', + 'TD': '154.68.128.0/19', + 'TG': '196.168.0.0/14', + 'TH': '171.96.0.0/13', + 'TJ': '85.9.128.0/18', + 'TK': '27.96.24.0/21', + 'TL': '180.189.160.0/20', + 'TM': '95.85.96.0/19', + 'TN': '197.0.0.0/11', + 'TO': '175.176.144.0/21', + 'TR': '78.160.0.0/11', + 'TT': '186.44.0.0/15', + 'TV': '202.2.96.0/19', + 'TW': '120.96.0.0/11', + 'TZ': '156.156.0.0/14', + 'UA': '93.72.0.0/13', + 'UG': '154.224.0.0/13', + 'US': '3.0.0.0/8', + 'UY': '167.56.0.0/13', + 'UZ': '82.215.64.0/18', + 'VA': '212.77.0.0/19', + 'VC': '24.92.144.0/20', + 'VE': '186.88.0.0/13', + 'VG': '172.103.64.0/18', + 'VI': '146.226.0.0/16', + 'VN': '14.160.0.0/11', + 'VU': '202.80.32.0/20', + 'WF': '117.20.32.0/21', + 'WS': '202.4.32.0/19', + 'YE': '134.35.0.0/16', + 'YT': '41.242.116.0/22', + 'ZA': '41.0.0.0/11', + 'ZM': '165.56.0.0/13', + 'ZW': '41.85.192.0/19', + } + + @classmethod + def random_ipv4(cls, code): + block = cls._country_ip_map.get(code.upper()) + if not block: + return None + addr, preflen = block.split('/') + addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] + addr_max = addr_min | (0xffffffff >> int(preflen)) + return socket.inet_ntoa( + compat_struct_pack('!I', random.randint(addr_min, addr_max))) + + class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): def __init__(self, proxies=None): # Set default handlers -- cgit v1.2.3 From 0016b84e16965a07c52946c4672363153e8b18a2 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Sat, 4 Feb 2017 21:06:07 +0700 Subject: Add faked X-Forwarded-For to formats' HTTP headers --- youtube_dl/YoutubeDL.py | 14 ++++++++++++++ youtube_dl/extractor/common.py | 5 ++++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ebace6b57..1c04e46c1 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -864,8 +864,14 @@ class YoutubeDL(object): if self.params.get('playlistrandom', False): random.shuffle(entries) + x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + for i, entry in enumerate(entries, 1): self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) + # This __x_forwarded_for_ip thing is a bit ugly but requires + # minimal changes + if x_forwarded_for: + entry['__x_forwarded_for_ip'] = x_forwarded_for extra = { 'n_entries': n_entries, 'playlist': playlist, @@ -1250,6 +1256,11 @@ class YoutubeDL(object): if cookies: res['Cookie'] = cookies + if 'X-Forwarded-For' not in res: + x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip') + if x_forwarded_for_ip: + res['X-Forwarded-For'] = x_forwarded_for_ip + return res def _calc_cookies(self, info_dict): @@ -1392,6 +1403,9 @@ class YoutubeDL(object): full_format_info = info_dict.copy() full_format_info.update(format) format['http_headers'] = self._calc_headers(full_format_info) + # Remove private housekeeping stuff + if '__x_forwarded_for_ip' in info_dict: + del info_dict['__x_forwarded_for_ip'] # TODO Central sorting goes here diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 96815099d..c1f7f28a0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -384,7 +384,10 @@ class InfoExtractor(object): for _ in range(2): try: self.initialize() - return self._real_extract(url) + ie_result = self._real_extract(url) + if self._x_forwarded_for_ip: + ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip + return ie_result except GeoRestrictedError as e: if (not self._downloader.params.get('bypass_geo_restriction_as_country', None) and self._BYPASS_GEO and -- cgit v1.2.3 From 0a840f584c3f1fedb6957c05587dec697143f2d5 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Sun, 19 Feb 2017 01:53:41 +0700 Subject: Rename bypass geo restriction options --- youtube_dl/YoutubeDL.py | 5 ++--- youtube_dl/__init__.py | 4 ++-- youtube_dl/extractor/common.py | 8 ++++---- youtube_dl/options.py | 12 ++++++------ 4 files changed, 14 insertions(+), 15 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 1c04e46c1..68000dea2 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -274,10 +274,9 @@ class YoutubeDL(object): If it returns None, the video is downloaded. match_filter_func in utils.py is one example for this. no_color: Do not emit color codes in output. - bypass_geo_restriction: - Bypass geographic restriction via faking X-Forwarded-For + geo_bypass: Bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental) - bypass_geo_restriction_as_country: + geo_bypass_country: Two-letter ISO 3166-2 country code that will be used for explicit geographic restriction bypassing via faking X-Forwarded-For HTTP header (experimental) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 94f461a78..f91d29a7b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -414,8 +414,8 @@ def _real_main(argv=None): 'cn_verification_proxy': opts.cn_verification_proxy, 'geo_verification_proxy': opts.geo_verification_proxy, 'config_location': opts.config_location, - 'bypass_geo_restriction': opts.bypass_geo_restriction, - 'bypass_geo_restriction_as_country': opts.bypass_geo_restriction_as_country, + 'geo_bypass': opts.geo_bypass, + 'geo_bypass_country': opts.geo_bypass_country, } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c1f7f28a0..6eb6a25b8 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -326,7 +326,7 @@ class InfoExtractor(object): _BYPASS_GEO attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. Though it won't disable explicit geo restriction bypass based on - country code provided with bypass_geo_restriction_as_country. + country code provided with geo_bypass_country. Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. @@ -371,7 +371,7 @@ class InfoExtractor(object): def initialize(self): """Initializes an instance (authentication, etc).""" if not self._x_forwarded_for_ip: - country_code = self._downloader.params.get('bypass_geo_restriction_as_country', None) + country_code = self._downloader.params.get('geo_bypass_country', None) if country_code: self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if not self._ready: @@ -389,9 +389,9 @@ class InfoExtractor(object): ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip return ie_result except GeoRestrictedError as e: - if (not self._downloader.params.get('bypass_geo_restriction_as_country', None) and + if (not self._downloader.params.get('geo_bypass_country', None) and self._BYPASS_GEO and - self._downloader.params.get('bypass_geo_restriction', True) and + self._downloader.params.get('geo_bypass', True) and not self._x_forwarded_for_ip and e.countries): self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(e.countries)) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 2e194f6dc..ae3f50754 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -550,16 +550,16 @@ def parseOpts(overrideArguments=None): '(maximum possible number of seconds to sleep). Must only be used ' 'along with --min-sleep-interval.')) workarounds.add_option( - '--bypass-geo', - action='store_true', dest='bypass_geo_restriction', default=True, + '--geo-bypass', + action='store_true', dest='geo_bypass', default=True, help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') workarounds.add_option( - '--no-bypass-geo', - action='store_false', dest='bypass_geo_restriction', default=True, + '--no-geo-bypass', + action='store_false', dest='geo_bypass', default=True, help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') workarounds.add_option( - '--bypass-geo-as-country', metavar='CODE', - dest='bypass_geo_restriction_as_country', default=None, + '--geo-bypass-country', metavar='CODE', + dest='geo_bypass_country', default=None, help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)') verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') -- cgit v1.2.3 From 4248dad92bd87650c791194276296b148f668e68 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Sun, 19 Feb 2017 03:53:23 +0700 Subject: Improve geo bypass mechanism * Rename options to preffixly match with --geo-verification-proxy * Introduce _GEO_COUNTRIES for extractors * Implement faking IP right away for sites with known geo restriction --- youtube_dl/extractor/common.py | 57 +++++++++++++++++++++++++---------- youtube_dl/extractor/dramafever.py | 3 +- youtube_dl/extractor/go.py | 3 +- youtube_dl/extractor/itv.py | 4 ++- youtube_dl/extractor/nrk.py | 4 ++- youtube_dl/extractor/ondemandkorea.py | 3 +- youtube_dl/extractor/pbs.py | 5 ++- youtube_dl/extractor/srgssr.py | 6 ++-- youtube_dl/extractor/svt.py | 4 ++- youtube_dl/extractor/vbox7.py | 3 +- youtube_dl/extractor/vgtv.py | 5 +-- youtube_dl/extractor/viki.py | 2 +- youtube_dl/utils.py | 2 +- 13 files changed, 71 insertions(+), 30 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6eb6a25b8..272da74b6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -323,10 +323,15 @@ class InfoExtractor(object): _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. - _BYPASS_GEO attribute may be set to False in order to disable + _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. Though it won't disable explicit geo restriction bypass based on - country code provided with geo_bypass_country. + country code provided with geo_bypass_country. (experimental) + + _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted + countries for this extractor. One of these countries will be used by + geo restriction bypass mechanism right away in order to bypass + geo restriction, of course, if the mechanism is not disabled. (experimental) Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. @@ -335,7 +340,8 @@ class InfoExtractor(object): _ready = False _downloader = None _x_forwarded_for_ip = None - _BYPASS_GEO = True + _GEO_BYPASS = True + _GEO_COUNTRIES = None _WORKING = True def __init__(self, downloader=None): @@ -370,13 +376,27 @@ class InfoExtractor(object): def initialize(self): """Initializes an instance (authentication, etc).""" + self.__initialize_geo_bypass() + if not self._ready: + self._real_initialize() + self._ready = True + + def __initialize_geo_bypass(self): if not self._x_forwarded_for_ip: country_code = self._downloader.params.get('geo_bypass_country', None) + # If there is no explicit country for geo bypass specified and + # the extractor is known to be geo restricted let's fake IP + # as X-Forwarded-For right away. + if (not country_code and + self._GEO_BYPASS and + self._downloader.params.get('geo_bypass', True) and + self._GEO_COUNTRIES): + country_code = random.choice(self._GEO_COUNTRIES) if country_code: self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) - if not self._ready: - self._real_initialize() - self._ready = True + if self._downloader.params.get('verbose', False): + self._downloader.to_stdout( + '[debug] Using fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip) def extract(self, url): """Extracts URL information and returns it in list of dicts.""" @@ -389,16 +409,8 @@ class InfoExtractor(object): ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip return ie_result except GeoRestrictedError as e: - if (not self._downloader.params.get('geo_bypass_country', None) and - self._BYPASS_GEO and - self._downloader.params.get('geo_bypass', True) and - not self._x_forwarded_for_ip and - e.countries): - self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(e.countries)) - if self._x_forwarded_for_ip: - self.report_warning( - 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip) - continue + if self.__maybe_fake_ip_and_retry(e.countries): + continue raise except ExtractorError: raise @@ -407,6 +419,19 @@ class InfoExtractor(object): except (KeyError, StopIteration) as e: raise ExtractorError('An extractor error has occurred.', cause=e) + def __maybe_fake_ip_and_retry(self, countries): + if (not self._downloader.params.get('geo_bypass_country', None) and + self._GEO_BYPASS and + self._downloader.params.get('geo_bypass', True) and + not self._x_forwarded_for_ip and + countries): + self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(countries)) + if self._x_forwarded_for_ip: + self.report_warning( + 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip) + return True + return False + def set_downloader(self, downloader): """Sets the downloader for this IE.""" self._downloader = downloader diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 755db806a..e7abc8889 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -20,6 +20,7 @@ from ..utils import ( class DramaFeverBaseIE(AMPIE): _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _NETRC_MACHINE = 'dramafever' + _GEO_COUNTRIES = ['US', 'CA'] _CONSUMER_SECRET = 'DA59dtVXYLxajktV' @@ -118,7 +119,7 @@ class DramaFeverIE(DramaFeverBaseIE): if isinstance(e.cause, compat_HTTPError): self.raise_geo_restricted( msg='Currently unavailable in your country', - countries=['US', 'CA']) + countries=self._GEO_COUNTRIES) raise series_id, episode_number = video_id.split('.') diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index ec902c670..b205bfc7c 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -37,6 +37,7 @@ class GoIE(AdobePassIE): } } _VALID_URL = r'https?://(?:(?P%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P\w+)|season-\d+/\d+-(?P[^/?#]+))' % '|'.join(_SITE_INFO.keys()) + _GEO_COUNTRIES = ['US'] _TESTS = [{ 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', 'info_dict': { @@ -104,7 +105,7 @@ class GoIE(AdobePassIE): for error in errors: if error.get('code') == 1002: self.raise_geo_restricted( - error['message'], countries=['US']) + error['message'], countries=self._GEO_COUNTRIES) error_message = ', '.join([error['message'] for error in errors]) raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) asset_url += '?' + entitlement['uplynkData']['sessionKey'] diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index aabde15f3..021c6b278 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -24,6 +24,7 @@ from ..utils import ( class ITVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P[0-9a-zA-Z]+)' + _GEO_COUNTRIES = ['GB'] _TEST = { 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', 'info_dict': { @@ -101,7 +102,8 @@ class ITVIE(InfoExtractor): fault_code = xpath_text(resp_env, './/faultcode') fault_string = xpath_text(resp_env, './/faultstring') if fault_code == 'InvalidGeoRegion': - self.raise_geo_restricted(msg=fault_string, countries=['GB']) + self.raise_geo_restricted( + msg=fault_string, countries=self._GEO_COUNTRIES) raise ExtractorError('%s said: %s' % (self.IE_NAME, fault_string)) title = xpath_text(playlist, 'EpisodeTitle', fatal=True) video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 78ece33e1..13af9ed1f 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -14,6 +14,7 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['NO'] def _real_extract(self, url): video_id = self._match_id(url) @@ -93,7 +94,8 @@ class NRKBaseIE(InfoExtractor): # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* if 'IsGeoBlocked' in message_type: self.raise_geo_restricted( - msg=MESSAGES.get('ProgramIsGeoBlocked'), countries=['NO']) + msg=MESSAGES.get('ProgramIsGeoBlocked'), + countries=self._GEO_COUNTRIES) raise ExtractorError( '%s said: %s' % (self.IE_NAME, MESSAGES.get( message_type, message_type)), diff --git a/youtube_dl/extractor/ondemandkorea.py b/youtube_dl/extractor/ondemandkorea.py index 0c85d549e..df1ce3c1d 100644 --- a/youtube_dl/extractor/ondemandkorea.py +++ b/youtube_dl/extractor/ondemandkorea.py @@ -10,6 +10,7 @@ from ..utils import ( class OnDemandKoreaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P[^/]+)\.html' + _GEO_COUNTRIES = ['US', 'CA'] _TEST = { 'url': 'http://www.ondemandkorea.com/ask-us-anything-e43.html', 'info_dict': { @@ -36,7 +37,7 @@ class OnDemandKoreaIE(InfoExtractor): if 'msg_block_01.png' in webpage: self.raise_geo_restricted( msg='This content is not available in your region', - countries=['US', 'CA']) + countries=self._GEO_COUNTRIES) if 'This video is only available to ODK PLUS members.' in webpage: raise ExtractorError( diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 64f47bae3..3e51b4dd7 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -193,6 +193,8 @@ class PBSIE(InfoExtractor): ) ''' % '|'.join(list(zip(*_STATIONS))[0]) + _GEO_COUNTRIES = ['US'] + _TESTS = [ { 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', @@ -492,7 +494,8 @@ class PBSIE(InfoExtractor): message = self._ERRORS.get( redirect_info['http_code'], redirect_info['message']) if redirect_info['http_code'] == 403: - self.raise_geo_restricted(msg=message, countries=['US']) + self.raise_geo_restricted( + msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( '%s said: %s' % (self.IE_NAME, message), expected=True) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index a35a0a538..bb73eb1d5 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -14,7 +14,8 @@ from ..utils import ( class SRGSSRIE(InfoExtractor): _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?Psrf|rts|rsi|rtr|swi):(?:[^:]+:)?(?Pvideo|audio):(?P[0-9a-f\-]{36}|\d+)' - _BYPASS_GEO = False + _GEO_BYPASS = False + _GEO_COUNTRIES = ['CH'] _ERRORS = { 'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.', @@ -43,7 +44,8 @@ class SRGSSRIE(InfoExtractor): if media_data.get('block') and media_data['block'] in self._ERRORS: message = self._ERRORS[media_data['block']] if media_data['block'] == 'GEOBLOCK': - self.raise_geo_restricted(msg=message, countries=['CH']) + self.raise_geo_restricted( + msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( '%s said: %s' % (self.IE_NAME, message), expected=True) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index f2a2200bf..9e2c9fcc6 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -13,6 +13,7 @@ from ..utils import ( class SVTBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['SE'] def _extract_video(self, video_info, video_id): formats = [] for vr in video_info['videoReferences']: @@ -39,7 +40,8 @@ class SVTBaseIE(InfoExtractor): }) if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): self.raise_geo_restricted( - 'This video is only available in Sweden', countries=['SE']) + 'This video is only available in Sweden', + countries=self._GEO_COUNTRIES) self._sort_formats(formats) subtitles = {} diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index f86d804c1..8152acefd 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -20,6 +20,7 @@ class Vbox7IE(InfoExtractor): ) (?P[\da-fA-F]+) ''' + _GEO_COUNTRIES = ['BG'] _TESTS = [{ 'url': 'http://vbox7.com/play:0946fff23c', 'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf', @@ -78,7 +79,7 @@ class Vbox7IE(InfoExtractor): video_url = video['src'] if '/na.mp4' in video_url: - self.raise_geo_restricted(countries=['BG']) + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) uploader = video.get('uploader') diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 1709fd6bb..0f8c156a7 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -14,7 +14,7 @@ from ..utils import ( class VGTVIE(XstreamIE): IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' - _BYPASS_GEO = False + _GEO_BYPASS = False _HOST_TO_APPNAME = { 'vgtv.no': 'vgtv', @@ -218,7 +218,8 @@ class VGTVIE(XstreamIE): properties = try_get( data, lambda x: x['streamConfiguration']['properties'], list) if properties and 'geoblocked' in properties: - raise self.raise_geo_restricted(countries=['NO']) + raise self.raise_geo_restricted( + countries=[host.rpartition('.')[-1].partition('/')[0].upper()]) self._sort_formats(info['formats']) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 68a74e246..e9c8bf824 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -27,7 +27,7 @@ class VikiBaseIE(InfoExtractor): _APP_VERSION = '2.2.5.1428709186' _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)' - _BYPASS_GEO = False + _GEO_BYPASS = False _NETRC_MACHINE = 'viki' _token = None diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index cbf7639c5..17b83794a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3291,7 +3291,7 @@ class GeoUtils(object): addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] addr_max = addr_min | (0xffffffff >> int(preflen)) return compat_str(socket.inet_ntoa( - compat_struct_pack('!I', random.randint(addr_min, addr_max)))) + compat_struct_pack('!L', random.randint(addr_min, addr_max)))) class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): -- cgit v1.2.3 From 3ccdde8cb76cacb7b2b64469ca51d3b1877da1f6 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Mon, 20 Feb 2017 23:21:15 +0700 Subject: [extractor/common] Emphasize geo bypass APIs are experimental --- youtube_dl/extractor/common.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 272da74b6..1ae264722 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -333,6 +333,9 @@ class InfoExtractor(object): geo restriction bypass mechanism right away in order to bypass geo restriction, of course, if the mechanism is not disabled. (experimental) + NB: both these geo attributes are experimental and may change in future + or be completely removed. + Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ -- cgit v1.2.3 From e39b5d4ab83de7a466c6d4c9528d385758566b22 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Tue, 21 Feb 2017 23:00:43 +0700 Subject: [extractor/common] Allow calling _initialize_geo_bypass from extractors (#11970) --- youtube_dl/extractor/common.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1ae264722..86aff3312 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -379,12 +379,31 @@ class InfoExtractor(object): def initialize(self): """Initializes an instance (authentication, etc).""" - self.__initialize_geo_bypass() + self._initialize_geo_bypass(self._GEO_COUNTRIES) if not self._ready: self._real_initialize() self._ready = True - def __initialize_geo_bypass(self): + def _initialize_geo_bypass(self, countries): + """ + Initialize geo restriction bypass mechanism. + + This method is used to initialize geo bypass mechanism based on faking + X-Forwarded-For HTTP header. A random country from provided country list + is selected and a random IP brlonging to this country is generated. This + IP will be passed as X-Forwarded-For HTTP header in all subsequent + HTTP requests. + Method does nothing if no countries are specified. + + This method will be used for initial geo bypass mechanism initialization + during the instance initialization with _GEO_COUNTRIES. + + You may also manually call it from extractor's code if geo countries + information is not available beforehand (e.g. obtained during + extraction) or due to some another reason. + """ + if not countries: + return if not self._x_forwarded_for_ip: country_code = self._downloader.params.get('geo_bypass_country', None) # If there is no explicit country for geo bypass specified and @@ -393,8 +412,8 @@ class InfoExtractor(object): if (not country_code and self._GEO_BYPASS and self._downloader.params.get('geo_bypass', True) and - self._GEO_COUNTRIES): - country_code = random.choice(self._GEO_COUNTRIES) + countries): + country_code = random.choice(countries) if country_code: self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._downloader.params.get('verbose', False): -- cgit v1.2.3 From dc0a869e5ee7a75218a759706bb11f17c4de6b72 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Tue, 21 Feb 2017 23:05:31 +0700 Subject: [extractor/common] Fix typo --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 86aff3312..6d4789d96 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -390,7 +390,7 @@ class InfoExtractor(object): This method is used to initialize geo bypass mechanism based on faking X-Forwarded-For HTTP header. A random country from provided country list - is selected and a random IP brlonging to this country is generated. This + is selected and a random IP belonging to this country is generated. This IP will be passed as X-Forwarded-For HTTP header in all subsequent HTTP requests. Method does nothing if no countries are specified. -- cgit v1.2.3 From 336a76551b92db1c040cbf3c4a9b1857e125ad45 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Tue, 21 Feb 2017 23:09:41 +0700 Subject: [extractor/common] Do not quit _initialize_geo_bypass on empty countries --- youtube_dl/extractor/common.py | 3 --- 1 file changed, 3 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6d4789d96..a34fbbc9b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -393,7 +393,6 @@ class InfoExtractor(object): is selected and a random IP belonging to this country is generated. This IP will be passed as X-Forwarded-For HTTP header in all subsequent HTTP requests. - Method does nothing if no countries are specified. This method will be used for initial geo bypass mechanism initialization during the instance initialization with _GEO_COUNTRIES. @@ -402,8 +401,6 @@ class InfoExtractor(object): information is not available beforehand (e.g. obtained during extraction) or due to some another reason. """ - if not countries: - return if not self._x_forwarded_for_ip: country_code = self._downloader.params.get('geo_bypass_country', None) # If there is no explicit country for geo bypass specified and -- cgit v1.2.3 From eea0716cae1290fe08faea89e24a58ec91098638 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Tue, 21 Feb 2017 23:14:33 +0700 Subject: [extractor/common] Print origin country for fake IP --- youtube_dl/extractor/common.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a34fbbc9b..4252d6825 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -415,7 +415,8 @@ class InfoExtractor(object): self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._downloader.params.get('verbose', False): self._downloader.to_stdout( - '[debug] Using fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip) + '[debug] Using fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) def extract(self, url): """Extracts URL information and returns it in list of dicts.""" @@ -444,10 +445,12 @@ class InfoExtractor(object): self._downloader.params.get('geo_bypass', True) and not self._x_forwarded_for_ip and countries): - self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(countries)) + country_code = random.choice(countries) + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._x_forwarded_for_ip: self.report_warning( - 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip) + 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) return True return False -- cgit v1.2.3 From eeb0a9568442a4dbbf3478579abe2696fbe890e2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 25 Feb 2017 18:40:05 +0800 Subject: [extractor/common] Add 'preference' to _parse_html5_media_entries Some websites, like NJPWorld, put different qualities on different player pages. --- youtube_dl/extractor/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4252d6825..eb3c091aa 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2010,7 +2010,7 @@ class InfoExtractor(object): }) return formats - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None): def absolute_url(video_url): return compat_urlparse.urljoin(base_url, video_url) @@ -2032,7 +2032,8 @@ class InfoExtractor(object): is_plain_url = False formats = self._extract_m3u8_formats( full_url, video_id, ext='mp4', - entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id) + entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, + preference=preference) elif ext == 'mpd': is_plain_url = False formats = self._extract_mpd_formats( -- cgit v1.2.3 From ed0cf9b38394b28bae5f05fd6b00c85a9c0e6755 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Sun, 5 Mar 2017 23:22:27 +0700 Subject: [extractor/common] Move jwplayer formats extraction in separate method --- youtube_dl/extractor/common.py | 106 ++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 50 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eb3c091aa..9b73a948c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2198,56 +2198,7 @@ class InfoExtractor(object): this_video_id = video_id or video_data['mediaid'] - formats = [] - for source in video_data['sources']: - source_url = self._proto_relative_url(source['file']) - if base_url: - source_url = compat_urlparse.urljoin(base_url, source_url) - source_type = source.get('type') or '' - ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - source_url, this_video_id, mpd_id=mpd_id, fatal=False)) - # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 - elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): - formats.append({ - 'url': source_url, - 'vcodec': 'none', - 'ext': ext, - }) - else: - height = int_or_none(source.get('height')) - if height is None: - # Often no height is provided but there is a label in - # format like 1080p. - height = int_or_none(self._search_regex( - r'^(\d{3,})[pP]$', source.get('label') or '', - 'height', default=None)) - a_format = { - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'height': height, - 'ext': ext, - } - if source_url.startswith('rtmp'): - a_format['ext'] = 'flv' - - # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as - # of jwplayer.flash.swf - rtmp_url_parts = re.split( - r'((?:mp4|mp3|flv):)', source_url, 1) - if len(rtmp_url_parts) == 3: - rtmp_url, prefix, play_path = rtmp_url_parts - a_format.update({ - 'url': rtmp_url, - 'play_path': prefix + play_path, - }) - if rtmp_params: - a_format.update(rtmp_params) - formats.append(a_format) + formats = self._parse_jwplayer_formats(video_data['sources'], this_video_id) self._sort_formats(formats) subtitles = {} @@ -2278,6 +2229,61 @@ class InfoExtractor(object): else: return self.playlist_result(entries) + def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, + m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + formats = [] + for source in jwplayer_sources_data : + source_url = self._proto_relative_url(source['file']) + if base_url: + source_url = compat_urlparse.urljoin(base_url, source_url) + source_type = source.get('type') or '' + ext = mimetype2ext(source_type) or determine_ext(source_url) + if source_type == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + source_url, video_id, mpd_id=mpd_id, fatal=False)) + # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 + elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + 'ext': ext, + }) + else: + height = int_or_none(source.get('height')) + if height is None: + # Often no height is provided but there is a label in + # format like 1080p. + height = int_or_none(self._search_regex( + r'^(\d{3,})[pP]$', source.get('label') or '', + 'height', default=None)) + a_format = { + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': height, + 'ext': ext, + } + if source_url.startswith('rtmp'): + a_format['ext'] = 'flv' + + # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as + # of jwplayer.flash.swf + rtmp_url_parts = re.split( + r'((?:mp4|mp3|flv):)', source_url, 1) + if len(rtmp_url_parts) == 3: + rtmp_url, prefix, play_path = rtmp_url_parts + a_format.update({ + 'url': rtmp_url, + 'play_path': prefix + play_path, + }) + if rtmp_params: + a_format.update(rtmp_params) + formats.append(a_format) + return formats + + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() -- cgit v1.2.3 From 0236cd0dfde1dda540c1067a9c5982d482005c47 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Sun, 5 Mar 2017 23:25:03 +0700 Subject: [extractor/common] Improve height extraction and extract bitrate --- youtube_dl/extractor/common.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9b73a948c..2887db0c3 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2240,12 +2240,14 @@ class InfoExtractor(object): ext = mimetype2ext(source_type) or determine_ext(source_url) if source_type == 'hls' or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=m3u8_id, fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 - elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): + elif source_type.startswith('audio') or ext in ( + 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): formats.append({ 'url': source_url, 'vcodec': 'none', @@ -2255,19 +2257,19 @@ class InfoExtractor(object): height = int_or_none(source.get('height')) if height is None: # Often no height is provided but there is a label in - # format like 1080p. + # format like "1080p", "720p SD", or 1080. height = int_or_none(self._search_regex( - r'^(\d{3,})[pP]$', source.get('label') or '', + r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), 'height', default=None)) a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), 'height': height, + 'tbr': int_or_none(source.get('bitrate')), 'ext': ext, } if source_url.startswith('rtmp'): a_format['ext'] = 'flv' - # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as # of jwplayer.flash.swf rtmp_url_parts = re.split( -- cgit v1.2.3 From 1a2192cb904ff42a309ab2c2477fc226f8651f33 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Sun, 5 Mar 2017 23:28:32 +0700 Subject: [extractor/common] Pass arguments to _parse_jwplayer_formats and PEP8 --- youtube_dl/extractor/common.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2887db0c3..78dc5be24 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2198,7 +2198,9 @@ class InfoExtractor(object): this_video_id = video_id or video_data['mediaid'] - formats = self._parse_jwplayer_formats(video_data['sources'], this_video_id) + formats = self._parse_jwplayer_formats( + video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, + mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) self._sort_formats(formats) subtitles = {} @@ -2232,7 +2234,7 @@ class InfoExtractor(object): def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): formats = [] - for source in jwplayer_sources_data : + for source in jwplayer_sources_data: source_url = self._proto_relative_url(source['file']) if base_url: source_url = compat_urlparse.urljoin(base_url, source_url) @@ -2285,7 +2287,6 @@ class InfoExtractor(object): formats.append(a_format) return formats - def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() -- cgit v1.2.3 From b51dc9db0e6ffc6a7725d92fa2c5de45a5b1be20 Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Thu, 16 Mar 2017 03:30:53 +0700 Subject: [extractor/common] Extract SMIL formats from jwplayer --- youtube_dl/extractor/common.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 78dc5be24..b51799bfa 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2247,6 +2247,9 @@ class InfoExtractor(object): elif ext == 'mpd': formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + source_url, video_id, fatal=False)) # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 elif source_type.startswith('audio') or ext in ( 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): -- cgit v1.2.3 From 46b18f2349670d395b9d84a57ee3d9b5d221ff4b Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Wed, 8 Mar 2017 18:13:54 -0500 Subject: [BostonGlobe] New. Nonstandard version of Brightcove. Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise pretty much just Brightcove. Except the Globe isn't all Brightcove videos, so fallback to Generic, too. Also, abstract playlist_from_matches() from generic.py to common.py, and use it here. History of these changes can be found in 51170427d4b1143572a498dedaee61863a5b2c5b. --- youtube_dl/extractor/bostonglobe.py | 72 ++++++++++++++++++++++++++++++++++ youtube_dl/extractor/common.py | 28 +++++++++----- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 77 +++++++++++++++++-------------------- 4 files changed, 126 insertions(+), 52 deletions(-) create mode 100644 youtube_dl/extractor/bostonglobe.py (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/bostonglobe.py b/youtube_dl/extractor/bostonglobe.py new file mode 100644 index 000000000..57882fbee --- /dev/null +++ b/youtube_dl/extractor/bostonglobe.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( + extract_attributes, +) + + +class BostonGlobeIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P[^/]+)/\w+(?:\.html)?' + _TESTS = [ + { + 'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html', + 'md5': '0a62181079c85c2d2b618c9a738aedaf', + 'info_dict': { + 'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood', + 'id': '5320421710001', + 'ext': 'mp4', + 'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.', + 'timestamp': 1486877593, + 'upload_date': '20170212', + 'uploader_id': '245991542', + }, + }, + { + # Embedded youtube video; we hand it off to the Generic extractor. + 'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html', + 'md5': '582b40327089d5c0c949b3c54b13c24b', + 'info_dict': { + 'title': "Who Is Matt Damon's Favorite Batman?", + 'id': 'ZW1QCnlA6Qc', + 'ext': 'mp4', + 'upload_date': '20170217', + 'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb', + 'uploader': 'The Late Late Show with James Corden', + 'uploader_id': 'TheLateLateShow', + }, + 'expected_warnings': ['404'], + }, + ] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + page_title = self._og_search_title(webpage, default=None) + + #